In [13]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotnine as p9
import os

from lib.lib import Import_data

path = Import_data()
# get all files in the directory
files = os.listdir(path)
print(files)

Path to dataset files: C:\Users\kris\.cache\kagglehub\datasets\arashnic\earthquake-magnitude-damage-and-impact\versions\6
['csv_building_damage_assessment.csv', 'csv_building_ownership_and_use.csv', 'csv_building_structure.csv', 'csv_household_demographics.csv', 'csv_household_earthquake_impact.csv', 'csv_household_resources.csv', 'csv_individual_demographics.csv', 'mapping.csv', 'ward_vdcmun_district_name_mapping.csv']


In [14]:
building_structure = pd.read_csv(path + "/csv_building_structure.csv")
building_structure.keys()

Index(['building_id', 'district_id', 'vdcmun_id', 'ward_id',
       'count_floors_pre_eq', 'count_floors_post_eq', 'age_building',
       'plinth_area_sq_ft', 'height_ft_pre_eq', 'height_ft_post_eq',
       'land_surface_condition', 'foundation_type', 'roof_type',
       'ground_floor_type', 'other_floor_type', 'position',
       'plan_configuration', 'has_superstructure_adobe_mud',
       'has_superstructure_mud_mortar_stone', 'has_superstructure_stone_flag',
       'has_superstructure_cement_mortar_stone',
       'has_superstructure_mud_mortar_brick',
       'has_superstructure_cement_mortar_brick', 'has_superstructure_timber',
       'has_superstructure_bamboo', 'has_superstructure_rc_non_engineered',
       'has_superstructure_rc_engineered', 'has_superstructure_other',
       'condition_post_eq', 'damage_grade', 'technical_solution_proposed'],
      dtype='object')

In [15]:
to_drop = ["district_id",
           "building_id",
           "vdcmun_id",
           "ward_id",
           "count_floors_post_eq",
           "height_ft_post_eq",
           "position",
           "condition_post_eq",
           "technical_solution_proposed"]

# Drop the specified columns
building_structure = building_structure.drop(columns=to_drop)

# Verify the columns have been dropped
print(building_structure.keys())


Index(['count_floors_pre_eq', 'age_building', 'plinth_area_sq_ft',
       'height_ft_pre_eq', 'land_surface_condition', 'foundation_type',
       'roof_type', 'ground_floor_type', 'other_floor_type',
       'plan_configuration', 'has_superstructure_adobe_mud',
       'has_superstructure_mud_mortar_stone', 'has_superstructure_stone_flag',
       'has_superstructure_cement_mortar_stone',
       'has_superstructure_mud_mortar_brick',
       'has_superstructure_cement_mortar_brick', 'has_superstructure_timber',
       'has_superstructure_bamboo', 'has_superstructure_rc_non_engineered',
       'has_superstructure_rc_engineered', 'has_superstructure_other',
       'damage_grade'],
      dtype='object')


In [16]:
# Dropping rows with nan values
building_structure = building_structure.dropna(axis=0, how='any')

In [17]:
# Drop rows where 'building_age' is greater than 100
building_structure = building_structure[building_structure["age_building"] <= 100]

In [18]:
# Dropping floors above 6
building_structure = building_structure[building_structure["count_floors_pre_eq"] > 6]

In [19]:
# Dropping rows with plinith area above 1500
building_structure = building_structure[building_structure["plinth_area_sq_ft"] <= 1500]

In [20]:
# One hot encode the relevant columns
building_structure_onehot = pd.get_dummies(building_structure, columns=["foundation_type", "plan_configuration", 'land_surface_condition', 'roof_type', 'ground_floor_type', 'other_floor_type'] )

In [21]:
building_structure.keys()

Index(['count_floors_pre_eq', 'age_building', 'plinth_area_sq_ft',
       'height_ft_pre_eq', 'land_surface_condition', 'foundation_type',
       'roof_type', 'ground_floor_type', 'other_floor_type',
       'plan_configuration', 'has_superstructure_adobe_mud',
       'has_superstructure_mud_mortar_stone', 'has_superstructure_stone_flag',
       'has_superstructure_cement_mortar_stone',
       'has_superstructure_mud_mortar_brick',
       'has_superstructure_cement_mortar_brick', 'has_superstructure_timber',
       'has_superstructure_bamboo', 'has_superstructure_rc_non_engineered',
       'has_superstructure_rc_engineered', 'has_superstructure_other',
       'damage_grade'],
      dtype='object')

In [22]:
# Changing damage categories into numerical values
building_structure["damage_grade"] = building_structure["damage_grade"].replace({"Grade 1": 1, "Grade 2": 2, "Grade 3": 3, "Grade 4": 4, "Grade 5": 5})



In [23]:
building_structure.describe()

Unnamed: 0,count_floors_pre_eq,age_building,plinth_area_sq_ft,height_ft_pre_eq,has_superstructure_adobe_mud,has_superstructure_mud_mortar_stone,has_superstructure_stone_flag,has_superstructure_cement_mortar_stone,has_superstructure_mud_mortar_brick,has_superstructure_cement_mortar_brick,has_superstructure_timber,has_superstructure_bamboo,has_superstructure_rc_non_engineered,has_superstructure_rc_engineered,has_superstructure_other,damage_grade
count,95.0,95.0,95.0,95.0,95.0,95.0,95.0,95.0,95.0,95.0,95.0,95.0,95.0,95.0,95.0,95.0
mean,7.326316,11.842105,800.410526,53.863158,0.010526,0.136842,0.010526,0.042105,0.010526,0.378947,0.063158,0.0,0.4,0.368421,0.010526,2.157895
std,0.675437,15.461906,353.987243,21.115493,0.102598,0.345504,0.102598,0.201895,0.102598,0.487699,0.244537,0.0,0.492497,0.484935,0.102598,1.205623
min,7.0,0.0,155.0,7.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
25%,7.0,4.0,497.5,48.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
50%,7.0,6.0,740.0,63.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0
75%,7.0,12.0,1106.0,67.5,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,3.0
max,9.0,80.0,1500.0,90.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,5.0
