### This notebook contains the code to generate the preprocessed data for the "Richter's Predictor: Modeling Earthquake Damage" competition.

We are given the training data train_values.csv and train_labels.csv, and test data test_values.csv.

This notebook preprocesses these and saves this data.

In [1]:
import pandas as pd

# Load the data
X = pd.read_csv('train_values.csv')
y = pd.read_csv('train_labels.csv')
X_test = pd.read_csv('test_values.csv')
X.head()

Unnamed: 0,building_id,geo_level_1_id,geo_level_2_id,geo_level_3_id,count_floors_pre_eq,age,area_percentage,height_percentage,land_surface_condition,foundation_type,...,has_secondary_use_agriculture,has_secondary_use_hotel,has_secondary_use_rental,has_secondary_use_institution,has_secondary_use_school,has_secondary_use_industry,has_secondary_use_health_post,has_secondary_use_gov_office,has_secondary_use_use_police,has_secondary_use_other
0,802906,6,487,12198,2,30,6,5,t,r,...,0,0,0,0,0,0,0,0,0,0
1,28830,8,900,2812,2,10,8,7,o,r,...,0,0,0,0,0,0,0,0,0,0
2,94947,21,363,8973,2,10,5,5,t,r,...,0,0,0,0,0,0,0,0,0,0
3,590882,22,418,10694,2,10,6,5,t,r,...,0,0,0,0,0,0,0,0,0,0
4,201944,11,131,1488,3,30,8,9,t,r,...,0,0,0,0,0,0,0,0,0,0


In [8]:
y.head()

Unnamed: 0,building_id,damage_grade
0,802906,3
1,28830,2
2,94947,3
3,590882,2
4,201944,3


In [3]:
# Check for missing values
#print(X.isnull().sum())
#print(y.isnull().sum())
#print(X_test.isnull().sum())
# Nothing missing!

In [5]:
# Categorical encoding: we only have nominal data, so check if there are not too much unique values so we can one-hot encode these
categorical_cols = [cname for cname in X.columns if (X[cname].dtype == "object")]
for col in categorical_cols:
    print(col, X[col].nunique())

land_surface_condition 3
foundation_type 5
roof_type 3
ground_floor_type 5
other_floor_type 4
position 4
plan_configuration 10
legal_ownership_status 4


In [6]:
# Not that many so go on with one-hot encoding
X = pd.get_dummies(X, columns=categorical_cols, dtype=int)
X_test = pd.get_dummies(X_test, columns=categorical_cols, dtype=int)
X.head()

Unnamed: 0,building_id,geo_level_1_id,geo_level_2_id,geo_level_3_id,count_floors_pre_eq,age,area_percentage,height_percentage,has_superstructure_adobe_mud,has_superstructure_mud_mortar_stone,...,plan_configuration_m,plan_configuration_n,plan_configuration_o,plan_configuration_q,plan_configuration_s,plan_configuration_u,legal_ownership_status_a,legal_ownership_status_r,legal_ownership_status_v,legal_ownership_status_w
0,802906,6,487,12198,2,30,6,5,1,1,...,0,0,0,0,0,0,0,0,1,0
1,28830,8,900,2812,2,10,8,7,0,1,...,0,0,0,0,0,0,0,0,1,0
2,94947,21,363,8973,2,10,5,5,0,1,...,0,0,0,0,0,0,0,0,1,0
3,590882,22,418,10694,2,10,6,5,0,1,...,0,0,0,0,0,0,0,0,1,0
4,201944,11,131,1488,3,30,8,9,1,0,...,0,0,0,0,0,0,0,0,1,0


In [7]:
# Then X can be saved again (not y as it is the same)
X.to_csv('train_values_preprocessed.csv', index=False)
X_test.to_csv('test_values_preprocessed.csv', index=False)