In [783]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import ConfusionMatrixDisplay
from sklearn.model_selection import cross_val_score, GridSearchCV, KFold
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
import matplotlib.pyplot as plt

# importing machine learning models for prediction
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, BaggingClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis, LinearDiscriminantAnalysis
from xgboost import XGBClassifier
from sklearn.svm import SVC, LinearSVC

In [784]:
pd.options.display.max_columns = 999

In [785]:
# Read data
train_values = pd.read_csv('data/train_values.csv')
train_labels = pd.read_csv('data/train_labels.csv')
test_values = pd.read_csv('data/test_values.csv')

In [786]:
train_values.head()

Unnamed: 0,building_id,geo_level_1_id,geo_level_2_id,geo_level_3_id,count_floors_pre_eq,age,area_percentage,height_percentage,land_surface_condition,foundation_type,roof_type,ground_floor_type,other_floor_type,position,plan_configuration,has_superstructure_adobe_mud,has_superstructure_mud_mortar_stone,has_superstructure_stone_flag,has_superstructure_cement_mortar_stone,has_superstructure_mud_mortar_brick,has_superstructure_cement_mortar_brick,has_superstructure_timber,has_superstructure_bamboo,has_superstructure_rc_non_engineered,has_superstructure_rc_engineered,has_superstructure_other,legal_ownership_status,count_families,has_secondary_use,has_secondary_use_agriculture,has_secondary_use_hotel,has_secondary_use_rental,has_secondary_use_institution,has_secondary_use_school,has_secondary_use_industry,has_secondary_use_health_post,has_secondary_use_gov_office,has_secondary_use_use_police,has_secondary_use_other
0,802906,6,487,12198,2,30,6,5,t,r,n,f,q,t,d,1,1,0,0,0,0,0,0,0,0,0,v,1,0,0,0,0,0,0,0,0,0,0,0
1,28830,8,900,2812,2,10,8,7,o,r,n,x,q,s,d,0,1,0,0,0,0,0,0,0,0,0,v,1,0,0,0,0,0,0,0,0,0,0,0
2,94947,21,363,8973,2,10,5,5,t,r,n,f,x,t,d,0,1,0,0,0,0,0,0,0,0,0,v,1,0,0,0,0,0,0,0,0,0,0,0
3,590882,22,418,10694,2,10,6,5,t,r,n,f,x,s,d,0,1,0,0,0,0,1,1,0,0,0,v,1,0,0,0,0,0,0,0,0,0,0,0
4,201944,11,131,1488,3,30,8,9,t,r,n,f,x,s,d,1,0,0,0,0,0,0,0,0,0,0,v,1,0,0,0,0,0,0,0,0,0,0,0


In [787]:
train_values.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 260601 entries, 0 to 260600
Data columns (total 39 columns):
 #   Column                                  Non-Null Count   Dtype 
---  ------                                  --------------   ----- 
 0   building_id                             260601 non-null  int64 
 1   geo_level_1_id                          260601 non-null  int64 
 2   geo_level_2_id                          260601 non-null  int64 
 3   geo_level_3_id                          260601 non-null  int64 
 4   count_floors_pre_eq                     260601 non-null  int64 
 5   age                                     260601 non-null  int64 
 6   area_percentage                         260601 non-null  int64 
 7   height_percentage                       260601 non-null  int64 
 8   land_surface_condition                  260601 non-null  object
 9   foundation_type                         260601 non-null  object
 10  roof_type                               260601 non-null 

In [788]:
X = train_values.iloc[:,4:14]
X = pd.get_dummies(X, prefix_sep='_')
#X = StandardScaler().fit_transform(X)

In [789]:
X.head()

Unnamed: 0,count_floors_pre_eq,age,area_percentage,height_percentage,land_surface_condition_n,land_surface_condition_o,land_surface_condition_t,foundation_type_h,foundation_type_i,foundation_type_r,foundation_type_u,foundation_type_w,roof_type_n,roof_type_q,roof_type_x,ground_floor_type_f,ground_floor_type_m,ground_floor_type_v,ground_floor_type_x,ground_floor_type_z,other_floor_type_j,other_floor_type_q,other_floor_type_s,other_floor_type_x,position_j,position_o,position_s,position_t
0,2,30,6,5,0,0,1,0,0,1,0,0,1,0,0,1,0,0,0,0,0,1,0,0,0,0,0,1
1,2,10,8,7,0,1,0,0,0,1,0,0,1,0,0,0,0,0,1,0,0,1,0,0,0,0,1,0
2,2,10,5,5,0,0,1,0,0,1,0,0,1,0,0,1,0,0,0,0,0,0,0,1,0,0,0,1
3,2,10,6,5,0,0,1,0,0,1,0,0,1,0,0,1,0,0,0,0,0,0,0,1,0,0,1,0
4,3,30,8,9,0,0,1,0,0,1,0,0,1,0,0,1,0,0,0,0,0,0,0,1,0,0,1,0


In [790]:
#PCA
from sklearn.decomposition import PCA
from sklearn.preprocessing import scale 
from sklearn.model_selection import RepeatedKFold
from sklearn.metrics import mean_squared_error
from sklearn import model_selection

In [791]:
pca = PCA()
principalComponents = pca.fit_transform(scale(X))

# #define cross validation
# cv = RepeatedKFold(
#     n_splits=10, 
#     n_repeats=3,
#     random_state=1
#     )

# RF = RandomForestClassifier()
# mse = []

# #calculate MSE
# # score = -1 * model_selection.cross_val_score(
# #     RF,
# #     np.ones((len(principalComponents),1)), 
# #     train_labels.iloc[0:1000,:], 
# #     cv=cv,
# #     scoring='neg_mean_squared_error'
# #     ).mean()  
# # mse.append(score)

# for i in np.arange(1, 9):
#     score = -1 * model_selection.cross_val_score(
#         RF,
#         principalComponents[:,:i], 
#         train_labels.iloc[0:1000,:], 
#         cv=cv, 
#         scoring='neg_mean_squared_error'
#         ).mean()
#     mse.append(score)
    
    
# # Plot cross-validation results    
# plt.plot(mse)
# plt.xlabel('Number of Principal Components')
# plt.ylabel('MSE')
# plt.title('hp')

In [792]:
principalComponents.shape

(260601, 28)

In [793]:
PCA_df = pd.DataFrame(data=principalComponents[:,:2], columns=['PC1', 'PC2'])
PCA_df.shape

(260601, 2)

In [794]:
train_values.head()

Unnamed: 0,building_id,geo_level_1_id,geo_level_2_id,geo_level_3_id,count_floors_pre_eq,age,area_percentage,height_percentage,land_surface_condition,foundation_type,roof_type,ground_floor_type,other_floor_type,position,plan_configuration,has_superstructure_adobe_mud,has_superstructure_mud_mortar_stone,has_superstructure_stone_flag,has_superstructure_cement_mortar_stone,has_superstructure_mud_mortar_brick,has_superstructure_cement_mortar_brick,has_superstructure_timber,has_superstructure_bamboo,has_superstructure_rc_non_engineered,has_superstructure_rc_engineered,has_superstructure_other,legal_ownership_status,count_families,has_secondary_use,has_secondary_use_agriculture,has_secondary_use_hotel,has_secondary_use_rental,has_secondary_use_institution,has_secondary_use_school,has_secondary_use_industry,has_secondary_use_health_post,has_secondary_use_gov_office,has_secondary_use_use_police,has_secondary_use_other
0,802906,6,487,12198,2,30,6,5,t,r,n,f,q,t,d,1,1,0,0,0,0,0,0,0,0,0,v,1,0,0,0,0,0,0,0,0,0,0,0
1,28830,8,900,2812,2,10,8,7,o,r,n,x,q,s,d,0,1,0,0,0,0,0,0,0,0,0,v,1,0,0,0,0,0,0,0,0,0,0,0
2,94947,21,363,8973,2,10,5,5,t,r,n,f,x,t,d,0,1,0,0,0,0,0,0,0,0,0,v,1,0,0,0,0,0,0,0,0,0,0,0
3,590882,22,418,10694,2,10,6,5,t,r,n,f,x,s,d,0,1,0,0,0,0,1,1,0,0,0,v,1,0,0,0,0,0,0,0,0,0,0,0
4,201944,11,131,1488,3,30,8,9,t,r,n,f,x,s,d,1,0,0,0,0,0,0,0,0,0,0,v,1,0,0,0,0,0,0,0,0,0,0,0


In [795]:
# Add PCA components to DF
# new_train_values = train_values.iloc[0:1000,:]
new_train_values = pd.concat([train_values, PCA_df], axis=1)

In [796]:
new_train_values

Unnamed: 0,building_id,geo_level_1_id,geo_level_2_id,geo_level_3_id,count_floors_pre_eq,age,area_percentage,height_percentage,land_surface_condition,foundation_type,roof_type,ground_floor_type,other_floor_type,position,plan_configuration,has_superstructure_adobe_mud,has_superstructure_mud_mortar_stone,has_superstructure_stone_flag,has_superstructure_cement_mortar_stone,has_superstructure_mud_mortar_brick,has_superstructure_cement_mortar_brick,has_superstructure_timber,has_superstructure_bamboo,has_superstructure_rc_non_engineered,has_superstructure_rc_engineered,has_superstructure_other,legal_ownership_status,count_families,has_secondary_use,has_secondary_use_agriculture,has_secondary_use_hotel,has_secondary_use_rental,has_secondary_use_institution,has_secondary_use_school,has_secondary_use_industry,has_secondary_use_health_post,has_secondary_use_gov_office,has_secondary_use_use_police,has_secondary_use_other,PC1,PC2
0,802906,6,487,12198,2,30,6,5,t,r,n,f,q,t,d,1,1,0,0,0,0,0,0,0,0,0,v,1,0,0,0,0,0,0,0,0,0,0,0,-0.857356,0.973888
1,28830,8,900,2812,2,10,8,7,o,r,n,x,q,s,d,0,1,0,0,0,0,0,0,0,0,0,v,1,0,0,0,0,0,0,0,0,0,0,0,-0.222276,-0.024820
2,94947,21,363,8973,2,10,5,5,t,r,n,f,x,t,d,0,1,0,0,0,0,0,0,0,0,0,v,1,0,0,0,0,0,0,0,0,0,0,0,-0.466756,0.269184
3,590882,22,418,10694,2,10,6,5,t,r,n,f,x,s,d,0,1,0,0,0,0,1,1,0,0,0,v,1,0,0,0,0,0,0,0,0,0,0,0,-0.821413,-0.685101
4,201944,11,131,1488,3,30,8,9,t,r,n,f,x,s,d,1,0,0,0,0,0,0,0,0,0,0,v,1,0,0,0,0,0,0,0,0,0,0,0,-0.565560,1.030754
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
260596,688636,25,1335,1621,1,55,6,3,n,r,n,f,j,s,q,0,1,0,0,0,0,0,0,0,0,0,v,1,0,0,0,0,0,0,0,0,0,0,0,-0.847747,-3.314194
260597,669485,17,715,2060,2,0,6,5,t,r,n,f,q,s,d,0,1,0,0,0,0,0,0,0,0,0,v,1,0,0,0,0,0,0,0,0,0,0,0,-1.247525,-0.032507
260598,602512,17,51,8163,3,55,6,7,t,r,q,f,q,s,d,0,1,0,0,0,0,0,0,0,0,0,v,1,0,0,0,0,0,0,0,0,0,0,0,-0.837444,1.655597
260599,151409,26,39,1851,2,10,14,6,t,r,x,v,s,j,d,0,0,0,0,0,1,0,0,0,0,0,v,1,0,0,0,0,0,0,0,0,0,0,0,6.315027,1.936951


In [797]:
# Drop columns used for PCA
# columns #4 ~ #14: "count_floors_pre_eq" ~ "plan_configuration"
drop_list = new_train_values.columns[4:15]
new_train_values.drop(drop_list, axis=1, inplace=True)

In [798]:
new_train_values.head()

Unnamed: 0,building_id,geo_level_1_id,geo_level_2_id,geo_level_3_id,has_superstructure_adobe_mud,has_superstructure_mud_mortar_stone,has_superstructure_stone_flag,has_superstructure_cement_mortar_stone,has_superstructure_mud_mortar_brick,has_superstructure_cement_mortar_brick,has_superstructure_timber,has_superstructure_bamboo,has_superstructure_rc_non_engineered,has_superstructure_rc_engineered,has_superstructure_other,legal_ownership_status,count_families,has_secondary_use,has_secondary_use_agriculture,has_secondary_use_hotel,has_secondary_use_rental,has_secondary_use_institution,has_secondary_use_school,has_secondary_use_industry,has_secondary_use_health_post,has_secondary_use_gov_office,has_secondary_use_use_police,has_secondary_use_other,PC1,PC2
0,802906,6,487,12198,1,1,0,0,0,0,0,0,0,0,0,v,1,0,0,0,0,0,0,0,0,0,0,0,-0.857356,0.973888
1,28830,8,900,2812,0,1,0,0,0,0,0,0,0,0,0,v,1,0,0,0,0,0,0,0,0,0,0,0,-0.222276,-0.02482
2,94947,21,363,8973,0,1,0,0,0,0,0,0,0,0,0,v,1,0,0,0,0,0,0,0,0,0,0,0,-0.466756,0.269184
3,590882,22,418,10694,0,1,0,0,0,0,1,1,0,0,0,v,1,0,0,0,0,0,0,0,0,0,0,0,-0.821413,-0.685101
4,201944,11,131,1488,1,0,0,0,0,0,0,0,0,0,0,v,1,0,0,0,0,0,0,0,0,0,0,0,-0.56556,1.030754


In [799]:
new_train_df = pd.merge(new_train_values, train_labels, on="building_id")

In [800]:
new_train_df.head()

Unnamed: 0,building_id,geo_level_1_id,geo_level_2_id,geo_level_3_id,has_superstructure_adobe_mud,has_superstructure_mud_mortar_stone,has_superstructure_stone_flag,has_superstructure_cement_mortar_stone,has_superstructure_mud_mortar_brick,has_superstructure_cement_mortar_brick,has_superstructure_timber,has_superstructure_bamboo,has_superstructure_rc_non_engineered,has_superstructure_rc_engineered,has_superstructure_other,legal_ownership_status,count_families,has_secondary_use,has_secondary_use_agriculture,has_secondary_use_hotel,has_secondary_use_rental,has_secondary_use_institution,has_secondary_use_school,has_secondary_use_industry,has_secondary_use_health_post,has_secondary_use_gov_office,has_secondary_use_use_police,has_secondary_use_other,PC1,PC2,damage_grade
0,802906,6,487,12198,1,1,0,0,0,0,0,0,0,0,0,v,1,0,0,0,0,0,0,0,0,0,0,0,-0.857356,0.973888,3
1,28830,8,900,2812,0,1,0,0,0,0,0,0,0,0,0,v,1,0,0,0,0,0,0,0,0,0,0,0,-0.222276,-0.02482,2
2,94947,21,363,8973,0,1,0,0,0,0,0,0,0,0,0,v,1,0,0,0,0,0,0,0,0,0,0,0,-0.466756,0.269184,3
3,590882,22,418,10694,0,1,0,0,0,0,1,1,0,0,0,v,1,0,0,0,0,0,0,0,0,0,0,0,-0.821413,-0.685101,2
4,201944,11,131,1488,1,0,0,0,0,0,0,0,0,0,0,v,1,0,0,0,0,0,0,0,0,0,0,0,-0.56556,1.030754,3


In [801]:
geo1_count = pd.DataFrame(new_train_df['geo_level_1_id'].value_counts().sort_index())
geo2_count = pd.DataFrame(new_train_df['geo_level_2_id'].value_counts().sort_index())
geo3_count = pd.DataFrame(new_train_df['geo_level_3_id'].value_counts().sort_index())

In [802]:
geo1_count['prob'] = geo1_count['geo_level_1_id'] / geo1_count.sum(axis=0)[0]
geo2_count['prob'] = geo2_count['geo_level_2_id'] / geo2_count.sum(axis=0)[0]
geo3_count['prob'] = geo3_count['geo_level_3_id'] / geo3_count.sum(axis=0)[0]

In [803]:
index_geo2 = list(range(1428))
a = pd.DataFrame()
a['id'] = index_geo2
geo2_count=pd.merge(a, geo2_count, how='left', left_on='id', right_index=True)
geo2_count=geo2_count.interpolate()

In [804]:
index_geo3 = list(range(12568))
b = pd.DataFrame()
b['id'] = index_geo3
geo3_count=pd.merge(b, geo3_count, how='left', left_on='id', right_index=True)
geo3_count=geo3_count.interpolate()

In [805]:
# Calculate conditional probability of damage_grade == 1,2,3 based on geo_level_1
cond_prob_geo = []

for i in range(len(new_train_df)):
    geo1 = new_train_df['geo_level_1_id'][i]
    geo2 = new_train_df['geo_level_2_id'][i]
    geo3 = new_train_df['geo_level_3_id'][i]

    prob1 = geo1_count[geo1_count.index==geo1].iloc[0,1]
    prob2 = geo2_count[geo2_count.index==geo2].iloc[0,2]
    prob3 = geo3_count[geo3_count.index==geo3].iloc[0,2]

    cond_prob_geo.append([prob1, prob2, prob3])

geo_prob_temp_df = pd.DataFrame(cond_prob_geo)
geo_prob_temp_df.columns = ['geo1_dam_prob', 'geo2_dam_prob', 'geo3_dam_prob']

new_train_df = pd.concat([new_train_df, geo_prob_temp_df], axis=1)

In [806]:
new_train_df.head()

Unnamed: 0,building_id,geo_level_1_id,geo_level_2_id,geo_level_3_id,has_superstructure_adobe_mud,has_superstructure_mud_mortar_stone,has_superstructure_stone_flag,has_superstructure_cement_mortar_stone,has_superstructure_mud_mortar_brick,has_superstructure_cement_mortar_brick,has_superstructure_timber,has_superstructure_bamboo,has_superstructure_rc_non_engineered,has_superstructure_rc_engineered,has_superstructure_other,legal_ownership_status,count_families,has_secondary_use,has_secondary_use_agriculture,has_secondary_use_hotel,has_secondary_use_rental,has_secondary_use_institution,has_secondary_use_school,has_secondary_use_industry,has_secondary_use_health_post,has_secondary_use_gov_office,has_secondary_use_use_police,has_secondary_use_other,PC1,PC2,damage_grade,geo1_dam_prob,geo2_dam_prob,geo3_dam_prob
0,802906,6,487,12198,1,1,0,0,0,0,0,0,0,0,0,v,1,0,0,0,0,0,0,0,0,0,0,0,-0.857356,0.973888,3,0.093557,0.001036,0.000142
1,28830,8,900,2812,0,1,0,0,0,0,0,0,0,0,0,v,1,0,0,0,0,0,0,0,0,0,0,0,-0.222276,-0.02482,2,0.073215,0.000764,6.1e-05
2,94947,21,363,8973,0,1,0,0,0,0,0,0,0,0,0,v,1,0,0,0,0,0,0,0,0,0,0,0,-0.466756,0.269184,3,0.057133,0.006754,0.000522
3,590882,22,418,10694,0,1,0,0,0,0,1,1,0,0,0,v,1,0,0,0,0,0,0,0,0,0,0,0,-0.821413,-0.685101,2,0.023991,0.000787,0.000119
4,201944,11,131,1488,1,0,0,0,0,0,0,0,0,0,0,v,1,0,0,0,0,0,0,0,0,0,0,0,-0.56556,1.030754,3,0.031542,0.003983,0.000468


In [807]:
new_train_df = pd.get_dummies(new_train_df, prefix_sep='_')
new_train_labels = new_train_df['damage_grade']
new_train_df.drop(['building_id','damage_grade','geo_level_1_id','geo_level_2_id','geo_level_3_id'], axis=1, inplace=True)

In [808]:
new_train_df.head()

Unnamed: 0,has_superstructure_adobe_mud,has_superstructure_mud_mortar_stone,has_superstructure_stone_flag,has_superstructure_cement_mortar_stone,has_superstructure_mud_mortar_brick,has_superstructure_cement_mortar_brick,has_superstructure_timber,has_superstructure_bamboo,has_superstructure_rc_non_engineered,has_superstructure_rc_engineered,has_superstructure_other,count_families,has_secondary_use,has_secondary_use_agriculture,has_secondary_use_hotel,has_secondary_use_rental,has_secondary_use_institution,has_secondary_use_school,has_secondary_use_industry,has_secondary_use_health_post,has_secondary_use_gov_office,has_secondary_use_use_police,has_secondary_use_other,PC1,PC2,geo1_dam_prob,geo2_dam_prob,geo3_dam_prob,legal_ownership_status_a,legal_ownership_status_r,legal_ownership_status_v,legal_ownership_status_w
0,1,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,-0.857356,0.973888,0.093557,0.001036,0.000142,0,0,1,0
1,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,-0.222276,-0.02482,0.073215,0.000764,6.1e-05,0,0,1,0
2,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,-0.466756,0.269184,0.057133,0.006754,0.000522,0,0,1,0
3,0,1,0,0,0,0,1,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,-0.821413,-0.685101,0.023991,0.000787,0.000119,0,0,1,0
4,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,-0.56556,1.030754,0.031542,0.003983,0.000468,0,0,1,0


In [809]:
# # Create geo_level == 1 Dataframe
# temp = []
# damage1 = pd.DataFrame()
# for i in range(1,4):
#     for j in range(0,31):
#         geo = len(new_train_df[new_train_df['damage_grade']==i][new_train_df['geo_level_1_id']==j])
#         temp.append(geo)
#     damage1[i] = pd.DataFrame(temp)
#     temp = []

# damage1.columns = ['dam1', 'dam2', 'dam3']

In [810]:
# # Create geo_level == 2 Dataframe
# temp = []
# damage2 = pd.DataFrame()
# for i in range(1,4):
#     for j in range(0,1428):
#         geo = len(new_train_df[new_train_df['damage_grade']==i][new_train_df['geo_level_2_id']==j])
#         temp.append(geo)
#     damage2[i] = pd.DataFrame(temp)
#     temp = []

# damage2.columns = ['dam1', 'dam2', 'dam3']

In [811]:
# # Create geo_level == 3 Dataframe
# temp = []
# damage3 = pd.DataFrame()
# for i in range(1,4):
#     for j in range(0,12562):
#         geo = len(new_train_df[new_train_df['damage_grade']==i][new_train_df['geo_level_3_id']==j])
#         temp.append(geo)
#     damage3[i] = pd.DataFrame(temp)
#     temp = []

# damage3.columns = ['dam1', 'dam2', 'dam3']

In [812]:
# geo1 = new_train_df['geo_level_1_id'][57]
# dam1 = new_train_df['damage_grade'][57] - 1
# prob = damage1.iloc[geo1,dam1] / geo1_count[geo1_count.index == geo1].iloc[0,0]
# geo1, dam1+1, damage1.iloc[geo1,dam1], geo1_count[geo1_count.index == geo1].iloc[0,0], prob

In [813]:
# Calculate conditional probability of damage_grade == 1,2,3 based on geo_level_1
# cond_prob_geo1 = []

# for i in range(len(new_train_df)):
#     geo11 = new_train_df['geo_level_1_id'][i]

#     temp = []
#     for j in range(0,3):     
#         dam = j
#         prob11 = damage1[damage1.index ==geo11].iloc[0,dam] / geo1_count[geo1_count.index == geo11].iloc[0,0]
#         temp.append(prob11)
    
#     cond_prob_geo1.append(temp)

# geo1_prob_temp_df = pd.DataFrame(cond_prob_geo1)
# geo1_prob_temp_df.columns = ['geo1_dam1_prob', 'geo1_dam2_prob', 'geo1_dam3_prob']

# new_train_df = pd.concat([new_train_df, geo1_prob_temp_df], axis=1)

In [814]:
# # Calculate conditional probability of damage_grade == 1,2,3 based on geo_level_2
# cond_prob_geo2 = []

# for i in range(len(new_train_df)):
#     geo11 = new_train_df['geo_level_2_id'][i]

#     temp = []
#     for j in range(0,3):     
#         dam = j
#         prob11 = damage2[damage2.index ==geo11].iloc[0,dam] / geo2_count[geo2_count.index == geo11].iloc[0,0]
#         temp.append(prob11)
    
#     cond_prob_geo2.append(temp)

# geo2_prob_temp_df = pd.DataFrame(cond_prob_geo2)
# geo2_prob_temp_df.columns = ['geo2_dam1_prob', 'geo2_dam2_prob', 'geo2_dam3_prob']

# new_train_df = pd.concat([new_train_df, geo2_prob_temp_df], axis=1)

In [815]:
############################
### Clean Test Values
############################

In [816]:
test_values.head()

Unnamed: 0,building_id,geo_level_1_id,geo_level_2_id,geo_level_3_id,count_floors_pre_eq,age,area_percentage,height_percentage,land_surface_condition,foundation_type,roof_type,ground_floor_type,other_floor_type,position,plan_configuration,has_superstructure_adobe_mud,has_superstructure_mud_mortar_stone,has_superstructure_stone_flag,has_superstructure_cement_mortar_stone,has_superstructure_mud_mortar_brick,has_superstructure_cement_mortar_brick,has_superstructure_timber,has_superstructure_bamboo,has_superstructure_rc_non_engineered,has_superstructure_rc_engineered,has_superstructure_other,legal_ownership_status,count_families,has_secondary_use,has_secondary_use_agriculture,has_secondary_use_hotel,has_secondary_use_rental,has_secondary_use_institution,has_secondary_use_school,has_secondary_use_industry,has_secondary_use_health_post,has_secondary_use_gov_office,has_secondary_use_use_police,has_secondary_use_other
0,300051,17,596,11307,3,20,7,6,t,r,n,f,q,s,d,0,1,0,0,0,0,0,0,0,0,0,v,1,0,0,0,0,0,0,0,0,0,0,0
1,99355,6,141,11987,2,25,13,5,t,r,n,f,q,s,d,0,1,0,0,0,0,0,0,0,0,0,v,1,1,1,0,0,0,0,0,0,0,0,0
2,890251,22,19,10044,2,5,4,5,t,r,n,f,q,s,d,0,1,0,0,0,0,0,0,0,0,0,v,1,0,0,0,0,0,0,0,0,0,0,0
3,745817,26,39,633,1,0,19,3,t,r,x,v,j,t,d,0,0,0,0,0,1,0,0,0,0,0,v,2,1,0,0,1,0,0,0,0,0,0,0
4,421793,17,289,7970,3,15,8,7,t,r,q,f,q,t,d,0,1,0,0,0,0,0,0,0,0,0,v,1,0,0,0,0,0,0,0,0,0,0,0


In [817]:
test_values.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 86868 entries, 0 to 86867
Data columns (total 39 columns):
 #   Column                                  Non-Null Count  Dtype 
---  ------                                  --------------  ----- 
 0   building_id                             86868 non-null  int64 
 1   geo_level_1_id                          86868 non-null  int64 
 2   geo_level_2_id                          86868 non-null  int64 
 3   geo_level_3_id                          86868 non-null  int64 
 4   count_floors_pre_eq                     86868 non-null  int64 
 5   age                                     86868 non-null  int64 
 6   area_percentage                         86868 non-null  int64 
 7   height_percentage                       86868 non-null  int64 
 8   land_surface_condition                  86868 non-null  object
 9   foundation_type                         86868 non-null  object
 10  roof_type                               86868 non-null  object
 11  gr

In [818]:
# PCA
X2 = test_values.iloc[:,4:14]
X2 = pd.get_dummies(X2, prefix_sep='_')

pca_test = PCA()
principalComponents_test = pca_test.fit_transform(scale(X2))

In [819]:
PCA_df_test = pd.DataFrame(data=principalComponents_test[:,:2], columns=['PC1', 'PC2'])
PCA_df_test.shape

(86868, 2)

In [820]:
test_values = pd.concat([test_values, PCA_df_test], axis=1)

In [821]:
test_values.head()

Unnamed: 0,building_id,geo_level_1_id,geo_level_2_id,geo_level_3_id,count_floors_pre_eq,age,area_percentage,height_percentage,land_surface_condition,foundation_type,roof_type,ground_floor_type,other_floor_type,position,plan_configuration,has_superstructure_adobe_mud,has_superstructure_mud_mortar_stone,has_superstructure_stone_flag,has_superstructure_cement_mortar_stone,has_superstructure_mud_mortar_brick,has_superstructure_cement_mortar_brick,has_superstructure_timber,has_superstructure_bamboo,has_superstructure_rc_non_engineered,has_superstructure_rc_engineered,has_superstructure_other,legal_ownership_status,count_families,has_secondary_use,has_secondary_use_agriculture,has_secondary_use_hotel,has_secondary_use_rental,has_secondary_use_institution,has_secondary_use_school,has_secondary_use_industry,has_secondary_use_health_post,has_secondary_use_gov_office,has_secondary_use_use_police,has_secondary_use_other,PC1,PC2
0,300051,17,596,11307,3,20,7,6,t,r,n,f,q,s,d,0,1,0,0,0,0,0,0,0,0,0,v,1,0,0,0,0,0,0,0,0,0,0,0,-1.151532,0.990749
1,99355,6,141,11987,2,25,13,5,t,r,n,f,q,s,d,0,1,0,0,0,0,0,0,0,0,0,v,1,1,1,0,0,0,0,0,0,0,0,0,-0.932726,0.173909
2,890251,22,19,10044,2,5,4,5,t,r,n,f,q,s,d,0,1,0,0,0,0,0,0,0,0,0,v,1,0,0,0,0,0,0,0,0,0,0,0,-1.362276,-0.017848
3,745817,26,39,633,1,0,19,3,t,r,x,v,j,t,d,0,0,0,0,0,1,0,0,0,0,0,v,2,1,0,0,1,0,0,0,0,0,0,0,4.61113,-1.794479
4,421793,17,289,7970,3,15,8,7,t,r,q,f,q,t,d,0,1,0,0,0,0,0,0,0,0,0,v,1,0,0,0,0,0,0,0,0,0,0,0,-0.235821,2.590723


In [822]:
# Drop columns used for PCA
# columns #4 ~ #14: "count_floors_pre_eq" ~ "plan_configuration"
drop_list2 = test_values.columns[4:15]
test_values.drop(drop_list2, axis=1, inplace=True)

In [823]:
test_values.head()

Unnamed: 0,building_id,geo_level_1_id,geo_level_2_id,geo_level_3_id,has_superstructure_adobe_mud,has_superstructure_mud_mortar_stone,has_superstructure_stone_flag,has_superstructure_cement_mortar_stone,has_superstructure_mud_mortar_brick,has_superstructure_cement_mortar_brick,has_superstructure_timber,has_superstructure_bamboo,has_superstructure_rc_non_engineered,has_superstructure_rc_engineered,has_superstructure_other,legal_ownership_status,count_families,has_secondary_use,has_secondary_use_agriculture,has_secondary_use_hotel,has_secondary_use_rental,has_secondary_use_institution,has_secondary_use_school,has_secondary_use_industry,has_secondary_use_health_post,has_secondary_use_gov_office,has_secondary_use_use_police,has_secondary_use_other,PC1,PC2
0,300051,17,596,11307,0,1,0,0,0,0,0,0,0,0,0,v,1,0,0,0,0,0,0,0,0,0,0,0,-1.151532,0.990749
1,99355,6,141,11987,0,1,0,0,0,0,0,0,0,0,0,v,1,1,1,0,0,0,0,0,0,0,0,0,-0.932726,0.173909
2,890251,22,19,10044,0,1,0,0,0,0,0,0,0,0,0,v,1,0,0,0,0,0,0,0,0,0,0,0,-1.362276,-0.017848
3,745817,26,39,633,0,0,0,0,0,1,0,0,0,0,0,v,2,1,0,0,1,0,0,0,0,0,0,0,4.61113,-1.794479
4,421793,17,289,7970,0,1,0,0,0,0,0,0,0,0,0,v,1,0,0,0,0,0,0,0,0,0,0,0,-0.235821,2.590723


In [824]:
# Calculate conditional probability of damage_grade == 1,2,3 based on geo_levels
cond_prob_geo = []

for i in range(0,12157):
    geo1 = test_values['geo_level_1_id'][i]
    geo2 = test_values['geo_level_2_id'][i]
    geo3 = test_values['geo_level_3_id'][i]

    prob1 = geo1_count[geo1_count.index==geo1].iloc[0,1]
    prob2 = geo2_count[geo2_count.index==geo2].iloc[0,2]
    prob3 = geo3_count[geo3_count.index==geo3].iloc[0,2]

    cond_prob_geo.append([prob1, prob2, prob3])

geo_prob_temp_df = pd.DataFrame(cond_prob_geo)
geo_prob_temp_df.columns = ['geo1_dam_prob', 'geo2_dam_prob', 'geo3_dam_prob']

test_values = pd.concat([test_values, geo_prob_temp_df], axis=1)

In [825]:
test_values = pd.get_dummies(test_values, prefix_sep='_')
test_values.drop(['building_id','geo_level_1_id','geo_level_2_id','geo_level_3_id'], axis=1, inplace=True)

In [826]:
geo3_count[geo3_count.index==1].iloc[0,2]


2.302370290213775e-05

In [827]:
geo3_count

Unnamed: 0,id,geo_level_3_id,prob
0,0,2.0,0.000008
1,1,6.0,0.000023
2,2,7.5,0.000029
3,3,9.0,0.000035
4,4,11.5,0.000044
...,...,...,...
12563,12563,24.0,0.000092
12564,12564,6.0,0.000023
12565,12565,7.0,0.000027
12566,12566,4.0,0.000015


In [828]:
new_train_df.shape

(260601, 32)

In [829]:
new_train_df.to_csv('train_feat_engineered.csv')
new_train_labels.to_csv('train_labels_feat_engineered.csv')
test_values.to_csv('test_feat_engineered.csv')