In [1]:
import pandas as pd
import numpy as np
import sklearn
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier

In [2]:
train_val = pd.read_csv('../input/train_val.csv', index_col=False)

In [3]:
train_val['damage_grade'].head()

0    3
1    2
2    3
3    2
4    3
Name: damage_grade, dtype: int64

In [4]:
train_val.head()

Unnamed: 0,building_id,geo_level_1_id,geo_level_2_id,geo_level_3_id,count_floors_pre_eq,age,area_percentage,height_percentage,has_superstructure_adobe_mud,has_superstructure_mud_mortar_stone,...,legal_ownership_status_w,CntFloorAge,CntFloorsArea,CntFloorsHeight,AreaPerAge,HeightPerAge,AreaPerHeight,CntFamFloors,CntFamArea,CntFamHeight
0,802906,6,487,12198,2,30.0,6.0,5.0,1,1,...,0,0.066445,0.333333,0.4,0.199336,0.166113,1.2,0.5,0.166667,0.2
1,28830,8,900,2812,2,10.0,8.0,7.0,0,1,...,0,0.19802,0.25,0.285714,0.792079,0.693069,1.142857,0.5,0.125,0.142857
2,94947,21,363,8973,2,10.0,5.0,5.0,0,1,...,0,0.19802,0.4,0.4,0.49505,0.49505,1.0,0.5,0.2,0.2
3,590882,22,418,10694,2,10.0,6.0,5.0,0,1,...,0,0.19802,0.333333,0.4,0.594059,0.49505,1.2,0.5,0.166667,0.2
4,201944,11,131,1488,3,30.0,8.0,9.0,1,0,...,0,0.099668,0.375,0.333333,0.265781,0.299003,0.888889,0.333333,0.125,0.111111


In [5]:
train_val.drop('building_id', axis='columns', inplace=True)

In [6]:
train_val.describe()

Unnamed: 0,geo_level_1_id,geo_level_2_id,geo_level_3_id,count_floors_pre_eq,age,area_percentage,height_percentage,has_superstructure_adobe_mud,has_superstructure_mud_mortar_stone,has_superstructure_stone_flag,...,legal_ownership_status_w,CntFloorAge,CntFloorsArea,CntFloorsHeight,AreaPerAge,HeightPerAge,AreaPerHeight,CntFamFloors,CntFamArea,CntFamHeight
count,260601.0,260601.0,260601.0,260601.0,260601.0,260601.0,260601.0,260601.0,260601.0,260601.0,...,260601.0,260601.0,260601.0,260601.0,260601.0,260601.0,260601.0,260601.0,260601.0,260601.0
mean,13.900353,701.074685,6257.876148,2.088626,21.514442,7.900967,5.414523,0.088645,0.761935,0.034332,...,0.010272,1.955842,0.317041,0.397145,8.919656,5.241824,1.595962,0.521725,0.15012,0.202842
std,8.033617,412.710734,3646.369645,0.625412,19.268494,3.727604,1.800279,0.284231,0.4259,0.182081,...,0.100831,5.830457,0.170883,0.08694,28.451033,15.802215,0.939023,0.290443,0.093597,0.112687
min,0.0,0.0,0.0,1.0,0.0,1.0,2.0,0.0,0.0,0.0,...,0.0,0.011099,0.047619,0.083333,0.011751,0.022198,0.083333,0.0,0.0,0.0
25%,7.0,350.0,3073.0,2.0,10.0,5.0,4.0,0.0,1.0,0.0,...,0.0,0.066445,0.2,0.333333,0.232558,0.179641,1.0,0.333333,0.1,0.142857
50%,12.0,702.0,6270.0,2.0,15.0,7.0,5.0,0.0,1.0,0.0,...,0.0,0.13245,0.285714,0.4,0.431894,0.298507,1.4,0.5,0.142857,0.2
75%,21.0,1050.0,9412.0,2.0,30.0,9.0,6.0,0.0,1.0,0.0,...,0.0,0.198675,0.4,0.5,0.980392,0.594059,1.857143,0.5,0.2,0.25
max,30.0,1427.0,12567.0,3.0,90.0,21.0,12.0,1.0,1.0,1.0,...,1.0,30.0,3.0,1.5,210.0,120.0,10.5,9.0,2.5,3.5


In [7]:
X = train_val.loc[:, train_val.columns != 'damage_grade']
y = train_val.loc[:, train_val.columns == 'damage_grade']

In [15]:
classifiers = [
    KNeighborsClassifier(),
    DecisionTreeClassifier(),
    RandomForestClassifier(),
    AdaBoostClassifier(),
    GradientBoostingClassifier()
]

In [16]:
def model_and_test(X, y, classifiers):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.1, random_state = 35)
    
    for model in classifiers:
        this_model = model.__class__.__name__
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)

        print(f'{this_model} f1 score:')
        score = f1_score(y_test, y_pred, average='micro')
        print(f'{score:.4f}')
        print('\n')

In [17]:
model_and_test(X, y, classifiers)

KNeighborsClassifier f1 score:
0.7020


DecisionTreeClassifier f1 score:
0.6758


RandomForestClassifier f1 score:
0.7433


AdaBoostClassifier f1 score:
0.7482


GradientBoostingClassifier f1 score:
0.7614




In [20]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.1, random_state = 35)
    
model = RandomForestClassifier()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

print(f'{model} f1 score:')
score = f1_score(y_test, y_pred, average='micro')
print(f'{score:.4f}')
print('\n')

RandomForestClassifier() f1 score:
0.7454




In [21]:
important_features = pd.DataFrame(model.feature_importances_, index = X_train.columns,
columns = ['importance']).sort_values('importance', ascending=False)

In [25]:
important_features.iloc[1:30, :]

Unnamed: 0,importance
prob2_geo3,0.100189
prob3_geo2,0.060624
prob2_geo2,0.054308
prob1_geo3,0.04569
AreaPerAge,0.045466
HeightPerAge,0.037239
AreaPerHeight,0.03276
prob1_geo2,0.032618
CntFloorAge,0.029521
CntFloorsArea,0.026981
