In [53]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap
import matplotlib.patches as mpatches
import seaborn as sb
from sklearn.metrics import f1_score


%matplotlib inline
plt.rcParams['figure.figsize'] = (16, 9)
plt.style.use('ggplot')
 
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

In [6]:
train_values = pd.read_csv('../train_values.csv', index_col='building_id')
train_labels = pd.read_csv('../train_labels.csv', index_col='building_id')
cleaned_train_labels = pd.read_csv('../cleaned_train.csv')

In [22]:
train = pd.read_pickle('../cleaned_train.pkl')
train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 260601 entries, 0 to 260600
Data columns (total 40 columns):
 #   Column                                  Non-Null Count   Dtype   
---  ------                                  --------------   -----   
 0   building_id                             260601 non-null  int32   
 1   damage_grade                            260601 non-null  int8    
 2   geo_level_1_id                          260601 non-null  int8    
 3   geo_level_2_id                          260601 non-null  int16   
 4   geo_level_3_id                          260601 non-null  int16   
 5   count_floors_pre_eq                     260601 non-null  int8    
 6   age                                     260601 non-null  int16   
 7   area_percentage                         260601 non-null  int8    
 8   height_percentage                       260601 non-null  int8    
 9   land_surface_condition                  260601 non-null  category
 10  foundation_type                 

In [23]:
test = pd.read_pickle('../cleaned_test.pkl')
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 86868 entries, 0 to 86867
Data columns (total 39 columns):
 #   Column                                  Non-Null Count  Dtype   
---  ------                                  --------------  -----   
 0   building_id                             86868 non-null  int32   
 1   geo_level_1_id                          86868 non-null  int8    
 2   geo_level_2_id                          86868 non-null  int16   
 3   geo_level_3_id                          86868 non-null  int16   
 4   count_floors_pre_eq                     86868 non-null  int8    
 5   age                                     86868 non-null  int16   
 6   area_percentage                         86868 non-null  int8    
 7   height_percentage                       86868 non-null  int8    
 8   land_surface_condition                  86868 non-null  category
 9   foundation_type                         86868 non-null  category
 10  roof_type                               86868 

In [24]:
cat_cols = train.select_dtypes(include=['category']).columns.to_list()
cat_cols

['land_surface_condition',
 'foundation_type',
 'roof_type',
 'ground_floor_type',
 'other_floor_type',
 'position',
 'plan_configuration',
 'legal_ownership_status']

In [25]:
oneHot = pd.get_dummies(train[cat_cols], prefix=cat_cols)
oneHot.transpose()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,260591,260592,260593,260594,260595,260596,260597,260598,260599,260600
land_surface_condition_n,0,0,0,0,0,0,1,0,0,0,...,1,0,0,0,0,1,0,0,0,1
land_surface_condition_o,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
land_surface_condition_t,1,0,1,1,1,1,0,1,1,1,...,0,1,1,1,1,0,1,1,1,0
foundation_type_h,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
foundation_type_i,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
foundation_type_r,1,1,1,1,1,1,1,0,1,0,...,1,1,1,1,1,1,1,1,1,1
foundation_type_u,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
foundation_type_w,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
roof_type_n,1,1,1,1,1,1,1,0,0,1,...,1,1,1,1,1,1,1,0,0,1
roof_type_q,0,0,0,0,0,0,0,1,1,0,...,0,0,0,0,0,0,0,1,0,0


In [26]:
train = train.drop(columns=cat_cols)
train

Unnamed: 0,building_id,damage_grade,geo_level_1_id,geo_level_2_id,geo_level_3_id,count_floors_pre_eq,age,area_percentage,height_percentage,has_superstructure_adobe_mud,...,has_secondary_use_agriculture,has_secondary_use_hotel,has_secondary_use_rental,has_secondary_use_institution,has_secondary_use_school,has_secondary_use_industry,has_secondary_use_health_post,has_secondary_use_gov_office,has_secondary_use_use_police,has_secondary_use_other
0,802906,3,6,487,12198,2,30,6,5,True,...,False,False,False,False,False,False,False,False,False,False
1,28830,2,8,900,2812,2,10,8,7,False,...,False,False,False,False,False,False,False,False,False,False
2,94947,3,21,363,8973,2,10,5,5,False,...,False,False,False,False,False,False,False,False,False,False
3,590882,2,22,418,10694,2,10,6,5,False,...,False,False,False,False,False,False,False,False,False,False
4,201944,3,11,131,1488,3,30,8,9,True,...,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
260596,688636,2,25,1335,1621,1,55,6,3,False,...,False,False,False,False,False,False,False,False,False,False
260597,669485,3,17,715,2060,2,0,6,5,False,...,False,False,False,False,False,False,False,False,False,False
260598,602512,3,17,51,8163,3,55,6,7,False,...,False,False,False,False,False,False,False,False,False,False
260599,151409,2,26,39,1851,2,10,14,6,False,...,False,False,False,False,False,False,False,False,False,False


In [27]:
train = pd.concat([train, oneHot], axis=1)
train

Unnamed: 0,building_id,damage_grade,geo_level_1_id,geo_level_2_id,geo_level_3_id,count_floors_pre_eq,age,area_percentage,height_percentage,has_superstructure_adobe_mud,...,plan_configuration_m,plan_configuration_n,plan_configuration_o,plan_configuration_q,plan_configuration_s,plan_configuration_u,legal_ownership_status_a,legal_ownership_status_r,legal_ownership_status_v,legal_ownership_status_w
0,802906,3,6,487,12198,2,30,6,5,True,...,0,0,0,0,0,0,0,0,1,0
1,28830,2,8,900,2812,2,10,8,7,False,...,0,0,0,0,0,0,0,0,1,0
2,94947,3,21,363,8973,2,10,5,5,False,...,0,0,0,0,0,0,0,0,1,0
3,590882,2,22,418,10694,2,10,6,5,False,...,0,0,0,0,0,0,0,0,1,0
4,201944,3,11,131,1488,3,30,8,9,True,...,0,0,0,0,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
260596,688636,2,25,1335,1621,1,55,6,3,False,...,0,0,0,1,0,0,0,0,1,0
260597,669485,3,17,715,2060,2,0,6,5,False,...,0,0,0,0,0,0,0,0,1,0
260598,602512,3,17,51,8163,3,55,6,7,False,...,0,0,0,0,0,0,0,0,1,0
260599,151409,2,26,39,1851,2,10,14,6,False,...,0,0,0,0,0,0,0,0,1,0


In [28]:
oneHot = pd.get_dummies(test[cat_cols], prefix=cat_cols)

test = test.drop(columns=cat_cols)

test = pd.concat([test, oneHot], axis=1)
test = test.set_index('building_id')
test

Unnamed: 0_level_0,geo_level_1_id,geo_level_2_id,geo_level_3_id,count_floors_pre_eq,age,area_percentage,height_percentage,has_superstructure_adobe_mud,has_superstructure_mud_mortar_stone,has_superstructure_stone_flag,...,plan_configuration_m,plan_configuration_n,plan_configuration_o,plan_configuration_q,plan_configuration_s,plan_configuration_u,legal_ownership_status_a,legal_ownership_status_r,legal_ownership_status_v,legal_ownership_status_w
building_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
300051,17,596,11307,3,20,7,6,False,True,False,...,0,0,0,0,0,0,0,0,1,0
99355,6,141,11987,2,25,13,5,False,True,False,...,0,0,0,0,0,0,0,0,1,0
890251,22,19,10044,2,5,4,5,False,True,False,...,0,0,0,0,0,0,0,0,1,0
745817,26,39,633,1,0,19,3,False,False,False,...,0,0,0,0,0,0,0,0,1,0
421793,17,289,7970,3,15,8,7,False,True,False,...,0,0,0,0,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
310028,4,605,3623,3,70,20,6,False,True,False,...,0,0,0,0,0,0,0,0,0,1
663567,10,1407,11907,3,25,6,7,True,True,True,...,0,0,0,0,0,0,0,0,1,0
1049160,22,1136,7712,1,50,3,3,False,True,False,...,0,0,0,0,0,0,0,0,1,0
442785,6,1041,912,2,5,9,5,True,True,False,...,0,0,0,0,0,0,1,0,0,0


In [60]:
X = train.drop(columns='damage_grade')
X = X.set_index('building_id')
# X = X.iloc[:, :3]
X = X[['geo_level_1_id', 'geo_level_2_id','geo_level_3_id', 'age', 'area_percentage', 'height_percentage']]
X
X

Unnamed: 0_level_0,geo_level_1_id,geo_level_2_id,geo_level_3_id,age,area_percentage,height_percentage
building_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
802906,6,487,12198,30,6,5
28830,8,900,2812,10,8,7
94947,21,363,8973,10,5,5
590882,22,418,10694,10,6,5
201944,11,131,1488,30,8,9
...,...,...,...,...,...,...
688636,25,1335,1621,55,6,3
669485,17,715,2060,0,6,5
602512,17,51,8163,55,6,7
151409,26,39,1851,10,14,6


In [61]:
y = train.damage_grade
y

0         3
1         2
2         3
3         2
4         3
         ..
260596    2
260597    3
260598    3
260599    2
260600    3
Name: damage_grade, Length: 260601, dtype: int8

In [62]:
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=240)

In [63]:
# X = cleaned_train_labels[['geo_level_1_id','geo_level_2_id', 'geo_level_3_id']].values
# y = cleaned_train_labels['damage_grade'].values

# X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
scaler = MinMaxScaler()
X_train = scaler.fit_transform(X_train)
X_valid = scaler.transform(X_valid)

In [64]:
n_neighbors = 20
 
knn = KNeighborsClassifier(n_neighbors)
knn.fit(X_train, y_train)
print('Accuracy of K-NN classifier on training set: {:.2f}'
     .format(knn.score(X_train, y_train)))
print('Accuracy of K-NN classifier on test set: {:.2f}'
     .format(knn.score(X_valid, y_valid)))

Accuracy of K-NN classifier on training set: 0.72
Accuracy of K-NN classifier on test set: 0.70


In [65]:
#Precision
pred = knn.predict(X_valid)
print(confusion_matrix(y_valid, pred))
print(classification_report(y_valid, pred))

[[ 1771  3042   285]
 [ 1068 24876  3826]
 [  159  7278  9816]]
              precision    recall  f1-score   support

           1       0.59      0.35      0.44      5098
           2       0.71      0.84      0.77     29770
           3       0.70      0.57      0.63     17253

    accuracy                           0.70     52121
   macro avg       0.67      0.58      0.61     52121
weighted avg       0.69      0.70      0.69     52121



In [66]:
f1_score(y_valid, pred, average='micro')

0.6995836610962951

In [67]:
pred

array([2, 2, 3, ..., 3, 2, 2], dtype=int8)

## Prediccion

In [79]:
test_pred = test[['geo_level_1_id', 'geo_level_2_id','geo_level_3_id', 'age', 'area_percentage', 'height_percentage']]
pred_final = knn.predict(test_pred)
pred_final

array([2, 1, 2, ..., 2, 2, 1], dtype=int8)

In [80]:
submission_format = pd.read_csv('../TP1/submission_format.csv', index_col='building_id')
submission_format

Unnamed: 0_level_0,damage_grade
building_id,Unnamed: 1_level_1
300051,1
99355,1
890251,1
745817,1
421793,1
...,...
310028,1
663567,1
1049160,1
442785,1


In [81]:
my_submission = pd.DataFrame(data=pred_final, columns=submission_format.columns, index=submission_format.index)
my_submission

Unnamed: 0_level_0,damage_grade
building_id,Unnamed: 1_level_1
300051,2
99355,1
890251,2
745817,2
421793,1
...,...
310028,2
663567,2
1049160,2
442785,2


In [82]:
my_submission.to_csv('Submits/KNN.csv')