In [22]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
from src.load_data import load_data
from src.explore import missing_values_check,explore_categorical_columns,explore_numerical_columns




# Load Dataset and Explore

In [19]:
train_values=load_data("data/raw_data/train_values.csv")
test_values=load_data("data/raw_data/test_values.csv")
train_labels=load_data("data/raw_data/train_labels.csv")

In [20]:
train_values.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 260601 entries, 0 to 260600
Data columns (total 39 columns):
 #   Column                                  Non-Null Count   Dtype 
---  ------                                  --------------   ----- 
 0   building_id                             260601 non-null  int64 
 1   geo_level_1_id                          260601 non-null  int64 
 2   geo_level_2_id                          260601 non-null  int64 
 3   geo_level_3_id                          260601 non-null  int64 
 4   count_floors_pre_eq                     260601 non-null  int64 
 5   age                                     260601 non-null  int64 
 6   area_percentage                         260601 non-null  int64 
 7   height_percentage                       260601 non-null  int64 
 8   land_surface_condition                  260601 non-null  object
 9   foundation_type                         260601 non-null  object
 10  roof_type                               260601 non-null 

In [23]:
explore_numerical_columns(train_values)

Index(['building_id', 'geo_level_1_id', 'geo_level_2_id', 'geo_level_3_id',
       'count_floors_pre_eq', 'age', 'area_percentage', 'height_percentage',
       'has_superstructure_adobe_mud', 'has_superstructure_mud_mortar_stone',
       'has_superstructure_stone_flag',
       'has_superstructure_cement_mortar_stone',
       'has_superstructure_mud_mortar_brick',
       'has_superstructure_cement_mortar_brick', 'has_superstructure_timber',
       'has_superstructure_bamboo', 'has_superstructure_rc_non_engineered',
       'has_superstructure_rc_engineered', 'has_superstructure_other',
       'count_families', 'has_secondary_use', 'has_secondary_use_agriculture',
       'has_secondary_use_hotel', 'has_secondary_use_rental',
       'has_secondary_use_institution', 'has_secondary_use_school',
       'has_secondary_use_industry', 'has_secondary_use_health_post',
       'has_secondary_use_gov_office', 'has_secondary_use_use_police',
       'has_secondary_use_other'],
      dtype='object')

In [24]:
explore_categorical_columns(train_values)

Index(['land_surface_condition', 'foundation_type', 'roof_type',
       'ground_floor_type', 'other_floor_type', 'position',
       'plan_configuration', 'legal_ownership_status'],
      dtype='object')

In [25]:
missing_values_check(train_values)

there is no missing value


In [26]:
missing_values_check(train_labels)

there is no missing value


# Converting Categorical data to Numerical 

In [80]:
# Create a LabelEncoder object
le = LabelEncoder()

for c in col:
    print(c)
    le.fit(train_values[c])
    # Fit and transform the categorical data in train 
    train_values[c]=le.transform(train_values[c])
    # Fit and transform the categorical data in test
    test_values[c]=le.transform(test_values[c])

land_surface_condition
foundation_type
roof_type
ground_floor_type
other_floor_type
position
plan_configuration
legal_ownership_status


In [93]:
train_labels['damage_grade'].unique()

array([3, 2, 1])

In [106]:
data=pd.merge(train_values,train_labels,on='building_id')

# Creating Model and Train

In [108]:
data

Unnamed: 0,building_id,geo_level_1_id,geo_level_2_id,geo_level_3_id,count_floors_pre_eq,age,area_percentage,height_percentage,land_surface_condition,foundation_type,...,has_secondary_use_hotel,has_secondary_use_rental,has_secondary_use_institution,has_secondary_use_school,has_secondary_use_industry,has_secondary_use_health_post,has_secondary_use_gov_office,has_secondary_use_use_police,has_secondary_use_other,damage_grade
0,802906,6,487,12198,2,30,6,5,2,2,...,0,0,0,0,0,0,0,0,0,3
1,28830,8,900,2812,2,10,8,7,1,2,...,0,0,0,0,0,0,0,0,0,2
2,94947,21,363,8973,2,10,5,5,2,2,...,0,0,0,0,0,0,0,0,0,3
3,590882,22,418,10694,2,10,6,5,2,2,...,0,0,0,0,0,0,0,0,0,2
4,201944,11,131,1488,3,30,8,9,2,2,...,0,0,0,0,0,0,0,0,0,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
260596,688636,25,1335,1621,1,55,6,3,0,2,...,0,0,0,0,0,0,0,0,0,2
260597,669485,17,715,2060,2,0,6,5,2,2,...,0,0,0,0,0,0,0,0,0,3
260598,602512,17,51,8163,3,55,6,7,2,2,...,0,0,0,0,0,0,0,0,0,3
260599,151409,26,39,1851,2,10,14,6,2,2,...,0,0,0,0,0,0,0,0,0,2


In [109]:
data=data.drop('building_id',axis=1)
X = data.iloc[:, :-1].values
y = data['damage_grade']

In [126]:
X_train, X_test, y_train, y_test=train_test_split(X,y,test_size=0.20)

In [140]:
classifier = KNeighborsClassifier(n_neighbors=7)
classifier.fit(X_train, y_train) 

# Testing Model Accuracy

In [141]:
y_predict = classifier.predict(X_test)


In [142]:
print(confusion_matrix(y_test, y_predict))
print(classification_report(y_test, y_predict)) 

[[ 2391  2333   242]
 [ 1319 23765  4560]
 [  212  6378 10921]]
              precision    recall  f1-score   support

           1       0.61      0.48      0.54      4966
           2       0.73      0.80      0.77     29644
           3       0.69      0.62      0.66     17511

    accuracy                           0.71     52121
   macro avg       0.68      0.64      0.65     52121
weighted avg       0.71      0.71      0.71     52121



In [143]:
predict_df=pd.DataFrame()
predict_df['test']=y_test
predict_df['predicted']=y_predict

In [144]:
predict_df

Unnamed: 0,test,predicted
111999,2,3
238595,3,2
253059,3,3
234521,3,3
253868,3,2
...,...,...
126710,2,2
135781,2,3
111083,3,3
231646,3,2
