In [21]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
from src.encoding import encoding


# Load Dataset and Explore

In [10]:
train_values=pd.read_csv("data/raw_data/train_values.csv")
train_values.head()

Unnamed: 0,building_id,geo_level_1_id,geo_level_2_id,geo_level_3_id,count_floors_pre_eq,age,area_percentage,height_percentage,land_surface_condition,foundation_type,...,has_secondary_use_agriculture,has_secondary_use_hotel,has_secondary_use_rental,has_secondary_use_institution,has_secondary_use_school,has_secondary_use_industry,has_secondary_use_health_post,has_secondary_use_gov_office,has_secondary_use_use_police,has_secondary_use_other
0,802906,6,487,12198,2,30,6,5,t,r,...,0,0,0,0,0,0,0,0,0,0
1,28830,8,900,2812,2,10,8,7,o,r,...,0,0,0,0,0,0,0,0,0,0
2,94947,21,363,8973,2,10,5,5,t,r,...,0,0,0,0,0,0,0,0,0,0
3,590882,22,418,10694,2,10,6,5,t,r,...,0,0,0,0,0,0,0,0,0,0
4,201944,11,131,1488,3,30,8,9,t,r,...,0,0,0,0,0,0,0,0,0,0


In [11]:
train_labels=pd.read_csv("data/raw_data/train_labels.csv")
train_labels.head()

Unnamed: 0,building_id,damage_grade
0,802906,3
1,28830,2
2,94947,3
3,590882,2
4,201944,3


In [12]:
train_labels.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 260601 entries, 0 to 260600
Data columns (total 2 columns):
 #   Column        Non-Null Count   Dtype
---  ------        --------------   -----
 0   building_id   260601 non-null  int64
 1   damage_grade  260601 non-null  int64
dtypes: int64(2)
memory usage: 4.0 MB


In [19]:
test_values=pd.read_csv("data/raw_data/test_values.csv")
test_values.head()

Unnamed: 0,building_id,geo_level_1_id,geo_level_2_id,geo_level_3_id,count_floors_pre_eq,age,area_percentage,height_percentage,land_surface_condition,foundation_type,...,has_secondary_use_agriculture,has_secondary_use_hotel,has_secondary_use_rental,has_secondary_use_institution,has_secondary_use_school,has_secondary_use_industry,has_secondary_use_health_post,has_secondary_use_gov_office,has_secondary_use_use_police,has_secondary_use_other
0,300051,17,596,11307,3,20,7,6,t,r,...,0,0,0,0,0,0,0,0,0,0
1,99355,6,141,11987,2,25,13,5,t,r,...,1,0,0,0,0,0,0,0,0,0
2,890251,22,19,10044,2,5,4,5,t,r,...,0,0,0,0,0,0,0,0,0,0
3,745817,26,39,633,1,0,19,3,t,r,...,0,0,1,0,0,0,0,0,0,0
4,421793,17,289,7970,3,15,8,7,t,r,...,0,0,0,0,0,0,0,0,0,0


# Controlling Data and Searching Missing Values

In [14]:
train_values.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 260601 entries, 0 to 260600
Data columns (total 39 columns):
 #   Column                                  Non-Null Count   Dtype 
---  ------                                  --------------   ----- 
 0   building_id                             260601 non-null  int64 
 1   geo_level_1_id                          260601 non-null  int64 
 2   geo_level_2_id                          260601 non-null  int64 
 3   geo_level_3_id                          260601 non-null  int64 
 4   count_floors_pre_eq                     260601 non-null  int64 
 5   age                                     260601 non-null  int64 
 6   area_percentage                         260601 non-null  int64 
 7   height_percentage                       260601 non-null  int64 
 8   land_surface_condition                  260601 non-null  object
 9   foundation_type                         260601 non-null  object
 10  roof_type                               260601 non-null 

In [15]:
train_values.isnull().sum().sum()

0

In [16]:
train_values.select_dtypes("object").columns

Index(['land_surface_condition', 'foundation_type', 'roof_type',
       'ground_floor_type', 'other_floor_type', 'position',
       'plan_configuration', 'legal_ownership_status'],
      dtype='object')

In [77]:
col=train_values.select_dtypes("object").columns
for c in col:
    print(train_values[c].unique())

['t' 'o' 'n']
['r' 'w' 'i' 'u' 'h']
['n' 'q' 'x']
['f' 'x' 'v' 'z' 'm']
['q' 'x' 'j' 's']
['t' 's' 'j' 'o']
['d' 'u' 's' 'q' 'm' 'c' 'a' 'n' 'f' 'o']
['v' 'a' 'r' 'w']


# Converting Categorical data to Numerical 

In [80]:
from sklearn.preprocessing import LabelEncoder
 
le = LabelEncoder()
col = train_values.select_dtypes("object").columns
print(col)
for c in col:
    le.fit(train_values[c])
    # Fit and transform the categorical data in train set
    train_values[c]=le.transform(train_values[c])
     # Fit and transform the categorical data in test set
    test_values[c]=le.transform(test_values[c])
    

Index(['foundation_type', 'roof_type', 'ground_floor_type', 'other_floor_type',
       'position', 'plan_configuration', 'legal_ownership_status'],
      dtype='object')


In [22]:
a, b = encoding(train_values, test_values)

In [24]:
b

Unnamed: 0,building_id,geo_level_1_id,geo_level_2_id,geo_level_3_id,count_floors_pre_eq,age,area_percentage,height_percentage,land_surface_condition,foundation_type,...,has_secondary_use_agriculture,has_secondary_use_hotel,has_secondary_use_rental,has_secondary_use_institution,has_secondary_use_school,has_secondary_use_industry,has_secondary_use_health_post,has_secondary_use_gov_office,has_secondary_use_use_police,has_secondary_use_other
0,300051,17,596,11307,3,20,7,6,2,r,...,0,0,0,0,0,0,0,0,0,0
1,99355,6,141,11987,2,25,13,5,2,r,...,1,0,0,0,0,0,0,0,0,0
2,890251,22,19,10044,2,5,4,5,2,r,...,0,0,0,0,0,0,0,0,0,0
3,745817,26,39,633,1,0,19,3,2,r,...,0,0,1,0,0,0,0,0,0,0
4,421793,17,289,7970,3,15,8,7,2,r,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
86863,310028,4,605,3623,3,70,20,6,2,r,...,1,0,0,0,0,0,0,0,0,0
86864,663567,10,1407,11907,3,25,6,7,0,r,...,0,0,0,0,0,0,0,0,0,0
86865,1049160,22,1136,7712,1,50,3,3,2,r,...,0,0,0,0,0,0,0,0,0,0
86866,442785,6,1041,912,2,5,9,5,2,r,...,0,0,0,0,0,0,0,0,0,0


In [93]:
a

Unnamed: 0,building_id,geo_level_1_id,geo_level_2_id,geo_level_3_id,count_floors_pre_eq,age,area_percentage,height_percentage,land_surface_condition,foundation_type,...,has_secondary_use_agriculture,has_secondary_use_hotel,has_secondary_use_rental,has_secondary_use_institution,has_secondary_use_school,has_secondary_use_industry,has_secondary_use_health_post,has_secondary_use_gov_office,has_secondary_use_use_police,has_secondary_use_other
0,802906,6,487,12198,2,30,6,5,2,r,...,0,0,0,0,0,0,0,0,0,0
1,28830,8,900,2812,2,10,8,7,1,r,...,0,0,0,0,0,0,0,0,0,0
2,94947,21,363,8973,2,10,5,5,2,r,...,0,0,0,0,0,0,0,0,0,0
3,590882,22,418,10694,2,10,6,5,2,r,...,0,0,0,0,0,0,0,0,0,0
4,201944,11,131,1488,3,30,8,9,2,r,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
260596,688636,25,1335,1621,1,55,6,3,0,r,...,0,0,0,0,0,0,0,0,0,0
260597,669485,17,715,2060,2,0,6,5,2,r,...,0,0,0,0,0,0,0,0,0,0
260598,602512,17,51,8163,3,55,6,7,2,r,...,0,0,0,0,0,0,0,0,0,0
260599,151409,26,39,1851,2,10,14,6,2,r,...,0,0,0,0,0,0,0,0,0,0


In [47]:
data=pd.merge(train_values,train_labels,on='building_id')

# Creating Model and Train

In [48]:
data

Unnamed: 0,building_id,geo_level_1_id,geo_level_2_id,geo_level_3_id,count_floors_pre_eq,age,area_percentage,height_percentage,land_surface_condition,foundation_type,...,has_secondary_use_hotel,has_secondary_use_rental,has_secondary_use_institution,has_secondary_use_school,has_secondary_use_industry,has_secondary_use_health_post,has_secondary_use_gov_office,has_secondary_use_use_police,has_secondary_use_other,damage_grade
0,802906,6,487,12198,2,30,6,5,2,2,...,0,0,0,0,0,0,0,0,0,3
1,28830,8,900,2812,2,10,8,7,1,2,...,0,0,0,0,0,0,0,0,0,2
2,94947,21,363,8973,2,10,5,5,2,2,...,0,0,0,0,0,0,0,0,0,3
3,590882,22,418,10694,2,10,6,5,2,2,...,0,0,0,0,0,0,0,0,0,2
4,201944,11,131,1488,3,30,8,9,2,2,...,0,0,0,0,0,0,0,0,0,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
260596,688636,25,1335,1621,1,55,6,3,0,2,...,0,0,0,0,0,0,0,0,0,2
260597,669485,17,715,2060,2,0,6,5,2,2,...,0,0,0,0,0,0,0,0,0,3
260598,602512,17,51,8163,3,55,6,7,2,2,...,0,0,0,0,0,0,0,0,0,3
260599,151409,26,39,1851,2,10,14,6,2,2,...,0,0,0,0,0,0,0,0,0,2


In [49]:
data=data.drop('building_id',axis=1)
X = data.iloc[:, :-1].values
y = data['damage_grade']

In [50]:
from sklearn.model_selection import cross_val_score

clf=KNeighborsClassifier(n_neighbors=3)
scores = cross_val_score(clf, X, y, cv=5)


ValueError: 
All the 5 fits failed.
It is very likely that your model is misconfigured.
You can try to debug the error by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
5 fits failed with the following error:
Traceback (most recent call last):
  File "/Users/mrv/opt/anaconda3/envs/dsr-38/lib/python3.12/site-packages/sklearn/model_selection/_validation.py", line 729, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/mrv/opt/anaconda3/envs/dsr-38/lib/python3.12/site-packages/sklearn/base.py", line 1152, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/mrv/opt/anaconda3/envs/dsr-38/lib/python3.12/site-packages/sklearn/neighbors/_classification.py", line 233, in fit
    return self._fit(X, y)
           ^^^^^^^^^^^^^^^
  File "/Users/mrv/opt/anaconda3/envs/dsr-38/lib/python3.12/site-packages/sklearn/neighbors/_base.py", line 456, in _fit
    X, y = self._validate_data(
           ^^^^^^^^^^^^^^^^^^^^
  File "/Users/mrv/opt/anaconda3/envs/dsr-38/lib/python3.12/site-packages/sklearn/base.py", line 622, in _validate_data
    X, y = check_X_y(X, y, **check_params)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/mrv/opt/anaconda3/envs/dsr-38/lib/python3.12/site-packages/sklearn/utils/validation.py", line 1146, in check_X_y
    X = check_array(
        ^^^^^^^^^^^^
  File "/Users/mrv/opt/anaconda3/envs/dsr-38/lib/python3.12/site-packages/sklearn/utils/validation.py", line 915, in check_array
    array = _asarray_with_order(array, order=order, dtype=dtype, xp=xp)
            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/mrv/opt/anaconda3/envs/dsr-38/lib/python3.12/site-packages/sklearn/utils/_array_api.py", line 380, in _asarray_with_order
    array = numpy.asarray(array, order=order, dtype=dtype)
            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
ValueError: could not convert string to float: 'n'


In [126]:
X_train, X_test, y_train, y_test=train_test_split(X,y,test_size=0.20)

In [140]:
classifier = KNeighborsClassifier(n_neighbors=7)

#classifier.fit(X_train, y_train) 

# Testing Model Accuracy

In [141]:
y_predict = classifier.predict(X_test)


In [142]:
print(confusion_matrix(y_test, y_predict))
print(classification_report(y_test, y_predict)) 

[[ 2391  2333   242]
 [ 1319 23765  4560]
 [  212  6378 10921]]
              precision    recall  f1-score   support

           1       0.61      0.48      0.54      4966
           2       0.73      0.80      0.77     29644
           3       0.69      0.62      0.66     17511

    accuracy                           0.71     52121
   macro avg       0.68      0.64      0.65     52121
weighted avg       0.71      0.71      0.71     52121



In [143]:
predict_df=pd.DataFrame()
predict_df['test']=y_test
predict_df['predicted']=y_predict

In [144]:
predict_df

Unnamed: 0,test,predicted
111999,2,3
238595,3,2
253059,3,3
234521,3,3
253868,3,2
...,...,...
126710,2,2
135781,2,3
111083,3,3
231646,3,2
