## Import packages

In [1]:
import numpy as np
import pandas as pd

#sklearn
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import cross_val_score
from sklearn.metrics import classification_report, confusion_matrix

# custom methods
from src.load_data import load_data
from src.encoding import encoding,one_hot_encoding
from src.create_training_data import create_training_data
from src.cross_validate import cross_validate
from src.submit import submit
from sklearn.preprocessing import StandardScaler


In [2]:
# Set jupyter notebook options
pd.set_option('display.max_columns', None)

# Load Dataset

In [3]:
df_train_values = load_data("data/raw_data/train_values.csv")
df_train_labels = load_data("data/raw_data/train_labels.csv")
df_test_values = load_data("data/raw_data/test_values.csv")

## 3 Preprocessing

### 3.1 Encoding: Converting Categorical columns into numerical for Training and Test data

In [5]:
df_train_values, df_test_values = encoding(df_train_values, df_test_values)

In [6]:
df_train_values

Unnamed: 0,building_id,geo_level_1_id,geo_level_2_id,geo_level_3_id,count_floors_pre_eq,age,area_percentage,height_percentage,land_surface_condition,foundation_type,roof_type,ground_floor_type,other_floor_type,position,plan_configuration,has_superstructure_adobe_mud,has_superstructure_mud_mortar_stone,has_superstructure_stone_flag,has_superstructure_cement_mortar_stone,has_superstructure_mud_mortar_brick,has_superstructure_cement_mortar_brick,has_superstructure_timber,has_superstructure_bamboo,has_superstructure_rc_non_engineered,has_superstructure_rc_engineered,has_superstructure_other,legal_ownership_status,count_families,has_secondary_use,has_secondary_use_agriculture,has_secondary_use_hotel,has_secondary_use_rental,has_secondary_use_institution,has_secondary_use_school,has_secondary_use_industry,has_secondary_use_health_post,has_secondary_use_gov_office,has_secondary_use_use_police,has_secondary_use_other
0,802906,6,487,12198,2,30,6,5,2,2,0,0,1,3,2,1,1,0,0,0,0,0,0,0,0,0,2,1,0,0,0,0,0,0,0,0,0,0,0
1,28830,8,900,2812,2,10,8,7,1,2,0,3,1,2,2,0,1,0,0,0,0,0,0,0,0,0,2,1,0,0,0,0,0,0,0,0,0,0,0
2,94947,21,363,8973,2,10,5,5,2,2,0,0,3,3,2,0,1,0,0,0,0,0,0,0,0,0,2,1,0,0,0,0,0,0,0,0,0,0,0
3,590882,22,418,10694,2,10,6,5,2,2,0,0,3,2,2,0,1,0,0,0,0,1,1,0,0,0,2,1,0,0,0,0,0,0,0,0,0,0,0
4,201944,11,131,1488,3,30,8,9,2,2,0,0,3,2,2,1,0,0,0,0,0,0,0,0,0,0,2,1,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
260596,688636,25,1335,1621,1,55,6,3,0,2,0,0,0,2,7,0,1,0,0,0,0,0,0,0,0,0,2,1,0,0,0,0,0,0,0,0,0,0,0
260597,669485,17,715,2060,2,0,6,5,2,2,0,0,1,2,2,0,1,0,0,0,0,0,0,0,0,0,2,1,0,0,0,0,0,0,0,0,0,0,0
260598,602512,17,51,8163,3,55,6,7,2,2,1,0,1,2,2,0,1,0,0,0,0,0,0,0,0,0,2,1,0,0,0,0,0,0,0,0,0,0,0
260599,151409,26,39,1851,2,10,14,6,2,2,2,2,2,0,2,0,0,0,0,0,1,0,0,0,0,0,2,1,0,0,0,0,0,0,0,0,0,0,0


### 3.2  Data Preparation for training

In [7]:
# data = pd.merge(df_train_values, df_train_labels, on='building_id')
# X_train, y_train = data.iloc[:, 1: -1].values, data['damage_grade']
X_train, y_train = create_training_data(df_train_values, df_train_labels)

In [8]:
X_train.shape

(260601, 38)

# Standard Scaler

In [9]:
pipe = make_pipeline(StandardScaler(), KNeighborsClassifier(5))
pipe.fit(X_train, y_train)  # apply scaling on training data

In [11]:
X_test = df_test_values.iloc[:, 1:].values
prediction=pipe.predict(X_test)

In [13]:
submit(df_test_values, prediction)

# One hot encoding

In [13]:
from sklearn.preprocessing import OneHotEncoder


one_hot_encoder=OneHotEncoder()
one_hot_encoded_data_train = pd.get_dummies(df_train_values, columns = df_train_values.select_dtypes('object').columns)
one_hot_encoded_data_test = pd.get_dummies(df_test_values, columns = df_test_values.select_dtypes('object').columns)
   

In [26]:
one_hot_encoded_data_test=one_hot_encoded_data_test.replace({True:1, False:0})
one_hot_encoded_data_train=one_hot_encoded_data_train.replace({True:1, False:0})

X_train, y_train = create_training_data(one_hot_encoded_data_train, df_train_labels)

In [27]:
X_train

array([[    6,   487, 12198, ...,     0,     1,     0],
       [    8,   900,  2812, ...,     0,     1,     0],
       [   21,   363,  8973, ...,     0,     1,     0],
       ...,
       [   17,    51,  8163, ...,     0,     1,     0],
       [   26,    39,  1851, ...,     0,     1,     0],
       [   21,     9,  9101, ...,     0,     1,     0]])

## 4. Cross Validating - Find the best model
Here we test StratifiedKFold also since the target values are not evely distributed as suggested in 
[Link](https://www.kaggle.com/discussions/general/237857)

### 4.1 Listing of potential Classifiers 

In [28]:
classifiers = [
    KNeighborsClassifier(5),
    #SVC(kernel="linear", C=0.025, random_state=42),
    #DecisionTreeClassifier(max_depth=5, random_state=42),
    #RandomForestClassifier(
    #    max_depth=5, n_estimators=10, max_features=1, random_state=42
    #),
    #MLPClassifier(alpha=1, max_iter=1000, random_state=42),
]

### 4.2 Finding best model among chosen classifiers

In [29]:
scores, best_model = cross_validate(classifiers, X_train, y_train)
print(scores)

[0.7049051968379323]


## 5. Train the best model

In [23]:
best_model.fit(X_train, y_train)

## Make Predictions on test data

In [None]:
X_test = df_test_values.iloc[:, 1:].values
y_predict = best_model.predict(X_test)

In [25]:
y_predict

array([3, 2, 2, ..., 2, 2, 1])

 ## Bring predictions into the right format

In [26]:
submit(df_test_values, y_predict )