## Import packages

In [8]:
import numpy as np
import pandas as pd

#data exploration
from ydata_profiling import ProfileReport

#sklearn
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_val_score
from sklearn.metrics import classification_report, confusion_matrix

# custom methods
from src.load_data import load_data
from src.encoding import encoding
from src.create_training_data import create_training_data
from src.cross_validate import cross_validate
from src.train import train
from src.submit import submit

  from .autonotebook import tqdm as notebook_tqdm


In [9]:
# Set jupyter notebook options
pd.set_option('display.max_columns', None)

# 1. Load Dataset

In [10]:
df_train_values = load_data("data/raw_data/train_values.csv")
df_train_labels = load_data("data/raw_data/train_labels.csv")
df_test_values = load_data("data/raw_data/test_values.csv")

## 2. Exploration

In [13]:
profile =  ProfileReport(df_train_values, title="Earthquake Data")
profile.to_notebook_iframe()
profile.to_file("data/Explore_Earthquake_data.html")

Summarize dataset: 100%|██████████| 129/129 [00:45<00:00,  2.80it/s, Completed]                                             
Generate report structure: 100%|██████████| 1/1 [00:15<00:00, 15.77s/it]
Render HTML: 100%|██████████| 1/1 [00:03<00:00,  3.81s/it]


Export report to file: 100%|██████████| 1/1 [00:00<00:00, 57.93it/s]


## 3 Preprocessing

### 3.1 Encoding: Converting Categorical columns into numerical for Training and Test data

In [4]:
df_train_values, df_test_values = encoding(df_train_values, df_test_values)

In [5]:
df_train_values

Unnamed: 0,building_id,geo_level_1_id,geo_level_2_id,geo_level_3_id,count_floors_pre_eq,age,area_percentage,height_percentage,land_surface_condition,foundation_type,roof_type,ground_floor_type,other_floor_type,position,plan_configuration,has_superstructure_adobe_mud,has_superstructure_mud_mortar_stone,has_superstructure_stone_flag,has_superstructure_cement_mortar_stone,has_superstructure_mud_mortar_brick,has_superstructure_cement_mortar_brick,has_superstructure_timber,has_superstructure_bamboo,has_superstructure_rc_non_engineered,has_superstructure_rc_engineered,has_superstructure_other,legal_ownership_status,count_families,has_secondary_use,has_secondary_use_agriculture,has_secondary_use_hotel,has_secondary_use_rental,has_secondary_use_institution,has_secondary_use_school,has_secondary_use_industry,has_secondary_use_health_post,has_secondary_use_gov_office,has_secondary_use_use_police,has_secondary_use_other
0,802906,6,487,12198,2,30,6,5,2,2,0,0,1,3,2,1,1,0,0,0,0,0,0,0,0,0,2,1,0,0,0,0,0,0,0,0,0,0,0
1,28830,8,900,2812,2,10,8,7,1,2,0,3,1,2,2,0,1,0,0,0,0,0,0,0,0,0,2,1,0,0,0,0,0,0,0,0,0,0,0
2,94947,21,363,8973,2,10,5,5,2,2,0,0,3,3,2,0,1,0,0,0,0,0,0,0,0,0,2,1,0,0,0,0,0,0,0,0,0,0,0
3,590882,22,418,10694,2,10,6,5,2,2,0,0,3,2,2,0,1,0,0,0,0,1,1,0,0,0,2,1,0,0,0,0,0,0,0,0,0,0,0
4,201944,11,131,1488,3,30,8,9,2,2,0,0,3,2,2,1,0,0,0,0,0,0,0,0,0,0,2,1,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
260596,688636,25,1335,1621,1,55,6,3,0,2,0,0,0,2,7,0,1,0,0,0,0,0,0,0,0,0,2,1,0,0,0,0,0,0,0,0,0,0,0
260597,669485,17,715,2060,2,0,6,5,2,2,0,0,1,2,2,0,1,0,0,0,0,0,0,0,0,0,2,1,0,0,0,0,0,0,0,0,0,0,0
260598,602512,17,51,8163,3,55,6,7,2,2,1,0,1,2,2,0,1,0,0,0,0,0,0,0,0,0,2,1,0,0,0,0,0,0,0,0,0,0,0
260599,151409,26,39,1851,2,10,14,6,2,2,2,2,2,0,2,0,0,0,0,0,1,0,0,0,0,0,2,1,0,0,0,0,0,0,0,0,0,0,0


### 3.2  Data Preparation for training

In [6]:
# data = pd.merge(df_train_values, df_train_labels, on='building_id')
# X_train, y_train = data.iloc[:, 1: -1].values, data['damage_grade']
X_train, y_train = create_training_data(df_train_values, df_train_labels)

## 4. Cross Validating - Find the best model
Here we test StratifiedKFold also since the target values are not evely distributed as suggested in 
[Link](https://www.kaggle.com/discussions/general/237857)

### 4.1 Listing of potential Classifiers 

In [13]:
classifiers = [
    KNeighborsClassifier(3),
    SVC(kernel="linear", C=0.025, random_state=42),
    DecisionTreeClassifier(max_depth=5, random_state=42),
    RandomForestClassifier(
        max_depth=5, n_estimators=10, max_features=1, random_state=42
    ),
    MLPClassifier(alpha=1, max_iter=1000, random_state=42),
]

### 4.2 Finding best model among chosen classifiers

In [14]:
sample_train_X = X_train[:100]
sample_train_y = y_train[:100]

scores, best_model = cross_validate(classifiers, sample_train_X, sample_train_y)
print(scores)
cv_scores = cross_val_score(classifier, X, y, cv=StratifiedKFold(n_splits=5, shuffle=True))


[0.4600000000000001, 0.5399999999999999, 0.54, 0.58, 0.45999999999999996]


## 5. Train the best model

In [15]:
best_model.fit(X_train, y_train)

## Make Predictions on test data

In [16]:
X_test = df_test_values.iloc[:, 1:].values
y_predict = best_model.predict(X_test)

In [17]:
y_predict

array([2, 2, 2, ..., 2, 2, 2])

 ## Bring predictions into the right format

In [18]:
submit(df_test_values, y_predict )

In [65]:
print(confusion_matrix(y_test, y_predict))
print(classification_report(y_test, y_predict)) 

NameError: name 'y_test' is not defined

In [None]:
predict_df=pd.DataFrame()
predict_df['test']=y_test
predict_df['predicted']=y_predict

In [None]:
predict_df

Unnamed: 0,test,predicted
111999,2,3
238595,3,2
253059,3,3
234521,3,3
253868,3,2
...,...,...
126710,2,2
135781,2,3
111083,3,3
231646,3,2
