In [25]:
%matplotlib inline

from tqdm import tqdm_notebook as tqdm
from pathlib import Path

import pandas as pd
pd.set_option('display.max_columns', 100)
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Feature selection
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2

# Evaluation
from scipy.stats import iqr
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split

# for preprocessing the data
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler

# the model
from sklearn import svm
from sklearn.svm import SVC
from sklearn.decomposition import PCA
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier

# for combining the preprocess with model training
from sklearn.pipeline import make_pipeline
from sklearn.pipeline import Pipeline

# for optimizing the hyperparameters of the pipeline
from sklearn.model_selection import GridSearchCV

import warnings
warnings.filterwarnings('ignore')

In [26]:
X = pd.read_csv('train_values.csv', index_col = 'building_id')
y = pd.read_csv('train_labels.csv', index_col = 'building_id')

X_original = X
X.head()

# Original data X
X = X.merge(y, on = 'building_id')

In [28]:
y = np.asarray(y)
y = y.ravel()
# sns.countplot(y)
# plt.hist(X['age'], bins = 120)

# PREPROCESSING DATA

In [29]:
unique = X['age'].unique()
unique[::-1].sort()
unique
age = X['age'].values.tolist()
new_age = [200 if x >= 200 else x for x in age]
X.drop(['age'], axis = 1, inplace = True)
new_age = np.array(new_age)
age = (new_age - np.mean(new_age)) / np.std(new_age)
X['age'] = age.T
# sns.distplot(X['age'], bins = 15, kde = True)

In [30]:
# sns.distplot(X['count_floors_pre_eq'], bins = 25, kde = True)
X['count_floors_pre_eq'].value_counts()
floors = X['count_floors_pre_eq'].values.tolist()
new_floors = [5 if x >= 5 else x for x in floors]
new_floors = np.array(new_floors)
# sns.distplot(new_floors, bins = 20, kde = True)
floors = (new_floors - np.mean(new_floors)) / np.std(new_floors)
X.drop(['count_floors_pre_eq'], axis = 1, inplace = True)
X['count_floors_pre_eq'] = floors.T

In [31]:
# plt.hist(X['area_percentage'], bins = 10)
ap = X['area_percentage'].values
ap = (ap - min(ap)) / (max(ap) - min(ap))
X.drop(['area_percentage'], axis = 1, inplace = True)
X['area_percentage'] = ap.T
# sns.distplot(X['area_percentage'], bins = 10)

In [32]:
# plt.hist(X['height_percentage'], bins = 25)
hp = X['height_percentage'].values
hp = (hp - min(hp)) / (max(hp) - min(hp))
X.drop(['height_percentage'], axis = 1, inplace = True)
X['height_percentage'] = hp.T
# sns.distplot(X['height_percentage'], bins = 10)

In [33]:
# sns.distplot(X['geo_level_1_id'], bins = 21, kde = True)
cuts = pd.cut(X['geo_level_1_id'], 21)
X.drop(['geo_level_1_id'], axis = 1, inplace = True)
le = LabelEncoder()
cuts = le.fit_transform(cuts)
X['geo_level_1_id'] = cuts.T

In [34]:
# sns.distplot(X['geo_level_2_id'], bins = 23)
cuts = pd.cut(X['geo_level_2_id'], 23)
X.drop(['geo_level_2_id'], axis = 1, inplace = True)
le = LabelEncoder()
cuts = le.fit_transform(cuts)
X['geo_level_2_id'] = cuts.T

In [35]:
# sns.distplot(X['geo_level_3_id'], bins = 21)
temp = X['geo_level_3_id'].values
temp = (temp - min(temp)) / (max(temp) - min(temp))
X.drop(['geo_level_3_id'], axis = 1, inplace = True)
X['geo_level_3_id'] = temp.T
# sns.distplot(X['count_families'], bins = 2)

In [36]:
X['count_families'].value_counts()
cf = X['count_families'].values.tolist()
cf_new = [4 if x >= 4 else x for x in cf]
X.drop(['count_families'], axis = 1, inplace = True)
X['count_families'] = np.array(cf_new).T
# sns.distplot(X['count_families'], bins = 2)

In [47]:
# Modified X after get_dummies
## Data after rescaled by StandardScaler
X = pd.get_dummies(X)
scaler = MinMaxScaler(feature_range=(0,1)).fit(X)
data_rescaled = scaler.fit_transform(X)
data_rescaled_df = pd.DataFrame(data_rescaled)
data_rescaled_df

Unnamed: 0,has_superstructure_adobe_mud,has_superstructure_mud_mortar_stone,has_superstructure_stone_flag,has_superstructure_cement_mortar_stone,has_superstructure_mud_mortar_brick,has_superstructure_cement_mortar_brick,has_superstructure_timber,has_superstructure_bamboo,has_superstructure_rc_non_engineered,has_superstructure_rc_engineered,has_superstructure_other,has_secondary_use,has_secondary_use_agriculture,has_secondary_use_hotel,has_secondary_use_rental,has_secondary_use_institution,has_secondary_use_school,has_secondary_use_industry,has_secondary_use_health_post,has_secondary_use_gov_office,has_secondary_use_use_police,has_secondary_use_other,damage_grade,age,count_floors_pre_eq,area_percentage,height_percentage,geo_level_1_id,geo_level_2_id,geo_level_3_id,count_families,land_surface_condition_n,land_surface_condition_o,land_surface_condition_t,foundation_type_h,foundation_type_i,foundation_type_r,foundation_type_u,foundation_type_w,roof_type_n,roof_type_q,roof_type_x,ground_floor_type_f,ground_floor_type_m,ground_floor_type_v,ground_floor_type_x,ground_floor_type_z,other_floor_type_j,other_floor_type_q,other_floor_type_s,other_floor_type_x,position_j,position_o,position_s,position_t,plan_configuration_a,plan_configuration_c,plan_configuration_d,plan_configuration_f,plan_configuration_m,plan_configuration_n,plan_configuration_o,plan_configuration_q,plan_configuration_s,plan_configuration_u,legal_ownership_status_a,legal_ownership_status_r,legal_ownership_status_v,legal_ownership_status_w
0,1.0,1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,1.0,-0.7,-0.5,-0.89899,-0.8,-0.6,-0.363636,0.941275,-0.5,-1.0,-1.0,1.0,-1.0,-1.0,1.0,-1.0,-1.0,1.0,-1.0,-1.0,1.0,-1.0,-1.0,-1.0,-1.0,-1.0,1.0,-1.0,-1.0,-1.0,-1.0,-1.0,1.0,-1.0,-1.0,1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,1.0,-1.0
1,-1.0,1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,0.0,-0.9,-0.5,-0.858586,-0.666667,-0.5,0.272727,-0.552479,-0.5,-1.0,1.0,-1.0,-1.0,-1.0,1.0,-1.0,-1.0,1.0,-1.0,-1.0,-1.0,-1.0,-1.0,1.0,-1.0,-1.0,1.0,-1.0,-1.0,-1.0,-1.0,1.0,-1.0,-1.0,-1.0,1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,1.0,-1.0
2,-1.0,1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,1.0,-0.9,-0.5,-0.919192,-0.8,0.4,-0.545455,0.428026,-0.5,-1.0,-1.0,1.0,-1.0,-1.0,1.0,-1.0,-1.0,1.0,-1.0,-1.0,1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,1.0,-1.0,-1.0,-1.0,1.0,-1.0,-1.0,1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,1.0,-1.0
3,-1.0,1.0,-1.0,-1.0,-1.0,-1.0,1.0,1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,0.0,-0.9,-0.5,-0.89899,-0.8,0.5,-0.454545,0.701918,-0.5,-1.0,-1.0,1.0,-1.0,-1.0,1.0,-1.0,-1.0,1.0,-1.0,-1.0,1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,1.0,-1.0,-1.0,1.0,-1.0,-1.0,-1.0,1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,1.0,-1.0
4,1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,1.0,-0.7,0.0,-0.858586,-0.533333,-0.3,-0.818182,-0.763189,-0.5,-1.0,-1.0,1.0,-1.0,-1.0,1.0,-1.0,-1.0,1.0,-1.0,-1.0,1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,1.0,-1.0,-1.0,1.0,-1.0,-1.0,-1.0,1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,1.0,-1.0
5,-1.0,1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,1.0,1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,0.0,-0.9,-0.5,-0.838384,-0.8,-0.5,-0.272727,-0.030954,-0.5,-1.0,-1.0,1.0,-1.0,-1.0,1.0,-1.0,-1.0,1.0,-1.0,-1.0,1.0,-1.0,-1.0,-1.0,-1.0,-1.0,1.0,-1.0,-1.0,-1.0,-1.0,1.0,-1.0,-1.0,-1.0,1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,1.0,-1.0
6,-1.0,1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,1.0,-0.75,-0.5,-0.959596,-0.866667,-0.4,-0.363636,0.920267,-0.5,1.0,-1.0,-1.0,-1.0,-1.0,1.0,-1.0,-1.0,1.0,-1.0,-1.0,-1.0,-1.0,-1.0,1.0,-1.0,-1.0,1.0,-1.0,-1.0,-1.0,-1.0,1.0,-1.0,-1.0,-1.0,1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,1.0,-1.0
7,-1.0,-1.0,-1.0,-1.0,-1.0,1.0,1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-0.5,-0.858586,-0.733333,0.3,-0.545455,0.947322,-0.5,-1.0,-1.0,1.0,-1.0,-1.0,-1.0,-1.0,1.0,-1.0,1.0,-1.0,-1.0,-1.0,1.0,-1.0,-1.0,-1.0,-1.0,-1.0,1.0,-1.0,-1.0,1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,1.0,-1.0,-1.0,1.0,-1.0
8,-1.0,1.0,-1.0,-1.0,-1.0,-1.0,1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,0.0,-0.85,-0.5,-0.858586,-0.733333,-1.0,0.090909,0.148882,-0.5,-1.0,-1.0,1.0,-1.0,-1.0,1.0,-1.0,-1.0,-1.0,1.0,-1.0,1.0,-1.0,-1.0,-1.0,-1.0,-1.0,1.0,-1.0,-1.0,-1.0,-1.0,1.0,-1.0,-1.0,-1.0,1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,1.0,-1.0
9,-1.0,-1.0,-1.0,-1.0,-1.0,1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-0.757576,-0.866667,0.8,0.272727,-0.841808,-0.5,-1.0,-1.0,1.0,-1.0,1.0,-1.0,-1.0,-1.0,1.0,-1.0,-1.0,-1.0,-1.0,1.0,-1.0,-1.0,1.0,-1.0,-1.0,-1.0,-1.0,-1.0,1.0,-1.0,-1.0,-1.0,1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,1.0,-1.0


In [48]:
X.head()

Unnamed: 0_level_0,has_superstructure_adobe_mud,has_superstructure_mud_mortar_stone,has_superstructure_stone_flag,has_superstructure_cement_mortar_stone,has_superstructure_mud_mortar_brick,has_superstructure_cement_mortar_brick,has_superstructure_timber,has_superstructure_bamboo,has_superstructure_rc_non_engineered,has_superstructure_rc_engineered,has_superstructure_other,has_secondary_use,has_secondary_use_agriculture,has_secondary_use_hotel,has_secondary_use_rental,has_secondary_use_institution,has_secondary_use_school,has_secondary_use_industry,has_secondary_use_health_post,has_secondary_use_gov_office,has_secondary_use_use_police,has_secondary_use_other,damage_grade,age,count_floors_pre_eq,area_percentage,height_percentage,geo_level_1_id,geo_level_2_id,geo_level_3_id,count_families,land_surface_condition_n,land_surface_condition_o,land_surface_condition_t,foundation_type_h,foundation_type_i,foundation_type_r,foundation_type_u,foundation_type_w,roof_type_n,roof_type_q,roof_type_x,ground_floor_type_f,ground_floor_type_m,ground_floor_type_v,ground_floor_type_x,ground_floor_type_z,other_floor_type_j,other_floor_type_q,other_floor_type_s,other_floor_type_x,position_j,position_o,position_s,position_t,plan_configuration_a,plan_configuration_c,plan_configuration_d,plan_configuration_f,plan_configuration_m,plan_configuration_n,plan_configuration_o,plan_configuration_q,plan_configuration_s,plan_configuration_u,legal_ownership_status_a,legal_ownership_status_r,legal_ownership_status_v,legal_ownership_status_w
building_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1
802906,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,3,0.328046,-0.178069,0.050505,0.1,4,7,0.970637,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,0,0,0,1,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,1,0
28830,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,-0.523429,-0.178069,0.070707,0.166667,5,14,0.223761,1,0,1,0,0,0,1,0,0,1,0,0,0,0,0,1,0,0,1,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0
94947,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,3,-0.523429,-0.178069,0.040404,0.1,14,5,0.714013,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,0,0,0,0,0,1,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,1,0
590882,0,1,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,-0.523429,-0.178069,0.050505,0.1,15,6,0.850959,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,0,0,0,0,0,1,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0
201944,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,3,0.328046,1.206659,0.070707,0.233333,7,2,0.118405,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,0,0,0,0,0,1,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0


# FEATURE SELECTION

In [40]:
# # Data with training cols and test cols
# X_sub = X.iloc[:,0:37]  #independent columns
# y = X.iloc[:,-1]    #target column i.e price range

In [None]:
# ## Univariate selection 
# ### apply SelectKBest class to extract top 10 best features
# bestfeatures = SelectKBest(score_func=chi2, k=20)

# fit = bestfeatures.fit(X_sub,y)

# dfscores = pd.DataFrame(fit.scores_)
# dfcolumns = pd.DataFrame(X_sub.columns)
# #concat two dataframes for better visualization 
# featureScores = pd.concat([dfcolumns,dfscores],axis=1)
# featureScores.columns = ['Specs','Score']  #naming the dataframe columns
# print(featureScores.nlargest(11,'Score'))  #print 10 best features

In [None]:
# ## Feature importance
# from sklearn.ensemble import ExtraTreesClassifier
# import matplotlib.pyplot as plt
# model = ExtraTreesClassifier()

# model.fit(X_sub,y)

# print(model.feature_importances_) #use inbuilt class feature_importances of tree based classifiers
# #plot graph of feature importances for better visualization
# feat_importances = pd.Series(model.feature_importances_, index=X_sub.columns)
# feat_importances.nlargest(10).plot(kind='barh')
# plt.show()

In [49]:
selected_features = ['has_secondary_use',
                    'roof_type_x', 
                    'geo_level_3_id', 
                    'geo_level_2_id', 
                    'area_percentage', 
                    'has_secondary_use', 
                    'foundation_type_i', 
                    'has_secondary_use_rental', 
                    'geo_level_1_id', 
                    'has_secondary_use_hotel']
X_subset = pd.get_dummies(X, columns = selected_features)

In [50]:
X_subset.head()

Unnamed: 0_level_0,has_superstructure_adobe_mud,has_superstructure_mud_mortar_stone,has_superstructure_stone_flag,has_superstructure_cement_mortar_stone,has_superstructure_mud_mortar_brick,has_superstructure_cement_mortar_brick,has_superstructure_timber,has_superstructure_bamboo,has_superstructure_rc_non_engineered,has_superstructure_rc_engineered,has_superstructure_other,has_secondary_use_agriculture,has_secondary_use_institution,has_secondary_use_school,has_secondary_use_industry,has_secondary_use_health_post,has_secondary_use_gov_office,has_secondary_use_use_police,has_secondary_use_other,damage_grade,age,count_floors_pre_eq,height_percentage,count_families,land_surface_condition_n,land_surface_condition_o,land_surface_condition_t,foundation_type_h,foundation_type_r,foundation_type_u,foundation_type_w,roof_type_n,roof_type_q,ground_floor_type_f,ground_floor_type_m,ground_floor_type_v,ground_floor_type_x,ground_floor_type_z,other_floor_type_j,other_floor_type_q,other_floor_type_s,other_floor_type_x,position_j,position_o,position_s,position_t,plan_configuration_a,plan_configuration_c,plan_configuration_d,plan_configuration_f,...,area_percentage_0.6363636363636364,area_percentage_0.6464646464646465,area_percentage_0.6565656565656566,area_percentage_0.6666666666666666,area_percentage_0.6868686868686869,area_percentage_0.696969696969697,area_percentage_0.7171717171717171,area_percentage_0.7272727272727273,area_percentage_0.7474747474747475,area_percentage_0.7575757575757576,area_percentage_0.7676767676767676,area_percentage_0.7777777777777778,area_percentage_0.797979797979798,area_percentage_0.8181818181818182,area_percentage_0.8282828282828283,area_percentage_0.8383838383838383,area_percentage_0.8484848484848485,area_percentage_0.8585858585858586,area_percentage_0.898989898989899,area_percentage_0.9595959595959596,area_percentage_1.0,has_secondary_use_0,has_secondary_use_1,foundation_type_i_0,foundation_type_i_1,has_secondary_use_rental_0,has_secondary_use_rental_1,geo_level_1_id_0,geo_level_1_id_1,geo_level_1_id_2,geo_level_1_id_3,geo_level_1_id_4,geo_level_1_id_5,geo_level_1_id_6,geo_level_1_id_7,geo_level_1_id_8,geo_level_1_id_9,geo_level_1_id_10,geo_level_1_id_11,geo_level_1_id_12,geo_level_1_id_13,geo_level_1_id_14,geo_level_1_id_15,geo_level_1_id_16,geo_level_1_id_17,geo_level_1_id_18,geo_level_1_id_19,geo_level_1_id_20,has_secondary_use_hotel_0,has_secondary_use_hotel_1
building_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1,Unnamed: 75_level_1,Unnamed: 76_level_1,Unnamed: 77_level_1,Unnamed: 78_level_1,Unnamed: 79_level_1,Unnamed: 80_level_1,Unnamed: 81_level_1,Unnamed: 82_level_1,Unnamed: 83_level_1,Unnamed: 84_level_1,Unnamed: 85_level_1,Unnamed: 86_level_1,Unnamed: 87_level_1,Unnamed: 88_level_1,Unnamed: 89_level_1,Unnamed: 90_level_1,Unnamed: 91_level_1,Unnamed: 92_level_1,Unnamed: 93_level_1,Unnamed: 94_level_1,Unnamed: 95_level_1,Unnamed: 96_level_1,Unnamed: 97_level_1,Unnamed: 98_level_1,Unnamed: 99_level_1,Unnamed: 100_level_1,Unnamed: 101_level_1
802906,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,3,0.328046,-0.178069,0.1,1,0,0,1,0,1,0,0,1,0,1,0,0,0,0,0,1,0,0,0,0,0,1,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0
28830,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,-0.523429,-0.178069,0.166667,1,0,1,0,0,1,0,0,1,0,0,0,0,1,0,0,1,0,0,0,0,1,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0
94947,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,3,-0.523429,-0.178069,0.1,1,0,0,1,0,1,0,0,1,0,1,0,0,0,0,0,0,0,1,0,0,0,1,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0
590882,0,1,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,2,-0.523429,-0.178069,0.1,1,0,0,1,0,1,0,0,1,0,1,0,0,0,0,0,0,0,1,0,0,1,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0
201944,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,3,0.328046,1.206659,0.233333,1,0,0,1,0,1,0,0,1,0,1,0,0,0,0,0,0,0,1,0,0,1,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0


In [52]:
#START TRAINING DATA
X_train, X_val, y_train, y_val = train_test_split(X_subset, y, test_size = 0.20, random_state = 42)

In [None]:
# Decide what n_components for PCA
scaler = StandardScaler()
data_rescaled = scaler.fit_transform(X_train)

#Fitting the PCA algorithm with our Data
pca = PCA().fit(data_rescaled)

print(np.cumsum(pca.explained_variance_ratio_))

#Plotting the Cumulative Summation of the Explained Variance
plt.figure()
plt.plot(np.cumsum(pca.explained_variance_ratio_))
plt.xlabel('Number of Components')
plt.ylabel('Variance (%)') #for each component
plt.title('Pulsar Dataset Explained Variance')
plt.show()

In [None]:
# xgb = RandomForestClassifier(n_estimators = 300)
# xgb.fit(X_train, y_train)
# y_pred = xgb.predict(X_val)
# y_pred
# f1_score(y_val, y_pred, average = 'micro')

In [None]:
#Train the model
## RBF kernels
pipe_steps = [('pca', PCA()), ('SupVM', SVC(kernel='rbf'))]
pipe = Pipeline(pipe_steps)
param_grid = {'pca__n_components': [10],
              'SupVM__C': [0.1, 0.5, 1, 10, 30, 40, 50, 70, 100, 500, 1000],
              'SupVM__gamma': [0.001, 0.005, 0.01, 0.05, 0.07, 0.1, 0.5, 1, 5, 10, 50],
             }
print('Start fitting training data')

num_cv = 3
gs = GridSearchCV(pipe, param_grid, cv=num_cv)
gs.fit(X_train, y_train)
print("Best fit parameter for %d fold CV" % num_cv, gs.best_params_)

#Evaluate the model
from sklearn.metrics import f1_score

in_sample_preds = gs.predict(train_values_subset)
f1_score(train_labels, in_sample_preds, average='micro')

Start fitting training data


In [None]:
#READ TEST VALUES
X_test = pd.read_csv('test_values.csv')
building_id = X_test['building_id'].values.tolist()
X_test.head()

#PRE_PROCESSING DATA
## AGE
age = X_test['age'].values.tolist()
new_age = [200 if x >= 200 else x for x in age]
X_test.drop(['age'], axis = 1, inplace = True)
new_age = np.array(new_age)
age = (new_age - np.mean(new_age)) / np.std(new_age)
X_test['age'] = age.T

## COUNT_FLOORS
floors = X_test['count_floors_pre_eq'].values.tolist()
new_floors = [5 if x >= 5 else x for x in floors]
new_floors = np.array(new_floors)
floors = (new_floors - np.mean(new_floors)) / np.std(new_floors)
X_test.drop(['count_floors_pre_eq'], axis = 1, inplace = True)
X_test['count_floors_pre_eq'] = floors.T

## AREA PRECENTAGES
ap = X_test['area_percentage'].values
ap = (ap - min(ap)) / (max(ap) - min(ap))
X_test.drop(['area_percentage'], axis = 1, inplace = True)
X_test['area_percentage'] = ap.T

## HEIGHT PERCENTAGE
hp = X_test['height_percentage'].values
hp = (hp - min(hp)) / (max(hp) - min(hp))
X_test.drop(['height_percentage'], axis = 1, inplace = True)
X_test['height_percentage'] = hp.T

## GEO_LEVEL 1
cuts = pd.cut(X_test['geo_level_1_id'], 21)
X_test.drop(['geo_level_1_id'], axis = 1, inplace = True)
le = LabelEncoder()
cuts = le.fit_transform(cuts)
X_test['geo_level_1_id'] = cuts.T

## GEO_LEVEL 2
cuts = pd.cut(X_test['geo_level_2_id'], 23)
X_test.drop(['geo_level_2_id'], axis = 1, inplace = True)
le = LabelEncoder()
cuts = le.fit_transform(cuts)
X_test['geo_level_2_id'] = cuts.T

## GEO_LEVEL 3
temp = X_test['geo_level_3_id'].values
temp = (temp - min(temp)) / (max(temp) - min(temp))
X_test.drop(['geo_level_3_id'], axis = 1, inplace = True)
X_test['geo_level_3_id'] = temp.T

## COUNT_FAMILIES
cf = X_test['count_families'].values.tolist()
cf_new = [4 if x >= 4 else x for x in cf]
X_test.drop(['count_families'], axis = 1, inplace = True)
X_test['count_families'] = np.array(cf_new).T

category_cols = ['geo_level_1_id', 'geo_level_2_id', 'roof_type', 'foundation_type', 'land_surface_condition', 
                 'ground_floor_type', 'other_floor_type', 'position', 'plan_configuration', 'legal_ownership_status']
X_test = pd.get_dummies(X_test, columns = category_cols)

X_test.drop(['building_id'], axis = 1, inplace = True)
X_test.head()

y_test = xgb.predict(X_test)

df = pd.DataFrame()
df['building_id'] = np.array(building_id).T
df['damage_grade'] = np.array(y_test).T
df

df.to_csv('solution.csv', index = False)