### IMPORTS

In [1]:
import pandas as pd
import numpy as np
import sklearn

from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import f1_score
from xgboost import XGBRegressor
from xgboost import XGBClassifier

### READING CSVs

In [2]:
train_values = pd.read_pickle('../cleaned_train.pkl')  #CLEANED
train_labels = pd.read_csv('../TP1/train_labels.csv')
test_values = pd.read_pickle('../cleaned_test.pkl') #CLEANED

### FEATURE ENGINEERING 

In [72]:
train_values.columns

Index(['building_id', 'damage_grade', 'geo_level_1_id', 'geo_level_2_id',
       'geo_level_3_id', 'count_floors_pre_eq', 'age', 'area_percentage',
       'height_percentage', 'land_surface_condition', 'foundation_type',
       'roof_type', 'ground_floor_type', 'other_floor_type', 'position',
       'plan_configuration', 'has_superstructure_adobe_mud',
       'has_superstructure_mud_mortar_stone', 'has_superstructure_stone_flag',
       'has_superstructure_cement_mortar_stone',
       'has_superstructure_mud_mortar_brick',
       'has_superstructure_cement_mortar_brick', 'has_superstructure_timber',
       'has_superstructure_bamboo', 'has_superstructure_rc_non_engineered',
       'has_superstructure_rc_engineered', 'has_superstructure_other',
       'legal_ownership_status', 'count_families', 'has_secondary_use',
       'has_secondary_use_agriculture', 'has_secondary_use_hotel',
       'has_secondary_use_rental', 'has_secondary_use_institution',
       'has_secondary_use_school', 'has

In [73]:
selected_features = ['geo_level_1_id', 'geo_level_2_id',
       'geo_level_3_id', 'count_floors_pre_eq', 'age', 'area_percentage',
       'height_percentage', 'land_surface_condition', 'foundation_type',
       'roof_type', 'ground_floor_type', 'other_floor_type', 'position','count_families']

#ONE HOT ENCODING
train_values_subset = train_values[selected_features]
train_values_subset = pd.get_dummies(train_values_subset)

train_values_subset

Unnamed: 0,geo_level_1_id,geo_level_2_id,geo_level_3_id,count_floors_pre_eq,age,area_percentage,height_percentage,count_families,land_surface_condition_n,land_surface_condition_o,...,ground_floor_type_x,ground_floor_type_z,other_floor_type_j,other_floor_type_q,other_floor_type_s,other_floor_type_x,position_j,position_o,position_s,position_t
0,6,487,12198,2,30,6,5,1,0,0,...,0,0,0,1,0,0,0,0,0,1
1,8,900,2812,2,10,8,7,1,0,1,...,1,0,0,1,0,0,0,0,1,0
2,21,363,8973,2,10,5,5,1,0,0,...,0,0,0,0,0,1,0,0,0,1
3,22,418,10694,2,10,6,5,1,0,0,...,0,0,0,0,0,1,0,0,1,0
4,11,131,1488,3,30,8,9,1,0,0,...,0,0,0,0,0,1,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
260596,25,1335,1621,1,55,6,3,1,1,0,...,0,0,1,0,0,0,0,0,1,0
260597,17,715,2060,2,0,6,5,1,0,0,...,0,0,0,1,0,0,0,0,1,0
260598,17,51,8163,3,55,6,7,1,0,0,...,0,0,0,1,0,0,0,0,1,0
260599,26,39,1851,2,10,14,6,1,0,0,...,0,0,0,0,1,0,1,0,0,0


### TRYING XGBOOST WITH CROSS VALIDATION

In [5]:
X = train_values_subset.to_numpy()
y = train_labels.damage_grade.to_numpy()

In [6]:
xg_class = XGBClassifier()

In [7]:
scores = cross_val_score(xg_class, X, y=y, scoring='f1_micro')





















In [8]:
scores

array([0.72471748, 0.72054873, 0.7277053 , 0.72586339, 0.72551804])

### SIMPLE XGBOOST REGRESSOR WITH TRAIN SPLIT 

In [9]:
X = train_values_subset.to_numpy();
y = train_labels.damage_grade.to_numpy();
X_train,X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [10]:
xg_reg = XGBRegressor(objective='reg:squarederror')

In [11]:
xg_reg.fit(X_train, y_train)

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
             importance_type='gain', interaction_constraints='',
             learning_rate=0.300000012, max_delta_step=0, max_depth=6,
             min_child_weight=1, missing=nan, monotone_constraints='()',
             n_estimators=100, n_jobs=6, num_parallel_tree=1, random_state=0,
             reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
             tree_method='exact', validate_parameters=1, verbosity=None)

In [12]:
prediction = xg_reg.predict(X_test)

In [13]:
prediction = prediction.astype(int)

In [14]:
error = f1_score(y_test, prediction, average='micro')
error

0.5247788799140461

### TRYING WITH XGBOOST REGRESSOR WITH GRIDSEARCH 

In [15]:
param_grid = {'learning_rate': [0,1],
             'max_depth': [2,6],
             'n_estimators': [30,80]}

xg_reg = XGBRegressor(objective='reg:squarederror')
gs = GridSearchCV(xg_reg, param_grid, cv=10)

In [16]:
gs.fit(X_train, y_train)

GridSearchCV(cv=10,
             estimator=XGBRegressor(base_score=None, booster=None,
                                    colsample_bylevel=None,
                                    colsample_bynode=None,
                                    colsample_bytree=None, gamma=None,
                                    gpu_id=None, importance_type='gain',
                                    interaction_constraints=None,
                                    learning_rate=None, max_delta_step=None,
                                    max_depth=None, min_child_weight=None,
                                    missing=nan, monotone_constraints=None,
                                    n_estimators=100, n_jobs=None,
                                    num_parallel_tree=None, random_state=None,
                                    reg_alpha=None, reg_lambda=None,
                                    scale_pos_weight=None, subsample=None,
                                    tree_method=None, validate_par

In [17]:
gs.best_params_

{'learning_rate': 1, 'max_depth': 6, 'n_estimators': 80}

In [18]:
gs

GridSearchCV(cv=10,
             estimator=XGBRegressor(base_score=None, booster=None,
                                    colsample_bylevel=None,
                                    colsample_bynode=None,
                                    colsample_bytree=None, gamma=None,
                                    gpu_id=None, importance_type='gain',
                                    interaction_constraints=None,
                                    learning_rate=None, max_delta_step=None,
                                    max_depth=None, min_child_weight=None,
                                    missing=nan, monotone_constraints=None,
                                    n_estimators=100, n_jobs=None,
                                    num_parallel_tree=None, random_state=None,
                                    reg_alpha=None, reg_lambda=None,
                                    scale_pos_weight=None, subsample=None,
                                    tree_method=None, validate_par

In [19]:
prediction = gs.predict(X_test)
prediction = prediction.astype(int)
prediction

array([2, 1, 1, ..., 2, 2, 1])

In [20]:
error = f1_score(y_test, prediction, average='micro')
error

0.5190230425356382

### NOW USING XGBOOST CLASSIFIER

In [60]:
xg_class = XGBClassifier()

X = train_values_subset.to_numpy();
y = train_labels.damage_grade.to_numpy();

X_train,X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [61]:
xg_class.fit(X_train, y_train)





XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.300000012, max_delta_step=0, max_depth=6,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=100, n_jobs=6, num_parallel_tree=1,
              objective='multi:softprob', random_state=0, reg_alpha=0,
              reg_lambda=1, scale_pos_weight=None, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=None)

In [62]:
preds = xg_class.predict(X_test)
preds

array([3, 2, 2, ..., 2, 3, 2])

In [63]:
f1_score(y_test, preds, average='micro')

0.7251779513056158

### XGBOOST CLASSIFIER WITH GS

In [31]:
xg_class = XGBClassifier()

X = train_values_subset.to_numpy();
y = train_labels.damage_grade.to_numpy();

X_train,X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [32]:
param_grid = {'learning_rate': [0,1],
             'max_depth': [2,10],
             'min_child_weight': [0,5],
             'gamma': [0,0.5],
             'colsample_bytree': [0.3,0.9],
             'n_estimators': [10,100]}

gs = GridSearchCV(xg_class, param_grid, cv=10)

In [33]:
gs.fit(X_train, y_train)













GridSearchCV(cv=10,
             estimator=XGBClassifier(base_score=None, booster=None,
                                     colsample_bylevel=None,
                                     colsample_bynode=None,
                                     colsample_bytree=None, gamma=None,
                                     gpu_id=None, importance_type='gain',
                                     interaction_constraints=None,
                                     learning_rate=None, max_delta_step=None,
                                     max_depth=None, min_child_weight=None,
                                     missing=nan, monotone_constraints=None,
                                     n_estimators=100, n_jobs=None,
                                     num_parallel_tree=None, random_state=None,
                                     reg_alpha=None, reg_lambda=None,
                                     scale_pos_weight=None, subsample=None,
                                     tree_method=None

In [34]:
gs.best_params_

{'colsample_bytree': 0.3,
 'gamma': 0.5,
 'learning_rate': 1,
 'max_depth': 10,
 'min_child_weight': 0,
 'n_estimators': 100}

In [35]:
preds = gs.predict(X_test)
preds

array([3, 2, 2, ..., 2, 3, 2])

In [36]:
f1_score(y_test, preds, average='micro')

0.7329483317664665

In [None]:
xg_class.feature_importances_

### PREDICTION

In [74]:
xg_class = XGBClassifier(colsample_bytree = 0.3,
                         gamma = 0.5,
                         learning_rate = 1,
                         max_depth = 10,
                         min_child_weight = 0,
                         n_estimators = 100)

X = train_values_subset.to_numpy();
y = train_labels.damage_grade.to_numpy();

In [75]:
xg_class.fit(X, y)





XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=0.3, gamma=0.5, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=1, max_delta_step=0, max_depth=10,
              min_child_weight=0, missing=nan, monotone_constraints='()',
              n_estimators=100, n_jobs=6, num_parallel_tree=1,
              objective='multi:softprob', random_state=0, reg_alpha=0,
              reg_lambda=1, scale_pos_weight=None, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=None)

In [81]:
test_values = test_values.set_index('building_id')
test_values_subset = test_values[selected_features]
test_values_subset = pd.get_dummies(test_values_subset)
test_values_subset

Unnamed: 0_level_0,geo_level_1_id,geo_level_2_id,geo_level_3_id,count_floors_pre_eq,age,area_percentage,height_percentage,count_families,land_surface_condition_n,land_surface_condition_o,...,ground_floor_type_x,ground_floor_type_z,other_floor_type_j,other_floor_type_q,other_floor_type_s,other_floor_type_x,position_j,position_o,position_s,position_t
building_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
300051,17,596,11307,3,20,7,6,1,0,0,...,0,0,0,1,0,0,0,0,1,0
99355,6,141,11987,2,25,13,5,1,0,0,...,0,0,0,1,0,0,0,0,1,0
890251,22,19,10044,2,5,4,5,1,0,0,...,0,0,0,1,0,0,0,0,1,0
745817,26,39,633,1,0,19,3,2,0,0,...,0,0,1,0,0,0,0,0,0,1
421793,17,289,7970,3,15,8,7,1,0,0,...,0,0,0,1,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
310028,4,605,3623,3,70,20,6,1,0,0,...,0,0,0,1,0,0,0,0,0,1
663567,10,1407,11907,3,25,6,7,1,1,0,...,0,0,0,1,0,0,0,0,1,0
1049160,22,1136,7712,1,50,3,3,1,0,0,...,0,0,1,0,0,0,0,0,1,0
442785,6,1041,912,2,5,9,5,1,0,0,...,0,0,0,1,0,0,0,0,1,0


In [82]:
preds = xg_class.predict(test_values_subset)
preds

array([3, 2, 2, ..., 2, 2, 2])

In [83]:
submission_format = pd.read_csv('../TP1/submission_format.csv', index_col='building_id')
my_submission = pd.DataFrame(data=preds, columns=submission_format.columns, index=submission_format.index)
my_submission

Unnamed: 0_level_0,damage_grade
building_id,Unnamed: 1_level_1
300051,3
99355,2
890251,2
745817,1
421793,3
...,...
310028,2
663567,2
1049160,2
442785,2


In [85]:
my_submission.to_csv('Submits/XGBoostWithGSandFE.csv')