## Catboost

In [1]:
!pip install catboost



In [2]:
from catboost import CatBoostClassifier
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split,cross_val_score,RandomizedSearchCV
from sklearn.metrics import mean_squared_error
from sklearn.metrics import f1_score
import pandas as pd
import numpy as np
%matplotlib inline
import matplotlib.pyplot as plt
from sklearn import tree

### Importamos los datos originales

In [3]:
df_train_labels = pd.read_csv('./../../datasets/train_labels.csv')
df_train_values = pd.read_csv('./../../datasets/train_values.csv')
df_test_values = pd.read_csv('./../../datasets/test_values.csv')
submission_format = pd.read_csv('./../../datasets/submission_format.csv', index_col='building_id')

### Pre_procesamiento de datos

In [4]:
CB_cat_train=df_train_values

#### Pasamos 'geo_level..' a str

In [5]:
CB_cat_train['geo_level_1_id'] =CB_cat_train['geo_level_1_id'].astype(str)
CB_cat_train['geo_level_2_id'] =CB_cat_train['geo_level_2_id'].astype(str)
CB_cat_train['geo_level_3_id'] =CB_cat_train['geo_level_3_id'].astype(str)

#### Creamos 2 nuevos features combinando los features has_superstructure en uno y has_secondary en otro

#### Has_superstructure¶


In [6]:
CB_cat_train['has_superstructure_adobe_mud']=\
    CB_cat_train['has_superstructure_adobe_mud'].replace([1,0],['a','n'])

CB_cat_train['has_superstructure_mud_mortar_stone']=\
    CB_cat_train['has_superstructure_mud_mortar_stone'].replace([1,0],['b','n'])

CB_cat_train['has_superstructure_stone_flag']=\
    CB_cat_train['has_superstructure_stone_flag'].replace([1,0],['c','n'])

CB_cat_train['has_superstructure_cement_mortar_stone']=\
    CB_cat_train['has_superstructure_cement_mortar_stone'].replace([1,0],['d','n'])

CB_cat_train['has_superstructure_mud_mortar_brick']=\
    CB_cat_train['has_superstructure_mud_mortar_brick'].replace([1,0],['e','n'])

CB_cat_train['has_superstructure_cement_mortar_brick']=\
    CB_cat_train['has_superstructure_cement_mortar_brick'].replace([1,0],['f','n'])

CB_cat_train['has_superstructure_timber']=\
    CB_cat_train['has_superstructure_timber'].replace([1,0],['g','n'])

CB_cat_train['has_superstructure_bamboo']=\
    CB_cat_train['has_superstructure_bamboo'].replace([1,0],['h','n'])

CB_cat_train['has_superstructure_rc_non_engineered']=\
    CB_cat_train['has_superstructure_rc_non_engineered'].replace([1,0],['i','n'])

CB_cat_train['has_superstructure_rc_engineered']=\
    CB_cat_train['has_superstructure_rc_engineered'].replace([1,0],['j','n'])

CB_cat_train['has_superstructure_other']=\
    CB_cat_train['has_superstructure_other'].replace([1,0],['k','n'])

In [7]:
CB_cat_train['has_superstructure']=\
    CB_cat_train['has_superstructure_adobe_mud'].str.cat([
    CB_cat_train['has_superstructure_mud_mortar_stone'],
    CB_cat_train['has_superstructure_stone_flag'],
    CB_cat_train['has_superstructure_cement_mortar_stone'],
    CB_cat_train['has_superstructure_mud_mortar_brick'],
    CB_cat_train['has_superstructure_cement_mortar_brick'],
    CB_cat_train['has_superstructure_timber'],
    CB_cat_train['has_superstructure_bamboo'],
    CB_cat_train['has_superstructure_rc_non_engineered'],
    CB_cat_train['has_superstructure_rc_engineered'],
    CB_cat_train['has_superstructure_other']])

In [8]:
CB_cat_train['has_superstructure'].value_counts()

nbnnnnnnnnn    137134
nbnnnngnnnn     28861
nnnnnfnnnnn     10106
nbnnnnghnnn      9980
annnnnnnnnn      7229
                ...  
abcdenghnnk         1
nncnnnghinn         1
nbndnnnhnnk         1
abnnefnninn         1
nnnnnfghnjn         1
Name: has_superstructure, Length: 505, dtype: int64

##### Has_secondary_use

In [9]:
CB_cat_train['has_secondary_use']=\
    CB_cat_train['has_secondary_use'].replace([1,0],['a','x'])

CB_cat_train['has_secondary_use_agriculture']=\
    CB_cat_train['has_secondary_use_agriculture'].replace([1,0],['b','x'])

CB_cat_train['has_secondary_use_hotel']=\
    CB_cat_train['has_secondary_use_hotel'].replace([1,0],['c','x'])

CB_cat_train['has_secondary_use_rental']=\
    CB_cat_train['has_secondary_use_rental'].replace([1,0],['d','x'])

CB_cat_train['has_secondary_use_institution']=\
    CB_cat_train['has_secondary_use_institution'].replace([1,0],['e','x'])

CB_cat_train['has_secondary_use_school']=\
    CB_cat_train['has_secondary_use_school'].replace([1,0],['f','x'])

CB_cat_train['has_secondary_use_industry']=\
    CB_cat_train['has_secondary_use_industry'].replace([1,0],['g','x'])

CB_cat_train['has_secondary_use_health_post']=\
    CB_cat_train['has_secondary_use_health_post'].replace([1,0],['h','x'])

CB_cat_train['has_secondary_use_gov_office']=\
    CB_cat_train['has_secondary_use_gov_office'].replace([1,0],['i','x'])

CB_cat_train['has_secondary_use_use_police']=\
    CB_cat_train['has_secondary_use_use_police'].replace([1,0],['j','x'])

CB_cat_train['has_secondary_use_other']=\
    CB_cat_train['has_secondary_use_other'].replace([1,0],['k','x'])

In [10]:
CB_cat_train['has_secondary']=\
    CB_cat_train['has_secondary_use'].str.cat([
    CB_cat_train['has_secondary_use_agriculture'],
    CB_cat_train['has_secondary_use_hotel'],
    CB_cat_train['has_secondary_use_rental'],
    CB_cat_train['has_secondary_use_institution'],
    CB_cat_train['has_secondary_use_school'],
    CB_cat_train['has_secondary_use_industry'],
    CB_cat_train['has_secondary_use_health_post'],
    CB_cat_train['has_secondary_use_gov_office'],
    CB_cat_train['has_secondary_use_use_police'],
    CB_cat_train['has_secondary_use_other']])

In [11]:
CB_cat_train['has_secondary'].value_counts()

xxxxxxxxxxx    231445
abxxxxxxxxx     16303
axcxxxxxxxx      8705
axxdxxxxxxx      2096
axxxxxxxxxk       777
abxxxxxxxxk       474
axxxxxgxxxx       275
axxxexxxxxx       241
axxxxfxxxxx        93
axcxxxxxxxk        58
axxxxxxhxxx        49
axxxxxxxixx        37
axxxxxxxxjx        23
axxdxxxxxxk        15
axxxexxxxxk         4
axxxxxgxxxk         4
axxxxxxxixk         1
axxxxfxxxxk         1
Name: has_secondary, dtype: int64

In [12]:
CB_cat_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 260601 entries, 0 to 260600
Data columns (total 41 columns):
 #   Column                                  Non-Null Count   Dtype 
---  ------                                  --------------   ----- 
 0   building_id                             260601 non-null  int64 
 1   geo_level_1_id                          260601 non-null  object
 2   geo_level_2_id                          260601 non-null  object
 3   geo_level_3_id                          260601 non-null  object
 4   count_floors_pre_eq                     260601 non-null  int64 
 5   age                                     260601 non-null  int64 
 6   area_percentage                         260601 non-null  int64 
 7   height_percentage                       260601 non-null  int64 
 8   land_surface_condition                  260601 non-null  object
 9   foundation_type                         260601 non-null  object
 10  roof_type                               260601 non-null 

#### Agragamos el label

In [13]:
CB_cat_train_lab = CB_cat_train.merge(df_train_labels)

#### Quitamos el feature 'building_id'

In [14]:
CB_cat_train_lab = CB_cat_train_lab.iloc[:,1:]

In [15]:
CB_cat_train_lab.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 260601 entries, 0 to 260600
Data columns (total 41 columns):
 #   Column                                  Non-Null Count   Dtype 
---  ------                                  --------------   ----- 
 0   geo_level_1_id                          260601 non-null  object
 1   geo_level_2_id                          260601 non-null  object
 2   geo_level_3_id                          260601 non-null  object
 3   count_floors_pre_eq                     260601 non-null  int64 
 4   age                                     260601 non-null  int64 
 5   area_percentage                         260601 non-null  int64 
 6   height_percentage                       260601 non-null  int64 
 7   land_surface_condition                  260601 non-null  object
 8   foundation_type                         260601 non-null  object
 9   roof_type                               260601 non-null  object
 10  ground_floor_type                       260601 non-null 

#### Separamos la variable a predecir

In [16]:
X, y = CB_cat_train_lab.iloc[:,:-1],CB_cat_train_lab.iloc[:,-1]
categorical_features_indices = np.where(X.dtypes != np.float)[0]

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  categorical_features_indices = np.where(X.dtypes != np.float)[0]


#### Separamos el set train - test

In [17]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123)

### Creamos el modelo

In [18]:
model16 = CatBoostClassifier(
    loss_function='MultiClass', 
    custom_metric='TotalF1:average=Micro',
    iterations=1500,
    cat_features=categorical_features_indices,
    learning_rate= 0.225,
    depth=5,
    l2_leaf_reg=1,
    early_stopping_rounds=6
)
model16.fit(X, y)

0:	learn: 0.9486581	total: 1.05s	remaining: 26m 9s
1:	learn: 0.8605583	total: 2.06s	remaining: 25m 43s
2:	learn: 0.8022909	total: 2.79s	remaining: 23m 14s
3:	learn: 0.7601350	total: 3.78s	remaining: 23m 33s
4:	learn: 0.7293721	total: 4.57s	remaining: 22m 45s
5:	learn: 0.7038733	total: 5.48s	remaining: 22m 44s
6:	learn: 0.6829186	total: 6.41s	remaining: 22m 46s
7:	learn: 0.6696108	total: 7.62s	remaining: 23m 41s
8:	learn: 0.6577634	total: 8.71s	remaining: 24m 3s
9:	learn: 0.6481620	total: 9.88s	remaining: 24m 31s
10:	learn: 0.6403854	total: 10.9s	remaining: 24m 29s
11:	learn: 0.6345128	total: 11.8s	remaining: 24m 26s
12:	learn: 0.6299089	total: 12.8s	remaining: 24m 29s
13:	learn: 0.6252841	total: 13.9s	remaining: 24m 31s
14:	learn: 0.6206611	total: 14.9s	remaining: 24m 34s
15:	learn: 0.6171097	total: 15.9s	remaining: 24m 31s
16:	learn: 0.6144070	total: 17.1s	remaining: 24m 50s
17:	learn: 0.6130412	total: 18.2s	remaining: 25m
18:	learn: 0.6107136	total: 19.4s	remaining: 25m 11s
19:	learn

<catboost.core.CatBoostClassifier at 0x7fc01921ef70>

In [19]:
preds = model16.predict(X_test)
f1_score(y_test, preds, average='micro')

0.8030544310354751

In [19]:
model_b = CatBoostClassifier(
    loss_function='MultiClass', 
    custom_metric='TotalF1:average=Micro',
    iterations=4000,
    cat_features=categorical_features_indices,
    learning_rate= 0.2,
    depth=5,
    l2_leaf_reg=1,
    early_stopping_rounds=6,
)
model_b.fit(X_train, y_train)

0:	learn: 0.9650080	total: 791ms	remaining: 52m 42s
1:	learn: 0.8805889	total: 1.53s	remaining: 50m 53s
2:	learn: 0.8220016	total: 2.33s	remaining: 51m 49s
3:	learn: 0.7817319	total: 2.89s	remaining: 48m 8s
4:	learn: 0.7520111	total: 3.56s	remaining: 47m 22s
5:	learn: 0.7281890	total: 4.21s	remaining: 46m 40s
6:	learn: 0.7050206	total: 4.98s	remaining: 47m 18s
7:	learn: 0.6868424	total: 5.6s	remaining: 46m 36s
8:	learn: 0.6726766	total: 6.25s	remaining: 46m 13s
9:	learn: 0.6607437	total: 6.9s	remaining: 45m 51s
10:	learn: 0.6510982	total: 7.74s	remaining: 46m 48s
11:	learn: 0.6437580	total: 8.31s	remaining: 46m 1s
12:	learn: 0.6386378	total: 8.9s	remaining: 45m 29s
13:	learn: 0.6333890	total: 9.65s	remaining: 45m 48s
14:	learn: 0.6285710	total: 10.6s	remaining: 46m 55s
15:	learn: 0.6247894	total: 11.4s	remaining: 47m 22s
16:	learn: 0.6215285	total: 12.3s	remaining: 48m 13s
17:	learn: 0.6188769	total: 13.2s	remaining: 48m 29s
18:	learn: 0.6165345	total: 13.9s	remaining: 48m 38s
19:	lear

<catboost.core.CatBoostClassifier at 0x7ff6ea165b80>

In [20]:
preds_b = model_b.predict(X_test)
f1_score(y_test, preds_b, average='micro')

0.749544329540876

#### Competencia: Predict and Submit

In [20]:
df_test_values['geo_level_1_id'] = df_test_values['geo_level_1_id'].astype(str)
df_test_values['geo_level_2_id'] = df_test_values['geo_level_2_id'].astype(str)
df_test_values['geo_level_3_id'] = df_test_values['geo_level_3_id'].astype(str)
df_test_values = df_test_values.iloc[:,1:]

In [21]:
# Has_superstructure¶

df_test_values['has_superstructure_adobe_mud']=\
    df_test_values['has_superstructure_adobe_mud'].replace([1,0],['a','n'])

df_test_values['has_superstructure_mud_mortar_stone']=\
    df_test_values['has_superstructure_mud_mortar_stone'].replace([1,0],['b','n'])

df_test_values['has_superstructure_stone_flag']=\
    df_test_values['has_superstructure_stone_flag'].replace([1,0],['c','n'])

df_test_values['has_superstructure_cement_mortar_stone']=\
    df_test_values['has_superstructure_cement_mortar_stone'].replace([1,0],['d','n'])

df_test_values['has_superstructure_mud_mortar_brick']=\
    df_test_values['has_superstructure_mud_mortar_brick'].replace([1,0],['e','n'])

df_test_values['has_superstructure_cement_mortar_brick']=\
    df_test_values['has_superstructure_cement_mortar_brick'].replace([1,0],['f','n'])

df_test_values['has_superstructure_timber']=\
    df_test_values['has_superstructure_timber'].replace([1,0],['g','n'])

df_test_values['has_superstructure_bamboo']=\
    df_test_values['has_superstructure_bamboo'].replace([1,0],['h','n'])

df_test_values['has_superstructure_rc_non_engineered']=\
    df_test_values['has_superstructure_rc_non_engineered'].replace([1,0],['i','n'])

df_test_values['has_superstructure_rc_engineered']=\
    df_test_values['has_superstructure_rc_engineered'].replace([1,0],['j','n'])

df_test_values['has_superstructure_other']=\
    df_test_values['has_superstructure_other'].replace([1,0],['k','n'])


df_test_values['has_superstructure']=\
    df_test_values['has_superstructure_adobe_mud'].str.cat([
    df_test_values['has_superstructure_mud_mortar_stone'],
    df_test_values['has_superstructure_stone_flag'],
    df_test_values['has_superstructure_cement_mortar_stone'],
    df_test_values['has_superstructure_mud_mortar_brick'],
    df_test_values['has_superstructure_cement_mortar_brick'],
    df_test_values['has_superstructure_timber'],
    df_test_values['has_superstructure_bamboo'],
    df_test_values['has_superstructure_rc_non_engineered'],
    df_test_values['has_superstructure_rc_engineered'],
    df_test_values['has_superstructure_other']])

In [22]:
#Has_secondary_use

df_test_values['has_secondary_use']=\
    df_test_values['has_secondary_use'].replace([1,0],['a','x'])

df_test_values['has_secondary_use_agriculture']=\
    df_test_values['has_secondary_use_agriculture'].replace([1,0],['b','x'])

df_test_values['has_secondary_use_hotel']=\
    df_test_values['has_secondary_use_hotel'].replace([1,0],['c','x'])

df_test_values['has_secondary_use_rental']=\
    df_test_values['has_secondary_use_rental'].replace([1,0],['d','x'])

df_test_values['has_secondary_use_institution']=\
    df_test_values['has_secondary_use_institution'].replace([1,0],['e','x'])

df_test_values['has_secondary_use_school']=\
    df_test_values['has_secondary_use_school'].replace([1,0],['f','x'])

df_test_values['has_secondary_use_industry']=\
    df_test_values['has_secondary_use_industry'].replace([1,0],['g','x'])

df_test_values['has_secondary_use_health_post']=\
    df_test_values['has_secondary_use_health_post'].replace([1,0],['h','x'])

df_test_values['has_secondary_use_gov_office']=\
    df_test_values['has_secondary_use_gov_office'].replace([1,0],['i','x'])

df_test_values['has_secondary_use_use_police']=\
    df_test_values['has_secondary_use_use_police'].replace([1,0],['j','x'])

df_test_values['has_secondary_use_other']=\
    df_test_values['has_secondary_use_other'].replace([1,0],['k','x'])

df_test_values['has_secondary']=\
    df_test_values['has_secondary_use'].str.cat([
    df_test_values['has_secondary_use_agriculture'],
    df_test_values['has_secondary_use_hotel'],
    df_test_values['has_secondary_use_rental'],
    df_test_values['has_secondary_use_institution'],
    df_test_values['has_secondary_use_school'],
    df_test_values['has_secondary_use_industry'],
    df_test_values['has_secondary_use_health_post'],
    df_test_values['has_secondary_use_gov_office'],
    df_test_values['has_secondary_use_use_police'],
    df_test_values['has_secondary_use_other']])

In [23]:
df_test_values.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 86868 entries, 0 to 86867
Data columns (total 40 columns):
 #   Column                                  Non-Null Count  Dtype 
---  ------                                  --------------  ----- 
 0   geo_level_1_id                          86868 non-null  object
 1   geo_level_2_id                          86868 non-null  object
 2   geo_level_3_id                          86868 non-null  object
 3   count_floors_pre_eq                     86868 non-null  int64 
 4   age                                     86868 non-null  int64 
 5   area_percentage                         86868 non-null  int64 
 6   height_percentage                       86868 non-null  int64 
 7   land_surface_condition                  86868 non-null  object
 8   foundation_type                         86868 non-null  object
 9   roof_type                               86868 non-null  object
 10  ground_floor_type                       86868 non-null  object
 11  ot

In [24]:
preds = model16.predict(df_test_values)

In [25]:
our_submission = pd.DataFrame(
    data=preds,
    columns=submission_format.columns,
    index=submission_format.index
)

In [26]:
our_submission.to_csv('submission_Cat17.csv')

In [27]:
!head submission_Cat17.csv

building_id,damage_grade
300051,3
99355,2
890251,2
745817,1
421793,3
871976,2
691228,2
896100,3
343471,2


In [28]:
feature_imp=model16.get_feature_importance(
    data=None,
    prettified=False,
    thread_count=-1,
    verbose=False
)

In [29]:
feature_imp

array([6.16170899e+00, 2.43655630e+01, 1.77152554e+01, 2.62743043e+00,
       7.14404673e+00, 3.99415224e+00, 3.55908625e+00, 1.49174515e+00,
       3.20360580e+00, 2.49141760e+00, 4.75223237e+00, 2.85687173e+00,
       2.39628797e+00, 9.23776307e-01, 2.48453310e-01, 1.13393309e-01,
       9.27938004e-02, 1.53782116e-01, 2.57581044e-01, 3.13658723e-01,
       2.68826358e-01, 2.18440914e-01, 9.99460918e-02, 4.32987503e-02,
       1.19905196e-01, 1.26180252e+00, 2.22185491e+00, 8.86543882e-02,
       6.14395354e-02, 2.44683483e-02, 2.96294450e-02, 8.83929888e-03,
       1.33511844e-02, 2.63395035e-02, 9.10992597e-03, 4.80726704e-03,
       3.24140258e-03, 1.40691089e-01, 8.98651332e+00, 1.50599830e+00])

In [30]:
df_fi=pd.DataFrame()
df_fi['f_imp'] = pd.Series(feature_imp)
df_sin_id= CB_cat_train.iloc[:,1:]
df_fi['f_name'] = pd.Series(df_sin_id.columns)


df_fi.sort_values('f_imp',ascending=False)

Unnamed: 0,f_imp,f_name
1,24.365563,geo_level_2_id
2,17.715255,geo_level_3_id
38,8.986513,has_superstructure
4,7.144047,age
0,6.161709,geo_level_1_id
10,4.752232,ground_floor_type
5,3.994152,area_percentage
6,3.559086,height_percentage
8,3.203606,foundation_type
11,2.856872,other_floor_type
