## Feature Engineering

In [2]:
from sklearn import tree
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import RandomForestClassifier

from sklearn.datasets import make_classification
from sklearn.metrics import mean_squared_error
from sklearn.metrics import f1_score
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.model_selection import train_test_split

import xgboost as xgb

import pandas as pd
import numpy as np
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
# ROUTES

RUTA_PARA_ENTRENAR = './../datasets/train_values.csv'
RUTA_PARA_ACTUALIZAR_DS_DE_TRAINING = './../datasets/train_values_FE.csv'

RUTA_PARA_TESTEAR = './../x/test_values.csv'
RUTA_PARA_ACTUALIZAR_DS_DE_TEST = './../x/test_values_FE.csv'

ORIGEN = RUTA_PARA_ENTRENAR
ORIGEN_LABEL = './../datasets/train_labels.csv'
DESTINO = RUTA_PARA_ACTUALIZAR_DS_DE_TRAINING

### Importamos las features originales

In [4]:
df_train_values = pd.read_csv(ORIGEN)
df_train_labels = pd.read_csv(ORIGEN_LABEL)

In [5]:
df_a_filtrar = df_train_values

In [6]:
df_a_filtrar.head(1)

Unnamed: 0,building_id,geo_level_1_id,geo_level_2_id,geo_level_3_id,count_floors_pre_eq,age,area_percentage,height_percentage,land_surface_condition,foundation_type,...,has_secondary_use_agriculture,has_secondary_use_hotel,has_secondary_use_rental,has_secondary_use_institution,has_secondary_use_school,has_secondary_use_industry,has_secondary_use_health_post,has_secondary_use_gov_office,has_secondary_use_use_police,has_secondary_use_other
0,802906,6,487,12198,2,30,6,5,t,r,...,0,0,0,0,0,0,0,0,0,0


### Building_id

In [7]:
#df_a_filtrar = df_a_filtrar.drop(['building_id'], axis=1)

## Encoding

In [8]:
encoder = OneHotEncoder()

### Geo Level 1 id 

Como geolevel no tiene una relación de orden entre cada uno de sus elementos, debemos hacer un encoding para poder predecir.geo_level_1_idgeo_level_1_id

In [9]:
df_a_filtrar['geo_level_1_id'] = pd.Categorical(df_a_filtrar['geo_level_1_id'])
df_a_filtrar[['gl0', 'gl1', 'gl2', 'gl3', 'gl4', 'gl5', 'gl6', 'gl7', 'gl8', 'gl9', 'gl10', 'gl11', 'gl12', 'gl13', 'gl14', 'gl15', 'gl16', 'gl17', 'gl18', 'gl19', 'gl20', 'gl21', 'gl22', 'gl23', 'gl24', 'gl25', 'gl26', 'gl27', 'gl28', 'gl29', 'gl30']] = pd.get_dummies(df_a_filtrar['geo_level_1_id'])

In [10]:
df_a_filtrar = df_a_filtrar.drop(['geo_level_1_id', 'geo_level_2_id', 'geo_level_3_id'], axis=1)

## count_floors_pre_eq

Al tener una relación de orden, no es necesario hacer un encoding de los datos.

### corregir si es necesario, sacando los datos que tienen pocas muestras.

In [11]:
df_a_filtrar['count_floors_pre_eq'].value_counts()

2    156623
3     55617
1     40441
4      5424
5      2246
6       209
7        39
8         1
9         1
Name: count_floors_pre_eq, dtype: int64

## Age

In [12]:
df_a_filtrar['age'].value_counts().head(10)

10    38896
15    36010
5     33697
20    32182
0     26041
25    24366
30    18028
35    10710
40    10559
50     7257
Name: age, dtype: int64

## area_percentage

In [13]:
df_a_filtrar['area_percentage'].value_counts()

6      42013
7      36752
5      32724
8      28445
9      22199
       ...  
82         1
80         1
78         1
75         1
100        1
Name: area_percentage, Length: 84, dtype: int64

## Categóricos 

### land_surface_condition

In [14]:
df_a_filtrar['land_surface_condition'].value_counts()

t    216757
n     35528
o      8316
Name: land_surface_condition, dtype: int64

In [15]:
df_a_filtrar[['lsc_t', 'lsc_n', 'lsc_o']] = pd.get_dummies(df_a_filtrar['land_surface_condition'])

In [16]:
df_a_filtrar = df_a_filtrar.drop(['land_surface_condition'], axis=1)

### foundation_type

In [17]:
df_a_filtrar['foundation_type'].value_counts()

r    219196
w     15118
u     14260
i     10579
h      1448
Name: foundation_type, dtype: int64

In [18]:
df_a_filtrar[['ft_r', 'ft_w', 'ft_u', 'ft_i', 'ft_h']] = pd.get_dummies(df_a_filtrar['foundation_type'])

In [19]:
df_a_filtrar = df_a_filtrar.drop(['foundation_type'], axis=1)

### roof_type

In [20]:
df_a_filtrar['roof_type'].value_counts()

n    182842
q     61576
x     16183
Name: roof_type, dtype: int64

In [21]:
df_a_filtrar[['rt_n', 'rt_q', 'rt_x']] = pd.get_dummies(df_a_filtrar['roof_type'])

In [22]:
df_a_filtrar = df_a_filtrar.drop(['roof_type'], axis=1)

### ground_floor_type

In [23]:
df_a_filtrar['ground_floor_type'].value_counts()

f    209619
x     24877
v     24593
z      1004
m       508
Name: ground_floor_type, dtype: int64

In [24]:
df_a_filtrar[['gft_f', 'gft_x', 'gft_v', 'gft_z', 'gft_m']] = pd.get_dummies(df_a_filtrar['ground_floor_type'])

In [25]:
df_a_filtrar = df_a_filtrar.drop(['ground_floor_type'], axis=1)

### other_floor_type

Quiza conviene combinarlo con floor type

In [26]:
df_a_filtrar['other_floor_type'].value_counts()

q    165282
x     43448
j     39843
s     12028
Name: other_floor_type, dtype: int64

In [27]:
df_a_filtrar[['oft_q', 'oft_x', 'oft_j', 'oft_s']] = pd.get_dummies(df_a_filtrar['other_floor_type'])

In [28]:
df_a_filtrar = df_a_filtrar.drop(['other_floor_type'], axis=1)

### position

In [29]:
df_a_filtrar['position'].value_counts()

s    202090
t     42896
j     13282
o      2333
Name: position, dtype: int64

In [30]:
df_a_filtrar[['p_s', 'p_t', 'p_j', 'p_o']] = pd.get_dummies(df_a_filtrar['position'])

In [31]:
df_a_filtrar = df_a_filtrar.drop(['position'], axis=1)

### plan_configuration

In [32]:
df_a_filtrar['plan_configuration'].value_counts()

d    250072
q      5692
u      3649
s       346
c       325
a       252
o       159
m        46
n        38
f        22
Name: plan_configuration, dtype: int64

In [33]:
df_a_filtrar[['pc_d', 'pc_q', 'pq_u', 'pc_s', 'pc_c', 'pc_a', 'pc_o', 'pc_m', 'pc_n', 'pc_f']] = pd.get_dummies(df_a_filtrar['plan_configuration'])

In [34]:
df_a_filtrar = df_a_filtrar.drop(['plan_configuration'], axis=1)

## legal_ownership_status

In [35]:
df_a_filtrar['legal_ownership_status'].value_counts()

v    250939
a      5512
w      2677
r      1473
Name: legal_ownership_status, dtype: int64

In [36]:
df_a_filtrar[['los_v', 'los_a', 'los_w', 'los_r']] = pd.get_dummies(df_a_filtrar['legal_ownership_status'])

In [37]:
df_a_filtrar = df_a_filtrar.drop(['legal_ownership_status'], axis=1)

## Estado de features

In [38]:
df_a_filtrar.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 260601 entries, 0 to 260600
Data columns (total 97 columns):
 #   Column                                  Non-Null Count   Dtype
---  ------                                  --------------   -----
 0   building_id                             260601 non-null  int64
 1   count_floors_pre_eq                     260601 non-null  int64
 2   age                                     260601 non-null  int64
 3   area_percentage                         260601 non-null  int64
 4   height_percentage                       260601 non-null  int64
 5   has_superstructure_adobe_mud            260601 non-null  int64
 6   has_superstructure_mud_mortar_stone     260601 non-null  int64
 7   has_superstructure_stone_flag           260601 non-null  int64
 8   has_superstructure_cement_mortar_stone  260601 non-null  int64
 9   has_superstructure_mud_mortar_brick     260601 non-null  int64
 10  has_superstructure_cement_mortar_brick  260601 non-null  int64
 11  

## DataFrame con primer encoding

Usando One-hot Encoder

In [345]:
df_a_filtrar = df_a_filtrar.set_index(['building_id'])
df_a_filtrar.to_csv(DESTINO)

In [346]:
df_a_filtrar.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 260601 entries, 802906 to 747594
Data columns (total 96 columns):
 #   Column                                  Non-Null Count   Dtype
---  ------                                  --------------   -----
 0   count_floors_pre_eq                     260601 non-null  int64
 1   age                                     260601 non-null  int64
 2   area_percentage                         260601 non-null  int64
 3   height_percentage                       260601 non-null  int64
 4   has_superstructure_adobe_mud            260601 non-null  int64
 5   has_superstructure_mud_mortar_stone     260601 non-null  int64
 6   has_superstructure_stone_flag           260601 non-null  int64
 7   has_superstructure_cement_mortar_stone  260601 non-null  int64
 8   has_superstructure_mud_mortar_brick     260601 non-null  int64
 9   has_superstructure_cement_mortar_brick  260601 non-null  int64
 10  has_superstructure_timber               260601 non-null  int64


In [219]:
df_train_labels.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 260601 entries, 0 to 260600
Data columns (total 2 columns):
 #   Column        Non-Null Count   Dtype
---  ------        --------------   -----
 0   building_id   260601 non-null  int64
 1   damage_grade  260601 non-null  int64
dtypes: int64(2)
memory usage: 4.0 MB


In [367]:
xgb_cl = xgb.XGBClassifier(
    objective="multi:softprob",
    seed=42,
    nthread=3,
    subsample=0.8,
    num_parallel_tree=2,
    n_estimators=500,
    max_depth=14,
    learning_rate=0.1,
    min_child_weight=30
)

df_train_labels_filtrados = df_train_labels.drop(['building_id'], axis=1)
df_train_values_filtrados = df_a_filtrar.reset_index().drop(['building_id'], axis=1)

X_train, X_test, y_train, y_test = train_test_split(
    df_train_values_filtrados,
    df_train_labels_filtrados,
    test_size=0.3,
    random_state=123
)

eval_set = [(X_test, y_test)]
training = xgb_cl.fit(X_train, y_train, early_stopping_rounds=10, verbose=True, eval_set=eval_set)

  return f(*args, **kwargs)


[0]	validation_0-mlogloss:1.03109
[1]	validation_0-mlogloss:0.97975
[2]	validation_0-mlogloss:0.93695
[3]	validation_0-mlogloss:0.89957
[4]	validation_0-mlogloss:0.86852
[5]	validation_0-mlogloss:0.84628
[6]	validation_0-mlogloss:0.82117
[7]	validation_0-mlogloss:0.80590
[8]	validation_0-mlogloss:0.79109
[9]	validation_0-mlogloss:0.77974
[10]	validation_0-mlogloss:0.76967
[11]	validation_0-mlogloss:0.76118
[12]	validation_0-mlogloss:0.75281
[13]	validation_0-mlogloss:0.74600
[14]	validation_0-mlogloss:0.73979
[15]	validation_0-mlogloss:0.73358
[16]	validation_0-mlogloss:0.72831
[17]	validation_0-mlogloss:0.72355
[18]	validation_0-mlogloss:0.71927
[19]	validation_0-mlogloss:0.71473
[20]	validation_0-mlogloss:0.71198
[21]	validation_0-mlogloss:0.70900
[22]	validation_0-mlogloss:0.70655
[23]	validation_0-mlogloss:0.70339
[24]	validation_0-mlogloss:0.70154
[25]	validation_0-mlogloss:0.69804
[26]	validation_0-mlogloss:0.69584
[27]	validation_0-mlogloss:0.69424
[28]	validation_0-mlogloss:0.6

### Se hace la predicción

In [368]:
preds = training.predict(X_test)

### Error de entrenamiento

In [369]:
xgb_cl.score(X_train, y_train)

0.7286810656726236

### Error de generalización o testing

In [370]:
xgb_cl.score(X_test, y_test)

0.7001189547332473

### Se calcula el error

In [357]:
score = f1_score(y_test, preds, average='micro')

In [358]:
print("El score es: " + str(score))

El score es: 0.7026771210396388
