## Random Forest

### Importamos las librerías

In [2]:
from sklearn.datasets import make_classification
from sklearn.metrics import f1_score
from sklearn.model_selection import RandomizedSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import GridSearchCV

import pandas as pd
import numpy as np
%matplotlib inline
import matplotlib.pyplot as plt
from sklearn import tree
import xgboost as xgb

In [3]:
df_train_labels = pd.read_csv('./../../datasets/train_labels.csv')
df_train_values = pd.read_csv('./../../datasets/train_values_FE.csv')

In [4]:
df_train_labels_filtrados = df_train_labels.drop(['building_id'], axis=1)
df_train_values_filtrados = df_train_values.drop(['building_id'], axis=1)

In [5]:
df_train_values_filtrados = pd.get_dummies(df_train_values_filtrados)

In [6]:
pipe = make_pipeline(
    StandardScaler(),
    RandomForestClassifier(random_state=2018)
)
pipe

Pipeline(steps=[('standardscaler', StandardScaler()),
                ('randomforestclassifier',
                 RandomForestClassifier(random_state=2018))])

In [7]:
param_grid = {'randomforestclassifier__n_estimators': [50, 100],
              'randomforestclassifier__min_samples_leaf': [1, 5]}
gs = GridSearchCV(pipe, param_grid, cv=5)

In [8]:
gs.fit(df_train_values_filtrados, df_train_labels_filtrados.values.ravel())

KeyboardInterrupt: 

In [26]:
print(gs.best_params_)

{'randomforestclassifier__min_samples_leaf': 5, 'randomforestclassifier__n_estimators': 100}


In [27]:
in_sample_preds = gs.predict(df_train_values_filtrados)
f1_score(df_train_labels_filtrados, in_sample_preds, average='micro')

0.7207109719456181

### Prueba de los datos de test

In [28]:
df_test_values = pd.read_csv('./../../datasets/test_values_FE.csv')
df_test_values_filtrados = df_test_values.drop(['building_id'], axis=1)

In [29]:
preds = gs.predict(df_test_values_filtrados)
preds

array([3, 2, 2, ..., 2, 2, 2], dtype=int64)

In [30]:
my_df = pd.DataFrame(preds)
my_df['building_id'] = df_test_values['building_id']
my_df = my_df.rename(columns={0: 'damage_grade'})[['building_id', 'damage_grade']]
my_df = my_df.set_index(['building_id', 'damage_grade'])
my_df

building_id,damage_grade
300051,3
99355,2
890251,2
745817,1
421793,3
...,...
310028,2
663567,2
1049160,2
442785,2


In [31]:
my_df.to_csv('./../../datasets/prueba_RandomForest_GridSearchCV.csv')