## Random Forest

### Importamos las librerías

In [3]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split

import pandas as pd
import numpy as np
%matplotlib inline
import matplotlib.pyplot as plt
from sklearn import tree

### importamos los datos

In [4]:
df_train_labels = pd.read_csv('./../../datasets/train_labels.csv')
df_train_values = pd.read_csv('./../../datasets/train_values_FE.csv')

### Filtramos los datos

In [5]:
df_train_labels_filtrados = df_train_labels.drop(['building_id'], axis=1)
df_train_values_filtrados = df_train_values.drop(['building_id'], axis=1)

### Implementación del algoritmo

In [6]:
X_train, X_test, y_train, y_test = train_test_split(df_train_values_filtrados, df_train_labels_filtrados, test_size=0.2, random_state=123)

In [8]:
clf = RandomForestClassifier(max_depth=30, random_state=42)
clf.fit(X_train, y_train.values.ravel())

RandomForestClassifier(max_depth=30, random_state=42)

In [12]:
preds = clf.predict(X_test)

In [13]:
error = f1_score(y_test, preds, average='micro')

In [14]:
error

0.6931371232324782

### Error de entrenamiento

In [15]:
clf.score(X_train, y_train)

0.8388670376055257

### Error de generalización o testing

In [16]:
clf.score(X_test, y_test)

0.6931371232324782

In [None]:
tree0 = clf.estimators_[0]
fig, axes = plt.subplots(nrows = 1, ncols = 1, figsize = (25,25), dpi=100)
tree.plot_tree(tree0, feature_names = X_train.columns, filled = True)

### Prueba de los datos de tests

In [34]:
df_test_values = pd.read_csv('./../../datasets/test_values_FE.csv')
df_test_values_filtrados = df_test_values.drop(['building_id'], axis=1)

### Predicción

In [35]:
preds = clf.predict(df_test_values_filtrados)
preds

array([3, 2, 2, ..., 2, 2, 2])

In [36]:
my_df = pd.DataFrame(preds)
my_df['building_id'] = df_test_values['building_id']
my_df = my_df.rename(columns={0: 'damage_grade'})[['building_id', 'damage_grade']]
my_df = my_df.set_index(['building_id', 'damage_grade'])
my_df

building_id,damage_grade
300051,3
99355,2
890251,2
745817,1
421793,3
...,...
310028,2
663567,2
1049160,2
442785,2


### Pasaje a CSV

In [37]:
my_df.to_csv('./../../datasets/prueba.csv')