In [13]:
from sklearn.datasets import make_classification
from sklearn.metrics import f1_score
from sklearn.model_selection import RandomizedSearchCV

import pandas as pd
import numpy as np
%matplotlib inline
import matplotlib.pyplot as plt
from sklearn import tree
import xgboost as xgb

# for preprocessing the data
from sklearn.preprocessing import StandardScaler

# the model
from sklearn.ensemble import RandomForestClassifier

# for combining the preprocess with model training
from sklearn.pipeline import make_pipeline

# for optimizing the hyperparameters of the pipeline
from sklearn.model_selection import GridSearchCV

In [20]:
df_train_labels = pd.read_csv('./../datasets/train_labels.csv')
df_train_values = pd.read_csv('./../datasets/train_values_FE.csv')

In [21]:
df_train_labels_filtrados = df_train_labels.drop(['building_id'], axis=1)
df_train_values_filtrados = df_train_values.drop(['building_id'], axis=1)

In [22]:
df_train_values_filtrados = pd.get_dummies(df_train_values_filtrados)

In [23]:
pipe = make_pipeline(StandardScaler(), 
                     RandomForestClassifier(random_state=2018))
pipe

Pipeline(steps=[('standardscaler', StandardScaler()),
                ('randomforestclassifier',
                 RandomForestClassifier(random_state=2018))])

In [24]:
param_grid = {'randomforestclassifier__n_estimators': [50, 100],
              'randomforestclassifier__min_samples_leaf': [1, 5]}
gs = GridSearchCV(pipe, param_grid, cv=5)

In [25]:
gs.fit(df_train_values_filtrados, df_train_labels_filtrados.values.ravel())

Traceback (most recent call last):
  File "C:\Users\gianc\anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 598, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\gianc\anaconda3\lib\site-packages\sklearn\pipeline.py", line 346, in fit
    self._final_estimator.fit(Xt, y, **fit_params_last_step)
  File "C:\Users\gianc\anaconda3\lib\site-packages\sklearn\ensemble\_forest.py", line 387, in fit
    trees = Parallel(n_jobs=self.n_jobs, verbose=self.verbose,
  File "C:\Users\gianc\anaconda3\lib\site-packages\joblib\parallel.py", line 1051, in __call__
    while self.dispatch_one_batch(iterator):
  File "C:\Users\gianc\anaconda3\lib\site-packages\joblib\parallel.py", line 866, in dispatch_one_batch
    self._dispatch(tasks)
  File "C:\Users\gianc\anaconda3\lib\site-packages\joblib\parallel.py", line 784, in _dispatch
    job = self._backend.apply_async(batch, callback=cb)
  File "C:\Users\gianc\anaconda3\lib\site-packages\joblib\_par

Traceback (most recent call last):
  File "C:\Users\gianc\anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 598, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\gianc\anaconda3\lib\site-packages\sklearn\pipeline.py", line 346, in fit
    self._final_estimator.fit(Xt, y, **fit_params_last_step)
  File "C:\Users\gianc\anaconda3\lib\site-packages\sklearn\ensemble\_forest.py", line 387, in fit
    trees = Parallel(n_jobs=self.n_jobs, verbose=self.verbose,
  File "C:\Users\gianc\anaconda3\lib\site-packages\joblib\parallel.py", line 1051, in __call__
    while self.dispatch_one_batch(iterator):
  File "C:\Users\gianc\anaconda3\lib\site-packages\joblib\parallel.py", line 866, in dispatch_one_batch
    self._dispatch(tasks)
  File "C:\Users\gianc\anaconda3\lib\site-packages\joblib\parallel.py", line 784, in _dispatch
    job = self._backend.apply_async(batch, callback=cb)
  File "C:\Users\gianc\anaconda3\lib\site-packages\joblib\_par

GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('standardscaler', StandardScaler()),
                                       ('randomforestclassifier',
                                        RandomForestClassifier(random_state=2018))]),
             param_grid={'randomforestclassifier__min_samples_leaf': [1, 5],
                         'randomforestclassifier__n_estimators': [50, 100]})

In [26]:
print(gs.best_params_)

{'randomforestclassifier__min_samples_leaf': 5, 'randomforestclassifier__n_estimators': 100}


In [27]:
in_sample_preds = gs.predict(df_train_values_filtrados)
f1_score(df_train_labels_filtrados, in_sample_preds, average='micro')

0.7207109719456181

### Prueba de los datos de test

In [28]:
df_test_values = pd.read_csv('test_values_FE.csv')
df_test_values_filtrados = df_test_values.drop(['building_id'], axis=1)

In [29]:
preds = gs.predict(df_test_values_filtrados)
preds

array([3, 2, 2, ..., 2, 2, 2], dtype=int64)

In [30]:
my_df = pd.DataFrame(preds)
my_df['building_id'] = df_test_values['building_id']
my_df = my_df.rename(columns={0: 'damage_grade'})[['building_id', 'damage_grade']]
my_df = my_df.set_index(['building_id', 'damage_grade'])
my_df

building_id,damage_grade
300051,3
99355,2
890251,2
745817,1
421793,3
...,...
310028,2
663567,2
1049160,2
442785,2


In [31]:
my_df.to_csv('prueba_RandomForest_GridSearchCV.csv')