# GradientBoostingClassifier

### Importamos las librerías

In [1]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification
from sklearn.metrics import f1_score

from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import GradientBoostingClassifier

import pandas as pd
import numpy as np
%matplotlib inline
import matplotlib.pyplot as plt
from sklearn import tree
import xgboost as xgb

### Leemos los archivos

In [2]:
df_train_labels = pd.read_csv('train_labels.csv')
df_train_values = pd.read_csv('train_values_FE.csv')

#### Eliminamos el building id para pdoer hacer las predicciones

In [3]:
df_train_labels_filtrados = df_train_labels.drop(['building_id'], axis=1)
df_train_values_filtrados = df_train_values.drop(['building_id'], axis=1)

#### Implementamos el algoritmo

In [4]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(df_train_values_filtrados, df_train_labels_filtrados, test_size=0.3, random_state=123)

In [5]:
gb_clas = GradientBoostingClassifier(n_estimators = 700, max_depth = 8,learning_rate = 0.1,subsample = 0.5)

In [None]:
gb_model = gb_clas.fit(X_train, y_train.values.ravel())

In [8]:
print("train score: " + str(gb_model.score(X_train,y_train)))
print("test score: " + str(gb_model.score(X_test,y_test)))

train score: 0.8151463655300953
test score: 0.6940049372609713


### Entrenamos el modelo usando RandomSearchCV para el ajuste de hiperparametro

### Busqueda de hiperparametros con Random Search

In [6]:
params = {
    "max_depth": [8],
    "subsample": [0.5,1],
    "n_estimators": [500,600],
    "learning_rate": [0.1]
}

random_search = RandomizedSearchCV(
    gb_clas, 
    param_distributions=params, 
    n_iter=50,
    scoring='f1_micro',
    n_jobs=5,
    verbose=3,
    random_state=1001
)

In [7]:
random_search.fit(X_train, y_train)



Fitting 5 folds for each of 4 candidates, totalling 20 fits


  return f(*args, **kwargs)


RandomizedSearchCV(estimator=GradientBoostingClassifier(max_depth=8,
                                                        n_estimators=700,
                                                        subsample=0.5),
                   n_iter=50, n_jobs=5,
                   param_distributions={'learning_rate': [0.1],
                                        'max_depth': [8],
                                        'n_estimators': [500, 600],
                                        'subsample': [0.5, 1]},
                   random_state=1001, scoring='f1_micro', verbose=3)

In [8]:
print(random_search.best_params_)

{'subsample': 1, 'n_estimators': 500, 'max_depth': 8, 'learning_rate': 0.1}


In [9]:
in_sample_preds = random_search.predict(df_train_values_filtrados)
f1_score(df_train_labels_filtrados, in_sample_preds, average='micro')

0.7570845852471786

In [10]:
gb_clas = GradientBoostingClassifier(n_estimators = 500, max_depth = 8,learning_rate = 0.1,subsample = 1)

In [11]:
gb_model = gb_clas.fit(X_train, y_train.values.ravel())

In [12]:
print("train score: " + str(gb_model.score(X_train,y_train)))
print("test score: " + str(gb_model.score(X_test,y_test)))

train score: 0.782721192851661
test score: 0.6980852125196659


In [17]:
in_sample_preds = gb_model.predict(X_test)
f1_score(y_test, in_sample_preds, average='micro')

0.6980852125196659

In [20]:
in_sample_preds = gb_clas.predict(df_train_values_filtrados)
f1_score(df_train_labels_filtrados, in_sample_preds, average='micro')

0.7573301714114681