# 0.4.0 Boosting

In [4]:
%load_ext autoreload
%autoreload 2

In [26]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split

from sklearn.tree import DecisionTreeClassifier

from sklearn.ensemble import AdaBoostClassifier

from sklearn.ensemble import GradientBoostingRegressor

from sklearn.metrics import roc_auc_score
from sklearn.metrics import mean_squared_error as MSE

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
import warnings
warnings.filterwarnings("ignore")

In [6]:
import sys
sys.path.append("../") 

import utils.paths as path
from utils.paths2 import direcciones

# Set SEED for reproducibility
SEED = 1

## 0.4.2 Define the AdaBoost classifier 

In [7]:
indian = pd.read_csv(path.data_raw_dir("indian_liver_patient_preprocessed.csv"), encoding = 'utf-8')
indian.drop(['Unnamed: 0'], axis=1, inplace=True)
indian.head()

Unnamed: 0,Age_std,Total_Bilirubin_std,Direct_Bilirubin_std,Alkaline_Phosphotase_std,Alamine_Aminotransferase_std,Aspartate_Aminotransferase_std,Total_Protiens_std,Albumin_std,Albumin_and_Globulin_Ratio_std,Is_male_std,Liver_disease
0,1.247403,-0.42032,-0.495414,-0.42887,-0.355832,-0.319111,0.293722,0.203446,-0.14739,0,1
1,1.062306,1.218936,1.423518,1.675083,-0.093573,-0.035962,0.939655,0.077462,-0.648461,1,1
2,1.062306,0.640375,0.926017,0.816243,-0.115428,-0.146459,0.478274,0.203446,-0.178707,1,1
3,0.815511,-0.372106,-0.388807,-0.449416,-0.36676,-0.312205,0.293722,0.329431,0.16578,1,1
4,1.679294,0.093956,0.179766,-0.395996,-0.295731,-0.177537,0.755102,-0.930414,-1.713237,1,1


In [9]:
X = indian.drop("Liver_disease", axis=1)
y = indian['Liver_disease'].values

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=SEED)

In [8]:
# Instantiate dt
dt = DecisionTreeClassifier(max_depth=2, random_state=1)

# Instantiate ada
ada = AdaBoostClassifier(base_estimator=dt, n_estimators=180, random_state=1)

## 0.4.3 Train the AdaBoost classifier

In [12]:
# Fit ada to the training set
ada.fit(X_train, y_train)

# Compute the probabilities of obtaining the positive class
y_pred_proba = ada.predict_proba(X_test)[:,1]

## 0.4.4 Evaluate the AdaBoost classifier

In [14]:
# Evaluate test-set roc_auc_score
ada_roc_auc = roc_auc_score(y_test, y_pred_proba)

# Print roc_auc_score
print('ROC AUC score: {:.2f}'.format(ada_roc_auc))

ROC AUC score: 0.63


## 0.4.6 Define the GB regressor

In [15]:
bikes = pd.read_csv(path.data_raw_dir("bikes.csv"), encoding = 'utf-8')
bikes.head()

Unnamed: 0,hr,holiday,workingday,temp,hum,windspeed,cnt,instant,mnth,yr,Clear to partly cloudy,Light Precipitation,Misty
0,0,0,0,0.76,0.66,0.0,149,13004,7,1,1,0,0
1,1,0,0,0.74,0.7,0.1343,93,13005,7,1,1,0,0
2,2,0,0,0.72,0.74,0.0896,90,13006,7,1,1,0,0
3,3,0,0,0.72,0.84,0.1343,33,13007,7,1,1,0,0
4,4,0,0,0.7,0.79,0.194,4,13008,7,1,1,0,0


In [16]:
X = bikes.drop("cnt", axis=1)
y = bikes['cnt'].values

In [17]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=SEED)

In [20]:
# Instantiate gb
gb = GradientBoostingRegressor(n_estimators=200, max_depth=4, random_state=2)

## 0.4.7 Train the GB regressor

In [21]:
# Fit gb to the training set
gb.fit(X_train, y_train)

GradientBoostingRegressor(max_depth=4, n_estimators=200, random_state=2)

In [23]:
# Predict test set labels
y_pred = gb.predict(X_test)

## 0.4.8 Evaluate the GB regressor

In [25]:
# Compute MSE
mse_test = MSE(y_test, y_pred)

# Compute RMSE
rmse_test = mse_test**(1/2)

# Print RMSE
print('Test set RMSE of gb: {:.3f}'.format(rmse_test))

Test set RMSE of gb: 43.113


## 0.4.10 Regression with SGB

In [27]:
# Instantiate sgbr
sgbr = GradientBoostingRegressor(max_depth=4, subsample=0.9, max_features=0.75, n_estimators=200, random_state=2)

## 0.4.11 Train the SGB regressor

In [28]:
# Fit sgbr to the training set
sgbr.fit(X_train, y_train)

# Predict test set labels
y_pred = sgbr.predict(X_test)

## 0.4.12 Evaluate the SGB regressor

In [29]:
# Compute test set MSE
mse_test = MSE(y_test, y_pred)

# Compute test set RMSE
rmse_test = mse_test**(1/2)

# Print rmse_test
print('Test set RMSE of sgbr: {:.3f}'.format(rmse_test))

Test set RMSE of sgbr: 45.143


In [4]:
print('ok_')

ok_
