In [1]:
%matplotlib inline
%reload_ext autoreload
%autoreload 2

In [2]:
from sklearn.datasets import load_digits, load_boston, load_breast_cancer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, mean_squared_error, classification_report
from GradientBoost import GradientBoost

# Lets try binary classification

In [3]:
cancer = load_breast_cancer()

In [4]:
X_train, X_test, y_train, y_test \
= train_test_split(cancer.data, cancer.target, test_size=0.3, random_state=17)

#### Lets try sklearn's GradientBoostingClassifier

In [5]:
from sklearn.ensemble import GradientBoostingClassifier

In [6]:
gb = GradientBoostingClassifier(n_estimators=10, random_state=17, criterion='mse')

In [7]:
gb.fit(X_train, y_train)

GradientBoostingClassifier(ccp_alpha=0.0, criterion='mse', init=None,
                           learning_rate=0.1, loss='deviance', max_depth=3,
                           max_features=None, max_leaf_nodes=None,
                           min_impurity_decrease=0.0, min_impurity_split=None,
                           min_samples_leaf=1, min_samples_split=2,
                           min_weight_fraction_leaf=0.0, n_estimators=10,
                           n_iter_no_change=None, presort='deprecated',
                           random_state=17, subsample=1.0, tol=0.0001,
                           validation_fraction=0.1, verbose=0,
                           warm_start=False)

In [8]:
preds = gb.predict(X_test)

In [9]:
accuracy_score(y_test, preds)

0.9649122807017544

In [10]:
print(classification_report(y_test, preds))

              precision    recall  f1-score   support

           0       0.98      0.92      0.95        61
           1       0.96      0.99      0.97       110

    accuracy                           0.96       171
   macro avg       0.97      0.95      0.96       171
weighted avg       0.97      0.96      0.96       171



#### Lets try our GradientBoost implementation

In [11]:
gb_my = GradientBoost(n_estimators=10, random_seed=17)

In [12]:
gb_my.fit(X_train, y_train)

In [13]:
preds = gb_my.predict(X_test)

In [14]:
accuracy_score(y_test, preds)

0.9649122807017544

In [15]:
print(classification_report(y_test, preds))

              precision    recall  f1-score   support

           0       0.98      0.92      0.95        61
           1       0.96      0.99      0.97       110

    accuracy                           0.96       171
   macro avg       0.97      0.95      0.96       171
weighted avg       0.97      0.96      0.96       171



# Now lets try multiclass classification

In [16]:
digits = load_digits()

In [17]:
X_train, X_test, y_train, y_test \
= train_test_split(digits.data, digits.target, test_size=0.3, random_state=17)

#### Lets try sklearn's GradientBoostingClassifier

In [18]:
gb = GradientBoostingClassifier(n_estimators=10, random_state=17, criterion='mse')

In [19]:
gb.fit(X_train, y_train)

GradientBoostingClassifier(ccp_alpha=0.0, criterion='mse', init=None,
                           learning_rate=0.1, loss='deviance', max_depth=3,
                           max_features=None, max_leaf_nodes=None,
                           min_impurity_decrease=0.0, min_impurity_split=None,
                           min_samples_leaf=1, min_samples_split=2,
                           min_weight_fraction_leaf=0.0, n_estimators=10,
                           n_iter_no_change=None, presort='deprecated',
                           random_state=17, subsample=1.0, tol=0.0001,
                           validation_fraction=0.1, verbose=0,
                           warm_start=False)

In [20]:
preds = gb.predict(X_test)

In [21]:
accuracy_score(y_test, preds)

0.8777777777777778

In [22]:
print(classification_report(y_test, preds))

              precision    recall  f1-score   support

           0       1.00      0.98      0.99        51
           1       0.86      0.86      0.86        59
           2       0.94      0.96      0.95        49
           3       0.90      0.81      0.85        57
           4       0.96      0.82      0.88        56
           5       0.89      0.80      0.84        59
           6       0.93      0.87      0.90        47
           7       0.86      1.00      0.93        56
           8       0.76      0.92      0.83        48
           9       0.74      0.79      0.77        58

    accuracy                           0.88       540
   macro avg       0.88      0.88      0.88       540
weighted avg       0.88      0.88      0.88       540



#### Lets try our GradientBoost implementation

In [23]:
gb_my = GradientBoost(n_estimators=10, random_seed=17)

In [24]:
gb_my.fit(X_train, y_train)

In [25]:
preds = gb_my.predict(X_test)

In [26]:
accuracy_score(y_test, preds)

0.8796296296296297

## Let's do gridsearch

In [27]:
gb_params = {
    'n_estimators' : [10, 20, 40],
    'learning_rate' : [0.1, 0.05, 0.01],
    'max_depth' : [3, 5, 7] 
}

In [28]:
gb_grid = GridSearchCV(gb_my, gb_params, cv=3, verbose=1, scoring='accuracy')

In [29]:
gb_grid.fit(X_train, y_train)

Fitting 3 folds for each of 27 candidates, totalling 81 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  81 out of  81 | elapsed: 536.4min finished


GridSearchCV(cv=3, error_score=nan,
             estimator=GradientBoost(criterion='mse', is_classification=True,
                                     learning_rate=0.1, max_depth=3,
                                     max_features=0, min_samples_split=2,
                                     n_estimators=10, random_seed=17),
             iid='deprecated', n_jobs=None,
             param_grid={'learning_rate': [0.1, 0.05, 0.01],
                         'max_depth': [3, 5, 7], 'n_estimators': [10, 20, 40]},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring='accuracy', verbose=1)

In [30]:
gb_grid.best_params_, gb_grid.best_score_

({'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 40}, 0.92442322991249)

In [31]:
preds = gb_grid.predict(X_test)

In [32]:
accuracy_score(y_test, preds)

0.9407407407407408

In [33]:
print(classification_report(y_test, preds))

              precision    recall  f1-score   support

           0       1.00      0.98      0.99        51
           1       0.95      0.93      0.94        59
           2       0.96      0.98      0.97        49
           3       0.96      0.89      0.93        57
           4       1.00      0.95      0.97        56
           5       0.91      0.90      0.91        59
           6       1.00      0.94      0.97        47
           7       0.92      1.00      0.96        56
           8       0.81      1.00      0.90        48
           9       0.93      0.86      0.89        58

    accuracy                           0.94       540
   macro avg       0.94      0.94      0.94       540
weighted avg       0.94      0.94      0.94       540



# Regression

Let's try our hands at regression

In [34]:
ds = load_boston()

In [35]:
X_train, X_test, y_train, y_test = train_test_split(ds.data, ds.target, test_size=0.3, random_state=17)

#### Lets try sklearn's GradientBoostingRegressor

In [36]:
from sklearn.ensemble import GradientBoostingRegressor

In [37]:
gb_reg = GradientBoostingRegressor(n_estimators=10, random_state=17, criterion='mse')

In [38]:
gb_reg.fit(X_train, y_train)

GradientBoostingRegressor(alpha=0.9, ccp_alpha=0.0, criterion='mse', init=None,
                          learning_rate=0.1, loss='ls', max_depth=3,
                          max_features=None, max_leaf_nodes=None,
                          min_impurity_decrease=0.0, min_impurity_split=None,
                          min_samples_leaf=1, min_samples_split=2,
                          min_weight_fraction_leaf=0.0, n_estimators=10,
                          n_iter_no_change=None, presort='deprecated',
                          random_state=17, subsample=1.0, tol=0.0001,
                          validation_fraction=0.1, verbose=0, warm_start=False)

In [39]:
preds = gb_reg.predict(X_test)

In [40]:
mean_squared_error(y_test, preds)

19.211175508346717

#### Let's try our implementation

In [41]:
gb_reg_my = GradientBoost(n_estimators=10, is_classification=False, random_seed=17)

In [42]:
gb_reg_my.fit(X_train, y_train)

In [43]:
preds = gb_reg_my.predict(X_test)

In [44]:
mean_squared_error(y_test, preds)

19.032410516174092