# Gradient boosting - handwritten number image recognition

## Import required libraries

In [22]:
import numpy as np
import pandas as pd
from sklearn.datasets import load_digits
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score

## Import data and convert to a dataframe

In [23]:
digits = load_digits()
X = digits.data
y = digits.target

In [24]:
df = pd.DataFrame(data=X, columns=[f"pixel_{i+1}" for i in range(X.shape[1])])
df["target"] = y

In [25]:
df.head(100)

Unnamed: 0,pixel_1,pixel_2,pixel_3,pixel_4,pixel_5,pixel_6,pixel_7,pixel_8,pixel_9,pixel_10,...,pixel_56,pixel_57,pixel_58,pixel_59,pixel_60,pixel_61,pixel_62,pixel_63,pixel_64,target
0,0.0,0.0,5.0,13.0,9.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,6.0,13.0,10.0,0.0,0.0,0.0,0
1,0.0,0.0,0.0,12.0,13.0,5.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,11.0,16.0,10.0,0.0,0.0,1
2,0.0,0.0,0.0,4.0,15.0,12.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,3.0,11.0,16.0,9.0,0.0,2
3,0.0,0.0,7.0,15.0,13.0,1.0,0.0,0.0,0.0,8.0,...,0.0,0.0,0.0,7.0,13.0,13.0,9.0,0.0,0.0,3
4,0.0,0.0,0.0,1.0,11.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,2.0,16.0,4.0,0.0,0.0,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,0.0,0.0,0.0,11.0,16.0,8.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,12.0,16.0,15.0,0.0,0.0,6
96,0.0,1.0,9.0,16.0,15.0,10.0,0.0,0.0,0.0,6.0,...,0.0,0.0,0.0,11.0,16.0,8.0,0.0,0.0,0.0,8
97,0.0,0.0,0.0,3.0,14.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,2.0,16.0,7.0,0.0,0.0,4
98,0.0,2.0,15.0,16.0,16.0,13.0,2.0,0.0,0.0,1.0,...,0.0,0.0,2.0,15.0,16.0,14.0,5.0,0.0,0.0,3


## Split the data into training and test sets

In [26]:
X = df.drop(['target'], axis=1)
y = df['target']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

## Fitting the model

For this model, we will be using the Gradient Boosting classifier, this is a supervised learning model that belongs to a family of emsemble methods.The Gradient Boosting Classifier, uses decision trees as the base models and combines them to make predictions. It works by iteratively improving the base model by focusing on the previous miscalculated samples, and the algorithm produces a model that predicts the residuals and adds it to the previous calculated model. It does this until either a stopping criterion is met or the model reaches a satisfactory performance. 

In [31]:
gb = GradientBoostingClassifier(random_state=12)

The code above creates a class instance of the Gradient Boosting classifier, with the parameter `random_state` ensuring that the output is reproducible rather than a random outcome each time the code is run.

In [28]:
param_grid = {'n_estimators': [25, 50, 75, 100],
              'learning_rate': [0.05, 0.1, 0.15, 0.2]}

The `param_grid` is the values of the different parameters that we test for in the cross-validation.

In [29]:
grid_search = GridSearchCV(estimator=gb, param_grid=param_grid, cv=5, scoring='neg_mean_squared_error')
grid_search.fit(X, y)

GridSearchCV(cv=5, estimator=GradientBoostingClassifier(random_state=12),
             param_grid={'learning_rate': [0.05, 0.1, 0.15, 0.2],
                         'n_estimators': [25, 50, 75, 100]},
             scoring='neg_mean_squared_error')

The `GridSearchCV` is a class which performs a search of the different hyperparameter space of an estimator to determine which is the best fit. In this case, we have four different parameters in our class instance, which are:
* `cv` specifies the number of folds to be used in KFold cross validation.
* `param_grid` is a dictionary of the different hyperparameters and values the class will iterate over and test for in our grid search, in this case it is pre-determined values in our `param_grid` above.
* `estimator` is the machine learning estimator we want to optimize the use of in the model.
* `scoring` is the metric we use to evaluate the performance of each model fit, in this case we use the `neg_mean_squared_error` metric.

The grid search fit method runs the class instance with our dataset.

In [30]:
print(f"Best n_estimators: {grid_search.best_params_['n_estimators']}")
print(f"Best learning_rate: {grid_search.best_params_['learning_rate']}")
print(f"Best MSE: {-grid_search.best_score_}")

Best n_estimators: 50
Best learning_rate: 0.2
Best MSE: 1.2767641597028785
