# Training a classifier for hand-written digits

Data Source:
* https://scikit-learn.org/stable/auto_examples/classification/plot_digits_classification.html

Useful reading:
* https://stackoverflow.com/questions/42471523/how-can-i-generate-a-proper-mnist-image
* https://stackoverflow.com/questions/45539289/convert-image-from-28-28-4-to-2d-flat-array-and-write-to-csv
* https://stackoverflow.com/questions/61552402/if-image-has-28-28-3-shape-how-do-i-convert-it-to-28-28-1
* https://stackoverflow.com/questions/51205502/convert-a-black-and-white-image-to-array-of-numbers

In [1]:
# ! conda install xgboost -y

Collecting package metadata (current_repodata.json): done
Solving environment: done


  current version: 4.10.3
  latest version: 4.12.0

Please update conda by running

    $ conda update -n base conda



## Package Plan ##

  environment location: /home/studio-lab-user/.conda/envs/default

  added / updated specs:
    - xgboost


The following packages will be downloaded:

    package                    |            build
    ---------------------------|-----------------
    _py-xgboost-mutex-2.0      |            cpu_0           8 KB  conda-forge
    ca-certificates-2022.5.18.1|       ha878542_0         144 KB  conda-forge
    certifi-2022.5.18.1        |   py39hf3d152e_0         150 KB  conda-forge
    libxgboost-1.5.1           |   cpu_h3d145d1_2         3.6 MB  conda-forge
    openssl-1.1.1o             |       h166bdaf_0         2.1 MB  conda-forge
    py-xgboost-1.5.1           |cpu_py39h4655687_2         152 KB  conda-forge
    xgboost-1.5.1              |cpu_py39h4655687_2   

In [3]:
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, Normalizer
from sklearn import metrics
import numpy as np
import pandas as pd
import pickle
from sklearn.datasets import fetch_openml

In [2]:
# import the mnist dataset
mnist = fetch_openml('mnist_784', version=1)
mnist.keys()

dict_keys(['data', 'target', 'frame', 'categories', 'feature_names', 'target_names', 'DESCR', 'details', 'url'])

In [3]:
# separate features and target
X, y = mnist["data"], mnist["target"]
print(X.shape)
print(y.shape)

(70000, 784)
(70000,)


In [4]:
# train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                       test_size=0.2, 
                                       random_state=42)

## Exploratory Analysis

In [5]:
# explore one digit
some_digit=np.array( X.iloc[0])
print(len(some_digit))
some_digit[400:500]

784


array([  0.,   0.,   0.,   0.,   0.,  81., 240., 253., 253., 119.,  25.,
         0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,
         0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,
         0.,  45., 186., 253., 253., 150.,  27.,   0.,   0.,   0.,   0.,
         0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,
         0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,  16.,  93., 252.,
       253., 187.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,
         0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,
         0.,   0.,   0.,   0.,   0., 249., 253., 249.,  64.,   0.,   0.,
         0.])

In [6]:
# explore one digit
some_digit_image = some_digit.reshape(28, 28)
print(len(some_digit_image))
print(len(some_digit_image[0]))
some_digit_df=pd.DataFrame(some_digit_image)

28
28


In [None]:
# show the output
pd.set_option('display.max_columns', None)
some_digit_df

In [None]:
# examine its label
print(y[0])
print(type(y[0]))
y_int=y.astype(np.uint8)
print(y_int[0])

In [None]:
# display the figure
import matplotlib as mpl 
import matplotlib.pyplot as plt
f = plt.figure()
plt.imshow(some_digit_image, cmap = mpl.cm.binary, interpolation="nearest") 
plt.axis("off")
plt.savefig('model_outputs/foo_five.png')
plt.show()

## Preprocessing

In [None]:
# standardize
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [None]:
# pickle the scaler

f = open('scaler.pkl', 'wb')
pickle.dump(scaler, f)
f.close()  

## Single Decision Tree

In [None]:
# instantiate with arbitrary hyperparameters
tree_model = DecisionTreeClassifier(max_depth=7, 
                               criterion='entropy', 
                               min_samples_leaf=10,
                               class_weight='balanced')

In [None]:
# train the model
tree_model.fit(X_train_scaled, y_train)

In [None]:
# predict
y_preds=tree_model.predict(X_test_scaled)
print(list(y_preds[:10]))
print(list(y_test[:10]))

In [None]:
# evaluate
print('Accuracy:', metrics.accuracy_score(y_test, y_preds))
print('Precision:', metrics.precision_score(y_test, y_preds,average='macro'))
print('Recall:', metrics.recall_score(y_test, y_preds,average='macro'))
print('F1 Score:', metrics.f1_score(y_test, y_preds,average='macro'))

## Random Forest

In [None]:
# modeling: random forest (arbitrary hyperparameters)
rf_model = RandomForestClassifier(max_depth=8, min_samples_leaf=10, n_estimators=100)


In [None]:
# train the model (this is very time-consuming!!) uncomment next line
# rf_model.fit(X_train_scaled, y_train)

# as a workaround, we unpickle the trained model from the previous run
filename = open('model_outputs/rf_model.pkl', 'rb')
rf_model = pickle.load(filename)
filename.close()

In [None]:
# predict
y_preds=rf_model.predict(X_test_scaled)
print(list(y_preds[:10]))
print(list(y_test[:10]))

In [None]:
# evaluate
print('Accuracy:', metrics.accuracy_score(y_test, y_preds))
print('Precision:', metrics.precision_score(y_test, y_preds,average='macro'))
print('Recall:', metrics.recall_score(y_test, y_preds,average='macro'))
print('F1 Score:', metrics.f1_score(y_test, y_preds,average='macro'))

## XG Boost

There are in general two ways that you can control overfitting in XGBoost:

- The first way is to directly control model complexity.

    - This includes max_depth, min_child_weight and gamma.

- The second way is to add randomness to make training robust to noise.

    - This includes subsample and colsample_bytree.

    - You can also reduce stepsize eta. Remember to increase num_round when you do so.

[source](https://xgboost.readthedocs.io/en/stable/tutorials/param_tuning.html#:~:text=There%20are%20in,you%20do%20so.)

In [None]:
# modeling: XGBoost (arbitrary hyperparameters)
xgb_model = XGBClassifier(max_depth=6, 
                          min_child_weight=1, 
                          gamma=0, 
                          subsample=1, 
                          learning_rate=0.3)

In [None]:
# train the model (this is very time-consuming!!) uncomment next line
# xgb_model.fit(X_train_scaled, y_train)

# as a workaround, we unpickle the trained model from the previous run
filename = open('model_outputs/xgb_model.pkl', 'rb')
xgb_model = pickle.load(filename)
filename.close()

In [None]:
# predict
y_preds=xgb_model.predict(X_test_scaled)
print(list(y_preds[:10]))
print(list(y_test[:10]))

In [None]:
# evaluate
print('Accuracy:', metrics.accuracy_score(y_test, y_preds))
print('Precision:', metrics.precision_score(y_test, y_preds,average='macro'))
print('Recall:', metrics.recall_score(y_test, y_preds,average='macro'))
print('F1 Score:', metrics.f1_score(y_test, y_preds,average='macro'))

## Evaluate on new data

In [None]:
## read in our pickle file
filename = open('analysis/example-user-input.pkl', 'rb')
array_to_data_output = pickle.load(filename)
filename.close()

In [None]:
# what is the digit type
print(type(array_to_data_output))
print(array_to_data_output.shape)

In [None]:
# show the digit
pd.set_option('display.max_columns', None)
array_to_data_output

In [None]:
# convert the user input to the format expected by the model
some_digit_array = np.reshape(array_to_data_output.values, -1)
print(some_digit_array[:150])

In [None]:
# Standardize
some_digit_scaled = scaler.transform([some_digit_array])
print(some_digit_scaled[0][:50])

In [None]:
# make a prediction: Random Forest
rf_pred = rf_model.predict(some_digit_scaled)
rf_prob_array = rf_model.predict_proba(some_digit_scaled)
rf_prob = max(rf_prob_array[0])
rf_prob=round(rf_prob*100,2)
print( f'Digit: {rf_pred[0]}', f'Probability: {rf_prob}%')

In [None]:
# make a prediction: XG Boost
xgb_pred = tree_model.predict(some_digit_scaled)
xgb_prob_array = tree_model.predict_proba(some_digit_scaled)
xgb_prob = max(xgb_prob_array[0])
xgb_prob=round(xgb_prob*100,2)
print(f'Digit: {xgb_pred[0]}', f'Probability: {xgb_prob}%')

## Pickle the trained models

In [None]:
# random forest
f = open('model_outputs/rf_model.pkl', 'wb')
pickle.dump(rf_model, f)
f.close()  

In [None]:
# XG Boost
f = open('model_outputs/xgb_model.pkl', 'wb')
pickle.dump(xgb_model, f)
f.close()  