# ML Exercise: Handwritten Digit Recognition

This is a personal exercise to help me understand and practice basic machine learning concepts. The goal is to build a model that can recognize handwritten digits based on the MNIST dataset.

# Setup

In [None]:
# Helper functions

import matplotlib.pyplot as plt
from sklearn.preprocessing import FunctionTransformer

# Plot some of the 28x28 images in the dataset along with
# the corresponding label.
def visualize(dataset, labels):
  plt.figure(figsize=(6, 2))
  for i in range(12):
      plt.subplot(3, 4, i + 1)
      plt.imshow(dataset[i].reshape(28, 28), cmap='gray')
      plt.title(f"Label: {labels[i]}")
      plt.axis('off')
  plt.tight_layout()
  plt.savefig('sample_digits.png')
  plt.show()

# A model step which flattens the 2D images into a 1D array of pixel values.
reshape = FunctionTransformer(
    lambda X: X.reshape(X.shape[0], -1),
    validate=False
)

In [None]:
# Load the MNIST dataset

from keras.datasets import mnist

(xtrain, ytrain), (xtest, ytest) = mnist.load_data()

print("MNIST dataset loaded.")
print(f"Dataset has {len(xtrain)} examples of type: {xtrain.dtype}. Dataset shape: {xtrain.shape}")
print(f"Test data has {len(xtest)} examples of type: {xtest.dtype}. Dataset shape: {xtest.shape}")

In [None]:
# Visualize some samples

visualize(xtrain, ytrain)

# Exercise 1: Classify zeros and ones

In [None]:
mask = (ytrain == 0) | (ytrain == 1)

training_data = xtrain[mask]
training_labels = ytrain[mask]

mask = (ytest == 0) | (ytest == 1)

test_data = xtest[mask]
test_labels = ytest[mask]

print("Filtered the original dataset for only zeros and ones.")
print(f"Dataset has {len(training_data)} examples of type: {training_data.dtype}. Dataset shape: {training_data.shape}")
print(f"Test data has {len(test_data)} examples of type: {test_data.dtype}. Dataset shape: {test_data.shape}")

In [None]:
# Visualize some samples

visualize(training_data, training_labels)

## 1.1 Attempting a linear regression

Linear regression will not understand this as a classification problem of distinguishing between zeros and ones. It outputs floating-point predictions. The output is what the model actually expects the label to be, and NOT some probability of the label being zero or one. That's why some outputs will be outside the [0, 1] range.

However, because the training labels were always either 0 or 1, the models predictions don't stray too far off from these values in either direction, and it's reasonable to expect that predictions "close enough" to 0 mean the handwritten digit is 0, and predictions "close enough" to 1 mean the handwritten digit is 1.

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler

model = make_pipeline(
    # Reshape 2D images into a 1D array of pixel values
    reshape,
    # Normalize the values to have comparable magnitude for
    # better performance and faster convergence
    StandardScaler(),
    # Run training through linear regression
    LinearRegression())

# Run the training
model.fit(training_data, training_labels)

# Test the model
predictions = model.predict(test_data)

print(predictions)
print(test_labels)

### Binarizing the results

We can convert the outputs of linear regression to a 0/1 classification by setting a threshold separating what we consider to be zero-predictions and what we consider to be one-predictions. In this case, it makes sense why a threshold to 0.49 gives an accuracy of near 100%.

In [None]:
import numpy as np

threshold = 0.49 # @param {type:"slider", min:0, max:1, step:0.01}

binary_predictions = np.digitize(predictions, [threshold])

print(f"Using threshold: {threshold:.2f}")
print("\nClassification Report:")
print(classification_report(test_labels, binary_predictions))
print("Confusion Matrix:")
print(confusion_matrix(test_labels, binary_predictions))

## 1.2 Using logistic regression

Logistic regression directly models this as a classification problem so it should perform at least the same or better than linear regression with a threshold.

In this specific example, both approaches are likely to perform similarly, because the data is linearly separable with clear boundaries.

In [None]:
from sklearn.linear_model import LogisticRegression

model = make_pipeline(
    # Reshape 2D images into a 1D array of pixel values
    reshape,
    # Normalize the values to have comparable magnitude for
    # better performance and faster convergence
    StandardScaler(),
    # Run training through logistic regression
    LogisticRegression(penalty='l2', C=0.01))

# Run the training
model.fit(training_data, training_labels)

# Test the model
predictions = model.predict(test_data)

print(predictions)
print(test_labels)
print("Classification Report:")
print(classification_report(test_labels, predictions))
print("Confusion Matrix:")
print(confusion_matrix(test_labels, predictions))

# 2. Exercise 2: Classify the full dataset



## 2.1 Attempting a linear regression

Again, we can try using linear regression with thresholds to bucket the model's output in 10 different buckets for 10 different digits.

The model does typically output values around the digit. However, the thresholds no longer do a good enough job of neatly separating predictions, because the shapes overlap quite a bit more. It's harder to distinguish between 6 and 7 than it is to distinguish between a simple circle (0) and a simple line (1).

In [None]:
model = make_pipeline(
    # Reshape 2D images into a 1D array of pixel values
    reshape,
    # Normalize the values to have comparable magnitude for
    # better performance and faster convergence
    StandardScaler(),
    # Run training through linear regression
    LinearRegression())

# Run the model
model.fit(xtrain, ytrain)

# Test the model
predictions = model.predict(xtest)

print(predictions)
print(ytest)

thresholds = [0.5, 1.5, 2.5, 3.5, 4.5, 5.5, 6.5, 7.5, 8.5]
predicted_classes = np.digitize(predictions, thresholds)

print("Classification Report:")
print(classification_report(ytest, predicted_classes))
print("Confusion Matrix:")
print(confusion_matrix(ytest, predicted_classes))


## 2.2 Using a one-versus-rest approach

Instead of manually setting thresholds to bucket the predictions of a linear regression model, we could use a one-versus-rest approach to train 10 models, one for each digit.

This gives a much better accuracy than linear regression.


In [None]:
from sklearn.multiclass import OneVsRestClassifier

model = OneVsRestClassifier(
    make_pipeline(
      # Reshape 2D images into a 1D array of pixel values
      reshape,
      # Normalize the values to have comparable magnitude for
      # better performance and faster convergence
      StandardScaler(),
      # Run training for each individual model
      LogisticRegression(max_iter=2000)
    )
)

# Run the model
model.fit(xtrain, ytrain)

# Test the model
predictions = model.predict(xtest)

print(predictions)
print(ytest)

print("Classification Report:")
print(classification_report(ytest, predictions))
print("Confusion Matrix:")
print(confusion_matrix(ytest, predictions))


## 2.3 Multiclass approach

In [None]:
model = make_pipeline(
      # Reshape 2D images into a 1D array of pixel values
      reshape,
      # Normalize the values to have comparable magnitude for
      # better performance and faster convergence
      StandardScaler(),
      # Run training through linear regression
      # Here, multinomial is already inferred
      LogisticRegression(max_iter=2000)
    )

# Run the model
model.fit(xtrain, ytrain)

# Test the model
predictions = model.predict(xtest)

print(predictions)
print(ytest)

print("Classification Report:")
print(classification_report(ytest, predictions))
print("Confusion Matrix:")
print(confusion_matrix(ytest, predictions))