# MNIST dataset classification

In [None]:
import scipy.io as sio
import numpy as np
import random
from sklearn.svm import SVC
from sklearn.preprocessing import normalize
import matplotlib.pyplot as plt
import math
import csv

## Data partitioning

In [None]:
# Read in the file
train_mat = sio.loadmat("train.mat")
print(train_mat)

In [None]:
# Understand the data content
train_mat.keys()

In [None]:
# Get the training data
data_set = train_mat['trainX']
data_set

In [None]:
# Check the amount of data in the training set
data_set.shape

In [None]:
# Shuffle the whole provided data set
np.random.shuffle(data_set)

In [None]:
# Set aside 10000 training images as a validation set
validation_index = np.array(random.sample(range(data_set.shape[0]), 10000))
validation_set = data_set[validation_index]

In [None]:
# Verify the validation set size
validation_set.shape

In [None]:
# Construct the training set
training_data = np.delete(data_set, validation_index, 0)
print(training_data.shape)

# Separate the label and data
training_labels = training_data[:, 784]
training_data = training_data[:, 0:784]

In [None]:
# Normalize the given data
def normalize_data(data):
    result_data = np.zeros(data.shape)
    for i in range(data.shape[0]):
        result_data[i] = normalize(data[i][:,np.newaxis], axis=0).ravel()
    return result_data

## SVM

### Separate labels and data from validation_set

In [None]:
validation_labels = validation_set[:, 784]
validation_data = validation_set[:, 0:784]
print(validation_labels.shape)
print(validation_data.shape)

In [None]:
# Normalize the validation data
norm_validation_data = normalize_data(validation_data)

# Normalize the training data
norm_training_data = normalize_data(training_data)
    

### Training on n examples

In [None]:
accuracy_score = []
training_accuracy_score = []
def training_classifier(n, param = 1.0, l = accuracy_score, l2 = None):
    # Get 100 training data
    training_data1 = norm_training_data[0:n]
    training_labels1 = training_labels[0:n]
    # Build the SVC classifier
    classifier1 = SVC(C = param, kernel='linear')
    classifier1.fit(training_data1, training_labels1)
    # Make prediction
    score1 = classifier1.score(norm_validation_data, validation_labels)
    l.append(score1)
    if l2 != None:
        score2 = classifier1.score(training_data1, training_labels1)
        l2.append(score2)

In [None]:
training_size = [100, 200, 500, 1000, 2000, 5000, 10000]
for n in training_size:
    training_classifier(n, l = accuracy_score, l2 = training_accuracy_score)
print(accuracy_score)

## Plot the accuracy score 

In [None]:
plt.plot(training_size, accuracy_score, "r--", label = "validation data set accuracy_score")
plt.plot(training_size, accuracy_score, "ro")
plt.plot(training_size, training_accuracy_score, "g--", label = "training data set accuracy_score")
plt.plot(training_size, training_accuracy_score, "go")
plt.ylabel("accuracy_score")
plt.xlabel("number of training examples")
plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)
plt.axis([0, 11000, 0, 1])
plt.show()

## Hyperparameter tuning

In [None]:
# Guess C values
Cs = list(range(100, 10, -1)) +\
     [math.pow(10, 1), math.pow(10, 0), math.pow(10, -1), math.pow(10, -2),
      math.pow(10, -3), math.pow(10, -4), math.pow(10, -5),math.pow(10, -6), 
      math.pow(10, -7),math.pow(10, -8),math.pow(10, -10), math.pow(10, -20),
      math.pow(10, -30)]

In [None]:
def tunner_with_size(n, score_list, c_s):
    for c in c_s:
        training_classifier(n, param = c, l = score_list)
    if(max(score_list) > max_accuracy):
        max_accuracy_index = scores.index(max(scores))

In [None]:
# Tunning the hyperparameter with different training size
sizes = [100,200,500,1000,2000,5000,10000]
max_accuracy = 0
max_accuracy_index = 0
for size in sizes:
    scores = []
    tunner_with_size(size, scores, Cs)
    print(size,":", scores)

## Kaggle Prediction

In [None]:
best_c = Cs[max_accuracy_index]
best_c

In [None]:
# Read in the test file
test_mat = sio.loadmat("test.mat")
print(test_mat)
print(test_mat.keys())

In [None]:
# Make predictions
best_clf = SVC(C = best_c, kernel='linear')

test_data = test_mat['testX']

# Normalize the test data
norm_test_data = normalize_data(test_data)

best_clf.fit(training_data, training_labels)
predictions = best_clf.predict(test_data)

In [None]:
i = 0
with open('mnist_submission.csv', 'w', newline='') as outfile:
    writer = csv.writer(outfile)
    writer.writerow(['Id'] + ['Category'])
    for num in predictions:
        writer.writerow([i] + [num])
        i += 1