In [9]:
# Import module
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
from sklearn import svm, datasets
from sklearn.model_selection import GridSearchCV


In [19]:
# load data
TRAIN_DATASET = "./../data/csv/mnist_train.csv"
TEST_DATASET = "./../data/csv/mnist_test.csv"


def load_data(filename):
    """Load the data of the file given in parameter."""
    
    data = np.loadtxt(filename, delimiter=",")
    train = np.asfarray(data[:, 1:])
    labels = np.reshape(data[:, :1], -1)
    
    #Random data for Cross-validation
    random_indices = np.random.randint(0,data.shape[0],1000)
    random_data = data[random_indices]
    trainingShort = np.asfarray(random_data[:, 1:])
    labelsShort = np.reshape(random_data[:, :1],-1)

    print("> {} loaded".format(filename.split("/")[-1]))
    
    return train, labels, trainingShort, labelsShort

# Load the training and the test set.
training_data, training_labels, training_dataShort, training_labelsShort = load_data(TRAIN_DATASET)
test_data, test_labels, test_dataShort, test_labelsShort = load_data(TEST_DATASET)

> mnist_train.csv loaded
> mnist_test.csv loaded


# SVM scikit-learn
* First classifier with default values to see the difference in terms of accuracy between an optimized classifier and an not optimized one
* Second classifier with cross-validation on 1000 random digits from the entire training data set to find the best parameters 
* These paremeters will be used for the SVM applied on the entire data test set 


In [45]:
# SVM of sklearn 

# First classifier with default values -> Not optimized"
clfNotOptimized = svm.SVC(gamma='auto')
clfNotOptimized.fit(training_dataShort, training_labelsShort)

# Second classifier -> optimized with cross-validation to get the best parameters
parameters = {'kernel':('rbf','linear'), 'C':[1, 10, 100, 10000], 'gamma':[1, 10, 100, 1000]}
nb_folds = 5
svc = svm.SVC()
clf = GridSearchCV(svc, parameters, cv=nb_folds)
clf.fit(training_dataShort, training_labelsShort)
print("> The best parameters are: ", clf.best_params_)




> The best parameters are:  {'C': 1, 'gamma': 1, 'kernel': 'linear'}


In [47]:
from sklearn import metrics
y1_pred = clfNotOptimized.predict(test_data)
y2_pred = clf.predict(test_data)

print("> Accuracy of the default classifier:",metrics.accuracy_score(test_labels, y1_pred)*100,"%")
print("> Accuracy of the classifier with best parameters:",metrics.accuracy_score(test_labels, y2_pred)*100,"%")

> Accuracy of the default classifier: 9.58 %
> Accuracy of the classifier with best parameters: 88.33 %
