In [38]:
# import dependencies
import pandas as pd
from sklearn import svm, metrics # support vector machine
import numpy as np

In [30]:
# import data
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

In [3]:
# data exploration
print("The shape of the data is:", train.shape, "for the training set, and:", test.shape, "for the test set")
print("For the training set, the first column is the label of the digit (0 to 9), and the remaining columns are the pixel color value (0 to 255)")
print("Head of the training set:")
train.head()

The shape of the data is: (42000, 785) for the training set, and: (28000, 784) for the test set
For the training set, the first column is the label of the digit (0 to 9), and the remaining columns are the pixel color value (0 to 255)
Head of the training set:


Unnamed: 0,label,pixel0,pixel1,pixel2,pixel3,pixel4,pixel5,pixel6,pixel7,pixel8,...,pixel774,pixel775,pixel776,pixel777,pixel778,pixel779,pixel780,pixel781,pixel782,pixel783
0,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,4,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [4]:
# prepare data for training
x = train.loc[:, train.columns != "label"].values
y = train["label"].values

In [6]:
split_point = int(0.66 * len(x))
print(split_point)

27720


In [10]:
x_train = x[:split_point]
y_train = y[:split_point]
print("Size of the training set:", len(x_train), "images")

Size of the training set: 27720 images


In [12]:
# define & train the SVM classifier
classifier = svm.LinearSVC()
classifier.fit(x_train, y_train)

LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0)

In [14]:
# Preparation of the test set
x_test = x[split_point:]
y_test = y[split_point:]

print("The size of the test set is", len(x_test), "images")

The size of the test set is 14280 images


In [15]:
# test the model
predicted = classifier.predict(x_test)

In [28]:
print(metrics.classification_report(y_test, predicted))

             precision    recall  f1-score   support

          0       0.98      0.90      0.94      1442
          1       0.94      0.97      0.96      1613
          2       0.77      0.87      0.82      1376
          3       0.82      0.85      0.83      1468
          4       0.89      0.87      0.88      1339
          5       0.91      0.66      0.77      1296
          6       0.96      0.89      0.93      1388
          7       0.75      0.96      0.84      1504
          8       0.68      0.89      0.77      1401
          9       0.92      0.60      0.72      1453

avg / total       0.86      0.85      0.85     14280



In [29]:
print(metrics.confusion_matrix(y_test, predicted))

[[1294    0   40   21    4   23   12    5   43    0]
 [   0 1562   13    5    2    0    1    5   25    0]
 [   3   16 1197   35   10    2   12   33   63    5]
 [   0    8   77 1243    0    6    2   25  101    6]
 [   3    9   43    9 1162    2   10   24   43   34]
 [   7   13   38  127   20  857   12   12  200   10]
 [  10    7   57    9    7   17 1242    4   34    1]
 [   3    2   14    1   14    3    1 1443    6   17]
 [   1   34   37   32    6   14    2   25 1243    7]
 [   4    4   41   31   79   16    0  345   67  866]]


In [31]:
# now make a prediction on the "real" data set
predicted_real = classifier.predict(test)

In [37]:
print(len(predicted_real))
predicted_real

28000


array([2, 0, 8, ..., 3, 9, 2], dtype=int64)

In [62]:
df_predictions = pd.DataFrame(predicted_real)
df_predictions.shape

(28000, 1)

In [63]:
df_predictions.columns = ["Label"]
df_predictions["ImageId"] = df_predictions.index

In [66]:
f = lambda x: x+1
df_predictions["ImageId"] = df_predictions["ImageId"].apply(f)

In [67]:
df_predictions.head()

Unnamed: 0,Label,ImageId
0,2,1
1,0,2
2,8,3
3,7,4
4,2,5


In [68]:
df_predictions.to_csv("predicted_data.csv", header = True, index = False)