In [1]:
import pandas as pd
from sklearn.utils import parallel_backend
from sklearn.model_selection import cross_val_score, KFold
from sklearn import datasets
from sklearn import svm
from joblibspark import register_spark

In [2]:
register_spark() # register spark backend

In [3]:
trainDF = pd.read_csv("./data/mnist_train.csv")
trainDF.head()

Unnamed: 0,label,1x1,1x2,1x3,1x4,1x5,1x6,1x7,1x8,1x9,...,28x19,28x20,28x21,28x22,28x23,28x24,28x25,28x26,28x27,28x28
0,5,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,4,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,9,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [4]:
y_train = trainDF.iloc[:,0]
X_train = trainDF.iloc[:,1:]

In [5]:
testDF = pd.read_csv("./data/mnist_test.csv")
testDF.head()

Unnamed: 0,label,1x1,1x2,1x3,1x4,1x5,1x6,1x7,1x8,1x9,...,28x19,28x20,28x21,28x22,28x23,28x24,28x25,28x26,28x27,28x28
0,7,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,2,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,4,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [6]:
y_test = testDF.iloc[:,0]
X_test = testDF.iloc[:,1:]

## Poly Kernel One

In [7]:
kfold = KFold(n_splits=5)

In [8]:
clf_PolyOne = svm.SVC(kernel='poly', C=1)

In [9]:
with parallel_backend('spark', n_jobs=-1):
    p1scores = cross_val_score(clf_PolyOne, X_train, y_train, cv=kfold)

print(p1scores)

[0.97441667 0.97316667 0.97316667 0.97108333 0.97675   ]


## Poly Kernel Two

In [10]:
clf_PolyTwo = svm.SVC(kernel='poly', C=0.5)

In [11]:
with parallel_backend('spark', n_jobs=-1):
    p2scores = cross_val_score(clf_PolyTwo, X_train, y_train, cv=kfold)

print(p2scores)

[0.97041667 0.96866667 0.96766667 0.967      0.97333333]


## Poly Kernel Three

In [12]:
clf_PolyThree = svm.SVC(kernel='poly', C=10)

In [13]:
with parallel_backend('spark', n_jobs=-1):
    p3scores = cross_val_score(clf_PolyThree, X_train, y_train, cv=kfold)

print(p3scores)

[0.97833333 0.976      0.97558333 0.97391667 0.97791667]


## Radial Kernel One

In [15]:
clf_RadialOne = svm.SVC(kernel='rbf', C=1)

In [16]:
with parallel_backend('spark', n_jobs=-1):
    r1scores = cross_val_score(clf_RadialOne, X_train, y_train, cv=kfold)

print(r1scores)

[0.97925    0.97575    0.97733333 0.97383333 0.97858333]


## Radial Kernel Two

In [17]:
clf_RadialTwo = svm.SVC(kernel='rbf', C=0.5)

In [18]:
with parallel_backend('spark', n_jobs=-1):
    r2scores = cross_val_score(clf_RadialTwo, X_train, y_train, cv=kfold)

print(r2scores)

[0.97458333 0.97208333 0.97016667 0.969      0.97408333]


## Radial Kernel Three

In [19]:
clf_RadialThree = svm.SVC(kernel='rbf', C=10)

In [20]:
with parallel_backend('spark', n_jobs=-1):
    r3scores = cross_val_score(clf_RadialThree, X_train, y_train, cv=kfold)

print(r3scores)

[0.98466667 0.98216667 0.98       0.97875    0.98341667]


## Visualization

In [29]:
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
plt.bar(x = ["Poly Model 1",
             "Poly Model 2",
             "Poly Model 3",
             "Radial Model 1",
             "Radial Model 2",
             "Radial Model 3"],
       height = [np.mean(p1scores),
            np.mean(p2scores),
            np.mean(p3scores),
            np.mean(r1scores),
            np.mean(r2scores),
            np.mean(r3scores)],
       yerr = [np.std(p1scores),
               np.std(p2scores),
               np.std(p3scores),
               np.std(r1scores),
               np.std(r2scores),
               np.std(r3scores)],
        color = []

### With Grid Search CV

In [None]:
from sklearn.model_selection import GridSearchCV

In [None]:
est = svm.SVC()

In [None]:
parameters = {"kernel": ('poly','rbf'), 
              "C":[0.5, 1, 1.5, 5, 20], 
              'gamma':["scale","auto"]}

In [None]:
clf = GridSearchCV(est, parameters, cv=5)

In [None]:
with parallel_backend('spark',n_jobs=-1):
    clf.fit(X_train, y_train)