# Import libraries

In [1]:
import numpy as np
from sklearn.svm import SVC, LinearSVC

# Import dataset

In [2]:
from sklearn.datasets import fetch_openml
mnist = fetch_openml('mnist_784', version=1, as_frame=False)
mnist.keys()

dict_keys(['data', 'target', 'frame', 'categories', 'feature_names', 'target_names', 'DESCR', 'details', 'url'])

# Feature selection

In [3]:
X, y = mnist["data"], mnist["target"]
X.shape

(70000, 784)

In [4]:
y.shape

(70000,)

# Train test split

In [5]:
X_train = X[:60000]
y_train = y[:60000]
X_test = X[60000:]
y_test = y[60000:]

# Linear SVC model

In [6]:
lin_clf = LinearSVC(random_state=42)
lin_clf.fit(X_train[:10000], y_train[:10000])



LinearSVC(random_state=42)

In [7]:
from sklearn.metrics import accuracy_score

y_pred = lin_clf.predict(X_train)
accuracy_score(y_train, y_pred)

0.8536666666666667

The model gave only 85.37% accuracy on training dataset, but we have to consider the fact that we only trained 10000 rows.

In [8]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train.astype(np.float32))
X_test_scaled = scaler.transform(X_test.astype(np.float32))

In [9]:
lin_clf = LinearSVC(random_state=42)
lin_clf.fit(X_train_scaled[:10000], y_train[:10000])



LinearSVC(random_state=42)

In [10]:
y_pred = lin_clf.predict(X_train_scaled)
accuracy_score(y_train, y_pred)

0.8745333333333334

After scaling, the model gave only 87.45% accuracy on training dataset, but again we have to consider the fact that we only trained 10000 rows.

# SVC model

In [11]:
svm_clf = SVC(gamma="scale")
svm_clf.fit(X_train_scaled[:10000], y_train[:10000])

SVC()

In [12]:
y_pred = svm_clf.predict(X_train_scaled)
accuracy_score(y_train, y_pred)

0.9455333333333333

The model gave 94.55% accuracy on training dataset which is pretty good, but we have to consider the fact that we only trained 10000 rows.

In [13]:
svm_clf.get_params()

{'C': 1.0,
 'break_ties': False,
 'cache_size': 200,
 'class_weight': None,
 'coef0': 0.0,
 'decision_function_shape': 'ovr',
 'degree': 3,
 'gamma': 'scale',
 'kernel': 'rbf',
 'max_iter': -1,
 'probability': False,
 'random_state': None,
 'shrinking': True,
 'tol': 0.001,
 'verbose': False}

## Hyperparameter tuning on SVC

In [14]:
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import reciprocal, uniform

param_distributions = {"gamma": reciprocal(0.001, 0.1), "C": uniform(1, 10)}
rnd_search_cv = RandomizedSearchCV(svm_clf, param_distributions, n_iter=10, verbose=2, cv=3)
rnd_search_cv.fit(X_train_scaled[:1000], y_train[:1000])

Fitting 3 folds for each of 10 candidates, totalling 30 fits
[CV] END .....C=8.984166586383836, gamma=0.00587145898433773; total time=   0.2s
[CV] END .....C=8.984166586383836, gamma=0.00587145898433773; total time=   0.1s
[CV] END .....C=8.984166586383836, gamma=0.00587145898433773; total time=   0.1s
[CV] END .....C=6.324968402404182, gamma=0.07280989176885158; total time=   0.1s
[CV] END .....C=6.324968402404182, gamma=0.07280989176885158; total time=   0.1s
[CV] END .....C=6.324968402404182, gamma=0.07280989176885158; total time=   0.2s
[CV] END .....C=7.84352403020776, gamma=0.001453040969930766; total time=   0.1s
[CV] END .....C=7.84352403020776, gamma=0.001453040969930766; total time=   0.1s
[CV] END .....C=7.84352403020776, gamma=0.001453040969930766; total time=   0.1s
[CV] END ....C=3.455964919668295, gamma=0.017209643003464777; total time=   0.2s
[CV] END ....C=3.455964919668295, gamma=0.017209643003464777; total time=   0.2s
[CV] END ....C=3.455964919668295, gamma=0.017209

RandomizedSearchCV(cv=3, estimator=SVC(),
                   param_distributions={'C': <scipy.stats._distn_infrastructure.rv_frozen object at 0x000001F2B407F460>,
                                        'gamma': <scipy.stats._distn_infrastructure.rv_frozen object at 0x000001F2B407F3D0>},
                   verbose=2)

In [15]:
rnd_search_cv.best_estimator_

SVC(C=7.84352403020776, gamma=0.001453040969930766)

In [16]:
rnd_search_cv.best_score_

0.8649937362512213

In [17]:
rnd_search_cv.best_estimator_.fit(X_train_scaled[:10000], y_train[:10000])

SVC(C=7.84352403020776, gamma=0.001453040969930766)

In [18]:
y_pred = rnd_search_cv.best_estimator_.predict(X_train_scaled)
accuracy_score(y_train, y_pred)

0.9551333333333333

The tuned model gave 95.51% accuracy on training dataset which is pretty good, but we have to consider the fact that we only trained 10000 rows.

In [19]:
y_pred = rnd_search_cv.best_estimator_.predict(X_test_scaled)
accuracy_score(y_test, y_pred)

0.9488

The tuned model gave 94.88% accuracy on testing dataset which is really good, but we have to consider the fact that we only trained 10000 rows.

In [20]:
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred, 
                            target_names = np.unique(y)))

              precision    recall  f1-score   support

           0       0.98      0.97      0.97       980
           1       0.98      0.99      0.99      1135
           2       0.92      0.95      0.94      1032
           3       0.94      0.95      0.94      1010
           4       0.95      0.95      0.95       982
           5       0.96      0.92      0.94       892
           6       0.96      0.96      0.96       958
           7       0.89      0.94      0.91      1028
           8       0.95      0.93      0.94       974
           9       0.96      0.92      0.94      1009

    accuracy                           0.95     10000
   macro avg       0.95      0.95      0.95     10000
weighted avg       0.95      0.95      0.95     10000



Apparently the final model is overfitting slightly. It's tempting to tweak the hyperparameters a bit more (e.g. decreasing C and/or gamma), but we would run the risk of overfitting the test set. 