## General Imports

In [2]:
import pandas as pd
import sklearn
import numpy as np
from tqdm import tqdm

from sklearn.svm import SVC
from sklearn.model_selection import train_test_split, KFold, ParameterGrid
from sklearn.metrics import accuracy_score

### SVM

In [3]:
POSITIVE_LETTERS = ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M']

In [4]:
# Import letter recognition data, transform label, and convert to array
letter_df = pd.read_csv('../data/letter-recognition.data', names=np.arange(1,18))
letter_df[1] = letter_df[1].apply(lambda letter: 1 if letter in POSITIVE_LETTERS else -1)
letter_data = letter_df.values

In [5]:
# Create SVM classifier for letter recognition data
# NOTE: Use pipeline to scale first


In [6]:
X, y = letter_data[:, 1:], letter_data[:, :1]

In [7]:
# Split data
# Want to do multiple iterations here
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=5000, shuffle=True)
kf = KFold(n_splits=5)

In [8]:
# Create grid
param_grid = {'kernel': ['linear', 'poly', 'rbf'], 'degree': [0, 2, 3], 'C': [1e-7, 1e-6, 1e-5, 1e-4, 1e-3, 1e-2, 1e-1, 1e0, 1e1, 1e2, 1e3]}

In [None]:
# Cycle across each param combo
performance_dict = {}
for param_dict in tqdm(list(ParameterGrid(param_grid))):
    performance = 0
    C, degree, kernel = param_dict.values()
    if (kernel in ('linear', 'rbf') and degree > 0) or (kernel == 'poly' and degree == 0): 
        continue
    # Do k fold validation
    for train, validate in tqdm(kf.split(X_train)):
        X_train_cross, X_val_cross, y_train_cross, y_val_cross = X_train[train], X_train[validate], y_train[train], y_train[validate] # get data folds
        svm_letter = SVC(C=C, degree=degree, kernel=kernel) # create the model
        svm_letter.fit(X_train_cross, y_train_cross.ravel()) # fit the model
        y_val_cross_pred = svm_letter.predict(X_val_cross) # predict validation data
        performance += accuracy_score(y_val_cross, y_val_cross_pred) # keep track of performance
    # Average the performance
    performance /= 5
    
    # Add performance info to dict
    performance_dict[(C, degree, kernel)] = performance

  0%|          | 0/99 [00:00<?, ?it/s]
0it [00:00, ?it/s][A
1it [00:00,  2.54it/s][A
2it [00:00,  2.64it/s][A
3it [00:01,  2.73it/s][A
4it [00:01,  2.79it/s][A
5it [00:01,  2.85it/s][A
  1%|          | 1/99 [00:01<02:52,  1.76s/it]
0it [00:00, ?it/s][A
1it [00:00,  1.83it/s][A
2it [00:01,  1.78it/s][A
3it [00:01,  1.77it/s][A
4it [00:02,  1.77it/s][A
5it [00:02,  1.76it/s][A
  3%|▎         | 3/99 [00:04<02:39,  1.66s/it]
0it [00:00, ?it/s][A
1it [00:00,  2.75it/s][A
2it [00:00,  2.73it/s][A
3it [00:01,  2.71it/s][A
4it [00:01,  2.66it/s][A
5it [00:01,  2.64it/s][A
  5%|▌         | 5/99 [00:06<02:15,  1.44s/it]
0it [00:00, ?it/s][A
1it [00:00,  2.60it/s][A
2it [00:00,  2.54it/s][A
3it [00:01,  2.54it/s][A
4it [00:01,  2.50it/s][A
5it [00:02,  2.49it/s][A
  8%|▊         | 8/99 [00:08<01:50,  1.21s/it]
0it [00:00, ?it/s][A
1it [00:00,  2.95it/s][A
2it [00:00,  2.90it/s][A
3it [00:01,  2.86it/s][A

In [None]:
performance_dict

In [66]:
svm_letter.fit(X_train, y_train.ravel())

SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

In [67]:
y_train_pred = svm_letter.predict(X_train)
accuracy_score(y_train, y_train_pred)

0.8926

In [68]:
y_test_pred = svm_letter.predict(X_test)
accuracy_score(y_test, y_test_pred)

0.8721333333333333

In [9]:
for i in tqdm(list(ParameterGrid(param_grid))):
    pass

100%|██████████| 99/99 [00:00<00:00, 92956.37it/s]
