## General Imports

In [1]:
import pandas as pd
import sklearn
import numpy as np

from sklearn.svm import SVC
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score

## Data Loading

In [3]:
# Import letter recognition data, transform label, and convert to array
POSITIVE_LETTERS = ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M']

letter_df = pd.read_csv('../data/letter-recognition.data', names=np.arange(1,18))
letter_df[1] = letter_df[1].apply(lambda letter: 1 if letter in POSITIVE_LETTERS else -1)
letter_data = letter_df.values

### SVM

In [4]:
# Get data
X_letter, y_letter = letter_data[:, 1:], letter_data[:, :1]

In [5]:
# Split data
# Want to do multiple iterations here
X_letter_train, X_letter_test, y_letter_train, y_letter_test = train_test_split(X_letter, y_letter, train_size=5000, shuffle=True)

In [6]:
# Create grid
c_vals = [1e-7, 1e-6, 1e-5, 1e-4, 1e-3, 1e-2, 1e-1, 1e0, 1e1, 1e2, 1e3]
param_grid = [{'kernel': ['linear'], 'C': c_vals}, {'kernel': ['poly'], 'degree': [0,2,3], 'C': c_vals}, {'kernel': ['rbf'], 'gamma': [0, 0.001, 0.005, 0.01, 0.05, 0.1, 0.5, 1, 2], 'C': c_vals}]

In [7]:
# Create model & grid search object
svc = SVC()
clf_svc = GridSearchCV(estimator=svc, param_grid=param_grid, cv=5, n_jobs=2, verbose=5, scoring='accuracy')

clf_svc.fit(X_letter_train, y_letter_train.ravel())
clf_svc.cv_results_

Fitting 5 folds for each of 143 candidates, totalling 715 fits


[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done  14 tasks      | elapsed:    4.7s
[Parallel(n_jobs=2)]: Done  68 tasks      | elapsed: 45.5min
[Parallel(n_jobs=2)]: Done 158 tasks      | elapsed: 46.3min
[Parallel(n_jobs=2)]: Done 284 tasks      | elapsed: 80.9min
[Parallel(n_jobs=2)]: Done 446 tasks      | elapsed: 83.1min
[Parallel(n_jobs=2)]: Done 644 tasks      | elapsed: 85.7min
[Parallel(n_jobs=2)]: Done 715 out of 715 | elapsed: 88.1min finished


{'mean_fit_time': array([5.02764034e-01, 5.01872873e-01, 5.01892757e-01, 4.42322540e-01,
        4.10009098e-01, 3.86174631e-01, 6.82751751e-01, 2.46379108e+00,
        1.59837862e+01, 1.29171826e+02, 1.15115161e+03, 4.03417349e-01,
        5.23229218e-01, 5.20933151e-01, 4.02800131e-01, 5.23405838e-01,
        5.18744135e-01, 4.02636528e-01, 5.21752357e-01, 5.19067478e-01,
        4.02730989e-01, 5.22144794e-01, 4.89948225e-01, 4.03102875e-01,
        4.73005819e-01, 4.09091377e-01, 4.03197718e-01, 4.10048819e-01,
        3.75821257e-01, 4.02608156e-01, 3.85338593e-01, 4.78411531e-01,
        4.03215170e-01, 5.04001093e-01, 1.26507444e+00, 4.02812862e-01,
        1.58741627e+00, 9.57951031e+00, 4.03323364e-01, 1.20638954e+01,
        7.15348955e+01, 4.02951765e-01, 1.44707632e+02, 4.59503136e+02,
        4.96108055e-01, 6.74510765e-01, 6.81567907e-01, 6.81769848e-01,
        6.79310894e-01, 6.70266151e-01, 6.49084568e-01, 6.55442524e-01,
        7.94046593e-01, 4.95707226e-01, 6.73966

In [9]:
y_letter_test_pred = clf_svc.predict(X_letter_test)
accuracy_score(y_letter_test, y_letter_test_pred)

0.9662666666666667

In [None]:
###### DEPRACATED FOR MULTITHREAD SKLEARN GRID SEARCH, KEPT IN CASE OF MEASURING OTHER METRICS
from sklearn.model_selection import KFold, ParameterGrid
from tqdm import tqdm

# Cycle across each param combo
performance_dict = {}
for param_dict in tqdm(list(ParameterGrid(param_grid))):
    performance = 0
    C, degree, gamma, kernel = param_dict.values()
    if ((kernel in ('linear', 'rbf') and degree > 0) or  # Don't want to run linear or rbf with polynomial degrees (degree will be ignored but we'll get duplicate trials)
        (kernel == 'poly' and degree == 0) or # Don't want polynomial with degree 0
        (kernel in ('linear', 'poly') and gamma > 0) or # Don't want linear or poly with gamma param
        (kernel == 'rbf' and gamma == 0)): # Don't want rbf with 0 gamma
        continue
    # Do k fold validation
    for train, validate in kf.split(X_letter_train):
        X_letter_train_cross, X_letter_val_cross, y_letter_train_cross, y_letter_val_cross = X_letter_train[train], X_letter_train[validate], y_letter_train[train], y_letter_train[validate] # get data folds
        svm_letter = SVC(C=C, degree=degree, kernel=kernel) # create the model #NOTE: not scaling because all data appears to follow the same scaling regardless
        svm_letter.fit(X_letter_train_cross, y_letter_train_cross.ravel()) # fit the model
        y_letter_val_cross_pred = svm_letter.predict(X_letter_val_cross) # predict validation data
        performance += accuracy_score(y_letter_val_cross, y_letter_val_cross_pred) # keep track of performance
    # Average the performance
    performance /= 5
    
    # Add performance info to dict
    performance_dict[(C, degree, kernel)] = performance