## General Imports

In [None]:
# Data Handling
import pandas as pd
import numpy as np

# Misc
import json # saving/loading metrics

# ML
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score

## Helper Functions

In [None]:
def save_dict(dictionary, filename, verbose=False):
    '''
    Saves dictionary object as json file for reloading and easy viewing
    
    Args:
    - dictionary (dict): data to be saved
    - filename (str): filename for dictionary to be stored in
    - verbose=False (bool): sepcifies if exact filename should be used. if False, .json extension appended to filename if not already present
    Return:
    - filename (str): filename for dictionary to be stored in
    '''
    if (not verbose) and ('.json' not in filename):
        filename += '.json'
        
    with open(filename, "w") as outfile:  
        json.dump(dictionary, outfile) 
    
    return filename
        
def load_dict(filename, verbose=False):
    '''
    Loads dictionaary of metrics from given filename
    
    Args:
    - filename (str): file to load
    - verbose=False (bool): sepcifies if exact filename should be used. if False, .json extension appended to filename if not already present
    Return
    - dictionary (dict): data found in file
    - None (None): return None val in case exception is raised and dictionary file does not exist
    '''
    if (not verbose) and ('.json' not in filename):
        filename += '.json'

    try:
        with open(filename) as json_file: 
            dictionary = json.load(json_file) 
    except FileNotFoundError as e:
        print(e)
        return None
    
    return dictionary

## Data Loading

In [None]:
# Import letter recognition data, transform label, and convert to array
POSITIVE_LETTERS = ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M']

letter_df = pd.read_csv('../data/letter-recognition.data', names=np.arange(1,18))
letter_df[1] = letter_df[1].apply(lambda letter: 1 if letter in POSITIVE_LETTERS else -1)
letter_data = letter_df.values

### SVM

In [None]:
# Create metric dict
svm_metric_dict = {}

In [None]:
# Create grid
c_vals = [1e-7, 1e-6, 1e-5, 1e-4, 1e-3, 1e-2, 1e-1, 1e0, 1e1, 1e2, 1e3]
param_grid = [{'kernel': ['linear'], 'C': c_vals}, {'kernel': ['poly'], 'degree': [0,2,3], 'C': c_vals}, {'kernel': ['rbf'], 'gamma': [0, 0.001, 0.005, 0.01, 0.05, 0.1, 0.5, 1, 2], 'C': c_vals}]

In [None]:
# Create model & grid search object
svc = SVC()
clf_svc = GridSearchCV(estimator=svc, param_grid=param_grid, cv=5, n_jobs=2, verbose=5, scoring='accuracy')

In [None]:
for dataset in [letter_data]:
    # Get data
    X, y = dataset[:, 1:], dataset[:, :1] #Treats first column as label
    # Split data
    X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=5000, shuffle=True)
    for i in range(3):
        clf_svc.fit(X_train, y_train.ravel()) # Fit training data to model
        y_test_pred = clf_svc.predict(X_test) # Predict test values using best parameters from classifier
        acc = accuracy_score(y_test, y_test_pred) # Get accuracy for predictions
        svm_metric_dict[(dataset, i)] = {'cv_results': clf_svc.cv_results_, 'acc': acc} # Add metrics to dict for analysis
        save_dict(svm_metric_dict, '../checkpoints/svm/svm_{}_{}.json'.format(dataset, i)) # Save checkpoint results in case of hardware failure

In [None]:
###### DEPRACATED FOR MULTITHREAD SKLEARN GRID SEARCH, KEPT IN CASE OF MEASURING OTHER METRICS
from sklearn.model_selection import KFold, ParameterGrid
from tqdm import tqdm

# Cycle across each param combo
performance_dict = {}
for param_dict in tqdm(list(ParameterGrid(param_grid))):
    performance = 0
    C, degree, gamma, kernel = param_dict.values()
    if ((kernel in ('linear', 'rbf') and degree > 0) or  # Don't want to run linear or rbf with polynomial degrees (degree will be ignored but we'll get duplicate trials)
        (kernel == 'poly' and degree == 0) or # Don't want polynomial with degree 0
        (kernel in ('linear', 'poly') and gamma > 0) or # Don't want linear or poly with gamma param
        (kernel == 'rbf' and gamma == 0)): # Don't want rbf with 0 gamma
        continue
    # Do k fold validation
    for train, validate in kf.split(X_letter_train):
        X_letter_train_cross, X_letter_val_cross, y_letter_train_cross, y_letter_val_cross = X_letter_train[train], X_letter_train[validate], y_letter_train[train], y_letter_train[validate] # get data folds
        svm_letter = SVC(C=C, degree=degree, kernel=kernel) # create the model #NOTE: not scaling because all data appears to follow the same scaling regardless
        svm_letter.fit(X_letter_train_cross, y_letter_train_cross.ravel()) # fit the model
        y_letter_val_cross_pred = svm_letter.predict(X_letter_val_cross) # predict validation data
        performance += accuracy_score(y_letter_val_cross, y_letter_val_cross_pred) # keep track of performance
    # Average the performance
    performance /= 5
    
    # Add performance info to dict
    performance_dict[(C, degree, kernel)] = performance