In [1]:
import matplotlib.pyplot as plt
%matplotlib inline

import numpy as np
import scipy as sp
from scipy import io
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
import csv

In [2]:
''' 
Helper Function: plot_accuracy
Purpose: Plots Accuracy of Classifier on Input Data
Params: 
        x - list of sample_sizes
        Y - Accuracy Scores
        name - Name of data_set
Return: plots accuracy, returns None
'''
def plot_accuracy(x, y, name):
    plt.plot(x, y)
    plt.xlabel('Num_samples')
    plt.ylabel('Accuracy')
    plt.title(name)
    plt.show()

In [3]:
''' 
Function: train_svm
Purpose: Trains Classifier and Plot Valid Accuracy and Training Accuracy
Params:
    samples - list of sample_sizes
    clf - classifier to train
    train_set - training data set
    train_y - training labels
    valid_set - valid data set
    valid_y - valid labels
    name - name of data set
Return: list of validation accuracies, list of training accuracies across sample sizes
'''
def train_svm(samples, clf, train_set, train_y, valid_set, valid_y, name):
    valid_score, train_score = train_svm_no_plot(samples, clf, train_set, train_y, valid_set, valid_y)
    plot_accuracy(samples, train_score, name + ' Training_Accuracy')
    plot_accuracy(samples, valid_score, name + ' Validation_Accuracy')
    return valid_score, train_score

In [4]:
'''
Function: train_svm_no_plot
Purpose: Trains Classifier Without Plot, calls train_svm and plot_accuracy
Params: 
    samples - list of sample_sizes
    clf - classifier to train
    train_set - training data set
    train_y - training labels
    valid_set - valid data set
    valid_y - valid labels
    name - name of data set
Return: list of validation accuracies, list of training accuracies across sample sizes
'''
def train_svm_no_plot(samples, clf, train_set, train_y, valid_set, valid_y):
    train_scores = list()
    valid_scores = list()
    print("Sample_size --> valid_score:  train_score ")
    for sample_size in samples:
        clf.fit(train_set[:sample_size], train_y[:sample_size])
        train_score = clf.score(train_set, train_y)
        train_scores.append(train_score)
        valid_score = clf.score(valid_set, valid_y)
        valid_scores.append(valid_score)
        print(str(sample_size) + ' --> ' + str(valid_score) + ' : ' + str(train_score))
    return valid_scores, train_scores

In [5]:
'''
Function: split_train_and_valid_sets
Purpose: To Split Data into Training and Valid Sets
Param:
    data - data_set to split
    size - size of sample
Return: The shape and parititoned valid data set, valid data labels, training data set, training labels
'''
def split_train_and_valid_sets(data, size):
    np.random.shuffle(data)
    shape = data.shape[0] 
    valid_set = data[:size, :-1] 
    valid_y = data[:size, -1] 
    train_set = data[size:, :-1] 
    train_y = data[size:, -1]
    return shape, valid_set, valid_y, train_set, train_y

In [6]:
# -----------MNIST SET-------- #
mnist_dict = io.loadmat('../mnist/train.mat')
mnist_trainX = mnist_dict['trainX']
mnist_train_set, mnist_valid_set, mnist_train_y, mnist_valid_y = train_test_split(mnist_trainX[:, :-1], mnist_trainX[:, -1], test_size=10000, random_state=42)
print('mnist_train_set ' + str(mnist_train_set.shape))
print('mnist_valid_set ' + str(mnist_valid_set.shape))
print('mnist_train_y ' + str(mnist_train_y.shape))
print('mnist_valid_y ' + str(mnist_valid_y.shape))

mnist_train_set (50000, 784)
mnist_valid_set (10000, 784)
mnist_train_y (50000,)
mnist_valid_y (10000,)


In [7]:
# -----------CIFAR-10 SET -------- (sklearn fn)
cifar_dict = sp.io.loadmat('../cifar/train.mat')
cifar_trainX = cifar_dict['trainX']
cifar_train_set, cifar_valid_set, cifar_train_y, cifar_valid_y = train_test_split(cifar_trainX[:, :-1], cifar_trainX[:, -1], test_size=5000, random_state=42)

In [8]:
print('cifar_train_set ' + str(cifar_train_set.shape))
print('cifar_valid_set ' + str(cifar_valid_set.shape))
print('cifar_train_y ' + str(cifar_train_y.shape))
print('cifar_valid_y ' + str(cifar_valid_y.shape))

cifar_train_set (45000, 3072)
cifar_valid_set (5000, 3072)
cifar_train_y (45000,)
cifar_valid_y (5000,)


In [9]:
# --------------SPAM DataSet (sklearn fn)----------------
spam_dict = sp.io.loadmat('../spam/spam_data.mat')
spam_trainX= spam_dict['training_data']
spam_labels = spam_dict['training_labels']
spam_train_set, spam_valid_set, spam_train_y, spam_valid_y = train_test_split(spam_trainX, spam_labels.T, test_size=0.2, random_state=42)

In [10]:
print('spam_train_set ' + str(spam_train_set.shape))
print('spam_valid_set ' + str(spam_valid_set.shape))
print('spam_train_y ' + str(spam_train_y.shape))
print('spam_valid_y ' + str(spam_valid_y.shape))

spam_train_set (4137, 32)
spam_valid_set (1035, 32)
spam_train_y (4137, 1)
spam_valid_y (1035, 1)


In [11]:
'''
PROBLEM 2: TRAIN CLASSIFIERS AND PLOT ACCURACY
'''

# ----------TRAIN MNIST DATA------------#
# expect between 70-90% accuracy
print("Training MNIST")
clf_mnist = SVC(kernel="linear")
experiments = [100, 200, 500, 1000, 2000, 5000, 10000]
valid_error, train_error = train_svm(experiments, clf_mnist, mnist_train_set, mnist_train_y, mnist_valid_set, mnist_valid_y,
          'MNIST')

Training MNIST
Sample_size --> valid_score:  train_score 
100 --> 0.6908 : 0.69646
Sample_size --> valid_score:  train_score 
200 --> 0.8101 : 0.81514
Sample_size --> valid_score:  train_score 
500 --> 0.8686 : 0.8658


KeyboardInterrupt: 

In [None]:
# -----------TRAIN CIFAR DATA-----------#
# expect between 25-35% accuracy
print("Training CIFAR")
clf_cifar = SVC(kernel='linear')
experiments = [100, 200, 500, 1000, 2000, 5000]
valid_error, train_error = train_svm(experiments, clf_cifar, cifar_train_set, cifar_train_y, cifar_valid_set, cifar_valid_y, 'CIFAR')


In [None]:
# ----------TRAIN SPAM DATA -------------#
# expect between 70-90% accuracy
print("Training SPAM")
clf_spam = SVC(kernel="linear")
experiments = [100, 200, 500, 1000, 2000, 4137]

valid_error, train_error = train_svm(experiments, clf_spam, spam_train_set, spam_train_y, spam_valid_set, spam_valid_y, 'SPAM')

In [None]:
'''
PROBLEM 3: FIND BEST C VALUE MNIST SET
'''

# ATTEMPT 1: SMALL C-VALUES
from sklearn.svm import SVC
C_range = [.01, .001, .0001, .00001, .000001, .0000001, .00000001, .000000001]
experiments = [10000]
for c in C_range:
    clf_mnist = SVC(C=c, kernel="linear")
    scores = train_svm_no_plot(experiments, clf_mnist, mnist_train_set, mnist_train_y, mnist_valid_set, mnist_valid_y)
    print('C value: ' + str(c))
    print('valid_score ' + str(scores[0][0]))

In [None]:
# ATTEMPT 2: SMALL C-VALUES CLOSE TO 10^-6
from sklearn.svm import SVC
C_range = [.000002, .000003, .000004, .000005, .000006]
experiments = [10000]
for c in C_range:
    clf_mnist = SVC(C=c, kernel="linear")
    scores = train_svm_no_plot(experiments, clf_mnist, mnist_train_set, mnist_train_y, mnist_valid_set, mnist_valid_y)
    print('C value: ' + str(c))
    print('valid_score ' + str(scores[0][0]))

In [None]:
# ATTEMPT 3: Larger C Values
from sklearn.svm import SVC
C_range = [1, 5, 10, 100]
experiments = [10000]
for c in C_range:
    clf_mnist = SVC(C=c, kernel="linear")
    scores = train_svm_no_plot(experiments, clf_mnist, mnist_train_set, mnist_train_y, mnist_valid_set, mnist_valid_y)
    print('C value: ' + str(c))
    print('valid_score ' + str(scores[0][0]))

In [None]:
# ************************** PROBLEM 3: BEST C Value ********************************
#got this from http://scikit-learn.org/stable/auto_examples/svm/plot_rbf_parameters.html #
from sklearn.model_selection import GridSearchCV, StratifiedShuffleSplit
C_range = np.logspace(-2, 10, 13)
gamma_range = np.logspace(-9, 3, 13)
param_grid = dict(gamma=gamma_range, C=C_range, kernel=('linear', 'rbf'))
cv = StratifiedShuffleSplit(n_splits=5, test_size=0.2, random_state=42)
grid = GridSearchCV(SVC(), param_grid=param_grid, cv=cv)
grid.fit(mnist_train_set, mnist_train_y)
print("The best parameters are %s with a score of %0.2f"
      % (grid.best_params_, grid.best_score_))

In [None]:
'''
PROBLEM 4: K-FOLD
'''

In [None]:
'''
Function k_fold_split
Purpose: Performs k-fold Cross Validation to Compute Classifier Accuracy
Params:
    data - training_data to fit to model
    clf - the classifier
    k - the number of splits
Returns Accuracy of classifier on validation data
    
'''
def k_fold_split(data, clf, k):
    np.random.shuffle(data)
    valid_scores = list()
    copy_data = data[:, :]
    for i in range(k):
        if is_divisible_by_k(copy_data, k):
            k_arrays = np.split(copy_data, k)
        else:
            divisible_data, extra_data = preprocess_data(copy_data, k)
            k_arrays = np.split(divisible_data, k)
            np.vstack((k_arrays[-1], extra_data)) 
        valid_set = k_arrays.pop(i)
        valid_data = valid_set[:, :-1]
        valid_y = valid_set[:, -1]
        train_set = np.vstack(k_arrays)
        train_data = train_set[:, :-1]
        train_y = train_set[:, -1]
        valid_scores.append(train_svm_no_plot([train_data.shape[0]], clf, train_data, train_y, valid_data, valid_y)[0])
    valid_score = np.sum(valid_scores)/k
    print("Valid Score is:")
    print(valid_score)
    return valid_score

In [None]:
'''
Helper Function: preprocess_data
Purpose: Divides data into a set of points divisible by k, and the remainder of sample points
Param: 
    data - the data to divide into two sets
    k - the number of splits in k-fold algorithm
Return: a set of divisible samples and a set of extra samples, ie the remainder
'''
def preprocess_data(data, k):
    remainder = data.shape[0] % k
    extra_samples = data[-remainder:, :]
    divisible_samples = data[:-remainder, :]
    return divisible_samples, extra_samples

In [None]:
'''
Helper Function: is_divisible_by_k
Purpose: Checks if number of samples in data set is divisible by k for k-fold split
Param:
    data - the data fitting classifer to
    k - the number of splits in k-fold algorithm
Return: True or False
'''
def is_divisible_by_k(data, k):
    if data.shape[0] % k == 0:
        return True
    return False

In [None]:
# Running KFold on k = 5 #
print("Running KFOLD")
clf_spam = SVC(C=1000)
two_dim_label = np.array(spam_train_y)
spam_training = np.hstack((spam_train_set, two_dim_label))
k_fold_split(spam_training, clf_spam, 5, 'SPAM')

In [None]:
# ****************** PROBLEM 5: KAGGLE COMPETITION ***************
# import csv

In [None]:
# --------------- EXPERIMENT WITH MNIST CLF HYPER-PARAMETERS ------------------------#

In [None]:
# clf_mnist = SVC(C=0.000001, kernel="linear", )
# experiments = [1000]
# # expect between 70-90% accuracy
# valid_error, train_error = train_svm_no_plot(experiments, clf_mnist, mnist_train_set, mnist_train_y, mnist_valid_set, mnist_valid_y,
#           'MNIST')

In [None]:
# mnist submission
# clf_mnist = SVC(C=0.000001, kernel="linear")
# mnist_data = io.loadmat('../mnist/test.mat')['testX']
# clf_mnist.fit(mnist_train_set, mnist_train_y)
# predicted_labels = clf_mnist.predict(mnist_data)
# # file = open('mnist_submission.csv', 'w')
# # w = csv.writer(f)

# with open('mnist_kaggle.csv', 'w') as csvfile:
#     writer = csv.writer(csvfile)
#     for i in range(len(predicted_labels)):
#         writer.writerow([i, predicted_labels[i]])
# csvfile.close()

In [None]:
# spam_data = io.loadmat('spam/spam_data.mat')
# print(spam_data.keys())

In [None]:
# spam submission
# spam_data = io.loadmat('spam/spam_data.mat')['testX']
# predicted_labels = clf_spam.predict(spam_data)
# file = open('mnist_submission.csv', 'w')
# w = csv.writer(f)

# with open('spam_sample_submission.csv', 'w') as csvfile:
#     writer = csv.writer(csvfile)
#     for i in range(len(predicted_labels)):
#         writer.writerow([i, predicted_labels[i]])
# csvfile.close()