In [1]:
#Load packages
import pandas as pd
import numpy as np
import scipy.io as sio
import random
import scipy
import matplotlib.pyplot as plt
from collections import Counter
from sklearn.manifold import TSNE
from sklearn.svm import SVC, LinearSVC
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.decomposition import PCA
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import confusion_matrix

In [2]:
#This function is used to encode labels since labels are categorical.
def encode_labels(labels):
    le = LabelEncoder()
    le.fit(labels)
    encoded_labels = le.transform(labels)
    
    return encoded_labels, le

def decode_labels(encoded_predict_labels, le):
    test_predictions = le.inverse_transform(encoded_predict_labels)
    
    return test_predictions

#Load data
train = sio.loadmat('train.mat')
validation = sio.loadmat('validation.mat')
test = sio.loadmat('test_wolabels.mat')

train_classid = np.squeeze(train['classid'])
train_class_labels = []
for item in train_classid:
    train_class_labels.append(item[0])
train_features = train['features']
train_imid = train['imid']
train_sampleid = train['sampleid']
print(train_features.shape)
train_unique_labels = sorted(np.unique(train_class_labels))
train_unique_labels_count = len(train_unique_labels)
print(train_unique_labels_count)

validation_classid = np.squeeze(validation['classid'])
validation_class_labels = []
for item in validation_classid:
    validation_class_labels.append(item[0])
validation_features = validation['features']
validation_imid = validation['imid']
validation_sampleid = validation['sampleid']
print(validation_features.shape)
validation_unique_labels = sorted(np.unique(validation_class_labels))
validation_unique_labels_count = len(validation_unique_labels)
print(validation_unique_labels_count)

#encoded train labels
train_labels, le = encode_labels(train_class_labels)
print(len(train_labels))

#encoded validation labels
validation_labels = le.transform(validation_class_labels)
print(len(validation_labels))

(7849, 2048)
1013
(1379, 2048)
1013
7849
1379


In [3]:
scalar = MinMaxScaler()
train_features_norm = scalar.fit_transform(train_features)
validation_features_norm = scalar.transform(validation_features)

SVM with Support vector counts Design of Experiment

In [4]:
N_components = [500, 550, 600, 650, 700, 750, 800, 850, 900, 950, 1000]
cost_param = [10, 50, 100]
mean_acc = []
support_vector_counts = []

for n_components in N_components:
    pca = PCA(n_components=n_components)
    train_features_red = pca.fit_transform(train_features_norm)
    validation_features_red = pca.transform(validation_features_norm)
    print("No. of principal components =", n_components)
    for c in cost_param:
        clf = SVC(C=c, kernel = 'linear', class_weight='balanced', random_state=0)
        clf.fit(train_features_red, train_labels)
        predictions = clf.predict(validation_features_red)
        matrix = confusion_matrix(validation_labels, predictions)
        acc = matrix.diagonal()/matrix.sum(axis=1)
        print("Mean class accuracy score:", sum(acc)/len(acc))
        n_support = clf.n_support_
        score = clf.score(validation_features_red, validation_labels)
        mean_acc.append(score)
        support_vector_counts.append(n_support)
        print("Cost parameter: ", c)
        print("accuracy scores:", score)

No. of principal components = 500
Mean class accuracy score: 0.6944859681286137
Cost parameter:  10
accuracy scores: 0.7447425670775925
Mean class accuracy score: 0.6944859681286137
Cost parameter:  50
accuracy scores: 0.7447425670775925
Mean class accuracy score: 0.6944859681286137
Cost parameter:  100
accuracy scores: 0.7447425670775925
No. of principal components = 550
Mean class accuracy score: 0.6976942603299958
Cost parameter:  10
accuracy scores: 0.7469180565627266
Mean class accuracy score: 0.6976942603299958
Cost parameter:  50
accuracy scores: 0.7469180565627266
Mean class accuracy score: 0.6976942603299958
Cost parameter:  100
accuracy scores: 0.7469180565627266
No. of principal components = 600
Mean class accuracy score: 0.6969538852066
Cost parameter:  10
accuracy scores: 0.7454677302393038
Mean class accuracy score: 0.6969538852066
Cost parameter:  50
accuracy scores: 0.7454677302393038
Mean class accuracy score: 0.6969538852066
Cost parameter:  100
accuracy scores: 0.745

KeyboardInterrupt: 

In [6]:
for i in range(len(support_vector_counts)):
    print(np.sum(support_vector_counts[i]))

7784
7784
7784
7785
7785
7785
7788
7788
7788
7790
7790
7790
7792
7792
7792
7791
7791


No. of principal components = 500
Mean class accuracy score: 0.6952263432520096
Cost parameter:  10
accuracy scores: 0.7440174039158811
No.of support vectors: [3, 3, 6, 8, 5, 3, 8, 4, 5, 9, 3, 4, 3, 12, 4, 9, 10, 3, 3, 3, 6, 5, 4, 6, 20, 10, 4, 3, 5, 3, 3, 5, 3, 11, 2, 4, 7, 8, 4, 16, 5, 22, 4, 4, 3, 6, 9, 7, 22, 11, 8, 10, 3, 5, 3, 6, 3, 7, 5, 3, 13, 13, 4, 3, 3, 4, 3, 9, 4, 8, 3, 3, 6, 4, 4, 5, 9, 3, 13, 5, 29, 4, 8, 7, 10, 27, 16, 5, 14, 16, 3, 23, 40, 21, 3, 32, 19, 5, 8, 29, 13, 3, 19, 11, 23, 7, 11, 9, 3, 9, 11, 13, 4, 10, 4, 6, 12, 5, 3, 6, 19, 15, 3, 8, 3, 4, 3, 6, 6, 4, 3, 5, 13, 9, 12, 3, 5, 18, 4, 4, 3, 4, 3, 5, 4, 9, 12, 3, 17, 4, 4, 3, 2, 6, 9, 4, 4, 18, 15, 4, 8, 2, 7, 3, 3, 4, 3, 3, 15, 4, 8, 5, 5, 3, 6, 11, 4, 10, 3, 4, 3, 3, 4, 30, 4, 6, 3, 6, 8, 6, 21, 3, 5, 9, 3, 5, 8, 2, 20, 14, 8, 7, 6, 4, 9, 9, 3, 10, 3, 11, 5, 4, 10, 6, 4, 23, 4, 3, 4, 4, 6, 15, 10, 3, 3, 16, 6, 9, 5, 3, 6, 3, 10, 9, 2, 4, 15, 5, 4, 6, 3, 9, 5, 3, 3, 9, 8, 3, 4, 15, 3, 12, 3, 3, 9, 11, 11, 14, 21

Inference from experiment: PCA dim 500 is optimum

In [7]:
N_components = [250, 300, 350, 400, 450]
cost_param = [1, 10, 25, 50, 100]
mean_acc = []
support_vector_counts = []

for n_components in N_components:
    pca = PCA(n_components=n_components)
    train_features_red = pca.fit_transform(train_features_norm)
    validation_features_red = pca.transform(validation_features_norm)
    print("No. of principal components =", n_components)
    for c in cost_param:
        clf = SVC(C=c, kernel = 'linear', class_weight='balanced', random_state=0)
        clf.fit(train_features_red, train_labels)
        n_support = clf.n_support_
        score = clf.score(validation_features_red, validation_labels)
        mean_acc.append(score)
        support_vector_counts.append(n_support)
        print("Cost parameter: ", c)
        print("Mean class accuracy scores:", score)

No. of principal components = 250
Cost parameter:  1
Mean class accuracy scores: 0.7353154459753445
No.of support vectors: [3, 3, 6, 8, 5, 3, 8, 4, 5, 9, 3, 4, 3, 12, 4, 9, 10, 3, 3, 3, 6, 5, 4, 6, 20, 10, 4, 3, 5, 3, 3, 5, 3, 11, 2, 4, 7, 8, 4, 15, 5, 22, 4, 4, 3, 6, 9, 7, 22, 11, 8, 10, 3, 5, 3, 6, 3, 7, 5, 3, 13, 13, 4, 3, 3, 4, 3, 9, 4, 8, 3, 3, 6, 4, 4, 5, 9, 3, 12, 5, 28, 4, 8, 7, 10, 27, 16, 5, 14, 16, 3, 23, 40, 21, 3, 32, 19, 5, 8, 29, 13, 3, 19, 11, 23, 7, 11, 9, 3, 9, 11, 13, 4, 10, 4, 6, 12, 5, 3, 6, 18, 15, 3, 8, 3, 4, 3, 6, 6, 4, 3, 5, 13, 9, 12, 3, 5, 18, 4, 4, 3, 4, 3, 5, 4, 9, 12, 3, 16, 4, 4, 3, 2, 6, 9, 4, 4, 18, 15, 4, 8, 2, 7, 3, 3, 4, 3, 3, 15, 4, 8, 5, 5, 3, 6, 11, 4, 10, 3, 4, 3, 3, 4, 30, 4, 6, 3, 6, 8, 6, 21, 3, 5, 9, 3, 5, 8, 2, 20, 14, 8, 7, 6, 4, 9, 9, 3, 10, 3, 11, 5, 4, 10, 6, 4, 22, 4, 3, 4, 4, 6, 15, 10, 3, 3, 16, 6, 9, 5, 3, 6, 3, 10, 9, 2, 4, 15, 5, 4, 6, 3, 9, 5, 3, 3, 9, 8, 3, 4, 15, 3, 12, 3, 3, 9, 11, 11, 14, 21, 7, 4, 4, 4, 5, 4, 20, 6, 5, 4, 3, 

In [8]:
for i in range(len(support_vector_counts)):
    print(np.sum(support_vector_counts[i]))

7765
7765
7765
7765
7765
7771
7772
7772
7772
7772
7780
7779
7779
7779
7779
7783
7782
7782
7782
7782
7787
7786
7786
7786
7786


In [9]:
print(mean_acc)

[0.7353154459753445, 0.7374909354604786, 0.73821609862219, 0.73821609862219, 0.73821609862219, 0.7396664249456127, 0.7418419144307469, 0.7418419144307469, 0.7418419144307469, 0.7418419144307469, 0.7418419144307469, 0.7425670775924583, 0.7425670775924583, 0.7425670775924583, 0.7425670775924583, 0.7432922407541697, 0.7432922407541697, 0.7432922407541697, 0.7432922407541697, 0.7432922407541697, 0.7469180565627266, 0.7454677302393038, 0.7454677302393038, 0.7454677302393038, 0.7454677302393038]
