In [1]:
#Load packages
import pandas as pd
import numpy as np
import scipy.io as sio
import random
import scipy
import matplotlib.pyplot as plt
from collections import Counter
from sklearn.manifold import TSNE
from sklearn.svm import SVC, LinearSVC
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.decomposition import PCA
from sklearn.linear_model import SGDClassifier

In [20]:
def report_mean_class_acc(ytest, ypred, all_labels):
    class_accuracies = []
    for label in all_labels:
        if(np.sum(ytest==label)>0):
            acc = np.sum(np.logical_and(ytest == label,ypred == label))/np.sum(ytest==label)
            class_accuracies.append(acc)
    mean_acc = np.mean(class_accuracies)
        
    return mean_acc

In [2]:
#This function is used to encode labels since labels are categorical.
def encode_labels(labels):
    le = LabelEncoder()
    le.fit(labels)
    encoded_labels = le.transform(labels)
    
    return encoded_labels, le

def decode_labels(encoded_predict_labels, le):
    test_predictions = le.inverse_transform(encoded_predict_labels)
    
    return test_predictions

#Load data
train = sio.loadmat('train.mat')
validation = sio.loadmat('validation.mat')
test = sio.loadmat('test_wolabels.mat')

train_classid = np.squeeze(train['classid'])
train_class_labels = []
for item in train_classid:
    train_class_labels.append(item[0])
train_features = train['features']
train_imid = train['imid']
train_sampleid = train['sampleid']
print(train_features.shape)
train_unique_labels = sorted(np.unique(train_class_labels))
train_unique_labels_count = len(train_unique_labels)
print(train_unique_labels_count)

validation_classid = np.squeeze(validation['classid'])
validation_class_labels = []
for item in validation_classid:
    validation_class_labels.append(item[0])
validation_features = validation['features']
validation_imid = validation['imid']
validation_sampleid = validation['sampleid']
print(validation_features.shape)
validation_unique_labels = sorted(np.unique(validation_class_labels))
validation_unique_labels_count = len(validation_unique_labels)
print(validation_unique_labels_count)

#encoded train labels
train_labels, le = encode_labels(train_class_labels)
print(len(train_labels))

#encoded validation labels
validation_labels = le.transform(validation_class_labels)
print(len(validation_labels))

(7849, 2048)
1013
(1379, 2048)
1013
7849
1379


SGDClassifier with LinearSVM Design of Experiment

In [5]:
N_components = [500, 550, 600, 650, 700, 750, 800, 850, 900, 950, 1000]
alpha_param = [0.0001, 0.001, 0.01, 0.1]
mean_acc = []

for n_components in N_components:
    pca = PCA(n_components=n_components)
    train_features_red = pca.fit_transform(train_features)
    validation_features_red = pca.transform(validation_features)
    print("No. of principal components =", n_components)
    for alpha in alpha_param:
        clf = make_pipeline(StandardScaler(), SGDClassifier(alpha = alpha, max_iter=10000, tol=1e-4, class_weight='balanced', random_state=0))
        clf.fit(train_features_red, train_labels)
        score = clf.score(validation_features_red, validation_labels)
        mean_acc.append(score)
        print("Alhpa parameter: ", alpha)
        print("Mean class accuracy scores:", score)

No. of principal components = 500
Alhpa parameter:  0.0001
Mean class accuracy scores: 0.7751994198694706
Alhpa parameter:  0.001
Mean class accuracy scores: 0.5953589557650472
Alhpa parameter:  0.01
Mean class accuracy scores: 0.13850616388687453
Alhpa parameter:  0.1
Mean class accuracy scores: 0.05148658448150834
No. of principal components = 550
Alhpa parameter:  0.0001
Mean class accuracy scores: 0.766497461928934
Alhpa parameter:  0.001
Mean class accuracy scores: 0.6171138506163887
Alhpa parameter:  0.01
Mean class accuracy scores: 0.29949238578680204
Alhpa parameter:  0.1
Mean class accuracy scores: 0.08556925308194344
No. of principal components = 600
Alhpa parameter:  0.0001
Mean class accuracy scores: 0.7751994198694706
Alhpa parameter:  0.001
Mean class accuracy scores: 0.640319071791153
Alhpa parameter:  0.01
Mean class accuracy scores: 0.30384336475707036
Alhpa parameter:  0.1
Mean class accuracy scores: 0.09572153734590283
No. of principal components = 650
Alhpa paramete

Inference from experiment: alpha 0.0001 is optimum

In [6]:
#Running SGDClassifier without PCA and with with default parameters

Mean class accuracy scores: 0.6562726613488035


In [7]:
N_components = [1000, 1100, 1200, 1300, 1400, 1500, 1600, 1700, 1800]
mean_acc = []

for n_components in N_components:
    pca = PCA(n_components=n_components)
    train_features_red = pca.fit_transform(train_features)
    validation_features_red = pca.transform(validation_features)
    print("No. of principal components =", n_components)
    clf = make_pipeline(StandardScaler(), SGDClassifier(max_iter=10000, tol=1e-4, class_weight='balanced', random_state=0))
    clf.fit(train_features_red, train_labels)
    score = clf.score(validation_features_red, validation_labels)
    mean_acc.append(score)
    print("Mean class accuracy scores:", score)

No. of principal components = 1000
Mean class accuracy scores: 0.7926033357505439
No. of principal components = 1100
Mean class accuracy scores: 0.7955039883973894
No. of principal components = 1200
Mean class accuracy scores: 0.7911530094271211
No. of principal components = 1300
Mean class accuracy scores: 0.7933284989122552
No. of principal components = 1400
Mean class accuracy scores: 0.8013052936910805
No. of principal components = 1500
Mean class accuracy scores: 0.7991298042059464
No. of principal components = 1600
Mean class accuracy scores: 0.7940536620739667
No. of principal components = 1700
Mean class accuracy scores: 0.7875271936185642
No. of principal components = 1800
Mean class accuracy scores: 0.7875271936185642


optimum model

PCA dim = 1400

In [None]:
n_components = 1400
pca = PCA(n_components=n_components)
train_features_red = pca.fit_transform(train_features)
validation_features_red = pca.transform(validation_features)
clf = make_pipeline(StandardScaler(), SGDClassifier(max_iter=10000, tol=1e-4, class_weight='balanced', random_state=0))
clf.fit(train_features_red, train_labels)
SGDClassifier_predictions = clf.predict(validation_features_red)
score = clf.score(validation_features_red, validation_labels)

In [18]:
print("Mean class accuracy scores:", score)

Mean class accuracy scores: 0.7962291515591008


In [19]:
#Save predictions in an excel file
output_df = pd.DataFrame(SGDClassifier_predictions)
output_df.to_csv('Task1_SGDClassifier_predictions_dim1400.csv', index=False,  header=False)

In [23]:
n_components = 1000
pca = PCA(n_components=n_components)
train_features_red = pca.fit_transform(train_features)
validation_features_red = pca.transform(validation_features)
print("No. of principal components =", n_components)
clf_1000 = make_pipeline(StandardScaler(), SGDClassifier(max_iter=10000, tol=1e-4, class_weight='balanced', random_state=0))
clf_1000.fit(train_features_red, train_labels)
SGDClassifier_predictions = clf_1000.predict(validation_features_red)
score = clf_1000.score(validation_features_red, validation_labels)
print("Mean class accuracy scores:", score)

No. of principal components = 1000
Mean class accuracy scores: 0.7984046410442349


In [24]:
#Save predictions in an excel file
output_df = pd.DataFrame(SGDClassifier_predictions)
output_df.to_csv('Task1_SGDClassifier_predictions_dim1000.csv', index=False,  header=False)

In [25]:
from sklearn.metrics import accuracy_score

print(accuracy_score(SGDClassifier_predictions, validation_labels))

0.7984046410442349


In [28]:
from sklearn.metrics import confusion_matrix

matrix = confusion_matrix(validation_labels, SGDClassifier_predictions)
acc = matrix.diagonal()/matrix.sum(axis=1)
print(sum(acc)/len(acc))

0.7790133032482489
