In [23]:
#Load packages
import pandas as pd
import numpy as np
import scipy.io as sio
import random
import scipy
import matplotlib.pyplot as plt
from collections import Counter
from sklearn.manifold import TSNE
from sklearn.svm import SVC
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler, MinMaxScaler, LabelEncoder
from sklearn.utils.class_weight import compute_class_weight
from sklearn.decomposition import PCA
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.linear_model import LogisticRegression, RidgeClassifier, SGDClassifier
from sklearn.model_selection import cross_val_score

In [24]:
#This function is used to encode labels since labels are categorical.
def encode_labels(labels):
    le = LabelEncoder()
    le.fit(labels)
    encoded_labels = le.transform(labels)
    
    return encoded_labels, le

def decode_labels(encoded_predict_labels, le):
    test_predictions = le.inverse_transform(encoded_predict_labels)
    
    return test_predictions

#Mean class accuracy
def mean_class_acc(predictions, true_labels):
    matrix = confusion_matrix(true_labels, predictions)
    acc = matrix.diagonal()/matrix.sum(axis=1)

    return sum(acc)/len(acc)

In [4]:
#Load data
whole = sio.loadmat('whole_wolabels.mat')
parts = sio.loadmat('parts_wolabels.mat')

In [5]:
#whole

#train
train_classid = np.squeeze(whole['train_classid'])
train_class_labels = []
for item in train_classid:
    train_class_labels.append(item[0])
train_features = whole['train_feats']
train_imid = whole['train_imgid']
train_imgid = []
for item in train_imid:
    train_imgid.append(item[0])
train_imgid = np.squeeze(train_imgid)
train_sampleid = whole['train_sampleid']

#validation
validation_classid = np.squeeze(whole['val_classid'])
validation_class_labels = []
for item in validation_classid:
    validation_class_labels.append(item[0])
validation_features = whole['val_feats']
validation_imid = whole['val_imgid']
validation_imgid = []
for item in validation_imid:
    validation_imgid.append(item[0])
validation_imgid = np.squeeze(validation_imgid)
validation_sampleid = whole['val_sampleid']

#test
test_features = whole['test_feats']
test_imid = whole['test_imgid']
test_imgid = []
for item in test_imid:
    test_imgid.append(item[0])
test_imgid = np.squeeze(test_imgid)
test_sampleid = whole['test_sampleid']

In [6]:
#whole statistics

#train
print("Train image ids:", Counter(train_imgid))
print("Train sample id shape:", train_sampleid.shape)
print("Train features shape:", train_features.shape)
train_unique_labels = sorted(np.unique(train_class_labels))
train_unique_labels_count = len(train_unique_labels)
print("Train unique labels count:", train_unique_labels_count)
print("\n")

#validation
print("Validation image ids:", Counter(validation_imgid))
print("Validation sample id shape:", validation_sampleid.shape)
print("Validation features shape:", validation_features.shape)
validation_unique_labels = sorted(np.unique(validation_class_labels))
validation_unique_labels_count = len(validation_unique_labels)
print("Validation unique labels count:", validation_unique_labels_count)
print("\n")

#test
print("Test image ids:", Counter(test_imgid))
print("Test sample id shape:", test_sampleid.shape)
print("Test features shape:", test_features.shape)

Train image ids: Counter({0: 7550, 1: 297, 2: 2})
Train sample id shape: (7849, 1)
Train features shape: (7849, 384)
Train unique labels count: 1013


Validation image ids: Counter({0: 1329, 1: 50})
Validation sample id shape: (1379, 1)
Validation features shape: (1379, 384)
Validation unique labels count: 1013


Test image ids: Counter({0: 1407, 1: 766, 2: 189, 3: 32, 4: 30, 5: 30})
Test sample id shape: (2454, 1)
Test features shape: (2454, 384)


In [27]:
#z-score normalization
scalar = StandardScaler()
train_features_norm = scalar.fit_transform(train_features)
validation_features_norm = scalar.transform(validation_features)

In [10]:
#encoded train labels
train_labels, le = encode_labels(train_class_labels)
print(len(train_labels))

#encoded validation labels
validation_labels = le.transform(validation_class_labels)
print(len(validation_labels))

7849
1379


Checking effect of dimensionality reduction

Normalization used is StandardScaler

In [28]:
N_components = [150, 200, 250, 300, 350]

for n_components in N_components:
    pca = PCA(n_components = n_components)
    train_features_red = pca.fit_transform(train_features_norm)
    validation_features_red = pca.transform(validation_features_norm)
    print("PCA dimension:", n_components)
    clf = SGDClassifier(class_weight = 'balanced', random_state=0)
    clf.fit(train_features_red, train_labels)
    preds = clf.predict(validation_features_red)
    score = clf.score(validation_features_red, validation_labels)
    print("Overall Accuracy:", score)
    mean_acc = mean_class_acc(preds, validation_labels)
    print("Mean class accuracy:", mean_acc)

PCA dimension: 150
Overall Accuracy: 0.7164612037708484
Mean class accuracy: 0.6815164762844922
PCA dimension: 200
Overall Accuracy: 0.7106598984771574
Mean class accuracy: 0.6761928265876933
PCA dimension: 250
Overall Accuracy: 0.7135605511240029
Mean class accuracy: 0.6799440605462324
PCA dimension: 300
Overall Accuracy: 0.7244379985496737
Mean class accuracy: 0.6866497438067034
PCA dimension: 350
Overall Accuracy: 0.7237128353879623
Mean class accuracy: 0.68930569266206


Normalization used is MinMaxScaler

In [26]:
N_components = [150, 200, 250, 300, 350]

for n_components in N_components:
    pca = PCA(n_components = n_components)
    train_features_red = pca.fit_transform(train_features_norm)
    validation_features_red = pca.transform(validation_features_norm)
    print("PCA dimension:", n_components)
    clf = SGDClassifier(class_weight = 'balanced', random_state=0)
    clf.fit(train_features_red, train_labels)
    preds = clf.predict(validation_features_red)
    score = clf.score(validation_features_red, validation_labels)
    print("Overall Accuracy:", score)
    mean_acc = mean_class_acc(preds, validation_labels)
    print("Mean class accuracy:", mean_acc)

PCA dimension: 150
Overall Accuracy: 0.6838288614938361
Mean class accuracy: 0.638095238095238
PCA dimension: 200
Overall Accuracy: 0.6838288614938361
Mean class accuracy: 0.6358247543834908
PCA dimension: 250
Overall Accuracy: 0.6903553299492385
Mean class accuracy: 0.6422178348140836
PCA dimension: 300
Overall Accuracy: 0.6983321247280638
Mean class accuracy: 0.6493183848070324
PCA dimension: 350
Overall Accuracy: 0.6889050036258159
Mean class accuracy: 0.6378014384430969


checking effect of alpha parameter

Normalization is StandardScaler

In [29]:
alpha = [0.000001, 0.00001, 0.0001, 0.001, 0.01, 0.1]

for a in alpha:
    clf = SGDClassifier(alpha= a, class_weight = 'balanced', random_state=0)
    clf.fit(train_features_norm, train_labels)
    preds = clf.predict(validation_features_norm)
    score = clf.score(validation_features_norm, validation_labels)
    print("Overall Accuracy:", score)
    mean_acc = mean_class_acc(preds, validation_labels)
    print("Mean class accuracy:", mean_acc)

Overall Accuracy: 0.6838288614938361
Mean class accuracy: 0.6309923377050721
Overall Accuracy: 0.7229876722262509
Mean class accuracy: 0.6715155361256052
Overall Accuracy: 0.7229876722262509
Mean class accuracy: 0.6878837023456964
Overall Accuracy: 0.5431472081218274
Mean class accuracy: 0.5433084191228318
Overall Accuracy: 0.20522117476432197
Mean class accuracy: 0.20533070088845018
Overall Accuracy: 0.06453952139231327
Mean class accuracy: 0.06498848305363605


SGD Classifier on whole data

In [19]:
#without PCA
#normalization StandardScaler
clf = SGDClassifier(class_weight = 'balanced', random_state=0)
clf.fit(train_features_norm, train_labels)
score = clf.score(validation_features_norm, validation_labels)
print(score)

0.7229876722262509


Ridge Classifier on whole data

In [13]:
clf = RidgeClassifier(class_weight = 'balanced', random_state=0)
clf.fit(train_features_norm,train_labels)
score = clf.score(validation_features_norm, validation_labels)
print(score)

0.7708484408992023
