In [18]:
#Load packages
import pandas as pd
import numpy as np
import scipy.io as sio
import random
import scipy
import matplotlib.pyplot as plt
from collections import Counter
from sklearn.manifold import TSNE
from sklearn.svm import SVC
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler, MinMaxScaler, LabelEncoder
from sklearn.utils.class_weight import compute_class_weight
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression, RidgeClassifier, SGDClassifier

In [4]:
#This function is used to encode labels since labels are categorical.
def encode_labels(labels):
    le = LabelEncoder()
    le.fit(labels)
    encoded_labels = le.transform(labels)
    
    return encoded_labels, le

def decode_labels(encoded_predict_labels, le):
    test_predictions = le.inverse_transform(encoded_predict_labels)
    
    return test_predictions

#Load data
train = sio.loadmat('train.mat')
validation = sio.loadmat('validation.mat')
test = sio.loadmat('test_wolabels.mat')

train_classid = np.squeeze(train['classid'])
train_class_labels = []
for item in train_classid:
    train_class_labels.append(item[0])
train_features = train['features']
train_imid = train['imid']
train_sampleid = train['sampleid']
print(train_features.shape)
train_unique_labels = sorted(np.unique(train_class_labels))
train_unique_labels_count = len(train_unique_labels)
print(train_unique_labels_count)

validation_classid = np.squeeze(validation['classid'])
validation_class_labels = []
for item in validation_classid:
    validation_class_labels.append(item[0])
validation_features = validation['features']
validation_imid = validation['imid']
validation_sampleid = validation['sampleid']
print(validation_features.shape)
validation_unique_labels = sorted(np.unique(validation_class_labels))
validation_unique_labels_count = len(validation_unique_labels)
print(validation_unique_labels_count)

#encoded train labels
train_labels, le = encode_labels(train_class_labels)
print(len(train_labels))

#encoded validation labels
validation_labels = le.transform(validation_class_labels)
print(len(validation_labels))

(7849, 2048)
1013
(1379, 2048)
1013
7849
1379


In [5]:
#0-1 normalization
scalar = MinMaxScaler()
train_features_norm = scalar.fit_transform(train_features)
validation_features_norm = scalar.transform(validation_features)

Logistic Regression with Cross-validation Design of Experiment

In [6]:
cost_param = [1, 10]
for c in cost_param:
    clf = LogisticRegression(C = c, solver = 'liblinear', class_weight = 'balanced', random_state=0)
    clf.fit(train_features, train_labels)
    score = clf.score(validation_features, validation_labels)
    print("Cost parameter: ", c)
    print("Mean class accuracy scores:", score)

Cost parameter:  1
Mean class accuracy scores: 0.78535170413343
Cost parameter:  10
Mean class accuracy scores: 0.7882523567802756


Logistic Regression

PCA dim 1000

Cost parameter 100

solver-Liblinear

In [6]:
n_components = 1000
pca = PCA(n_components=n_components)
train_features_red = pca.fit_transform(train_features_norm)
validation_features_red = pca.transform(validation_features_norm)
print("No. of principal components =", n_components)
clf = LogisticRegression(C = 100, solver = 'liblinear', class_weight = 'balanced', random_state=0)
clf.fit(train_features_red, train_labels)
logistic_Regression_predictions = clf.predict(validation_features_red)
score = clf.score(validation_features_red, validation_labels)
print("accuracy score:", score)

No. of principal components = 1000
Mean class accuracy score: 0.794778825235678


In [16]:
from sklearn.metrics import confusion_matrix
logistic_Regression_predictions = clf.predict(validation_features_red)
matrix = confusion_matrix(validation_labels, logistic_Regression_predictions)
acc = matrix.diagonal()/matrix.sum(axis=1)
print(sum(acc)/len(acc))

0.7549405349504067


In [13]:
clf_LR2 = LogisticRegression(C = 100, solver = 'liblinear', class_weight = 'balanced', random_state=0)
clf_LR2.fit(train_features, train_labels)
logistic_Regression_predictions = clf_LR2.predict(validation_features)
score = clf_LR2.score(validation_features, validation_labels)
print("Mean class accuracy score:", score)

Mean class accuracy score: 0.7846265409717187


Ridge Classifier

In [7]:
clf_ridge = RidgeClassifier(class_weight = 'balanced', random_state=0)
clf_ridge.fit(train_features_norm, train_labels)
RidgeClassifier_predictions = clf_ridge.predict(validation_features_norm)
score = clf_ridge.score(validation_features_norm, validation_labels)
print("Mean class accuracy score:", score)

Mean class accuracy score: 0.8042059463379261


SGDClassifier-SVM

PCA dim 1000

Alpha parameter 0.0001

In [8]:
n_components = 1000
pca = PCA(n_components=n_components)
train_features_red_sgd = pca.fit_transform(train_features)
validation_features_red_sgd = pca.transform(validation_features)
print("No. of principal components =", n_components)
clf_sgd = make_pipeline(StandardScaler(), SGDClassifier(max_iter=10000, tol=1e-4, class_weight='balanced', random_state=0))
clf_sgd.fit(train_features_red_sgd, train_labels)
SGDClassifier_predictions = clf_sgd.predict(validation_features_red_sgd)
score = clf_sgd.score(validation_features_red_sgd, validation_labels)
print("Mean class accuracy scores:", score)

No. of principal components = 1000
Mean class accuracy scores: 0.7918781725888325


Ensemble method-VotingClassifier

In [11]:
from sklearn.ensemble import VotingClassifier #create a dictionary of our models

estimators=[('log_reg', clf), ('ridge', clf_ridge), ('sgd_svm', clf_sgd)] #create our voting classifier, inputting our models

ensemble = VotingClassifier(estimators, voting='hard')

#fit model to training data
ensemble.fit(train_features_red, train_labels) #test our model on the test data
ensemble.score(validation_features_red, validation_labels)

0.7897026831036983

In [12]:
estimators2=[('log_reg', clf), ('ridge', clf_ridge)] #create our voting classifier, inputting our models

ensemble2 = VotingClassifier(estimators2, voting='hard')

#fit model to training data
ensemble2.fit(train_features_red, train_labels) #test our model on the test data
ensemble2.score(validation_features_red, validation_labels)

0.78535170413343

In [17]:
ensemble_predict = ensemble.predict(validation_features_red)

matrix = confusion_matrix(validation_labels, ensemble_predict)
acc = matrix.diagonal()/matrix.sum(axis=1)
print(sum(acc)/len(acc))

0.7618789075353735


In [25]:
final_pred = []
for i in range(1379):
    temp_list = [RidgeClassifier_predictions[i], logistic_Regression_predictions[i], SGDClassifier_predictions[i]]
    count_dict = Counter(temp_list)
    key = max(count_dict, key=count_dict.get)
    if count_dict[key] > 1:
        final_pred.append(key)
    else:
        final_pred.append(RidgeClassifier_predictions[i])

print(final_pred)

[815, 1, 2, 3, 4, 5, 392, 26, 8, 14, 799, 11, 622, 13, 13, 33, 15, 16, 14, 27, 21, 19, 20, 291, 22, 23, 24, 24, 723, 25, 25, 26, 29, 20, 20, 30, 31, 32, 344, 35, 34, 72, 23, 37, 38, 39, 39, 40, 41, 41, 41, 42, 43, 44, 45, 46, 47, 48, 48, 48, 49, 49, 50, 51, 51, 701, 53, 54, 55, 589, 57, 58, 59, 60, 60, 61, 61, 62, 64, 64, 65, 66, 67, 68, 69, 71, 70, 72, 73, 74, 75, 76, 77, 78, 78, 79, 80, 80, 80, 80, 81, 82, 794, 84, 84, 85, 85, 85, 86, 86, 87, 88, 88, 87, 89, 90, 102, 91, 91, 92, 92, 92, 92, 652, 773, 93, 93, 103, 95, 95, 95, 95, 96, 96, 96, 97, 98, 444, 99, 99, 452, 100, 100, 653, 102, 102, 102, 103, 103, 104, 104, 104, 105, 113, 106, 107, 107, 493, 109, 110, 110, 111, 111, 114, 106, 106, 114, 115, 116, 498, 745, 968, 769, 120, 120, 120, 121, 120, 122, 123, 5, 125, 126, 127, 128, 129, 130, 131, 132, 132, 133, 134, 134, 135, 1006, 137, 137, 137, 138, 137, 140, 141, 142, 143, 144, 145, 145, 146, 146, 147, 148, 148, 682, 150, 151, 152, 152, 153, 154, 154, 155, 687, 157, 157, 157, 158, 1

In [26]:
from sklearn.metrics import accuracy_score

acc = accuracy_score(validation_labels, final_pred)
print (acc)

matrix = confusion_matrix(validation_labels, final_pred)
acc2 = matrix.diagonal()/matrix.sum(axis=1)
print(sum(acc2)/len(acc2))

0.8085569253081943
0.784757674046914


In [None]:
estimators3=[('log_reg', clf), ('ridge', clf_ridge), ('sgd_svm', clf_sgd)] #create our voting classifier, inputting our models

ensemble3 = VotingClassifier(estimators3, voting='soft')

#fit model to training data
ensemble3.fit(train_features_red, train_labels) #test our model on the test data
ensemble3.score(validation_features_red, validation_labels)