In [8]:
#Load packages
import pandas as pd
import numpy as np
import scipy.io as sio
import random
import scipy
import matplotlib.pyplot as plt
from collections import Counter
from sklearn.manifold import TSNE
from sklearn.svm import SVC
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.utils.class_weight import compute_class_weight
from sklearn.decomposition import PCA
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.linear_model import LogisticRegression, RidgeClassifier, SGDClassifier, Lasso
from sklearn.model_selection import cross_val_score

In [2]:
#This function is used to encode labels since labels are categorical.
def encode_labels(labels):
    le = LabelEncoder()
    le.fit(labels)
    encoded_labels = le.transform(labels)
    
    return encoded_labels, le

def decode_labels(encoded_predict_labels, le):
    test_predictions = le.inverse_transform(encoded_predict_labels)
    
    return test_predictions

In [3]:
#Load data
whole = sio.loadmat('whole_wolabels.mat')
parts = sio.loadmat('parts_wolabels.mat')

In [4]:
#whole

#train
train_classid = np.squeeze(whole['train_classid'])
train_class_labels = []
for item in train_classid:
    train_class_labels.append(item[0])
train_features = whole['train_feats']
train_imid = whole['train_imgid']
train_imgid = []
for item in train_imid:
    train_imgid.append(item[0])
train_imgid = np.squeeze(train_imgid)
train_sampleid = whole['train_sampleid']

#validation
validation_classid = np.squeeze(whole['val_classid'])
validation_class_labels = []
for item in validation_classid:
    validation_class_labels.append(item[0])
validation_features = whole['val_feats']
validation_imid = whole['val_imgid']
validation_imgid = []
for item in validation_imid:
    validation_imgid.append(item[0])
validation_imgid = np.squeeze(validation_imgid)
validation_sampleid = whole['val_sampleid']

#test
test_features = whole['test_feats']
test_imid = whole['test_imgid']
test_imgid = []
for item in test_imid:
    test_imgid.append(item[0])
test_imgid = np.squeeze(test_imgid)
test_sampleid = whole['test_sampleid']

#encoded train labels
train_labels, le = encode_labels(train_class_labels)
print(len(train_labels))

#encoded validation labels
validation_labels = le.transform(validation_class_labels)
print(len(validation_labels))

7849
1379


In [5]:
#parts

#train
train_classid_parts = np.squeeze(parts['train_classid'])
train_class_labels_parts = []
for item in train_classid_parts:
    train_class_labels_parts.append(item[0])
train_features_parts = parts['train_feats']
train_imid_parts = parts['train_imgid']
train_imgid_parts = []
for item in train_imid_parts:
    train_imgid_parts.append(item[0])
train_imgid_parts = np.squeeze(train_imgid_parts)
train_sampleid_parts = parts['train_sampleid']
train_tileid_parts = parts['train_tileid']

#validation
validation_classid_parts = np.squeeze(parts['val_classid'])
validation_class_labels_parts = []
for item in validation_classid_parts:
    validation_class_labels_parts.append(item[0])
validation_features_parts = parts['val_feats']
validation_imid_parts = parts['val_imgid']
validation_imgid_parts = []
for item in validation_imid_parts:
    validation_imgid_parts.append(item[0])
validation_imgid_parts = np.squeeze(validation_imgid_parts)
validation_sampleid_parts = parts['val_sampleid']
validation_tileid_parts = parts['val_tileid']

#test
test_features_parts = parts['test_feats']
test_imid_parts = parts['test_imgid']
test_imgid_parts = []
for item in test_imid_parts:
    test_imgid_parts.append(item[0])
test_imgid_parts = np.squeeze(test_imgid_parts)
test_sampleid_parts = parts['test_sampleid']
test_tileid_parts = parts['test_tileid']

combine features

In [6]:
# combine train and validation data

#whole
train_features = np.vstack((train_features, validation_features))
print(train_features.shape)
train_labels = np.array(list(train_labels) + list(validation_labels))
print(len(train_labels))

#parts
train_features_parts = np.vstack((train_features_parts, validation_features_parts))
print(train_features_parts.shape)

(9228, 384)
9228
(83052, 384)


Converting into bag representation

In [7]:
train_n, d = train_features_parts.shape
print(train_n,d)
train_features_parts_bags = train_features_parts.reshape(int(train_n/9), 9, d)
print(train_features_parts_bags.shape)

test_n, d = test_features_parts.shape
print(test_n,d)
test_features_parts_bags = test_features_parts.reshape(int(test_n/9), 9, d)
print(test_features_parts_bags.shape)

83052 384
(9228, 9, 384)
22086 384
(2454, 9, 384)


In [9]:
combined_train_features = []
combined_test_features = []
for i in range(len(train_features_parts_bags)):
    combined_train_features.append(np.concatenate((train_features[i], train_features_parts_bags[i]), axis=None))
print(np.array(combined_train_features).shape)

for i in range(len(test_features_parts_bags)):
    combined_test_features.append(np.concatenate((test_features[i], test_features_parts_bags[i]), axis=None))
print(np.array(combined_test_features).shape)

(9228, 3840)
(2454, 3840)


combined features and Logistic Regression

In [None]:
clf = LogisticRegression(C = c, solver = 'liblinear', class_weight = 'balanced', random_state=0)
print("cost parameter:", c)
clf.fit(combined_train_features, train_labels)
preds = clf.predict(combined_test_features)

PCA dimension: 1000
cost parameter: 10


In [None]:
comb_feats_PCA_predictions_decoded = decode_labels(preds, le)
print(comb_feats_PCA_predictions_decoded)
print(len(comb_feats_PCA_predictions_decoded))

In [None]:
#Save final predictions in an excel file
output_df = pd.DataFrame(comb_feats_PCA_predictions_decoded)
output_df.to_csv('Task2_comb_feats_PCA_c10_d1000_LR_liblinear.csv', index=False,  header=False)

PCA on combined features and Logistic Regression

In [None]:
n_components = 1000
c = 10
pca = PCA(n_components=n_components)
train_features_red = pca.fit_transform(combined_train_features)
test_features_red = pca.transform(combined_test_features)
print("PCA dimension:", n_components)
clf = LogisticRegression(C = c, solver = 'liblinear', class_weight = 'balanced', random_state=0)
print("cost parameter:", c)
clf.fit(train_features_red, train_labels)
preds_pca = clf.predict(test_features_red)

Averaging by test sample id

In [31]:
nc, dc = np.array(combined_test_features).shape
test_sampleid_index = []
combined_test_features_sm = np.zeros((nc, dc))
test_sampleid_list = np.squeeze(test_sampleid)

for i in range(nc):
    loc_sample_id = []
    for j in range(nc):
        if (test_sampleid_list[i] == test_sampleid_list[j]):
            loc_sample_id.append(j)
    test_sampleid_index.append(loc_sample_id)
    
for i in range(nc):
    temp_test_features = np.zeros(dc)
    for j in range(len(test_sampleid_index[i])):
        temp_test_features += combined_test_features[test_sampleid_index[i][j]]
    combined_test_features_sm[i] = temp_test_features/len(test_sampleid_index[i])

print(combined_test_features_sm.shape)

In [None]:
preds_sm = clf.predict(combined_test_features_sm)

In [None]:
comb_feats_PCA_predictions_sm_decoded = decode_labels(preds_sm, le)
print(comb_feats_PCA_predictions_sm_decoded)
print(len(comb_feats_PCA_predictions_sm_decoded))

In [None]:
#Save final predictions in an excel file
output_df = pd.DataFrame(comb_feats_PCA_predictions_sm_decoded)
output_df.to_csv('Task2_comb_feats_PCA_c10_d1000_LR_liblinear_sm.csv', index=False,  header=False)

with sample id averaging

In [47]:
clf = LogisticRegression(C = 10, solver = 'liblinear', class_weight = 'balanced', random_state=0)
clf.fit(combined_train_features_sm, train_labels_bags)
score = clf.score(combined_validation_features_sm, validation_labels_bags)
print(score)
preds = clf.predict(combined_validation_features_sm)
mean_acc = mean_class_acc(preds, validation_labels_bags)
print("Mean class accuracy:", mean_acc)

0.873821609862219
Mean class accuracy: 0.8501034174775537


In [48]:
clf = LogisticRegression(C = 20, solver = 'liblinear', class_weight = 'balanced', random_state=0)
clf.fit(combined_train_features_sm, train_labels_bags)
score = clf.score(combined_validation_features_sm, validation_labels_bags)
print(score)
preds = clf.predict(combined_validation_features_sm)
mean_acc = mean_class_acc(preds, validation_labels_bags)
print("Mean class accuracy:", mean_acc)

0.8730964467005076
Mean class accuracy: 0.8496098340619564


In [49]:
clf = LogisticRegression(C = 30, solver = 'liblinear', class_weight = 'balanced', random_state=0)
clf.fit(combined_train_features_sm, train_labels_bags)
score = clf.score(combined_validation_features_sm, validation_labels_bags)
print(score)
preds = clf.predict(combined_validation_features_sm)
mean_acc = mean_class_acc(preds, validation_labels_bags)
print("Mean class accuracy:", mean_acc)

0.8701957940536621
Mean class accuracy: 0.8451675833215814


with minmax normaization

In [41]:
#0-1 normalization
scalar = MinMaxScaler()
combined_train_features_norm = scalar.fit_transform(combined_train_features)
combined_validation_features_norm = scalar.transform(combined_validation_features)

In [42]:
clf = LogisticRegression(C = 10, solver = 'liblinear', class_weight = 'balanced', random_state=0)
clf.fit(combined_train_features_norm, train_labels_bags)
score = clf.score(combined_validation_features_norm, validation_labels_bags)
print(score)
preds = clf.predict(combined_validation_features_norm)
mean_acc = mean_class_acc(preds, validation_labels_bags)
print("Mean class accuracy:", mean_acc)

0.8658448150833937
Mean class accuracy: 0.8383420298030367


with standard normalization

In [43]:
#0-1 normalization
scalar = StandardScaler()
combined_train_features_norm = scalar.fit_transform(combined_train_features)
combined_validation_features_norm = scalar.transform(combined_validation_features)

In [44]:
clf = LogisticRegression(C = 10, solver = 'liblinear', class_weight = 'balanced', random_state=0)
clf.fit(combined_train_features_norm, train_labels_bags)
score = clf.score(combined_validation_features_norm, validation_labels_bags)
print(score)
preds = clf.predict(combined_validation_features_norm)
mean_acc = mean_class_acc(preds, validation_labels_bags)
print("Mean class accuracy:", mean_acc)

0.8672951414068165
Mean class accuracy: 0.8381516476284492


In [None]:
print(train_sampleid_index)
print(len(train_sampleid_index))

In [None]:
for i in range(train_sampleid_index):
    train_sampleid_index[i] = sorted(train_sampleid_index[i])

In [None]:
for i in range(nc):
    for j in range(len(train_sampleid_index))
        combined_train_features_sm =

Ridge Classifier

In [22]:
clf = RidgeClassifier(class_weight = 'balanced', random_state=0)
clf.fit(combined_train_features, train_labels_bags)
score = clf.score(combined_validation_features, validation_labels_bags)
print(score)

0.8433647570703409


Logistic Regression

In [23]:
#0-1 normalization
scalar = MinMaxScaler()
combined_train_features_norm = scalar.fit_transform(combined_train_features)
combined_validation_features_norm = scalar.transform(combined_validation_features)

In [24]:
clf = LogisticRegression(C = 10, solver = 'liblinear', class_weight = 'balanced', random_state=0)
clf.fit(combined_train_features_norm, train_labels_bags)
score = clf.score(combined_validation_features_norm, validation_labels_bags)
print(score)

0.8658448150833937


In [28]:
preds = clf.predict(combined_validation_features_norm)
mean_acc = mean_class_acc(preds, validation_labels_bags)
print("Mean class accuracy:", mean_acc)

Mean class accuracy: 0.8383420298030367


without scaling

In [None]:
clf = LogisticRegression(C = 10, solver = 'liblinear', class_weight = 'balanced', random_state=0)
clf.fit(combined_train_features, train_labels_bags)
score = clf.score(combined_validation_features, validation_labels_bags)
print(score)
preds = clf.predict(combined_validation_features)
mean_acc = mean_class_acc(preds, validation_labels_bags)
print("Mean class accuracy:", mean_acc)

SGDClassifier

In [29]:
clf_sgd = make_pipeline(StandardScaler(), SGDClassifier(max_iter=10000, tol=1e-4, class_weight='balanced', random_state=0))
clf_sgd.fit(combined_train_features, train_labels_bags)
preds = clf_sgd.predict(combined_validation_features)
score = clf_sgd.score(combined_validation_features, validation_labels_bags)
print("Average class accuracy score:", score)
mean_acc = mean_class_acc(preds, validation_labels_bags)
print("Mean class accuracy:", mean_acc)

Average class accuracy score: 0.7984046410442349
Mean class accuracy: 0.7704766605556338


Random Forest

In [None]:
# evaluate random forest algorithm for classification
from sklearn.ensemble import RandomForestClassifier

# define the model
model_RF = RandomForestClassifier(n_estimators = 100, class_weight = 'balanced')
model_RF.fit(combined_train_features_norm, train_labels_bags)
preds = model_RF.predict(combined_validation_features_norm)
# evaluate the model
score = model_RF.score(combined_validation_features_norm, validation_labels_bags)
print('Accuracy: %.3f' % (score))
mean_acc = mean_class_acc(preds, validation_labels_bags)
print("Mean class accuracy:", mean_acc)

Apply MISVM in one-vs-all fashion

In [None]:
#SVM classifier with threshold 0.3 and cost 10
clf = misvm.MISVM(kernel='linear', C=1.0, max_iters=50)
conf_matrix = np.zeros((len(validation_labels_bags), train_unique_labels_count))
for j in range(train_unique_labels_count):
    #print(train_unique_labels[j])
    #print(train_labels_bags)
    positive_labels_index = np.argwhere(np.isin(train_labels_bags, train_unique_labels[j])).ravel()
    #print(positive_labels_index)
    #print(len(positive_labels_index))
    negative_labels_index = list(set(range(len(train_labels_bags))) - set(positive_labels_index))
    positive_labels_embedding = train_features_parts_bags[positive_labels_index]
    negative_labels_embedding = train_features_parts_bags[negative_labels_index]
    positive_labels = 1*np.ones(len(positive_labels_index))
    negative_labels = -1*np.ones(len(negative_labels_index))
    train_embeddings_one = np.vstack((positive_labels_embedding, negative_labels_embedding))
    #print(train_embeddings_one)
    #print(positive_labels_embedding.shape)
    #print(negative_labels_embedding.shape)
    train_labels_one = np.concatenate((positive_labels, negative_labels))
    #print(train_labels_one)
    #print(positive_labels.shape)
    #print(negative_labels.shape)
    clf.fit(train_embeddings_one, train_labels_one)
    preds = clf.predict(validation_features_parts_bags)
    print(preds[0:5])
    #print(conf_score)
    conf_matrix[:,j] = preds

0
[   0    0    0 ... 1012 1012 1012]
[0 1 2]
3
(3, 9, 384)
(7846, 9, 384)
(3,)
(7846,)
Non-random start...


In [None]:
predictions = np.zeros(len(validation_labels_bags))
for k in range(len(validation_labels_bags)):
    predictions[k] = train_unique_labels[np.argmax(conf_matrix[k])]