In [1]:
#Load packages
import pandas as pd
import numpy as np
import scipy.io as sio
import random
import scipy
import matplotlib.pyplot as plt
from collections import Counter
from sklearn.manifold import TSNE
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.utils.class_weight import compute_class_weight
from sklearn.decomposition import PCA
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.linear_model import LogisticRegression, RidgeClassifier
from sklearn.model_selection import cross_val_score

In [2]:
#This function is used to encode labels since labels are categorical.
def encode_labels(labels):
    le = LabelEncoder()
    le.fit(labels)
    encoded_labels = le.transform(labels)
    
    return encoded_labels, le

def decode_labels(encoded_predict_labels, le):
    test_predictions = le.inverse_transform(encoded_predict_labels)
    
    return test_predictions

#Mean class accuracy
def mean_class_acc(predictions, true_labels):
    matrix = confusion_matrix(true_labels, predictions)
    acc = matrix.diagonal()/matrix.sum(axis=1)

    return sum(acc)/len(acc)

In [3]:
#Load data
whole = sio.loadmat('whole_wolabels.mat')
parts = sio.loadmat('parts_wolabels.mat')

In [4]:
#whole

#train
train_classid = np.squeeze(whole['train_classid'])
train_class_labels = []
for item in train_classid:
    train_class_labels.append(item[0])
train_features = whole['train_feats']
train_imid = whole['train_imgid']
train_imgid = []
for item in train_imid:
    train_imgid.append(item[0])
train_imgid = np.squeeze(train_imgid)
train_sampleid = whole['train_sampleid']

#validation
validation_classid = np.squeeze(whole['val_classid'])
validation_class_labels = []
for item in validation_classid:
    validation_class_labels.append(item[0])
validation_features = whole['val_feats']
validation_imid = whole['val_imgid']
validation_imgid = []
for item in validation_imid:
    validation_imgid.append(item[0])
validation_imgid = np.squeeze(validation_imgid)
validation_sampleid = whole['val_sampleid']

#test
test_features = whole['test_feats']
test_imid = whole['test_imgid']
test_imgid = []
for item in test_imid:
    test_imgid.append(item[0])
test_imgid = np.squeeze(test_imgid)
test_sampleid = whole['test_sampleid']

#encoded train labels
train_labels, le = encode_labels(train_class_labels)
train_unique_labels = sorted(np.unique(train_labels))
train_unique_labels_count = len(train_unique_labels)
print(len(train_labels))

#encoded validation labels
validation_labels = le.transform(validation_class_labels)
validation_unique_labels = sorted(np.unique(validation_labels))
validation_unique_labels_count = len(validation_unique_labels)
print(len(validation_labels))

7849
1379


In [5]:
#parts

#train
train_classid_parts = np.squeeze(parts['train_classid'])
train_class_labels_parts = []
for item in train_classid_parts:
    train_class_labels_parts.append(item[0])
train_features_parts = parts['train_feats']
train_imid_parts = parts['train_imgid']
train_imgid_parts = []
for item in train_imid_parts:
    train_imgid_parts.append(item[0])
train_imgid_parts = np.squeeze(train_imgid_parts)
train_sampleid_parts = parts['train_sampleid']
train_tileid_parts = parts['train_tileid']
train_labels_parts = le.transform(train_class_labels_parts)
print(len(train_labels_parts))

#validation
validation_classid_parts = np.squeeze(parts['val_classid'])
validation_class_labels_parts = []
for item in validation_classid_parts:
    validation_class_labels_parts.append(item[0])
validation_features_parts = parts['val_feats']
validation_imid_parts = parts['val_imgid']
validation_imgid_parts = []
for item in validation_imid_parts:
    validation_imgid_parts.append(item[0])
validation_imgid_parts = np.squeeze(validation_imgid_parts)
validation_sampleid_parts = parts['val_sampleid']
validation_tileid_parts = parts['val_tileid']
validation_labels_parts = le.transform(validation_class_labels_parts)
print(len(validation_labels_parts))

#test
test_features_parts = parts['test_feats']
test_imid_parts = parts['test_imgid']
test_imgid_parts = []
for item in test_imid_parts:
    test_imgid_parts.append(item[0])
test_imgid_parts = np.squeeze(test_imgid_parts)
test_sampleid_parts = parts['test_sampleid']
test_tileid_parts = parts['test_tileid']

70641
12411


In [8]:
train_n, d = train_features_parts.shape
print(train_n, d)
train_features_parts_bags = train_features_parts.reshape(int(train_n/9), 9, d)
print(train_features_parts_bags.shape)

validation_n, d = validation_features_parts.shape
print(validation_n,d)
validation_features_parts_bags = validation_features_parts.reshape(int(validation_n/9), 9, d)
print(validation_features_parts_bags.shape)

70641 384
(7849, 9, 384)
12411 384
(1379, 9, 384)


In [9]:
combined_train_features = []
combined_validation_features = []

for i in range(len(train_features_parts_bags)):
    combined_train_features.append(np.concatenate((train_features[i], train_features_parts_bags[i]), axis=None))
combined_train_features = np.array(combined_train_features)
print(combined_train_features.shape)

for i in range(len(validation_features_parts_bags)):
    combined_validation_features.append(np.concatenate((validation_features[i], validation_features_parts_bags[i]), axis=None))
combined_validation_features = np.array(combined_validation_features)
print(combined_validation_features.shape)

(7849, 3840)
(1379, 3840)


In [10]:
#0-1 normalization
scaler = MinMaxScaler()
combined_train_features_norm = scaler.fit_transform(combined_train_features)
combined_validation_features_norm = scaler.transform(combined_validation_features)

converting into bag representation

In [11]:
train_n, d = combined_train_features_norm.shape
print(train_n,d)
combined_train_features_norm = combined_train_features_norm.reshape(train_n, 10, int(d/10))
print(combined_train_features_norm.shape)

val_n, d = combined_validation_features_norm.shape
print(val_n,d)
combined_validation_features_norm = combined_validation_features_norm.reshape(val_n, 10, int(d/10))
print(combined_validation_features_norm.shape)

7849 3840
(7849, 10, 384)
1379 3840
(1379, 10, 384)


training on each TILE to predict on corresponding validation 

whole feature

In [19]:
pred_table_train = np.zeros((len(train_labels), 9))

In [15]:
for i in range(1,10):
    print("Part:", i)
    clf_lr = LogisticRegression(C = 10, class_weight = 'balanced', solver = 'liblinear', fit_intercept = True, random_state=0)
    clf_lr.fit(combined_train_features_norm[:,i,:], train_labels)
    score = clf_lr.score(combined_train_features_norm[:,0,:], train_labels)
    print(score)
    preds = clf_lr.predict(combined_train_features_norm[:,0,:])
    pred_table_train[:,i] = preds
    mean_acc = mean_class_acc(preds, train_labels)
    print("Mean class accuracy:", mean_acc)

Part: 1
0.4674480825582877
Mean class accuracy: 0.4409893385037512
Part: 2
0.5310230602624538
Mean class accuracy: 0.5271516132057332
Part: 3
0.47101541597655755
Mean class accuracy: 0.441084251186126
Part: 4
0.6566441584915276
Mean class accuracy: 0.6531968328246706
Part: 5
0.7357625175181551
Mean class accuracy: 0.7375596331277856
Part: 6
0.6489998725952351
Mean class accuracy: 0.6393744045049086
Part: 7
0.29889157854503756
Mean class accuracy: 0.28626670588099096
Part: 8
0.3902407950057332
Mean class accuracy: 0.3713994305430714
Part: 9
0.2834756019875143


IndexError: index 9 is out of bounds for axis 1 with size 9

In [16]:
pred_table_copy = pred_table_train
print(pred_table_copy)

[[   0. 1010.  816. ...    0.  251. 1010.]
 [   0.    0.    0. ...    0. 1009.  814.]
 [   0.  764.    0. ...  255.  251.  251.]
 ...
 [   0.  922. 1012. ... 1012.  874.   95.]
 [   0.  559. 1012. ... 1012. 1012.  802.]
 [   0. 1012. 1012. ... 1012.  817.  803.]]


In [20]:
for j in range(8):
    pred_table_train[:, j] = pred_table_copy[:, j+1]

print(pred_table_train)

[[1010.  816.    0. ...  251. 1010.    0.]
 [   0.    0.    0. ... 1009.  814.    0.]
 [ 764.    0.    0. ...  251.  251.    0.]
 ...
 [ 922. 1012.  922. ...  874.   95.    0.]
 [ 559. 1012. 1012. ... 1012.  802.    0.]
 [1012. 1012.  559. ...  817.  803.    0.]]


In [21]:
pred_table_train[:,8] = preds
mean_acc = mean_class_acc(preds, train_labels)
print("Mean class accuracy:", mean_acc)

Mean class accuracy: 0.26452873987367626


In [22]:
print(pred_table_train)

[[1010.  816.    0. ...  251. 1010.  251.]
 [   0.    0.    0. ... 1009.  814.  257.]
 [ 764.    0.    0. ...  251.  251.  251.]
 ...
 [ 922. 1012.  922. ...  874.   95.  333.]
 [ 559. 1012. 1012. ... 1012.  802. 1012.]
 [1012. 1012.  559. ...  817.  803.  524.]]


In [29]:
pred_table_val = np.zeros((len(validation_labels), 9))

In [31]:
for i in range(1,10):
    print("Part:", i)
    clf_lr = LogisticRegression(C = 10, class_weight = 'balanced', solver = 'liblinear', fit_intercept = True, random_state=0)
    clf_lr.fit(combined_train_features_norm[:,i,:], train_labels)
    score = clf_lr.score(combined_validation_features_norm[:,i,:], validation_labels)
    print(score)
    preds = clf_lr.predict(combined_validation_features_norm[:,i,:])
    pred_table_val[:, i-1] = preds
    mean_acc = mean_class_acc(preds, validation_labels)
    print("Mean class accuracy:", mean_acc)

Part: 1
0.7686729514140682
Mean class accuracy: 0.7318737366614957
Part: 2
0.7070340826686005
Mean class accuracy: 0.6567080336576882
Part: 3
0.7585206671501088
Mean class accuracy: 0.7193437690969773
Part: 4
0.7940536620739667
Mean class accuracy: 0.7563037653363419
Part: 5
0.7686729514140682
Mean class accuracy: 0.7275231514125885
Part: 6
0.794778825235678
Mean class accuracy: 0.7584191228317585
Part: 7
0.6990572878897752
Mean class accuracy: 0.6482865604287122
Part: 8
0.6446700507614214
Mean class accuracy: 0.5872396935082029
Part: 9
0.6765772298767223
Mean class accuracy: 0.6230127391529167


Run experiment with TILES predictions as features

First append tiles predictions to whole features and normalize

In [32]:
train_appended_features = np.hstack((train_features, pred_table_train))
print(train_appended_features.shape)
val_appended_features = np.hstack((validation_features, pred_table_val))
print(val_appended_features.shape)

#0-1 normalization
scaler = MinMaxScaler()
train_appended_features_norm = scaler.fit_transform(train_appended_features)
val_appended_features_norm = scaler.transform(val_appended_features)

(7849, 393)
(1379, 393)


In [33]:
print(train_appended_features_norm)
print(val_appended_features_norm)

[[0.6342371  0.49304666 0.46219741 ... 0.24578791 1.         0.24578791]
 [0.64828488 0.54959035 0.49255834 ... 0.99702676 0.80594059 0.25173439]
 [0.68417868 0.4996972  0.58441029 ... 0.24578791 0.24851485 0.24578791]
 ...
 [0.33295999 0.68719526 0.42383084 ... 0.86323092 0.09405941 0.32705649]
 [0.48816728 0.65571606 0.33013516 ... 1.         0.79405941 1.        ]
 [0.45321441 0.73079081 0.25541552 ... 0.80673935 0.7950495  0.51635282]]
[[ 6.85525586e-01  5.19314883e-01  4.42941858e-01 ... -2.97324083e-03
   0.00000000e+00 -2.97324083e-03]
 [ 1.60238809e-01  3.84508565e-01  4.06438963e-01 ...  4.29137760e-01
   9.90099010e-04 -1.98216056e-03]
 [ 5.79529740e-01  5.89208812e-01  5.22164147e-01 ... -9.91080278e-04
   1.98019802e-03 -9.91080278e-04]
 ...
 [ 5.15960979e-01  5.87168024e-01  4.41510171e-01 ...  9.98017839e-01
   1.00000000e+00  9.98017839e-01]
 [ 6.61474527e-01  4.85201138e-01  3.77441706e-01 ...  9.99008920e-01
   1.00099010e+00  9.99008920e-01]
 [ 4.00949751e-01  7.38362

Run logistic regression with liblinear solver

In [34]:
clf_lr = LogisticRegression(C = 10, class_weight = 'balanced', solver = 'liblinear', fit_intercept = True, random_state=0)
clf_lr.fit(train_appended_features_norm, train_labels)
score = clf_lr.score(val_appended_features_norm, validation_labels)
print(score)
preds = clf_lr.predict(val_appended_features_norm)
mean_acc = mean_class_acc(preds, validation_labels)
print("Mean class accuracy:", mean_acc)

0.7962291515591008
Mean class accuracy: 0.7583462605180277


In [35]:
pred_table_val = np.zeros((len(validation_labels), 9))

weights for tiles:
    
