In [1]:
#Load packages
import pandas as pd
import numpy as np
import scipy.io as sio
import random
import scipy
import matplotlib.pyplot as plt
from collections import Counter
from sklearn.manifold import TSNE
from sklearn.svm import SVC
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.utils.class_weight import compute_class_weight
from sklearn.decomposition import PCA
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.linear_model import LogisticRegression, RidgeClassifier, SGDClassifier, Lasso
from sklearn.model_selection import cross_val_score

In [2]:
#This function is used to encode labels since labels are categorical.
def encode_labels(labels):
    le = LabelEncoder()
    le.fit(labels)
    encoded_labels = le.transform(labels)
    
    return encoded_labels, le

def decode_labels(encoded_predict_labels, le):
    test_predictions = le.inverse_transform(encoded_predict_labels)
    
    return test_predictions

#Mean class accuracy
def mean_class_acc(predictions, true_labels):
    matrix = confusion_matrix(true_labels, predictions)
    acc = matrix.diagonal()/matrix.sum(axis=1)

    return sum(acc)/len(acc)

In [3]:
#Load data
whole = sio.loadmat('whole_wolabels.mat')
parts = sio.loadmat('parts_wolabels.mat')

In [4]:
#whole

#train
train_classid = np.squeeze(whole['train_classid'])
train_class_labels = []
for item in train_classid:
    train_class_labels.append(item[0])
train_features = whole['train_feats']
train_imid = whole['train_imgid']
train_imgid = []
for item in train_imid:
    train_imgid.append(item[0])
train_imgid = np.squeeze(train_imgid)
train_sampleid = whole['train_sampleid']

#validation
validation_classid = np.squeeze(whole['val_classid'])
validation_class_labels = []
for item in validation_classid:
    validation_class_labels.append(item[0])
validation_features = whole['val_feats']
validation_imid = whole['val_imgid']
validation_imgid = []
for item in validation_imid:
    validation_imgid.append(item[0])
validation_imgid = np.squeeze(validation_imgid)
validation_sampleid = whole['val_sampleid']

#test
test_features = whole['test_feats']
test_imid = whole['test_imgid']
test_imgid = []
for item in test_imid:
    test_imgid.append(item[0])
test_imgid = np.squeeze(test_imgid)
test_sampleid = whole['test_sampleid']

In [5]:
#whole statistics

#train
print("Train image ids:", Counter(train_imgid))
print("Train sample id shape:", train_sampleid.shape)
print("Train features shape:", train_features.shape)
print("\n")

#validation
print("Validation image ids:", Counter(validation_imgid))
print("Validation sample id shape:", validation_sampleid.shape)
print("Validation features shape:", validation_features.shape)
print("\n")

#test
print("Test image ids:", Counter(test_imgid))
print("Test sample id shape:", test_sampleid.shape)
print("Test features shape:", test_features.shape)

Train image ids: Counter({0: 7550, 1: 297, 2: 2})
Train sample id shape: (7849, 1)
Train features shape: (7849, 384)


Validation image ids: Counter({0: 1329, 1: 50})
Validation sample id shape: (1379, 1)
Validation features shape: (1379, 384)


Test image ids: Counter({0: 1407, 1: 766, 2: 189, 3: 32, 4: 30, 5: 30})
Test sample id shape: (2454, 1)
Test features shape: (2454, 384)


In [6]:
#parts

#train
train_classid_parts = np.squeeze(parts['train_classid'])
train_class_labels_parts = []
for item in train_classid_parts:
    train_class_labels_parts.append(item[0])
train_features_parts = parts['train_feats']
train_imid_parts = parts['train_imgid']
train_imgid_parts = []
for item in train_imid_parts:
    train_imgid_parts.append(item[0])
train_imgid_parts = np.squeeze(train_imgid_parts)
train_sampleid_parts = parts['train_sampleid']
train_tileid_parts = parts['train_tileid']

#validation
validation_classid_parts = np.squeeze(parts['val_classid'])
validation_class_labels_parts = []
for item in validation_classid_parts:
    validation_class_labels_parts.append(item[0])
validation_features_parts = parts['val_feats']
validation_imid_parts = parts['val_imgid']
validation_imgid_parts = []
for item in validation_imid_parts:
    validation_imgid_parts.append(item[0])
validation_imgid_parts = np.squeeze(validation_imgid_parts)
validation_sampleid_parts = parts['val_sampleid']
validation_tileid_parts = parts['val_tileid']

#test
test_features_parts = parts['test_feats']
test_imid_parts = parts['test_imgid']
test_imgid_parts = []
for item in test_imid_parts:
    test_imgid_parts.append(item[0])
test_imgid_parts = np.squeeze(test_imgid_parts)
test_sampleid_parts = parts['test_sampleid']
test_tileid_parts = parts['test_tileid']

In [7]:
#parts statistics

#train
print("Parts Train image ids:", Counter(train_imgid_parts))
print("Parts Train sample id shape:", train_sampleid_parts.shape)
print("Parts Train tile id shape:", train_tileid_parts.shape)
print("Parts Train features shape:", train_features_parts.shape)
train_unique_labels_parts = sorted(np.unique(train_class_labels_parts))
train_unique_labels_count_parts = len(train_unique_labels_parts)
print("Parts Train unique labels count:", train_unique_labels_count_parts)
print("\n")

#validation
print("Parts Validation image ids:", Counter(validation_imgid_parts))
print("Parts Validation sample id shape:", validation_sampleid_parts.shape)
print("Parts Validation tile id shape:", validation_tileid_parts.shape)
print("Parts Validation features shape:", validation_features_parts.shape)
validation_unique_labels_parts = sorted(np.unique(validation_class_labels_parts))
validation_unique_labels_count_parts = len(validation_unique_labels_parts)
print("Parts Validation unique labels count:", validation_unique_labels_count_parts)
print("\n")

#test
print("Parts Test image ids:", Counter(test_imgid_parts))
print("Parts Test sample id shape:", test_sampleid_parts.shape)
print("Parts Test tile id shape:", test_tileid_parts.shape)
print("Parts Test features shape:", test_features_parts.shape)

Parts Train image ids: Counter({0: 67950, 1: 2673, 2: 18})
Parts Train sample id shape: (70641, 1)
Parts Train tile id shape: (70641, 2)
Parts Train features shape: (70641, 384)
Parts Train unique labels count: 1013


Parts Validation image ids: Counter({0: 11961, 1: 450})
Parts Validation sample id shape: (12411, 1)
Parts Validation tile id shape: (12411, 2)
Parts Validation features shape: (12411, 384)
Parts Validation unique labels count: 1013


Parts Test image ids: Counter({0: 12663, 1: 6894, 2: 1701, 3: 288, 4: 270, 5: 270})
Parts Test sample id shape: (22086, 1)
Parts Test tile id shape: (22086, 2)
Parts Test features shape: (22086, 384)


parts data

Converting into bag representation

In [8]:
train_n, d = train_features_parts.shape
print(train_n,d)
train_features_parts_bags = train_features_parts.reshape(int(train_n/9), 9*d)
print(train_features_parts_bags.shape)

val_n, d = validation_features_parts.shape
print(val_n,d)
validation_features_parts_bags = validation_features_parts.reshape(int(val_n/9), 9*d)
print(validation_features_parts_bags.shape)

70641 384
(7849, 3456)
12411 384
(1379, 3456)


In [10]:
#encoded train labels
train_labels_bags, le_parts = encode_labels(train_class_labels)
print(len(train_labels_bags))
train_unique_labels = sorted(np.unique(train_labels_bags))
train_unique_labels_count = len(train_unique_labels)
print("Train unique labels count:", train_unique_labels_count)

#encoded validation labels
validation_labels_bags = le_parts.transform(validation_class_labels)
print(len(validation_labels_bags))
validation_unique_labels = sorted(np.unique(validation_labels_bags))
validation_unique_labels_count = len(validation_unique_labels)
print("Validation unique labels count:", validation_unique_labels_count)

7849
Train unique labels count: 1013
1379
Validation unique labels count: 1013


In [11]:
combined_train_features = []
combined_validation_features = []
for i in range(len(train_features_parts_bags)):
    combined_train_features.append(np.concatenate((train_features[i], train_features_parts_bags[i]), axis=None))
print(np.array(combined_train_features).shape)

for i in range(len(validation_features_parts_bags)):
    combined_validation_features.append(np.concatenate((validation_features[i], validation_features_parts_bags[i]), axis=None))
print(np.array(combined_validation_features).shape)

(7849, 3840)
(1379, 3840)


In [12]:
#0-1 normalization
scalar = MinMaxScaler()
combined_train_features_norm = scalar.fit_transform(combined_train_features)
combined_validation_features_norm = scalar.transform(combined_validation_features)

Averaging by sample id

In [13]:
train_sampleid_list = np.squeeze(train_sampleid)
validation_sampleid_list = np.squeeze(validation_sampleid)

In [18]:
nc, dc = np.array(combined_train_features_norm).shape
train_sampleid_index = []
combined_train_features_sm = np.zeros((nc, dc))
for i in range(nc):
    loc_sample_id = []
    for j in range(nc):
        if (train_sampleid_list[i] == train_sampleid_list[j]):
            loc_sample_id.append(j)
    train_sampleid_index.append(loc_sample_id)

In [19]:
for i in range(nc):
    temp_train_features = np.zeros(dc)
    for j in range(len(train_sampleid_index[i])):
        temp_train_features += combined_train_features_norm[train_sampleid_index[i][j]]
    combined_train_features_sm[i] = temp_train_features/len(train_sampleid_index[i])

print(combined_train_features_sm.shape)

(7849, 3840)


In [20]:
nc, dc = np.array(combined_validation_features_norm).shape
validation_sampleid_index = []
combined_validation_features_sm = np.zeros((nc, dc))
for i in range(nc):
    loc_sample_id = []
    for j in range(nc):
        if (validation_sampleid_list[i] == validation_sampleid_list[j]):
            loc_sample_id.append(j)
    validation_sampleid_index.append(loc_sample_id)

In [21]:
for i in range(nc):
    temp_validation_features = np.zeros(dc)
    for j in range(len(validation_sampleid_index[i])):
        temp_validation_features += combined_validation_features_norm[validation_sampleid_index[i][j]]
    combined_validation_features_sm[i] = temp_validation_features/len(validation_sampleid_index[i])

print(combined_validation_features_sm.shape)

(1379, 3840)


Logistic regression

In [22]:
clf = LogisticRegression(C = 10, solver = 'liblinear', class_weight = 'balanced', random_state=0)
clf.fit(combined_train_features_sm, train_labels_bags)
score = clf.score(combined_validation_features_sm, validation_labels_bags)
print(score)
preds = clf.predict(combined_validation_features_sm)
mean_acc = mean_class_acc(preds, validation_labels_bags)
print("Mean class accuracy:", mean_acc)

0.8651196519216824
Mean class accuracy: 0.8435693132139331
