# COMP448 - Medical Image Analysis - Project
## Şebnem Demirtaş - 76813
## Mete Erdoğan - 69666


# - Extreme Learning Machines with PCA
#### - Resnet18
#### - Resnet50
#### - DINO ViT-B/16

# - Evaluations of Other Models
####      - Resnet18 - Trained From Scratch
####      - Resnet18 - Trained after Transfer Learning Initialization
####      - Resnet18 - Fine-Tuning the last Linear layer
####      - DINO ViT-B/16 - Fine-Tuning with Linear Classifier

In [1]:
from __future__ import print_function, division
import numpy as np
import matplotlib.pyplot as plt
import time
import os
import copy
import shutil
import math
import random
from tqdm import tqdm
from PIL import Image

In [2]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.optim import lr_scheduler
from torch.utils.data import TensorDataset, DataLoader, Subset
import torchvision
from torchvision import datasets, models, transforms
from torchvision.models import resnet50, ResNet50_Weights, resnet18, ResNet18_Weights, resnet101, ResNet101_Weights
from torchvision.models import vit_l_16, ViT_L_16_Weights, vit_b_16, ViT_B_16_Weights

In [4]:
from collections import namedtuple, Counter
import torch.nn.functional as F
from torch.nn.functional import relu
from torch.utils.data import TensorDataset, DataLoader, random_split, ConcatDataset
from sklearn.model_selection import KFold

In [5]:
from IPython.core.debugger import Pdb
import sys
import cv2 as cv
import pickle
import collections

In [6]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
random_seed = 448
torch.manual_seed(random_seed)

<torch._C.Generator at 0x2b5d45c1c2d0>

### - If the datasets are created and saved, start by loading them from pickle files here:

In [7]:
with open("train_data.pickle", 'rb') as file:
    loaded_data1 = pickle.load(file)
    train_dataset = loaded_data1
    
with open("test_data.pickle", 'rb') as file:
    loaded_data2 = pickle.load(file)
    test_dataset = loaded_data2


In [8]:
train_dataloader2 = DataLoader(train_dataset, batch_size=1, shuffle=False)
test_dataloader = DataLoader(test_dataset, batch_size=1, shuffle=False)

In [9]:
def calculate_metrics_multiclass(gt_masks, pred_masks, num_classes=3):
    gt_masks = gt_masks.float()
    pred_masks = pred_masks.float()
    metrics = torch.zeros((num_classes, 4))

    for c in range(num_classes):
        gt_class = (gt_masks == c).float()
        pred_class = (pred_masks == c).float()
        tp = torch.sum(gt_class * pred_class)
        tn = torch.sum((1 - gt_class) * (1 - pred_class))
        fp = torch.sum((1 - gt_class) * pred_class)
        fn = torch.sum(gt_class * (1 - pred_class))
        accuracy = (tp + tn) / (tp + tn + fp + fn)

        precision = tp / (tp + fp) if (tp + fp) > 0 else torch.tensor(0.0)
        recall = tp / (tp + fn) if (tp + fn) > 0 else torch.tensor(0.0)
        f1_score = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else torch.tensor(0.0)
        metrics[c] = torch.tensor([accuracy.item(), precision.item(), recall.item(), f1_score.item()])

    acc = torch.sum(gt_masks == pred_masks) / len(pred_masks)
    return acc, metrics

In [10]:
def return_model(model_num = 0):
    if(model_num == 4):
        model = models.vit_b_16(weights=ViT_B_16_Weights.IMAGENET1K_V1)
        print("learning rate: ", lr)
        for param in model.parameters():
            param.requires_grad = False
        model.heads.head = nn.Linear(768, 3, bias=True)
        return model
    
    if(model_num == 3):
        dino = torch.hub.load('facebookresearch/dino:main', 'dino_vits16')
        model = nn.Sequential(collections.OrderedDict([
          ('dino', dino),
          ('last', nn.Linear(384, 3, bias=True)),
        ]))
        for param in model.parameters():
            param.requires_grad = False
        model.last = nn.Linear(384, 3, bias=True)
        return model
    
    if(model_num == 2):
        model = models.resnet18(weights=ResNet18_Weights.IMAGENET1K_V1)
        for param in model.parameters():
            param.requires_grad = False
        model.fc = nn.Linear(512, 3, bias=True) 
        return model
        
    if(model_num == 1):
        return models.resnet18(weights=ResNet18_Weights.IMAGENET1K_V1)
    
    else:
        return models.resnet18()

In [43]:
def evaluate_ELM_classifier(train_dataloader2, test_dataloader, X_data, V_best, Wout_best, model):
    # Evaluate on Train and Test sets
    all_labels = []
    all_preds = []
    ct = 0
    for k, (y, x) in enumerate(train_dataloader2):  
        ct+=1
        yn = X_data[k,:].to(device)
        result = yn @ V_best.to(device) @ Wout_best
        pred = torch.argmax(result, dim=0).squeeze()
        all_preds.extend([pred.cpu()])
        all_labels.extend([x[0].cpu()])
    all_preds_tensor = torch.tensor(all_preds)
    all_labels_tensor = torch.tensor(all_labels)
    acc, metrics = calculate_metrics_multiclass(all_labels_tensor, all_preds_tensor)
    print(f'Average Metrics over train dataset:')
    print(f'Accuracy: {torch.mean(metrics[:, 0]):.4f}')
    print(f'Precision: {torch.mean(metrics[:, 1]):.4f}')
    print(f'Recall: {torch.mean(metrics[:, 2]):.4f}')
    print(f'F1 Score: {torch.mean(metrics[:, 3]):.4f}')
    for c in range(metrics.shape[0]):
        print(f'\nClass {c} Metrics')
        print(f'Accuracy: {metrics[c, 0]:.4f}')
        print(f'Precision: {metrics[c, 1]:.4f}')
        print(f'Recall: {metrics[c, 2]:.4f}')
        print(f'F1 Score: {metrics[c, 3]:.4f}')

    print()
    print()

    all_labels = []
    all_preds = []
    ct = 0
    for k, (y, x) in enumerate(test_dataloader):  
        ct+=1
        yn = model(y.to(device).repeat(1, 3, 1, 1))
        result = yn @ V_best.to(device) @ Wout_best
        pred = torch.argmax(result, dim=1).squeeze()
        all_preds.extend([pred.cpu()])
        all_labels.extend([x[0].cpu()])
    all_preds_tensor = torch.tensor(all_preds)
    all_labels_tensor = torch.tensor(all_labels)
    acc, metrics = calculate_metrics_multiclass(all_labels_tensor, all_preds_tensor)
    print(f'Average Metrics over test dataset:')
    print(f'Accuracy: {torch.mean(metrics[:, 0]):.4f}')
    print(f'Precision: {torch.mean(metrics[:, 1]):.4f}')
    print(f'Recall: {torch.mean(metrics[:, 2]):.4f}')
    print(f'F1 Score: {torch.mean(metrics[:, 3]):.4f}')
    for c in range(metrics.shape[0]):
        print(f'\nClass {c} Metrics')
        print(f'Accuracy: {metrics[c, 0]:.4f}')
        print(f'Precision: {metrics[c, 1]:.4f}')
        print(f'Recall: {metrics[c, 2]:.4f}')
        print(f'F1 Score: {metrics[c, 3]:.4f}')
    
    return all_preds_tensor


def evaluate_model_x(d1, d2, model):
    datatype = torch.float64
    model.eval()
    print("")
    sets = ["train", "test"]
    for idx, dataset in enumerate([d1, d2]):
        ct = 0 
        all_labels = []
        all_preds = []
        for inputs, labels in dataset:
            ct+=1
            inputs = inputs.to(device).repeat(1, 3, 1, 1)
            labels = labels.to(device)
            with torch.set_grad_enabled(False):
                outputs = model(inputs)
                _, preds = torch.max(outputs, dim=1)
                all_preds.extend(preds.cpu())
                all_labels.extend(labels.cpu())
                #Pdb().set_trace() 
        all_preds_tensor = torch.tensor(all_preds)
        all_labels_tensor = torch.tensor(all_labels)
        acc, metrics = calculate_metrics_multiclass(all_labels_tensor, all_preds_tensor)
        print(f'Average Metrics over {sets[idx]} dataset:')
        print(f'Accuracy: {torch.mean(metrics[:, 0]):.4f}')
        print(f'Precision: {torch.mean(metrics[:, 1]):.4f}')
        print(f'Recall: {torch.mean(metrics[:, 2]):.4f}')
        print(f'F1 Score: {torch.mean(metrics[:, 3]):.4f}')

        for c in range(metrics.shape[0]):
            print(f'\nClass {c} Metrics')
            print(f'Accuracy: {metrics[c, 0]:.4f}')
            print(f'Precision: {metrics[c, 1]:.4f}')
            print(f'Recall: {metrics[c, 2]:.4f}')
            print(f'F1 Score: {metrics[c, 3]:.4f}')
        print()
        print()
    return all_preds_tensor

# DINO - ViT-b16

In [11]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = torch.hub.load('facebookresearch/dino:main', 'dino_vits16').to(device)
torch.set_grad_enabled(False)
hidden = 384

X_data = torch.zeros((len(train_dataset), hidden), requires_grad=False)
b_data = torch.zeros((len(train_dataset), 3), requires_grad=False)

for k, (y, x) in enumerate(train_dataloader2):
    if(k % 250 == 0):
        print(k)
    X_data[k, :] = model(y.to(device).repeat(1, 3, 1, 1))
    xn = torch.zeros(1, 3).to(device)
    xn[:, x] = 1
    b_data[k, :] = xn

Using cache found in /kuacc/users/merdogan18/.cache/torch/hub/facebookresearch_dino_main


0
250
500
750
1000
1250
1500
1750
2000
2250
2500
2750
3000
3250
3500
3750
4000
4250
4500
4750
5000


In [153]:
column_means.shape

torch.Size([5216])

In [154]:
num_folds = 5
kf = KFold(n_splits=num_folds, shuffle=True, random_state=42)
best_f1 = 0
best_model = None
best_q = None
q_values = [10, 50, 100, 150, 200, 384]  # List of q values to try

column_means = torch.mean(X_data, axis=1, keepdim=True)
X_zero_mean = X_data - column_means


for q in q_values:
    print(f"- number of pca features: {q}")
    fold_metrics = []

    for fold, (train_ids, val_ids) in enumerate(kf.split(X_zero_mean)):
        print(f'Fold {fold + 1} with PCA q={q}')
        
        X_train, X_val = X_zero_mean[train_ids], X_zero_mean[val_ids]
        b_train, b_val = b_data[train_ids], b_data[val_ids]
        U, E, V = torch.pca_lowrank(X_train, q=q, center=True, niter=5)
        X_train_pca = X_train @ V
        Wout = torch.linalg.pinv(X_train_pca) @ b_train
        Wout = Wout.to(device)

        all_labels_val = []
        all_preds_val = []
        V = V.to(device)
        for i in range(X_val.shape[0]):
            yn = X_val[i, :].to(device)
            result = yn @ V @ Wout
            pred = torch.argmax(result, dim=0).squeeze()
            all_preds_val.extend([pred.cpu()])
            all_labels_val.extend([torch.argmax(b_val[i,:]).cpu()])

        all_preds_val_tensor = torch.tensor(all_preds_val)
        all_labels_val_tensor = torch.tensor(all_labels_val)
        acc_val, class_metrics = calculate_metrics_multiclass(all_labels_val_tensor, all_preds_val_tensor)
        fold_metrics.append(class_metrics)
    
    fold_metrics_tensor = torch.stack(fold_metrics)
    mean_metrics = torch.mean(fold_metrics_tensor, dim=0)
    std_metrics = torch.std(fold_metrics_tensor, dim=0)

    print(f'Average Metrics over {num_folds} folds:')
    print(f'Accuracy: {torch.mean(mean_metrics[:, 0]):.4f} ± {torch.mean(std_metrics[:, 0]):.4f}')
    print(f'Precision: {torch.mean(mean_metrics[:, 1]):.4f} ± {torch.mean(std_metrics[:, 1]):.4f}')
    print(f'Recall: {torch.mean(mean_metrics[:, 2]):.4f} ± {torch.mean(std_metrics[:, 2]):.4f}')
    print(f'F1 Score: {torch.mean(mean_metrics[:, 3]):.4f} ± {torch.mean(std_metrics[:, 3]):.4f}')

    for c in range(mean_metrics.shape[0]):
        print(f'\nClass {c} Metrics over {num_folds} folds:')
        print(f'Accuracy: {mean_metrics[c, 0]:.4f} ± {std_metrics[c, 0]:.4f}')
        print(f'Precision: {mean_metrics[c, 1]:.4f} ± {std_metrics[c, 1]:.4f}')
        print(f'Recall: {mean_metrics[c, 2]:.4f} ± {std_metrics[c, 2]:.4f}')
        print(f'F1 Score: {mean_metrics[c, 3]:.4f} ± {std_metrics[c, 3]:.4f}')
    print("----------------------------------------------------")
    print()
    avg_f1 = torch.mean(mean_metrics[:, 3])
    if avg_f1 > best_f1:
        best_f1 = avg_f1
        best_q = q
        best_model = (V, Wout)

torch.set_grad_enabled(True)
print(f"Best PCA q: {best_q} with F1 score: {best_f1:0.4f}")

# The best model components are in best_model
V_best, Wout_best = best_model
torch.save((V_best, Wout_best), 'best_model.pth')

- number of pca features: 10
Fold 1 with PCA q=10
Fold 2 with PCA q=10
Fold 3 with PCA q=10
Fold 4 with PCA q=10
Fold 5 with PCA q=10
Average Metrics over 5 folds:
Accuracy: 0.8319 ± 0.0112
Precision: 0.7294 ± 0.0303
Recall: 0.7431 ± 0.0214
F1 Score: 0.7311 ± 0.0204

Class 0 Metrics over 5 folds:
Accuracy: 0.9170 ± 0.0123
Precision: 0.7776 ± 0.0269
Recall: 0.9487 ± 0.0117
F1 Score: 0.8545 ± 0.0195

Class 1 Metrics over 5 folds:
Accuracy: 0.7966 ± 0.0119
Precision: 0.6289 ± 0.0445
Recall: 0.5153 ± 0.0392
F1 Score: 0.5656 ± 0.0344

Class 2 Metrics over 5 folds:
Accuracy: 0.7822 ± 0.0094
Precision: 0.7816 ± 0.0195
Recall: 0.7652 ± 0.0134
F1 Score: 0.7731 ± 0.0074
----------------------------------------------------

- number of pca features: 50
Fold 1 with PCA q=50
Fold 2 with PCA q=50
Fold 3 with PCA q=50
Fold 4 with PCA q=50
Fold 5 with PCA q=50
Average Metrics over 5 folds:
Accuracy: 0.8645 ± 0.0077
Precision: 0.7889 ± 0.0133
Recall: 0.7815 ± 0.0152
F1 Score: 0.7793 ± 0.0087

Class 0 M

In [155]:
all_preds_tensor1 = evaluate_ELM_classifier(train_dataloader2, test_dataloader, X_zero_mean, V_best, Wout_best, model)

Average Metrics over train dataset:
Accuracy: 0.8961
Precision: 0.8422
Recall: 0.8340
F1 Score: 0.8359

Class 0 Metrics
Accuracy: 0.9783
Precision: 0.9348
Recall: 0.9843
F1 Score: 0.9590

Class 1 Metrics
Accuracy: 0.8549
Precision: 0.7602
Recall: 0.6387
F1 Score: 0.6941

Class 2 Metrics
Accuracy: 0.8551
Precision: 0.8317
Recall: 0.8791
F1 Score: 0.8547


Average Metrics over test dataset:
Accuracy: 0.9028
Precision: 0.8549
Recall: 0.8449
F1 Score: 0.8427

Class 0 Metrics
Accuracy: 0.9054
Precision: 0.9834
Recall: 0.7607
F1 Score: 0.8578

Class 1 Metrics
Accuracy: 0.8894
Precision: 0.7548
Recall: 0.7905
F1 Score: 0.7723

Class 2 Metrics
Accuracy: 0.9135
Precision: 0.8264
Recall: 0.9835
F1 Score: 0.8981


# ResNet-18

In [160]:
hidden = 512
model2 = models.resnet18(weights=ResNet18_Weights.IMAGENET1K_V1)
model2.fc = nn.Identity()

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model2 = model2.to(device)

for param in model2.parameters():
    param.requires_grad = False
torch.set_grad_enabled(False)

X_data2 = torch.zeros((len(train_dataset), hidden), requires_grad=False)
b_data2 = torch.zeros((len(train_dataset), 3), requires_grad=False)

for k, (y, x) in enumerate(train_dataloader2):
    if k % 250 == 0:
        print(k)
    y = y.to(device).repeat(1, 3, 1, 1)
    X_data2[k, :] = model2(y)
    
    xn = torch.zeros(1, 3).to(device)
    xn[:, x] = 1
    b_data2[k, :] = xn

0
250
500
750
1000
1250
1500
1750
2000
2250
2500
2750
3000
3250
3500
3750
4000
4250
4500
4750
5000


In [168]:
num_folds = 5
kf = KFold(n_splits=num_folds, shuffle=True, random_state=42)
best_f1 = 0
best_model = None
best_q = None
q_values = [10, 50, 100, 150, 200, 400, 512]  # List of q values to try
column_means2 = torch.mean(X_data2, axis=1, keepdim=True)
X2_zero_mean = X_data2 - column_means2

for q in q_values:
    print(f"- number of pca features: {q}")
    fold_metrics = []

    for fold, (train_ids, val_ids) in enumerate(kf.split(X2_zero_mean)):
        print(f'Fold {fold + 1} with PCA q={q}')
        
        X_train, X_val = X2_zero_mean[train_ids], X2_zero_mean[val_ids]
        b_train, b_val = b_data2[train_ids], b_data2[val_ids]
        U, E, V = torch.pca_lowrank(X_train, q=q, center=True, niter=5)
        X_train_pca = X_train @ V
        Wout = torch.linalg.pinv(X_train_pca) @ b_train
        Wout = Wout.to(device)

        all_labels_val = []
        all_preds_val = []
        V = V.to(device)
        for i in range(X_val.shape[0]):
            yn = X_val[i, :].to(device)
            result = yn @ V @ Wout
            pred = torch.argmax(result, dim=0).squeeze()
            all_preds_val.extend([pred.cpu()])
            all_labels_val.extend([torch.argmax(b_val[i,:]).cpu()])

        all_preds_val_tensor = torch.tensor(all_preds_val)
        all_labels_val_tensor = torch.tensor(all_labels_val)
        acc_val, class_metrics = calculate_metrics_multiclass(all_labels_val_tensor, all_preds_val_tensor)
        fold_metrics.append(class_metrics)
    
    fold_metrics_tensor = torch.stack(fold_metrics)
    mean_metrics = torch.mean(fold_metrics_tensor, dim=0)
    std_metrics = torch.std(fold_metrics_tensor, dim=0)

    print(f'Average Metrics over {num_folds} folds:')
    print(f'Accuracy: {torch.mean(mean_metrics[:, 0]):.4f} ± {torch.mean(std_metrics[:, 0]):.4f}')
    print(f'Precision: {torch.mean(mean_metrics[:, 1]):.4f} ± {torch.mean(std_metrics[:, 1]):.4f}')
    print(f'Recall: {torch.mean(mean_metrics[:, 2]):.4f} ± {torch.mean(std_metrics[:, 2]):.4f}')
    print(f'F1 Score: {torch.mean(mean_metrics[:, 3]):.4f} ± {torch.mean(std_metrics[:, 3]):.4f}')

    for c in range(mean_metrics.shape[0]):
        print(f'\nClass {c} Metrics over {num_folds} folds:')
        print(f'Accuracy: {mean_metrics[c, 0]:.4f} ± {std_metrics[c, 0]:.4f}')
        print(f'Precision: {mean_metrics[c, 1]:.4f} ± {std_metrics[c, 1]:.4f}')
        print(f'Recall: {mean_metrics[c, 2]:.4f} ± {std_metrics[c, 2]:.4f}')
        print(f'F1 Score: {mean_metrics[c, 3]:.4f} ± {std_metrics[c, 3]:.4f}')
    print("----------------------------------------------------")
    print()
    avg_f1 = torch.mean(mean_metrics[:, 3])
    if avg_f1 > best_f1:
        best_f1 = avg_f1
        best_q = q
        best_model = (V, Wout)

torch.set_grad_enabled(True)
print(f"Best PCA q: {best_q} with F1 score: {best_f1:0.4f}")

# The best model components are in best_model
V_best, Wout_best = best_model
torch.save((V_best, Wout_best), 'best_model.pth')

- number of pca features: 10
Fold 1 with PCA q=10
Fold 2 with PCA q=10
Fold 3 with PCA q=10
Fold 4 with PCA q=10
Fold 5 with PCA q=10
Average Metrics over 5 folds:
Accuracy: 0.7763 ± 0.0190
Precision: 0.6652 ± 0.0621
Recall: 0.6111 ± 0.0368
F1 Score: 0.5716 ± 0.0556

Class 0 Metrics over 5 folds:
Accuracy: 0.8499 ± 0.0197
Precision: 0.6654 ± 0.0518
Recall: 0.8382 ± 0.0127
F1 Score: 0.7411 ± 0.0356

Class 1 Metrics over 5 folds:
Accuracy: 0.7559 ± 0.0184
Precision: 0.6610 ± 0.1057
Recall: 0.1451 ± 0.0772
F1 Score: 0.2252 ± 0.1123

Class 2 Metrics over 5 folds:
Accuracy: 0.7232 ± 0.0190
Precision: 0.6692 ± 0.0289
Recall: 0.8500 ± 0.0205
F1 Score: 0.7484 ± 0.0189
----------------------------------------------------

- number of pca features: 50
Fold 1 with PCA q=50
Fold 2 with PCA q=50
Fold 3 with PCA q=50
Fold 4 with PCA q=50
Fold 5 with PCA q=50
Average Metrics over 5 folds:
Accuracy: 0.7947 ± 0.0107
Precision: 0.6844 ± 0.0253
Recall: 0.6479 ± 0.0189
F1 Score: 0.6361 ± 0.0145

Class 0 M

In [169]:
all_preds_tensor2 = evaluate_ELM_classifier(train_dataloader2, test_dataloader, X2_zero_mean, V_best, Wout_best, model2)

Average Metrics over train dataset:
Accuracy: 0.8099
Precision: 0.7068
Recall: 0.6756
F1 Score: 0.6685

Class 0 Metrics
Accuracy: 0.8934
Precision: 0.7587
Recall: 0.8583
F1 Score: 0.8055

Class 1 Metrics
Accuracy: 0.7812
Precision: 0.6560
Recall: 0.3190
F1 Score: 0.4292

Class 2 Metrics
Accuracy: 0.7552
Precision: 0.7057
Recall: 0.8494
F1 Score: 0.7709


Average Metrics over test dataset:
Accuracy: 0.7906
Precision: 0.7070
Recall: 0.6523
F1 Score: 0.6569

Class 0 Metrics
Accuracy: 0.8093
Precision: 0.8286
Recall: 0.6197
F1 Score: 0.7090

Class 1 Metrics
Accuracy: 0.8157
Precision: 0.6737
Recall: 0.4324
F1 Score: 0.5267

Class 2 Metrics
Accuracy: 0.7468
Precision: 0.6186
Recall: 0.9050
F1 Score: 0.7349


# ResNet-50

In [170]:
hidden = 2048
model3 = models.resnet50(weights=ResNet50_Weights.IMAGENET1K_V2)
model3.fc = nn.Identity()

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model3 = model3.to(device)

for param in model3.parameters():
    param.requires_grad = False
torch.set_grad_enabled(False)

X_data3 = torch.zeros((len(train_dataset), hidden), requires_grad=False)
b_data3 = torch.zeros((len(train_dataset), 3), requires_grad=False)

for k, (y, x) in enumerate(train_dataloader2):
    if k % 250 == 0:
        print(k)
    y = y.to(device).repeat(1, 3, 1, 1)
    X_data3[k, :] = model3(y)
    
    xn = torch.zeros(1, 3).to(device)
    xn[:, x] = 1
    b_data3[k, :] = xn

0
250
500
750
1000
1250
1500
1750
2000
2250
2500
2750
3000
3250
3500
3750
4000
4250
4500
4750
5000


In [171]:
num_folds = 5
kf = KFold(n_splits=num_folds, shuffle=True, random_state=448)
best_f1 = 0
best_model = None
best_q = None
q_values = [10, 50, 100, 500, 750, 1000, 1500, 2048]  # List of q values to try
column_means3 = torch.mean(X_data3, axis=1, keepdim=True)
X3_zero_mean = X_data3 - column_means3

for q in q_values:
    print(f"- number of pca features: {q}")
    fold_metrics = []

    for fold, (train_ids, val_ids) in enumerate(kf.split(X3_zero_mean)):
        print(f'Fold {fold + 1} with PCA q={q}')
        
        X_train, X_val = X3_zero_mean[train_ids], X3_zero_mean[val_ids]
        b_train, b_val = b_data3[train_ids], b_data3[val_ids]
        U, E, V = torch.pca_lowrank(X_train, q=q, center=True, niter=5)
        X_train_pca = X_train @ V
        Wout = torch.linalg.pinv(X_train_pca) @ b_train
        Wout = Wout.to(device)

        all_labels_val = []
        all_preds_val = []
        V = V.to(device)
        for i in range(X_val.shape[0]):
            yn = X_val[i, :].to(device)
            result = yn @ V @ Wout
            pred = torch.argmax(result, dim=0).squeeze()
            all_preds_val.extend([pred.cpu()])
            all_labels_val.extend([torch.argmax(b_val[i,:]).cpu()])

        all_preds_val_tensor = torch.tensor(all_preds_val)
        all_labels_val_tensor = torch.tensor(all_labels_val)
        acc_val, class_metrics = calculate_metrics_multiclass(all_labels_val_tensor, all_preds_val_tensor)
        fold_metrics.append(class_metrics)
    
    fold_metrics_tensor = torch.stack(fold_metrics)
    mean_metrics = torch.mean(fold_metrics_tensor, dim=0)
    std_metrics = torch.std(fold_metrics_tensor, dim=0)

    print(f'Average Metrics over {num_folds} folds:')
    print(f'Accuracy: {torch.mean(mean_metrics[:, 0]):.4f} ± {torch.mean(std_metrics[:, 0]):.4f}')
    print(f'Precision: {torch.mean(mean_metrics[:, 1]):.4f} ± {torch.mean(std_metrics[:, 1]):.4f}')
    print(f'Recall: {torch.mean(mean_metrics[:, 2]):.4f} ± {torch.mean(std_metrics[:, 2]):.4f}')
    print(f'F1 Score: {torch.mean(mean_metrics[:, 3]):.4f} ± {torch.mean(std_metrics[:, 3]):.4f}')

    for c in range(mean_metrics.shape[0]):
        print(f'\nClass {c} Metrics over {num_folds} folds:')
        print(f'Accuracy: {mean_metrics[c, 0]:.4f} ± {std_metrics[c, 0]:.4f}')
        print(f'Precision: {mean_metrics[c, 1]:.4f} ± {std_metrics[c, 1]:.4f}')
        print(f'Recall: {mean_metrics[c, 2]:.4f} ± {std_metrics[c, 2]:.4f}')
        print(f'F1 Score: {mean_metrics[c, 3]:.4f} ± {std_metrics[c, 3]:.4f}')
    print("----------------------------------------------------")
    print()
    avg_f1 = torch.mean(mean_metrics[:, 3])
    if avg_f1 > best_f1:
        best_f1 = avg_f1
        best_q = q
        best_model = (V, Wout)

torch.set_grad_enabled(True)
print(f"Best PCA q: {best_q} with F1 score: {best_f1:0.4f}")

# The best model components are in best_model
V_best, Wout_best = best_model
torch.save((V_best, Wout_best), 'best_model.pth')

- number of pca features: 10
Fold 1 with PCA q=10
Fold 2 with PCA q=10
Fold 3 with PCA q=10
Fold 4 with PCA q=10
Fold 5 with PCA q=10
Average Metrics over 5 folds:
Accuracy: 0.7887 ± 0.0190
Precision: 0.6765 ± 0.0432
Recall: 0.6391 ± 0.0310
F1 Score: 0.6173 ± 0.0336

Class 0 Metrics over 5 folds:
Accuracy: 0.8595 ± 0.0178
Precision: 0.6828 ± 0.0393
Recall: 0.8485 ± 0.0235
F1 Score: 0.7564 ± 0.0311

Class 1 Metrics over 5 folds:
Accuracy: 0.7697 ± 0.0199
Precision: 0.6577 ± 0.0570
Recall: 0.2330 ± 0.0466
F1 Score: 0.3409 ± 0.0484

Class 2 Metrics over 5 folds:
Accuracy: 0.7370 ± 0.0192
Precision: 0.6890 ± 0.0334
Recall: 0.8357 ± 0.0229
F1 Score: 0.7547 ± 0.0214
----------------------------------------------------

- number of pca features: 50
Fold 1 with PCA q=50
Fold 2 with PCA q=50
Fold 3 with PCA q=50
Fold 4 with PCA q=50
Fold 5 with PCA q=50
Average Metrics over 5 folds:
Accuracy: 0.8239 ± 0.0130
Precision: 0.7308 ± 0.0336
Recall: 0.7057 ± 0.0172
F1 Score: 0.7007 ± 0.0191

Class 0 M

In [172]:
all_preds_tensor3 = evaluate_ELM_classifier(train_dataloader2, test_dataloader, X3_zero_mean, V_best, Wout_best, model3)

Average Metrics over train dataset:
Accuracy: 0.8645
Precision: 0.7910
Recall: 0.7787
F1 Score: 0.7799

Class 0 Metrics
Accuracy: 0.9434
Precision: 0.8607
Recall: 0.9306
F1 Score: 0.8943

Class 1 Metrics
Accuracy: 0.8286
Precision: 0.7213
Recall: 0.5465
F1 Score: 0.6218

Class 2 Metrics
Accuracy: 0.8215
Precision: 0.7910
Recall: 0.8589
F1 Score: 0.8236


Average Metrics over test dataset:
Accuracy: 0.8184
Precision: 0.7409
Recall: 0.7159
F1 Score: 0.7137

Class 0 Metrics
Accuracy: 0.8189
Precision: 0.8854
Recall: 0.5940
F1 Score: 0.7110

Class 1 Metrics
Accuracy: 0.8349
Precision: 0.6531
Recall: 0.6486
F1 Score: 0.6508

Class 2 Metrics
Accuracy: 0.8013
Precision: 0.6844
Recall: 0.9050
F1 Score: 0.7794


# Evaluate saved models

In [179]:
with open("loggers/resnet_from_scratch/resnet_from_scratch.pkl", 'rb') as f:
    best_model = pickle.load(f)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
best_model = best_model.to(device)
best_model.eval()
all_preds_tensor1x = evaluate_model_x(train_dataloader2, test_dataloader, best_model)


Average Metrics over train dataset:
Accuracy: 0.9751
Precision: 0.9636
Recall: 0.9599
F1 Score: 0.9617

Class 0 Metrics
Accuracy: 0.9956
Precision: 0.9911
Recall: 0.9918
F1 Score: 0.9914

Class 1 Metrics
Accuracy: 0.9636
Precision: 0.9412
Recall: 0.9160
F1 Score: 0.9284

Class 2 Metrics
Accuracy: 0.9661
Precision: 0.9587
Recall: 0.9719
F1 Score: 0.9653


Average Metrics over test dataset:
Accuracy: 0.8120
Precision: 0.7471
Recall: 0.7120
F1 Score: 0.6974

Class 0 Metrics
Accuracy: 0.8157
Precision: 0.9837
Recall: 0.5171
F1 Score: 0.6779

Class 1 Metrics
Accuracy: 0.7612
Precision: 0.4976
Recall: 0.6892
F1 Score: 0.5779

Class 2 Metrics
Accuracy: 0.8590
Precision: 0.7601
Recall: 0.9298
F1 Score: 0.8364




In [174]:
with open("loggers/resnet_transferlearn/resnet_transferlearn.pkl", 'rb') as f:
    best_model = pickle.load(f)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
best_model = best_model.to(device)
best_model.eval()
all_preds_tensor2x = evaluate_model_x(train_dataloader2, test_dataloader, best_model)


Average Metrics over train dataset:
Accuracy: 0.9822
Precision: 0.9738
Recall: 0.9731
F1 Score: 0.9734

Class 0 Metrics
Accuracy: 0.9988
Precision: 0.9963
Recall: 0.9993
F1 Score: 0.9978

Class 1 Metrics
Accuracy: 0.9741
Precision: 0.9535
Recall: 0.9457
F1 Score: 0.9496

Class 2 Metrics
Accuracy: 0.9737
Precision: 0.9716
Recall: 0.9743
F1 Score: 0.9730


Average Metrics over test dataset:
Accuracy: 0.8376
Precision: 0.7756
Recall: 0.7465
F1 Score: 0.7354

Class 0 Metrics
Accuracy: 0.8365
Precision: 0.9925
Recall: 0.5684
F1 Score: 0.7228

Class 1 Metrics
Accuracy: 0.8061
Precision: 0.5754
Recall: 0.6959
F1 Score: 0.6300

Class 2 Metrics
Accuracy: 0.8702
Precision: 0.7588
Recall: 0.9752
F1 Score: 0.8535




In [175]:
with open("loggers/resnet_finetuning/resnet_finetuning.pkl", 'rb') as f:
    best_model = pickle.load(f)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
best_model = best_model.to(device)
best_model.eval()
all_preds_tensor3x = evaluate_model_x(train_dataloader2, test_dataloader, best_model)


Average Metrics over train dataset:
Accuracy: 0.8612
Precision: 0.7822
Recall: 0.7913
F1 Score: 0.7863

Class 0 Metrics
Accuracy: 0.9517
Precision: 0.8794
Recall: 0.9411
F1 Score: 0.9092

Class 1 Metrics
Accuracy: 0.8184
Precision: 0.6508
Recall: 0.6387
F1 Score: 0.6447

Class 2 Metrics
Accuracy: 0.8135
Precision: 0.8163
Recall: 0.7941
F1 Score: 0.8050


Average Metrics over test dataset:
Accuracy: 0.8600
Precision: 0.7937
Recall: 0.7935
F1 Score: 0.7782

Class 0 Metrics
Accuracy: 0.8397
Precision: 0.9241
Recall: 0.6239
F1 Score: 0.7449

Class 1 Metrics
Accuracy: 0.8413
Precision: 0.6244
Recall: 0.8311
F1 Score: 0.7130

Class 2 Metrics
Accuracy: 0.8990
Precision: 0.8327
Recall: 0.9256
F1 Score: 0.8767




In [176]:
with open("loggers/dino_finetuning/dino_finetuning.pkl", 'rb') as f:
    best_model = pickle.load(f)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
best_model = best_model.to(device)
best_model.eval()
all_preds_tensor4x = evaluate_model_x(train_dataloader2, test_dataloader, best_model)


Average Metrics over train dataset:
Accuracy: 0.8912
Precision: 0.8405
Recall: 0.8257
F1 Score: 0.8319

Class 0 Metrics
Accuracy: 0.9837
Precision: 0.9772
Recall: 0.9590
F1 Score: 0.9680

Class 1 Metrics
Accuracy: 0.8459
Precision: 0.7275
Recall: 0.6431
F1 Score: 0.6827

Class 2 Metrics
Accuracy: 0.8441
Precision: 0.8167
Recall: 0.8751
F1 Score: 0.8449


Average Metrics over test dataset:
Accuracy: 0.8429
Precision: 0.7868
Recall: 0.7653
F1 Score: 0.7469

Class 0 Metrics
Accuracy: 0.8205
Precision: 0.9766
Recall: 0.5342
F1 Score: 0.6906

Class 1 Metrics
Accuracy: 0.8333
Precision: 0.6158
Recall: 0.7905
F1 Score: 0.6923

Class 2 Metrics
Accuracy: 0.8750
Precision: 0.7680
Recall: 0.9711
F1 Score: 0.8577




# Calculating Pairwise McNemar-Bowker Test for 7 models

In [177]:
all_labels = []
for k, (y, x) in enumerate(test_dataloader):  
    all_labels.extend([x[0].cpu()])
all_labels_tensor = torch.tensor(all_labels)

In [140]:
import numpy as np
from scipy.stats import chi2
from sklearn.metrics import confusion_matrix

# Assuming you have the true labels and predictions from 7 classifiers
true_labels = all_labels_tensor.numpy()  # True labels of the test set
predictions = [all_preds_tensor1.numpy(),
               all_preds_tensor2.numpy(),
               all_preds_tensor3.numpy(),
               all_preds_tensor1x.numpy(),
               all_preds_tensor2x.numpy(),
               all_preds_tensor3x.numpy(),
               all_preds_tensor4x.numpy()]

# McNemar-Bowker test function
def mcnemar_bowker_test(conf_matrix):
    k = conf_matrix.shape[0]
    chi2_statistic = 0
    for i in range(k):
        for j in range(i + 1, k):
            chi2_statistic += (conf_matrix[i, j] - conf_matrix[j, i])**2 / (conf_matrix[i, j] + conf_matrix[j, i] + 1e-9)
    
    p_value = 1 - chi2.cdf(chi2_statistic, df=k * (k - 1) / 2)
    return {'chi2_statistic': chi2_statistic, 'p_value': p_value}

# Generate pairwise confusion matrices and perform McNemar-Bowker test
num_classifiers = len(predictions)
results = {}

for i in range(num_classifiers):
    for j in range(i + 1, num_classifiers):
        pairwise_matrix = np.zeros((3, 3)) 
        for true_label, pred_i, pred_j in zip(true_labels, predictions[i], predictions[j]):
            pairwise_matrix[pred_i, pred_j] += 1  
        test_result = mcnemar_bowker_test(pairwise_matrix)
        results[(i + 1, j + 1)] = test_result

# Print the results
for pair, result in results.items():
    print(f"Comparison between Classifier {pair[0]} and Classifier {pair[1]}: Chi2 Statistic = {result['chi2_statistic']:.3f}, P-value = {result['p_value']:.3f}")

Comparison between Classifier 1 and Classifier 2: Chi2 Statistic = 20.418, P-value = 0.000
Comparison between Classifier 1 and Classifier 3: Chi2 Statistic = 10.746, P-value = 0.013
Comparison between Classifier 1 and Classifier 4: Chi2 Statistic = 53.319, P-value = 0.000
Comparison between Classifier 1 and Classifier 5: Chi2 Statistic = 41.503, P-value = 0.000
Comparison between Classifier 1 and Classifier 6: Chi2 Statistic = 30.145, P-value = 0.000
Comparison between Classifier 1 and Classifier 7: Chi2 Statistic = 48.464, P-value = 0.000
Comparison between Classifier 2 and Classifier 3: Chi2 Statistic = 6.710, P-value = 0.082
Comparison between Classifier 2 and Classifier 4: Chi2 Statistic = 44.416, P-value = 0.000
Comparison between Classifier 2 and Classifier 5: Chi2 Statistic = 29.108, P-value = 0.000
Comparison between Classifier 2 and Classifier 6: Chi2 Statistic = 48.372, P-value = 0.000
Comparison between Classifier 2 and Classifier 7: Chi2 Statistic = 35.427, P-value = 0.000
