<a href="https://colab.research.google.com/github/mayarali/carcinoma_classification/blob/fatih/OxML_feature_extraction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import os
import csv
import copy
import pandas as pd
import numpy as np
from PIL import Image
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader, random_split
from torchvision import io, models, transforms
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import f1_score, accuracy_score
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt

In [None]:

class CustomDataset(Dataset):

    def __init__(self, img_folder, df, phase='test', transform=None):

        self.img_folder = img_folder
        self.transform = transform
        self.df = df
        self.phase = phase

    def __len__(self):
        return self.df.shape[0]

    def __getitem__(self, idx):
        if torch.is_tensor(idx):
            idx = idx.tolist()
        if self.phase == 'test':
            fpath = os.path.join(self.img_folder, f"img_{self.df.id.iloc[idx]}.png")
        else:
            fpath = os.path.join(self.img_folder, f"img_{self.df.id.iloc[idx]}_{self.df.aug.iloc[idx]}.png")
        img = Image.open(fpath)
        label = self.df.malignant.iloc[idx] + 1
        pid = self.df.id.iloc[idx]
        if self.transform:
            img = self.transform(img)
        return img, label, pid

class AdaptiveMarginLoss(object):

    def __call__(self,a,p,n,m):
        pdist = nn.PairwiseDistance(p=2)
        dp = pdist(a,p)
        dn = pdist(a,n)
        cost = dp-dn+m
        loss = torch.where(cost > 0, cost, torch.zeros_like(cost))
        loss = torch.mean(loss)
        return loss

In [None]:
model_dict = {'resnet18': models.resnet18(weights=models.ResNet18_Weights.DEFAULT),
              'resnet50': models.resnet50(weights=models.ResNet50_Weights.DEFAULT),
              'vgg16': models.vgg16_bn(weights=models.VGG16_BN_Weights.DEFAULT),
              'mobilenet': models.mobilenet_v2(weights='DEFAULT'),
              'googlenet': models.googlenet(weights='DEFAULT'),
              'densenet': models.densenet161(weights='DEFAULT'),
              'shufflenet': models.shufflenet_v2_x1_0(weights='DEFAULT'),
              'efficientnet': models.efficientnet_v2_s(weights='DEFAULT')
              }
clf_dict = {'knn': KNeighborsClassifier(n_neighbors=5),
            'random forest': RandomForestClassifier(),
            'svm': SVC(),
            'naive bayes': GaussianNB(),
            'adaboost': AdaBoostClassifier(),
            'xgboost': GradientBoostingClassifier()
            }

Downloading: "https://download.pytorch.org/models/resnet18-f37072fd.pth" to /root/.cache/torch/hub/checkpoints/resnet18-f37072fd.pth
100%|██████████| 44.7M/44.7M [00:00<00:00, 194MB/s]
Downloading: "https://download.pytorch.org/models/resnet50-11ad3fa6.pth" to /root/.cache/torch/hub/checkpoints/resnet50-11ad3fa6.pth
100%|██████████| 97.8M/97.8M [00:00<00:00, 118MB/s] 
Downloading: "https://download.pytorch.org/models/vgg16_bn-6c64b313.pth" to /root/.cache/torch/hub/checkpoints/vgg16_bn-6c64b313.pth
100%|██████████| 528M/528M [00:05<00:00, 108MB/s] 
Downloading: "https://download.pytorch.org/models/mobilenet_v2-7ebf99e0.pth" to /root/.cache/torch/hub/checkpoints/mobilenet_v2-7ebf99e0.pth
100%|██████████| 13.6M/13.6M [00:00<00:00, 108MB/s]
Downloading: "https://download.pytorch.org/models/googlenet-1378be20.pth" to /root/.cache/torch/hub/checkpoints/googlenet-1378be20.pth
100%|██████████| 49.7M/49.7M [00:00<00:00, 206MB/s]
Downloading: "https://download.pytorch.org/models/densenet161-8d4

In [None]:
model_name = 'resnet18'
clf_name = 'knn'
torch.manual_seed(1)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
root = '/content/drive/MyDrive/OxML/MLx Cases/data/'
df = pd.read_csv('/content/drive/MyDrive/OxML/MLx Cases/data/labels.csv')
resultpath = '/content/drive/MyDrive/OxML/MLx Cases/results.csv'

transform_list = transforms.Compose([ transforms.ToTensor(),
                                  transforms.Normalize((0.7855, 0.6791, 0.8600),
                                                       (0.2116, 0.2516, 0.1184))])
dataset = CustomDataset(root, df, transform=transform_list)

dataloader = DataLoader(dataset, batch_size=1, shuffle=False)


for model_name in model_dict:
    print(model_name)
    model = model_dict[model_name]
    model = torch.nn.Sequential(*list(model.children())[:-1], nn.AdaptiveAvgPool2d(1))
    model.to(device)
    model.eval()

    labels = torch.Tensor().to(device)
    features = torch.Tensor().to(device)
    ids = torch.Tensor().to(device)

    for data in dataloader:
        input, label, pid = data[0].to(device), data[1].to(device), data[2].to(device)
        with torch.no_grad():
            outputs = model(input)
        #print(outputs.shape)
        labels = torch.cat((labels, label))
        features = torch.cat((features, outputs))
        ids = torch.cat((ids, pid))
    features = features.squeeze()
    #print(features.shape)
    for clf_name in clf_dict:
        print(clf_name)

        skf = StratifiedKFold(n_splits=5)
        f1 = 0
        for fold, (train_index, val_index) in enumerate(skf.split(features.cpu(), labels.cpu())):

            clf = clf_dict[clf_name]
            clf.fit(features[train_index].cpu(), labels[train_index].cpu())
            preds = clf.predict(features[val_index].cpu())
            f1 += f1_score(labels[val_index].cpu(), preds, average='micro')
        print('F1:', f1/5)
        is_file = os.path.isfile(resultpath)
        with open(resultpath, 'a', newline='') as csvfile:
            fieldnames = ['model', 'classifier',  'f1']
            if not is_file:
                logger.writeheader()
            logger = csv.DictWriter(csvfile, fieldnames=fieldnames)
            logger.writerow({ 'model': model_name,
                            'classifier': clf_name,
                            'f1': f1/5})



resnet18
knn
F1: 0.2576923076923077
random forest
F1: 0.5641025641025641
svm
F1: 0.5807692307692308
naive bayes
F1: 0.5974358974358974
adaboost
F1: 0.4346153846153847
xgboost
F1: 0.5153846153846153
resnet50
knn
F1: 0.30256410256410254
random forest
F1: 0.5641025641025641
svm
F1: 0.5807692307692308
naive bayes
F1: 0.614102564102564
adaboost
F1: 0.5025641025641026
xgboost
F1: 0.46794871794871795
vgg16
knn
F1: 0.3076923076923077
random forest
F1: 0.5346153846153846
svm
F1: 0.5807692307692308
naive bayes
F1: 0.40384615384615385
adaboost
F1: 0.44743589743589746
xgboost
F1: 0.49871794871794883
mobilenet
knn
F1: 0.2435897435897436
random forest
F1: 0.5807692307692308
svm
F1: 0.5807692307692308
naive bayes
F1: 0.5807692307692308
adaboost
F1: 0.4525641025641026
xgboost
F1: 0.4346153846153847
googlenet
knn
F1: 0.4987179487179487
random forest
F1: 0.5807692307692308
svm
F1: 0.5807692307692308
naive bayes
F1: 0.5807692307692308
adaboost
F1: 0.4705128205128205
xgboost
F1: 0.4551282051282051
densene

In [None]:
model_name = 'resnet50'
clf_name = 'naive bayes'
torch.manual_seed(1)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
root = '/content/drive/MyDrive/OxML/MLx Cases/data/'
train_df = pd.read_csv('/content/drive/MyDrive/OxML/MLx Cases/data/labels.csv')
test_df = pd.read_csv('/content/drive/MyDrive/OxML/MLx Cases/data/test.csv')
resultpath = '/content/drive/MyDrive/OxML/MLx Cases/submission.csv'

transform_list = transforms.Compose([ transforms.ToTensor(),
                                  transforms.Normalize((0.7855, 0.6791, 0.8600),
                                                       (0.2116, 0.2516, 0.1184))])

dataset = {'train': CustomDataset(root, train_df, transform=transform_list, phase='test'),
           'test' : CustomDataset(root, test_df, transform=transform_list, phase='test')}

dataloader = {x: DataLoader(dataset[x], batch_size=1, shuffle=False) for x in ['train', 'test']}

model = model_dict[model_name]
model = torch.nn.Sequential(*list(model.children())[:-1], nn.AdaptiveAvgPool2d(1))
model.to(device)
model.eval()

labels, features, ids = {}, {}, {}
for phase in ['train', 'test']:
    labels[phase] = torch.Tensor().to(device)
    features[phase] = torch.Tensor().to(device)
    ids[phase] = []

    for data in dataloader[phase]:
        input, label, pid = data[0].to(device), data[1].to(device), data[2].to(device)
        with torch.no_grad():
            outputs = model(input)
        labels[phase] = torch.cat((labels[phase], label))
        features[phase] = torch.cat((features[phase], outputs))
        ids[phase].append(pid.item())
    features[phase] = features[phase].squeeze()

clf = clf_dict[clf_name]
clf.fit(features['train'].cpu(), labels['train'].cpu())
preds = clf.predict(features['test'].cpu()).astype(int)-1
out_df = pd.DataFrame({'id': ids['test'], 'malignant': preds})
out_df.to_csv('submission.csv', index=False)

