<a href="https://colab.research.google.com/github/mayarali/carcinoma_classification/blob/fatih/OxML_feature_extraction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import os
import pandas as pd
from PIL import Image
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from torchvision import models, transforms
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import f1_score
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from pprint import pprint


In [2]:
#Configs
data_root = '/esat/smcdata/users/kkontras/Image_Dataset/no_backup/OxML/'
submission_file = data_root + 'submission.csv'
label_path = data_root + 'labels.csv'
seed = 1

In [3]:

class OxML_Dataset(Dataset):

    def __init__(self, img_folder, dataset, mode='test', transform=None):

        self.img_folder = img_folder
        self.transform = transform
        self.dataset = dataset
        self.mode = mode


    def __getitem__(self, idx):

        if torch.is_tensor(idx):
            idx = idx.tolist()

        #Create img path
        if self.mode == 'test':
            img_path = os.path.join(self.img_folder, f"img_{self.dataset.id.iloc[idx]}.png")
        else:
            img_path = os.path.join(self.img_folder, f"img_{self.dataset.id.iloc[idx]}_{self.dataset.aug.iloc[idx]}.png")

        img = Image.open(img_path)
        if self.transform:
            img = self.transform(img)


        label = self.dataset.malignant.iloc[idx] + 1
        id = self.dataset.id.iloc[idx]


        return img, label, id

    def __len__(self):
        return self.dataset.shape[0]


In [4]:
model_dict = {
            'resnet18': models.resnet18(weights=models.ResNet18_Weights.DEFAULT),
              'resnet50': models.resnet50(weights=models.ResNet50_Weights.DEFAULT),
              'vgg16': models.vgg16_bn(weights=models.VGG16_BN_Weights.DEFAULT),
              'mobilenet': models.mobilenet_v2(weights='DEFAULT'),
              'googlenet': models.googlenet(weights='DEFAULT'),
              'densenet': models.densenet161(weights='DEFAULT'),
              'shufflenet': models.shufflenet_v2_x1_0(weights='DEFAULT'),
              'efficientnet': models.efficientnet_v2_s(weights='DEFAULT')
              }
clf_dict = {
    'knn': KNeighborsClassifier(n_neighbors=5),
            'random forest': RandomForestClassifier(),
            'svm': SVC(),
            'naive bayes': GaussianNB(),
            'adaboost': AdaBoostClassifier(),
            'xgboost': GradientBoostingClassifier()
            }


In [None]:

torch.manual_seed(seed)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

transform_list = transforms.Compose([ transforms.ToTensor(),
                                      transforms.Normalize((0.7855, 0.6791, 0.8600),
                                                       (0.2116, 0.2516, 0.1184))])

train_ids_labels = pd.read_csv(label_path)
dataset = OxML_Dataset(img_folder = data_root, dataset = train_ids_labels, transform=transform_list)

dataloader = DataLoader(dataset, batch_size=1, shuffle=False)

from collections import defaultdict
results = defaultdict(dict)

for model_name in model_dict:

    model = model_dict[model_name]
    model = torch.nn.Sequential(*list(model.children())[:-1], nn.AdaptiveAvgPool2d(1))
    model.to(device)

    labels = torch.Tensor().to(device)
    features = torch.Tensor().to(device)
    ids = torch.Tensor().to(device)

    for data in dataloader:
        input, label, id = data[0].to(device), data[1].to(device), data[2].to(device)
        with torch.no_grad():
            outputs = model(input)
        labels = torch.cat((labels, label))
        features = torch.cat((features, outputs))
        ids = torch.cat((ids, id))
    features = features.squeeze()
    for clf_name in clf_dict:
        skf = StratifiedKFold(n_splits=5)
        f1_total = 0
        for fold, (train_index, val_index) in enumerate(skf.split(features.cpu(), labels.cpu())):
            clf = clf_dict[clf_name]
            clf.fit(features[train_index].cpu(), labels[train_index].cpu())
            preds = clf.predict(features[val_index].cpu())
            f1_total += f1_score(labels[val_index].cpu(), preds, average='micro')
        results[model_name][clf_name] = round(f1_total/5,4)
pprint(dict(results))

In [None]:
all_files = os.listdir(data_root)
train_ids_labels = pd.read_csv(label_path)

test_ids = []
for i in all_files:
    try:
        if i.split(".")[0].split("_")[1].isnumeric() :
            number_patient = int(i.split(".")[0].split("_")[1])
            if number_patient not in list(train_ids_labels.id):
                test_ids.append([number_patient,-2])
    except:
        continue
test_ids = pd.DataFrame(test_ids, columns =['id', "malignant"])

In [None]:

model_name = 'resnet50'
clf_name = 'naive bayes'
torch.manual_seed(seed)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

train_ids_labels = pd.read_csv(label_path)

transform_list = transforms.Compose([ transforms.ToTensor(),
                                  transforms.Normalize((0.7855, 0.6791, 0.8600),
                                                       (0.2116, 0.2516, 0.1184))])

dataset = {'train': OxML_Dataset(data_root, dataset = train_ids_labels, transform=transform_list, mode='test'),
           'test' : OxML_Dataset(data_root, test_ids, transform=transform_list, mode='test')}

dataloader = {x: DataLoader(dataset[x], batch_size=1, shuffle=False) for x in ['train', 'test']}

model = model_dict[model_name]
model = torch.nn.Sequential(*list(model.children())[:-1], nn.AdaptiveAvgPool2d(1))
model.to(device)

#Get the features from the model
labels, features, ids = {}, {}, {}
for mode in ['train', 'test']:
    labels[mode] = torch.Tensor().to(device)
    features[mode] = torch.Tensor().to(device)
    ids[mode] = []

    for data in dataloader[mode]:
        input, label, id = data[0].to(device), data[1].to(device), data[2].to(device)
        with torch.no_grad():
            outputs = model(input)
        labels[mode] = torch.cat((labels[mode], label))
        features[mode] = torch.cat((features[mode], outputs))
        ids[mode].append(id.item())
    features[mode] = features[mode].squeeze()

clf = clf_dict[clf_name]
clf.fit(features['train'].cpu(), labels['train'].cpu())
preds = clf.predict(features['test'].cpu()).astype(int)-1 # Make them compatible with the submission protocol
out_df = pd.DataFrame({'id': ids['test'], 'malignant': preds})
out_df.to_csv(submission_file, index=False)


In [None]:
out_df