In [1]:
import os
import pandas as pd
from PIL import Image
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from torchvision import models, transforms
from sklearn.naive_bayes import GaussianNB

In [2]:
# Configs
data_root = "./data/oxml-carinoma-classification"  # This data_root should contain the images in .png format and the label.csv
submission_file = "./submission.csv"
label_path = f"{data_root}/labels.csv"
seed = 1

In [3]:
class OxMLDataset(Dataset):
    def __init__(self, img_folder, dataset, mode="test", transform=None):
        self.img_folder = img_folder
        self.transform = transform
        self.dataset = dataset
        self.mode = mode

    def __getitem__(self, idx):
        if torch.is_tensor(idx):
            idx = idx.tolist()

        # Create img path
        if self.mode == "test":
            img_path = os.path.join(
                self.img_folder, f"img_{self.dataset.id.iloc[idx]}.png"
            )
        else:
            img_path = os.path.join(
                self.img_folder,
                f"img_{self.dataset.id.iloc[idx]}_{self.dataset.aug.iloc[idx]}.png",
            )

        img = Image.open(img_path)
        if self.transform:
            img = self.transform(img)

        label = self.dataset.malignant.iloc[idx] + 1
        id = self.dataset.id.iloc[idx]

        return img, label, id

    def __len__(self):
        return self.dataset.shape[0]

In [4]:
all_files = os.listdir(data_root)
train_ids_labels = pd.read_csv(label_path)

test_ids = []
for i in all_files:
    try:
        if i.split(".")[0].split("_")[1].isnumeric():
            number_patient = int(i.split(".")[0].split("_")[1])
            if number_patient not in list(train_ids_labels.id):
                test_ids.append([number_patient, -2])
    except:
        continue
test_ids = pd.DataFrame(test_ids, columns=["id", "malignant"])

In [5]:
torch.manual_seed(seed)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using {device} device")

train_ids_labels = pd.read_csv(label_path)

transform_list = transforms.Compose(
    [
        transforms.ToTensor(),
        transforms.Normalize((0.7855, 0.6791, 0.8600), (0.2116, 0.2516, 0.1184)),
    ]
)

dataset = {
    "train": OxMLDataset(
        data_root, dataset=train_ids_labels, transform=transform_list, mode="test"
    ),
    "test": OxMLDataset(data_root, test_ids, transform=transform_list, mode="test"),
}

dataloader = {
    x: DataLoader(dataset[x], batch_size=1, shuffle=False) for x in ["train", "test"]
}

model = models.resnet50(weights=models.ResNet50_Weights.DEFAULT)
model = torch.nn.Sequential(*list(model.children())[:-1], nn.AdaptiveAvgPool2d(1))
model.to(device)

# Get the features from the model
labels, features, ids = {}, {}, {}
for mode in ["train", "test"]:
    labels[mode] = torch.Tensor().to(device)
    features[mode] = torch.Tensor().to(device)
    ids[mode] = []

    for data in dataloader[mode]:
        input, label, id = data[0].to(device), data[1].to(device), data[2].to(device)
        with torch.no_grad():
            outputs = model(input)
        labels[mode] = torch.cat((labels[mode], label))
        features[mode] = torch.cat((features[mode], outputs))
        ids[mode].append(id.item())
    features[mode] = features[mode].squeeze()

clf = GaussianNB()
clf.fit(features["train"].cpu(), labels["train"].cpu())
# Make them compatible with the submission protocol by substr -1
preds = clf.predict(features["test"].cpu()).astype(int) - 1
out_df = pd.DataFrame({"id": ids["test"], "malignant": preds})
out_df.to_csv(submission_file, index=False)

Using cuda device


In [6]:
out_df

Unnamed: 0,id,malignant
0,103601,-1
1,105480,-1
2,118847,-1
3,125877,-1
4,133778,-1
...,...,...
119,968389,-1
120,97549,-1
121,976505,-1
122,996288,-1
