# LOO

## import libraries

In [1]:
import os
import pickle
import random
from glob import glob
from time import sleep

import numpy as np
import pandas as pd
import timm
import torch
import torch.nn as nn
import torch.optim as optim
from albumentations import (CenterCrop, Compose, HorizontalFlip, Normalize,
                            RandomCrop, VerticalFlip)
from albumentations.pytorch import ToTensorV2
from torch.utils.data import DataLoader, RandomSampler
from tqdm import tqdm

from clmodel.dataset import CLModelDataset
from clmodel.evaluate import Metrics, macro_auroc, macro_balanced_accuracy
#from clmodel.model import Frozen
from clmodel.train import train_loop
from clmodel.utils import AverageValue, Logger, fix_seed

## set seeds

In [2]:
seed = 124
fix_seed(seed)


def seed_worker(worker_id):
    worker_seed = torch.initial_seed() % 2**32
    np.random.seed(worker_seed)
    random.seed(worker_seed)


g = torch.Generator()
g.manual_seed(seed)


<torch._C.Generator at 0x7f0e46da1eb0>

## load data

In [3]:
df = pd.read_csv("../../data/TGGATEs/processed/train_val_for_model_training.csv")


In [4]:
df["path"] = df["path"].str.replace("HDD", "extHDD1")

In [5]:
ft_list = list(df.columns[3:11])
ft_list


['Proliferation, bile duct',
 'Ground glass appearance',
 'Increased mitosis',
 'Inclusion body, intracytoplasmic',
 'Deposit, pigment',
 'Single cell necrosis',
 'Vacuolization, cytoplasmic',
 'Swelling']

## image preprocessing

In [6]:
image_size = 512

tr_transform = Compose(
    [
        # RandomCrop(image_size, image_size),
        HorizontalFlip(p=0.5),
        VerticalFlip(p=0.5),
        Normalize(),
        ToTensorV2(),
    ]
)
vl_transform = Compose(
    [Normalize(), ToTensorV2()]  # CenterCrop(image_size, image_size),
)


## create image_dict

In [7]:
n_epochs = 5

tr = df[df["fold"] != 2]
vl = df[df["fold"] == 2]

ft_dict = {v[0]: v[1:] for v in df[["path"] + ft_list].to_numpy()}

train_dataset = CLModelDataset(
    tr["path"].values, None, transform=RandomCrop(image_size, image_size)
)
train_loader = DataLoader(
    train_dataset,
    num_workers=4,
    batch_size=16,
    sampler=RandomSampler(
        train_dataset, num_samples=len(train_dataset) // 10, replacement=True
    ),
    pin_memory=True,
    drop_last=True,
    worker_init_fn=seed_worker,
    generator=g,
)

new_tr = []
image_dict = {}
count = 0
for epoch in range(n_epochs):
    for x, pathes in tqdm(train_loader):
        for i in range(16):
            image_dict[f"dummy/dummy/{count}"] = x[i].numpy()
            new_tr.append([f"dummy/dummy/{count}"] + list(ft_dict[pathes[i]]))
            count += 1
    # with open("../../temp/cache_image_dict_new_tr.pickle", "wb") as f:
    #     pickle.dump((image_dict, new_tr), f)
new_tr = pd.DataFrame(new_tr, columns=["path"] + ft_list)


  0%|          | 1/321 [00:20<1:49:13, 20.48s/it]

In [None]:
valid_dataset = CLModelDataset(
    vl["path"].values,
    None,
    transform=CenterCrop(image_size, image_size),
)
valid_loader = DataLoader(
    valid_dataset,
    batch_size=32,
    drop_last=False,
    shuffle=False,
    num_workers=4,
    pin_memory=True,
)
new_vl = []
for x, pathes in tqdm(valid_loader):
    for i in range(len(x)):
        image_dict[f"dummy/dummy/{count}"] = x[i].numpy()
        new_vl.append([f"dummy/dummy/{count}"] + list(ft_dict[pathes[i]]))
        count += 1
new_vl = pd.DataFrame(new_vl, columns=["path"] + ft_list)


100%|██████████| 536/536 [1:19:13<00:00,  8.87s/it]


In [None]:
new_vl

Unnamed: 0,path,"Proliferation, bile duct",Ground glass appearance,Increased mitosis,"Inclusion body, intracytoplasmic","Deposit, pigment",Single cell necrosis,"Vacuolization, cytoplasmic",Swelling
0,dummy/dummy/25680,0.000000,0.999631,0.000000,0.0,0.0,0.0,0.0,0.000000
1,dummy/dummy/25681,0.000000,0.999841,0.000000,0.0,0.0,0.0,0.0,0.000000
2,dummy/dummy/25682,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,0.000000
3,dummy/dummy/25683,0.999986,0.000000,0.000000,0.0,0.0,0.0,0.0,0.998901
4,dummy/dummy/25684,0.000000,0.000000,0.978140,0.0,0.0,0.0,0.0,0.000000
...,...,...,...,...,...,...,...,...,...
17116,dummy/dummy/42796,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,0.000000
17117,dummy/dummy/42797,0.938293,0.000000,0.000000,0.0,0.0,0.0,0.0,0.971595
17118,dummy/dummy/42798,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,0.000000
17119,dummy/dummy/42799,0.000000,0.000000,0.908885,0.0,0.0,0.0,0.0,0.000000


In [None]:
new_vl = pd.DataFrame(new_vl[:len(vl)], columns=["path"] + ft_list)


## train

In [None]:
class FrozenResNet50Model(nn.Module):
    def __init__(self, depth, num_classes):
        assert 0<=depth<=5
        super().__init__()
        self.model = timm.create_model("resnetaa50", num_classes=0, pretrained=True)
        layer_list = []
        if depth>=1:
            layer_list = [self.model.layer4] + layer_list
            self.model.layer4 = nn.Identity()
        if depth>=2:
            layer_list = [self.model.layer3] + layer_list
            self.model.layer3 = nn.Identity()
        if depth>=3:
            layer_list = [self.model.layer2] + layer_list
            self.model.layer2 = nn.Identity()
        if depth>=4:
            layer_list = [self.model.layer1] + layer_list
            self.model.layer1 = nn.Identity()
        if depth>=5:
            layer_list = [self.model.conv1, self.model.bn1, self.model.act1, self.model.max_pool] + layer_list
            self.model.conv1 = nn.Identity()
            self.model.bn1 = nn.Identity()
            self.model.act1 = nn.Identity()
            self.model.max_pool = nn.Identity()
        global_pool = self.model.global_pool
        self.model.global_pool = nn.Identity()
        self.head = nn.Sequential(*layer_list, global_pool)
        if num_classes>0:
            self.classifier = nn.Linear(2048, num_classes)
        else:
            self.classifier = nn.Identity()
        for params in self.model.parameters():
            params.requires_grad = False
    
    def train(self, train_flag=True):
        super().train(train_flag)
        self.model.eval()
    
    def forward(self, x):
        x = self.model(x)
        x = self.head(x)
        x = self.classifier(x)
        return x

    def forward_features(self, x):
        x = self.model(x)
        x = self.head(x)
        return x

In [None]:
for ft in ft_list:
    print(f"==============={ft}=============")
    loo_ft_list = [f for f in ft_list if f != ft]
    train_dataset = CLModelDataset(
        new_tr["path"].values,
        new_tr[loo_ft_list].values >= 0.5,
        image_dict,
        transform=tr_transform,
        length=len(tr) // 10,
        cache_mode=True,
    )
    valid_dataset = CLModelDataset(
        new_vl["path"].values,
        new_vl[loo_ft_list].values >= 0.5,
        image_dict,
        transform=vl_transform,
    )

    train_loader = DataLoader(
        train_dataset,
        num_workers=4,
        batch_size=16,
        shuffle=False,
        pin_memory=True,
        drop_last=False,
    )
    valid_loader = DataLoader(
        valid_dataset,
        num_workers=4,
        batch_size=32,
        shuffle=False,
        pin_memory=True,
        drop_last=False,
    )

    criterion = nn.BCEWithLogitsLoss()

    n_epochs = 5

    out_dir = f"../../outputs/230305TGGATEs_ft_loo_{ft}_seed124_epoch5_resnet"
    os.system(f'mkdir "{out_dir}"')

    for depth in range(6):
        print(f"=================Depth {depth}===================")
        if depth >= 5:
            model = timm.create_model(
                "resnetaa50", pretrained=True, num_classes=len(loo_ft_list)
            )
            if depth == 9:
                depth = "8_10epochs"
                n_epochs = 10
        else:
            model = FrozenResNet50Model(depth, len(loo_ft_list))

        model.to("cuda")
        optimizer = optim.Adam(model.parameters(), lr=5e-4)
        scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer, 10, 1e-6)

        metrics = [
            Metrics("macro AUROC", macro_auroc, "+"),
            Metrics("macro balanced accuracy", macro_balanced_accuracy, "+"),
        ]
        os.system(f'mkdir "{out_dir}/{depth}"')
        res = train_loop(
            model,
            train_loader,
            valid_loader,
            0,
            criterion,
            optimizer,
            "cuda",
            n_epochs,
            scheduler,
            metrics,
            f"{out_dir}/{depth}",
            f"resnet50aa_freeze{depth}",
            preprocess=lambda x: x.sigmoid(),
            verbose=100,
            logger=Logger(),
        )

        with open(f"{out_dir}/{depth}/result.pickle", "wb") as f:
            pickle.dump(res, f)
        sleep(100)

Epoch 1
Step: 1/321 Loss: 0.7019 Elapsed time 2.7 Rest time 859.4
Step: 101/321 Loss: 0.4102 Elapsed time 6.6 Rest time 14.4
Step: 201/321 Loss: 0.3429 Elapsed time 10.5 Rest time 6.3
Step: 301/321 Loss: 0.3137 Elapsed time 14.4 Rest time 1.0
Step: 321/321 Loss: 0.3097 Elapsed time 15.2 Rest time 0.0
Step: 1/536 Loss: 0.2653 Elapsed time 0.6 Rest time 327.5
Step: 101/536 Loss: 0.2448 Elapsed time 13.2 Rest time 56.9
Step: 201/536 Loss: 0.2405 Elapsed time 25.8 Rest time 43.0
Step: 301/536 Loss: 0.2403 Elapsed time 38.4 Rest time 30.0
Step: 401/536 Loss: 0.2395 Elapsed time 51.2 Rest time 17.2
Step: 501/536 Loss: 0.2396 Elapsed time 63.9 Rest time 4.5
Step: 536/536 Loss: 0.2397 Elapsed time 68.2 Rest time 0.0
macro AUROC : 0.5596
This is best macro AUROC.
saved model.
macro balanced accuracy : 0.5
This is best macro balanced accuracy.
saved model.
loss : 0.2397
This is best loss.
saved model.
Epoch 2
Step: 1/321 Loss: 0.2346 Elapsed time 0.3 Rest time 106.5
Step: 101/321 Loss: 0.2428 El

## train with the all pathological findings

In [None]:
train_dataset = CLModelDataset(
    new_tr["path"].values,
    new_tr[ft_list].values >= 0.5,
    image_dict,
    transform=tr_transform,
    length=len(tr) // 10,
    cache_mode=True,
)
valid_dataset = CLModelDataset(
    new_vl["path"].values,
    new_vl[ft_list].values >= 0.5,
    image_dict,
    transform=vl_transform,
)

train_loader = DataLoader(
    train_dataset,
    num_workers=4,
    batch_size=16,
    shuffle=False,
    pin_memory=True,
    drop_last=False,
)
valid_loader = DataLoader(
    valid_dataset,
    num_workers=4,
    batch_size=32,
    shuffle=False,
    pin_memory=True,
    drop_last=False,
)

criterion = nn.BCEWithLogitsLoss()

n_epochs = 5

out_dir = "../../outputs/230305TGGATEs_model_seed124_epoch5_resnet"
os.system(f'mkdir "{out_dir}"')

for depth in range(6):
    print(f"=================Depth {depth}===================")
    if depth >= 5:
        model = timm.create_model(
            "resnetaa50", pretrained=True, num_classes=len(ft_list)
        )
        if depth == 9:
            depth = "8_10epochs"
            n_epochs = 10
    else:
        model = FrozenResNet50Model(depth, len(ft_list))

    model.to("cuda")
    optimizer = optim.Adam(model.parameters(), lr=5e-4)
    scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer, 10, 1e-6)

    metrics = [
        Metrics("macro AUROC", macro_auroc, "+"),
        Metrics("macro balanced accuracy", macro_balanced_accuracy, "+"),
    ]
    os.system(f'mkdir "{out_dir}/{depth}"')
    res = train_loop(
        model,
        train_loader,
        valid_loader,
        0,
        criterion,
        optimizer,
        "cuda",
        n_epochs,
        scheduler,
        metrics,
        f"{out_dir}/{depth}",
        f"effnetb4_freeze{depth}",
        preprocess=lambda x: x.sigmoid(),
        verbose=100,
        logger=Logger(),
    )

    with open(f"{out_dir}/{depth}/result.pickle", "wb") as f:
        pickle.dump(res, f)
    sleep(100)

Epoch 1
Step: 1/321 Loss: 0.6885 Elapsed time 0.4 Rest time 122.7
Step: 101/321 Loss: 0.4117 Elapsed time 4.3 Rest time 9.4
Step: 201/321 Loss: 0.3469 Elapsed time 8.2 Rest time 4.9
Step: 301/321 Loss: 0.3195 Elapsed time 12.1 Rest time 0.8
Step: 321/321 Loss: 0.3158 Elapsed time 12.9 Rest time 0.0
Step: 1/536 Loss: 0.2607 Elapsed time 0.5 Rest time 282.7
Step: 101/536 Loss: 0.2404 Elapsed time 13.0 Rest time 56.0
Step: 201/536 Loss: 0.2373 Elapsed time 25.3 Rest time 42.2
Step: 301/536 Loss: 0.2375 Elapsed time 37.7 Rest time 29.4
Step: 401/536 Loss: 0.2374 Elapsed time 50.1 Rest time 16.9
Step: 501/536 Loss: 0.2371 Elapsed time 62.4 Rest time 4.4
Step: 536/536 Loss: 0.2371 Elapsed time 66.6 Rest time 0.0
macro AUROC : 0.578
This is best macro AUROC.
saved model.
macro balanced accuracy : 0.5
This is best macro balanced accuracy.
saved model.
loss : 0.2371
This is best loss.
saved model.
Epoch 2
Step: 1/321 Loss: 0.2695 Elapsed time 0.4 Rest time 125.8
Step: 101/321 Loss: 0.2518 Elaps