# LOO

## import libraries

In [1]:
import os
import pickle
import random
from glob import glob
from time import sleep

import numpy as np
import pandas as pd
import timm
import torch
import torch.nn as nn
import torch.optim as optim
from albumentations import (CenterCrop, Compose, HorizontalFlip, Normalize,
                            RandomCrop, VerticalFlip)
from albumentations.pytorch import ToTensorV2
from torch.utils.data import DataLoader, RandomSampler
from tqdm import tqdm

from clmodel.dataset import CLModelDataset
from clmodel.evaluate import Metrics, macro_auroc, macro_balanced_accuracy
from clmodel.model import FrozenEffnetB4Model
from clmodel.train import train_loop
from clmodel.utils import AverageValue, Logger, fix_seed

## set seeds

In [2]:
seed = 124
fix_seed(seed)


def seed_worker(worker_id):
    worker_seed = torch.initial_seed() % 2**32
    np.random.seed(worker_seed)
    random.seed(worker_seed)


g = torch.Generator()
g.manual_seed(seed)


<torch._C.Generator at 0x7f62f8abaef0>

## load data

In [3]:
df = pd.read_csv("../../data/TGGATEs/processed/train_val_for_model_training.csv")


In [4]:
df["path"] = df["path"].str.replace("HDD", "extHDD1")

In [5]:
ft_list = list(df.columns[3:11])
ft_list


['Proliferation, bile duct',
 'Ground glass appearance',
 'Increased mitosis',
 'Inclusion body, intracytoplasmic',
 'Deposit, pigment',
 'Single cell necrosis',
 'Vacuolization, cytoplasmic',
 'Swelling']

## image preprocessing

In [6]:
image_size = 512

tr_transform = Compose(
    [
        # RandomCrop(image_size, image_size),
        HorizontalFlip(p=0.5),
        VerticalFlip(p=0.5),
        Normalize(),
        ToTensorV2(),
    ]
)
vl_transform = Compose(
    [Normalize(), ToTensorV2()]  # CenterCrop(image_size, image_size),
)


## create image_dict

In [7]:
n_epochs = 5

tr = df[df["fold"] != 2]
vl = df[df["fold"] == 2]

ft_dict = {v[0]: v[1:] for v in df[["path"] + ft_list].to_numpy()}

train_dataset = CLModelDataset(
    tr["path"].values, None, transform=RandomCrop(image_size, image_size)
)
train_loader = DataLoader(
    train_dataset,
    num_workers=4,
    batch_size=16,
    sampler=RandomSampler(
        train_dataset, num_samples=len(train_dataset) // 10, replacement=True
    ),
    pin_memory=True,
    drop_last=True,
    worker_init_fn=seed_worker,
    generator=g,
)

new_tr = []
image_dict = {}
count = 0
for epoch in range(n_epochs):
    for x, pathes in tqdm(train_loader):
        for i in range(16):
            image_dict[f"dummy/dummy/{count}"] = x[i].numpy()
            new_tr.append([f"dummy/dummy/{count}"] + list(ft_dict[pathes[i]]))
            count += 1
    # with open("../../temp/cache_image_dict_new_tr.pickle", "wb") as f:
    #     pickle.dump((image_dict, new_tr), f)
new_tr = pd.DataFrame(new_tr, columns=["path"] + ft_list)


100%|██████████| 321/321 [23:56<00:00,  4.47s/it]
100%|██████████| 321/321 [21:32<00:00,  4.03s/it]
100%|██████████| 321/321 [19:24<00:00,  3.63s/it]
100%|██████████| 321/321 [18:08<00:00,  3.39s/it]
100%|██████████| 321/321 [18:07<00:00,  3.39s/it]


In [8]:
valid_dataset = CLModelDataset(
    vl["path"].values,
    None,
    transform=CenterCrop(image_size, image_size),
)
valid_loader = DataLoader(
    valid_dataset,
    batch_size=32,
    drop_last=False,
    shuffle=False,
    num_workers=4,
    pin_memory=True,
)
new_vl = []
for x, pathes in tqdm(valid_loader):
    for i in range(len(x)):
        image_dict[f"dummy/dummy/{count}"] = x[i].numpy()
        new_vl.append([f"dummy/dummy/{count}"] + list(ft_dict[pathes[i]]))
        count += 1
new_vl = pd.DataFrame(new_vl, columns=["path"] + ft_list)


100%|██████████| 536/536 [1:20:54<00:00,  9.06s/it]


In [10]:
new_vl

Unnamed: 0,path,"Proliferation, bile duct",Ground glass appearance,Increased mitosis,"Inclusion body, intracytoplasmic","Deposit, pigment",Single cell necrosis,"Vacuolization, cytoplasmic",Swelling
0,dummy/dummy/25680,0.000000,0.999631,0.000000,0.0,0.0,0.0,0.0,0.000000
1,dummy/dummy/25681,0.000000,0.999841,0.000000,0.0,0.0,0.0,0.0,0.000000
2,dummy/dummy/25682,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,0.000000
3,dummy/dummy/25683,0.999986,0.000000,0.000000,0.0,0.0,0.0,0.0,0.998901
4,dummy/dummy/25684,0.000000,0.000000,0.978140,0.0,0.0,0.0,0.0,0.000000
...,...,...,...,...,...,...,...,...,...
17116,dummy/dummy/42796,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,0.000000
17117,dummy/dummy/42797,0.938293,0.000000,0.000000,0.0,0.0,0.0,0.0,0.971595
17118,dummy/dummy/42798,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,0.000000
17119,dummy/dummy/42799,0.000000,0.000000,0.908885,0.0,0.0,0.0,0.0,0.000000


In [9]:
new_vl = pd.DataFrame(new_vl[:len(vl)], columns=["path"] + ft_list)


## train

In [11]:
for ft in ft_list:
    print(f"==============={ft}=============")
    loo_ft_list = [f for f in ft_list if f != ft]
    train_dataset = CLModelDataset(
        new_tr["path"].values,
        new_tr[loo_ft_list].values >= 0.5,
        image_dict,
        transform=tr_transform,
        length=len(tr) // 10,
        cache_mode=True,
    )
    valid_dataset = CLModelDataset(
        new_vl["path"].values,
        new_vl[loo_ft_list].values >= 0.5,
        image_dict,
        transform=vl_transform,
    )

    train_loader = DataLoader(
        train_dataset,
        num_workers=4,
        batch_size=16,
        shuffle=False,
        pin_memory=True,
        drop_last=False,
    )
    valid_loader = DataLoader(
        valid_dataset,
        num_workers=4,
        batch_size=32,
        shuffle=False,
        pin_memory=True,
        drop_last=False,
    )

    criterion = nn.BCEWithLogitsLoss()

    n_epochs = 5

    out_dir = f"../../outputs/230305TGGATEs_ft_loo_{ft}_seed124_epoch5"
    os.system(f'mkdir "{out_dir}"')

    for depth in range(9):
        print(f"=================Depth {depth}===================")
        if depth >= 8:
            model = timm.create_model(
                "tf_efficientnet_b4_ns", pretrained=True, num_classes=len(loo_ft_list)
            )
            if depth == 9:
                depth = "8_10epochs"
                n_epochs = 10
        else:
            model = FrozenEffnetB4Model(depth, len(loo_ft_list))

        model.to("cuda")
        optimizer = optim.Adam(model.parameters(), lr=5e-4)
        scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer, 10, 1e-6)

        metrics = [
            Metrics("macro AUROC", macro_auroc, "+"),
            Metrics("macro balanced accuracy", macro_balanced_accuracy, "+"),
        ]
        os.system(f'mkdir "{out_dir}/{depth}"')
        res = train_loop(
            model,
            train_loader,
            valid_loader,
            0,
            criterion,
            optimizer,
            "cuda",
            n_epochs,
            scheduler,
            metrics,
            f"{out_dir}/{depth}",
            f"effnetb4_freeze{depth}",
            preprocess=lambda x: x.sigmoid(),
            verbose=100,
            logger=Logger(),
        )

        with open(f"{out_dir}/{depth}/result.pickle", "wb") as f:
            pickle.dump(res, f)
        sleep(100)

Epoch 1
Step: 1/321 Loss: 0.6926 Elapsed time 2.2 Rest time 699.9
Step: 101/321 Loss: 0.2831 Elapsed time 9.1 Rest time 19.9
Step: 201/321 Loss: 0.2429 Elapsed time 16.6 Rest time 9.9
Step: 301/321 Loss: 0.2263 Elapsed time 24.1 Rest time 1.6
Step: 321/321 Loss: 0.2239 Elapsed time 25.6 Rest time 0.0
Step: 1/536 Loss: 0.2486 Elapsed time 0.8 Rest time 408.9
Step: 101/536 Loss: 0.2105 Elapsed time 15.7 Rest time 67.7
Step: 201/536 Loss: 0.2071 Elapsed time 30.8 Rest time 51.3
Step: 301/536 Loss: 0.2079 Elapsed time 45.9 Rest time 35.8
Step: 401/536 Loss: 0.2088 Elapsed time 60.4 Rest time 20.3
Step: 501/536 Loss: 0.2082 Elapsed time 75.1 Rest time 5.2
Step: 536/536 Loss: 0.2085 Elapsed time 80.3 Rest time 0.0
macro AUROC : 0.8409
This is best macro AUROC.
saved model.
macro balanced accuracy : 0.5836
This is best macro balanced accuracy.
saved model.
loss : 0.2085
This is best loss.
saved model.
Epoch 2
Step: 1/321 Loss: 0.1433 Elapsed time 0.3 Rest time 110.7
Step: 101/321 Loss: 0.1855

## train with the all pathological findings

In [12]:
train_dataset = CLModelDataset(
    new_tr["path"].values,
    new_tr[ft_list].values >= 0.5,
    image_dict,
    transform=tr_transform,
    length=len(tr) // 10,
    cache_mode=True,
)
valid_dataset = CLModelDataset(
    new_vl["path"].values,
    new_vl[ft_list].values >= 0.5,
    image_dict,
    transform=vl_transform,
)

train_loader = DataLoader(
    train_dataset,
    num_workers=4,
    batch_size=16,
    shuffle=False,
    pin_memory=True,
    drop_last=False,
)
valid_loader = DataLoader(
    valid_dataset,
    num_workers=4,
    batch_size=32,
    shuffle=False,
    pin_memory=True,
    drop_last=False,
)

criterion = nn.BCEWithLogitsLoss()

n_epochs = 5

out_dir = "../../outputs/230305TGGATEs_model_seed124_epoch5"
os.system(f'mkdir "{out_dir}"')

for depth in range(9):
    print(f"=================Depth {depth}===================")
    if depth >= 8:
        model = timm.create_model(
            "tf_efficientnet_b4_ns", pretrained=True, num_classes=len(ft_list)
        )
        if depth == 9:
            depth = "8_10epochs"
            n_epochs = 10
    else:
        model = FrozenEffnetB4Model(depth, len(ft_list))

    model.to("cuda")
    optimizer = optim.Adam(model.parameters(), lr=5e-4)
    scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer, 10, 1e-6)

    metrics = [
        Metrics("macro AUROC", macro_auroc, "+"),
        Metrics("macro balanced accuracy", macro_balanced_accuracy, "+"),
    ]
    os.system(f'mkdir "{out_dir}/{depth}"')
    res = train_loop(
        model,
        train_loader,
        valid_loader,
        0,
        criterion,
        optimizer,
        "cuda",
        n_epochs,
        scheduler,
        metrics,
        f"{out_dir}/{depth}",
        f"effnetb4_freeze{depth}",
        preprocess=lambda x: x.sigmoid(),
        verbose=100,
        logger=Logger(),
    )

    with open(f"{out_dir}/{depth}/result.pickle", "wb") as f:
        pickle.dump(res, f)
    sleep(100)

Epoch 1
Step: 1/321 Loss: 0.7065 Elapsed time 0.4 Rest time 126.3
Step: 101/321 Loss: 0.2877 Elapsed time 5.2 Rest time 11.3
Step: 201/321 Loss: 0.2483 Elapsed time 10.0 Rest time 5.9
Step: 301/321 Loss: 0.2318 Elapsed time 14.7 Rest time 1.0
Step: 321/321 Loss: 0.2291 Elapsed time 15.6 Rest time 0.0
Step: 1/536 Loss: 0.2223 Elapsed time 0.5 Rest time 268.5
Step: 101/536 Loss: 0.2023 Elapsed time 15.0 Rest time 64.6
Step: 201/536 Loss: 0.1997 Elapsed time 29.6 Rest time 49.3
Step: 301/536 Loss: 0.2005 Elapsed time 44.1 Rest time 34.4
Step: 401/536 Loss: 0.2013 Elapsed time 58.7 Rest time 19.7
Step: 501/536 Loss: 0.2007 Elapsed time 73.3 Rest time 5.1
Step: 536/536 Loss: 0.2008 Elapsed time 78.3 Rest time 0.0
macro AUROC : 0.8363
This is best macro AUROC.
saved model.
macro balanced accuracy : 0.5917
This is best macro balanced accuracy.
saved model.
loss : 0.2008
This is best loss.
saved model.
Epoch 2
Step: 1/321 Loss: 0.1570 Elapsed time 0.4 Rest time 122.9
Step: 101/321 Loss: 0.1897