# LOO Validation

# import libraries

In [1]:
import os
import pickle

import numpy as np
import pandas as pd
import timm
import torch
import torch.nn as nn
import pickle
from albumentations import CenterCrop, Compose, Normalize
from albumentations.pytorch import ToTensorV2
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (
    average_precision_score,
    balanced_accuracy_score,
    matthews_corrcoef,
    roc_auc_score,
)
from sklearn.model_selection import GroupKFold
from torch.utils.data import DataLoader
from tqdm import tqdm

from toxreprcnn.dataset import ToxReprCNNDataset
from toxreprcnn.data_split import RepeatedStratifiedGroupKFold
from toxreprcnn.model import EffnetB4ModelMO, FrozenEffnetB4ModelMO
from toxreprcnn.utils import fix_seed

root = ".."

## load dataset

In [2]:
test_df = pd.read_csv(f"{root}/../data/TGGATEs/processed/test_for_finding_types.csv")

In [4]:
ft_list = list(test_df.columns[3:11])
ft_list

['Proliferation, bile duct',
 'Ground glass appearance',
 'Increased mitosis',
 'Inclusion body, intracytoplasmic',
 'Deposit, pigment',
 'Single cell necrosis',
 'Vacuolization, cytoplasmic',
 'Swelling']

## fix seed

In [5]:
seed = 123
fix_seed(seed)

## define image preprocessing

In [6]:
image_size = 512

vl_transform = Compose([CenterCrop(image_size, image_size), Normalize(), ToTensorV2()])

In [7]:
test_dataset = ToxReprCNNDataset(
    test_df["path"].values, test_df[ft_list].values >= 0.5, transform=vl_transform
)

test_loader = DataLoader(
    test_dataset,
    batch_size=32,
    drop_last=False,
    shuffle=False,
    num_workers=4,
    pin_memory=True,
)

model_control = timm.create_model(
    "tf_efficientnet_b4_ns", pretrained=True, num_classes=0
)
model_control.eval()
model_control.to("cuda")

models_list = []
for ft in tqdm(ft_list + ["whole"]):
    if ft != "whole":
        save_dir = f"../../outputs/230305TGGATEs_ft_loo_{ft}_seed123_epoch5"
        models = [FrozenEffnetB4ModelMO(i, len(ft_list) - 1) for i in range(8)] + [
            EffnetB4ModelMO(num_classes=len(ft_list) - 1)
        ]
    else:
        save_dir = "../../outputs/230305TGGATEs_model_seed123_epoch5"
        models = [FrozenEffnetB4ModelMO(i, len(ft_list)) for i in range(8)] + [
            EffnetB4ModelMO(num_classes=len(ft_list))
        ]
    for i, model in enumerate(models):
        if i <= 7:
            model.load_state_dict(
                torch.load(f"{save_dir}/{i}/effnetb4_freeze{i}_fold0_best_loss.pth")
            )
            model.classifier = nn.Identity()
        else:
            model.model.load_state_dict(
                torch.load(f"{save_dir}/{i}/effnetb4_freeze{i}_fold0_best_loss.pth")
            )
            model.model.classifier = nn.Identity()
        model.to("cuda")
        model.eval()
    models_list.append(models)

 11%|█         | 1/9 [00:11<01:28, 11.01s/it]

In [8]:
features_control = []
features = [[[[] for k in range(9)] for i in range(9)] for ft in ft_list + ["whole"]]
with torch.no_grad():
    for im, _ in tqdm(test_loader):
        im = im.to("cuda")
        outputs = model_control(im)
        features_control.append(outputs.to("cpu").numpy())
        for i in range(len(ft_list) + 1):
            for j in range(9):
                outputs = models_list[i][j](im)
                for k, f in enumerate(outputs):
                    features[i][j][k].append(f.to("cpu").numpy())

100%|██████████| 1178/1178 [3:49:13<00:00, 11.68s/it] 


## save features

In [9]:
import pickle

with open(f"../../outputs/ft_validation_features_seed{seed}.pickle", "wb") as f:
    pickle.dump({"features": features, "features_control": features_control}, f)

In [None]:
with open(
    f"../../outputs/230308ft_validation_features_seed{seed}.pickle", "rb"
) as f:
    data = pickle.load(f)

features = data["features"]
features_control = data["features_control"]
# {"features":features, "features_control":features_control}

In [None]:
def eval(gt, pr):
    return [
        roc_auc_score(gt >= 0.5, pr),
        matthews_corrcoef(gt >= 0.5, pr >= 0.5),
        balanced_accuracy_score(gt >= 0.5, pr >= 0.5),
        (
            average_precision_score(gt >= 0.5, pr)
            + average_precision_score(gt < 0.5, -pr)
        )
        / 2,
    ]


def logreg(X, y, ft_i, train_idx, valid_idx):
    r = []
    X_train = X[train_idx]
    y_train = y[train_idx] >= 0.5
    X_valid = X[valid_idx]
    y_valid = y[valid_idx] >= 0.5
    lr = LogisticRegression(max_iter=100, n_jobs=8)
    lr.fit(X_train, y_train[:, ft_i])
    y_preds = lr.predict_proba(X_valid)[:, 1]
    rr = eval(y_valid[:, ft_i], y_preds)
    r.append(rr)
    return np.array(r)

In [None]:
rsgkf = RepeatedStratifiedGroupKFold(n_splits=5, random_state=42)
spl = rsgkf.split(
    features_control,
    (test_df[ft_list].values >= 0.5).astype(np.int64),
    test_df["EG"].values,
)
train_idx, valid_idx = next(spl)

In [None]:
(test_df[ft_list].values >= 0.5)[train_idx].sum(axis=0)

array([3523, 2750, 3240,  296,  403, 2995, 3879, 1168])

In [None]:
(test_df[ft_list].values >= 0.5)[valid_idx].sum(axis=0)

array([1199, 1091,  208,  200,  200,  701,  100, 1987])

In [None]:
res_dict = {}
for ft_i in range(8):
    print(ft_list[ft_i])
    results = []
    results.append(
        logreg(
            np.concatenate(features_control),
            test_df[ft_list].values,
            ft_i,
            train_idx,
            valid_idx,
        )
    )

    for frozen in range(9):
        for depth in tqdm(range(9)):
            results.append(
                logreg(
                    np.concatenate(features[ft_i][frozen][depth]),
                    test_df[ft_list].values,
                    ft_i,
                    train_idx,
                    valid_idx,
                )
            )
    results.append(
        logreg(
            test_df[[ft for i, ft in enumerate(ft_list) if i != ft_i]].to_numpy(),
            test_df[ft_list].values,
            ft_i,
            train_idx,
            valid_idx,
        )
    )

    res_dict[ft_list[ft_i]] = results

Proliferation, bile duct


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logist

Ground glass appearance


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logist

Increased mitosis


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logist

Inclusion body, intracytoplasmic


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logist

Deposit, pigment


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logist

Single cell necrosis


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logist

Vacuolization, cytoplasmic


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logist

Swelling


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logist

In [None]:
data = []
model_name = ["Control"] + ["Head"] + [f"Block {7-i}" for i in range(7)] + ["Full"]
layer_name = ["Stem"] + [f"Block {i+1}" for i in range(7)] + ["Head"]

for ft in ft_list:
    results = res_dict[ft]
    res = np.concatenate(results[1:-1]).reshape(9, 9, 4)
    r = res[0:1, :, :].copy()
    r[0, -1, 0] = results[0][0][0]
    r[0, -1, 1] = results[0][0][1]
    r[0, -1, 2] = results[0][0][2]
    r[0, -1, 3] = results[0][0][3]
    res = np.concatenate([r, res])
    for i in range(10):
        for j in range(9):
            data.append(
                (
                    model_name[i],
                    layer_name[j],
                    ft,
                    res[i, j, 0],
                    res[i, j, 1],
                    res[i, j, 2],
                    res[i, j, 3],
                )
            )
    data.append(
        (
            "Pathological Findings",
            "-",
            ft,
            results[-1][0, 0],
            results[-1][0, 1],
            results[-1][0, 2],
            results[-1][0, 3],
        )
    )

score_df = pd.DataFrame(
    data,
    columns=[
        "model",
        "layer",
        "pathological findings",
        "AUROC",
        "MCC",
        "Balanced Accuracy",
        "AP",
    ],
)
score_df.to_csv(
    f"../../outputs/results/230721finding_type_validation_loo_seed{seed}.csv",
    index=False,
)

In [16]:
score_df

Unnamed: 0,model,layer,pathological findings,AUROC,MCC,Balanced Accuracy,AP
0,Control,Stem,"Proliferation, bile duct",0.896421,0.230107,0.579122,0.464976
1,Control,Block 1,"Proliferation, bile duct",0.938158,0.638737,0.839545,0.581823
2,Control,Block 2,"Proliferation, bile duct",0.968936,0.782962,0.950758,0.817201
3,Control,Block 3,"Proliferation, bile duct",0.972933,0.766453,0.930717,0.852656
4,Control,Block 4,"Proliferation, bile duct",0.972741,0.768363,0.932572,0.850484
...,...,...,...,...,...,...,...
723,Full,Block 5,Swelling,0.896128,0.716423,0.802823,0.840848
724,Full,Block 6,Swelling,0.876552,0.673697,0.780981,0.807422
725,Full,Block 7,Swelling,0.906462,0.724940,0.809520,0.848972
726,Full,Head,Swelling,0.904169,0.717028,0.798454,0.855862


## ToxReprCNN scores

In [21]:
for ft_i in range(8):
    results = []
    results.append(
        logreg(
            np.concatenate(features_control),
            test_df[ft_list].values,
            ft_i,
            train_idx,
            valid_idx,
        )
    )
    for frozen in range(9):
        for depth in tqdm(range(9)):
            results.append(
                logreg(
                    np.concatenate(features[8][frozen][depth]),
                    test_df[ft_list].values,
                    ft_i,
                    train_idx,
                    valid_idx,
                )
            )
    res_dict[f"whole_{ft_list[ft_i]}"] = results

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logist

In [22]:
data = []
model_name = ["Control"] + ["Head"] + [f"Block {7-i}" for i in range(7)] + ["Full"]
layer_name = ["Stem"] + [f"Block {i+1}" for i in range(7)] + ["Head"]

for ft in ft_list:
    results = res_dict[f"whole_{ft}"]
    res = np.concatenate(results[1:]).reshape(9, 9, 4)
    r = res[0:1, :, :].copy()
    r[0, -1, 0] = results[0][0][0]
    r[0, -1, 1] = results[0][0][1]
    r[0, -1, 2] = results[0][0][2]
    r[0, -1, 3] = results[0][0][3]
    res = np.concatenate([r, res])
    for i in range(10):
        for j in range(9):
            data.append(
                (
                    model_name[i],
                    layer_name[j],
                    ft,
                    res[i, j, 0],
                    res[i, j, 1],
                    res[i, j, 2],
                    res[i, j, 3],
                )
            )
    data.append(
        (
            "Pathological Findings",
            "-",
            ft,
            results[-1][0, 0],
            results[-1][0, 1],
            results[-1][0, 2],
            results[-1][0, 3],
        )
    )

score_df = pd.DataFrame(
    data,
    columns=[
        "model",
        "layer",
        "pathological findings",
        "AUROC",
        "MCC",
        "Balanced Accuracy",
        "AP",
    ],
)
score_df.to_csv(
    f"../../outputs/results/finding_type_validation_seed{seed}.csv",
    index=False,
)