# Early-to-late Prediction

In [1]:
import os
import pickle

import numpy as np
import pandas as pd
import timm
import torch
import torch.nn as nn
from albumentations import CenterCrop, Compose, Normalize
from albumentations.pytorch import ToTensorV2
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (average_precision_score, balanced_accuracy_score,
                             matthews_corrcoef, roc_auc_score)
from sklearn.model_selection import GroupKFold
from torch.utils.data import DataLoader
from tqdm import tqdm
from glob import glob

from toxreprcnn.dataset import ToxReprCNNDataset
from toxreprcnn.data_split import RepeatedStratifiedGroupKFold
from toxreprcnn.model import EffnetB4ModelMO, FrozenEffnetB4ModelMO
from toxreprcnn.utils import fix_seed

root = ".."

In [2]:
info = pd.read_csv(f"{root}/data/TGGATEs/processed/info.csv")
train = pd.read_csv(f"{root}/data/TGGATEs/processed/train.csv")

  exec(code_obj, self.user_global_ns, self.user_ns)


In [3]:
test_info = info[~info["COMPOUND_NAME"].isin(train["COMPOUND_NAME"])]
test_info = test_info[(test_info["DOSE"]>0) & (test_info["SACRI_PERIOD"].isin(["4 day", "8 day", "15 day", "29 day"]))]

In [4]:
test_tiles = []

for f in tqdm(test_info["FILE"].values):
    for p in glob(f"/mnt/local/extHDD1/TGGATE/tiles/{f}/*.tiff"):
        test_tiles.append(p)
len(test_tiles)

100%|██████████| 1588/1588 [00:00<00:00, 5560.73it/s]


141193

In [5]:
seed = 123
fix_seed(seed)

In [6]:
image_size = 512

vl_transform = Compose([CenterCrop(image_size, image_size), Normalize(), ToTensorV2()])


In [7]:
test_dataset = ToxReprCNNDataset(
    test_tiles, [0]*len(test_tiles), transform=vl_transform
)

test_loader = DataLoader(
    test_dataset,
    batch_size=32,
    drop_last=False,
    shuffle=False,
    num_workers=4,
    pin_memory=True,
)

model_control = timm.create_model("tf_efficientnet_b4_ns", pretrained=True, num_classes=0)
model_control.eval()
model_control.to("cuda")

ft_list = [None]*8

models_multiseed = []
for seed in range(123,128):
    save_dir = f"{root}/outputs/TGGATEs_model_seed{seed}"
    models = [FrozenEffnetB4ModelMO(i, len(ft_list)) for i in range(8)] + [EffnetB4ModelMO(num_classes=len(ft_list))]
    for i, model in enumerate(models):
        if i <= 7:
            model.load_state_dict(
                torch.load(f"{save_dir}/{i}/effnetb4_freeze{i}_fold0_best_loss.pth")
            )
            model.classifier = nn.Identity()
        else:
            model.model.load_state_dict(
                torch.load(f"{save_dir}/{i}/effnetb4_freeze{i}_fold0_best_loss.pth")
            )
            model.model.classifier = nn.Identity()
        model.to("cuda")
        model.eval()
    models_multiseed.append(models)

In [8]:
features_control = []
features = [[[[] for k in range(9)] for i in range(9)] for seed in range(5)]
with torch.no_grad():
    for im, _ in tqdm(test_loader):
        im = im.to("cuda")
        outputs = model_control(im)
        features_control.append(outputs.to("cpu").numpy())
        for seed in range(5):
            for j in range(9):
                outputs = models_multiseed[seed][j](im)
                for k, f in enumerate(outputs):
                    features[seed][j][k].append(f.to("cpu").numpy())


100%|██████████| 4413/4413 [8:02:57<00:00,  6.57s/it]  


In [10]:
for seed in range(5):
    print(seed)
    with open(f"/mnt/local/extHDD2/data/TGGATE/230310prognosis_features_seed{123+seed}.pickle", "wb") as f:
        pickle.dump({"features" : features[seed],
                    "features_control" : features_control,
                    "test_tiles" : test_tiles}, f)

0
1
2
3
4


In [8]:
with open("../../outputs/230310prognosis_features_multiseed.pickle", "rb") as f:
    features = pickle.load(f)

EOFError: Ran out of input

In [None]:
data = features
features = data["features"]
features_control = data["features_control"]
test_tiles = data["test_tiles"]

In [11]:
early_info = test_info[test_info["SACRI_PERIOD"].isin(["4 day", "8 day"])]
late_info = test_info[test_info["SACRI_PERIOD"].isin(["15 day", "29 day"])]

In [12]:
ft_all_list = ['Accumulation, foam cell', 'Adenoma, hepatocellular',
       'Alteration, cytoplasmic', 'Alteration, nuclear',
       'Altered hepatocellular foci', 'Anisonucleosis', 'Atrophy',
       'Atypia, nuclear', 'Bacterium', 'Cellular foci',
       'Cellular infiltration', 'Cellular infiltration, mononuclear cell',
       'Cellular infiltration, neutrophil', 'Change, acidophilic',
       'Change, basophilic', 'Change, eosinophilic', 'Congestion', 'Cyst',
       'DEAD', 'Degeneration', 'Degeneration, acidophilic, eosinophilic',
       'Degeneration, fatty', 'Degeneration, granular',
       'Degeneration, granular, eosinophilic', 'Degeneration, hydropic',
       'Degeneration, vacuolar', 'Deposit, glycogen', 'Deposit, hemosiderin',
       'Deposit, lipid', 'Deposit, pigment', 'Dilatation', 'Disarrangement',
       'Ectopic tissue', 'Edema', 'Fibrosis', 'Giant cell', 'Granuloma',
       'Ground glass appearance', 'Hematopoiesis, extramedullary',
       'Hemorrhage', 'Hyperplasia', 'Hypertrophy',
       'Inclusion body, intracytoplasmic', 'Increased mitosis', 'Inflammation',
       'Inflammation, foreign body', 'Inflammation, suppurative', 'Lesion,NOS',
       'Microgranuloma', 'Mineralization', 'Necrosis', 'Necrosis, fibrinoid',
       'Nodule, hepatodiaphragmatic', 'Phagocytosis', 'Proliferation',
       'Proliferation, Kupffer cell', 'Proliferation, bile duct',
       'Proliferation, oval cell', 'Pyknosis', 'Scar', 'Single cell necrosis',
       'Swelling', 'Thrombus', 'Vacuolization, cytoplasmic',
       'Vacuolization, nuclear']

In [13]:
compound_label = {}
for com in late_info["COMPOUND_NAME"].unique():
    compound_label[com] = late_info[late_info["COMPOUND_NAME"] == com][ft_all_list].mean().to_numpy() > 0

early_label = np.array([compound_label[com] for com in early_info["COMPOUND_NAME"].to_numpy()])

for i, ft in enumerate(ft_all_list):
    early_info["early_" + ft] = early_label[:, i]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [14]:
def eval(gt, pr):
    return [
        roc_auc_score(gt >= 0.5, pr),
        matthews_corrcoef(gt >= 0.5, pr >= 0.5),
        balanced_accuracy_score(gt >= 0.5, pr >= 0.5),
        (average_precision_score(gt >= 0.5, pr) + average_precision_score(gt < 0.5, -pr))/2,
    ]


def prognosis_test(features, test_tiles):
    ret = []
    if features is not None:
        wsi = [tile.split("/")[-2] for tile in test_tiles]
        df = pd.DataFrame(features)
        df["FILE"] = wsi
        late_features = pd.merge(late_info, df, on="FILE", how="inner")
        late_features = late_features.groupby("FILE").mean()
        X_train = late_features[range(features.shape[1])].to_numpy()
        early_features = pd.merge(early_info, df, on="FILE", how="inner")
        early_features = early_features.groupby("FILE").mean()
        X_test = early_features[range(features.shape[1])].to_numpy()
        lr_list = []
        for ft in ft_all_list:
            y_train = late_features[ft].to_numpy()
            if y_train.sum() == 0:
                ret.append([np.nan]*4)
                lr_list.append(None)
                continue
            lr = LogisticRegression(max_iter=10000)
            lr.fit(X_train, y_train)
            y_test = early_features["early_" + ft].to_numpy()
            if y_test.sum() == 0:
                ret.append([np.nan]*4)
                lr_list.append(None)
                continue
            y_preds = lr.predict_proba(X_test)[:, 1]
            ret.append(eval(y_test, y_preds))
            lr_list.append(lr)
        return ret, lr_list
    else:
        for ft in ft_all_list:
            y_test = early_info["early_" + ft].to_numpy(dtype=np.int64)
            if y_test.sum() == 0:
                ret.append([np.nan]*4)
                continue
            y_preds = early_info[ft].to_numpy(dtype=np.int64)
            ret.append(eval(y_test, y_preds))
        return ret

In [15]:

result = [[[0 for j in range(9)] for i in range(10)] for seed in range(5)]
lrs = [[[0 for j in range(9)] for i in range(10)] for seed in range(5)]
for seed in range(5):
    features_temp = [[features[seed][0][i] for i in range(8)] + [features_control]] + features[seed]
    for i in tqdm(range(10)):
        for j in range(9):
            result[seed][i][j], lrs[seed][i][j] = prognosis_test(np.concatenate(features_temp[i][j]), test_tiles)

ABNORMAL_TERMINATION_IN_LNSRCH.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
ABNORMAL_TERMINATION_IN_LNSRCH.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
ABNORMAL_TERMINATION_IN_LNSRCH.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
ABNORMAL_TERMINATION_IN_L

In [16]:
result_ft = prognosis_test(None, test_tiles)

In [15]:
result = np.array(result)

In [18]:
for seed in range(5):
    model_name = ["Control"] + ["Head"] + [f"Block {7-i}" for i in range(7)] + ["Full"]
    layer_name = ["Stem"] + [f"Block {i+1}" for i in range(7)] + ["Head"]
    rec = []
    for i in range(10):
        for j in range(9):
            for k in range(len(ft_all_list)):
                if np.isnan(result[seed][i][j][k][0]):
                    continue
                rec.append([model_name[i], layer_name[j], ft_all_list[k]] + [result[seed][i][j][k][0], result[seed][i][j][k][3]])
    for k in range(len(ft_all_list)):
        if np.isnan(result_ft[k][0]):
            continue
        rec.append(["Pathological Findings", "-", ft_all_list[k]] + [result_ft[k][0], result_ft[k][3]])
    raw_df = pd.DataFrame(rec, columns=["model", "layer", "finding type", "AUROC", "AP"])
    raw_df.to_csv(f"{root}/outputs/prognosis_result_{seed+123}.csv", index=False)