# Preprocessing of information

In [1]:
import numpy as np
import pandas as pd

In [2]:
# folder
root = "/workspace/tggate"

# seed for control sampling
seed=24771

# path for data folders
ORIGINAL_DIR = f"{root}/data/original"
PROCESSED_DIR = f"{root}/data/processed"

In [3]:
# load
df = pd.read_csv(f"{root}/data/tggate_info_ext.csv")
df_info=pd.read_csv(f"{ORIGINAL_DIR}/finding_information.csv")
lst_finding = [line.rstrip() for line in open(f"{PROCESSED_DIR}/finding_lst.txt", "r")]

  df = pd.read_csv(f"{root}/data/tggate_info_ext.csv")


In [4]:
# preprocessing
df_info=df_info.replace("-",np.nan)
df_info=df_info.dropna(subset=["Open TG-GATEs"])
finding_tggate=list(set(df_info["Open TG-GATEs"]))
for finding in lst_finding:
    if finding not in finding_tggate:
        print(finding)


Bacterium
Scar
DEAD
Lesion,NOS


* These findings will be deleted

In [5]:
def df_conv(df_finding, dict_conv):
    df_res=df_finding.copy()
    df_res.columns=[dict_conv.get(i,np.nan) for i in df_res.columns]
    df_res=df_res.T
    df_res["INDEX"]=df_res.index.tolist()
    df_res=df_res.dropna(subset=["INDEX"])
    del df_res["INDEX"]
    df_res=df_res.groupby(level=0).max().T
    return df_res

dict_conv_cat1=dict(zip(df_info["Open TG-GATEs"], df_info["Category1"]))
dict_conv_cat3=dict(zip(df_info["Open TG-GATEs"], df_info["Category3"]))

df_temp=df.loc[:,lst_finding]
df_res=pd.concat([
    df.loc[:,["COMPOUND_NAME", "DOSE_LEVEL", "SACRI_PERIOD", "FOLD",]], #information
    df_conv(df_temp, dict_conv_cat1), #category1 name
    df_conv(df_temp, dict_conv_cat3), #category3 name
],axis=1)

In [6]:
df_res.to_csv(f"{PROCESSED_DIR}/finding_converted.csv",index=False)
df_res.head()

Unnamed: 0,COMPOUND_NAME,DOSE_LEVEL,SACRI_PERIOD,FOLD,Billary Change,Hepatocellular Degeneration,"Hepatocellular Injury, and Death",Hepatocellular Responses,Inflammation,Other,...,"Necrosis, fibrinoid","Nodule, hepatodiaphragmatic",Nucleic Alteration,"Proliferation, Kupffer cell","Proliferation, bile duct","Proliferation, oval cell",Pyknosis,Single cell necrosis,Thrombus,"Vacuolization, nuclear"
0,acetaminophen,Middle,9 hr,2,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,acetaminophen,Middle,9 hr,2,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,acetaminophen,Middle,24 hr,4,0.0,1.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,acetaminophen,Middle,24 hr,4,0.0,1.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,acetaminophen,Middle,24 hr,4,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# Evaluation

In [5]:
import sys
import os

import numpy as np
import pandas as pd

# folder
root = "/workspace/tggate"
folder_feature = "/workspace/HDD3/TGGATEs/feature"

sys.path.append(f"{root}/src/SelfSupervisedLearningPathology")
from evaluate.tggatefold import ClassificationFold

lr_params={
    "penalty":"l2",
    "C":100,
    "random_state":24771,
    "class_weight":"balanced",
    "max_iter":10000,
}

lst_layer=[5,4,]

In [8]:
for layer in lst_layer:
    dat=ClassificationFold()
    lst_res = dat.evaluate(
        folder=f"{folder_feature}/pretrained/224",
        name="layer", layer=layer, 
        pretrained=True,
        n_fold=5, wsi=True, num_patch=512, 
        strategy="max", random_state=24771,
        convertz=True, compression=False, pred_method="logistic_regression", params=lr_params,
        finding=True, 
    )
    pd.to_pickle(lst_res, f"/workspace/tggate/result/finding/upper/pretrained_512_layer{layer}.pickle")

    lst_res = dat.evaluate(
        folder=f"{folder_feature}/pretrained/224",
        name="layer", layer=layer, 
        pretrained=True,
        n_fold=5, wsi=True, num_patch=None, 
        strategy="max", random_state=24771,
        convertz=True, compression=False, pred_method="logistic_regression", params=lr_params,
        finding=True, 
    )
    pd.to_pickle(lst_res, f"/workspace/tggate/result/finding/upper/pretrained_all_layer{layer}.pickle")

In [6]:
for layer in lst_layer:
    dat=ClassificationFold()
    lst_res = dat.evaluate(
        folder=f"{folder_feature}/pretrained/224",
        name="layer", layer=layer, 
        pretrained=True,
        n_fold=5, wsi=True, num_patch=512, 
        strategy="max", random_state=24771,
        convertz=True, compression=False, pred_method="logistic_regression", params=lr_params,
        finding=True, 
        delete_sample=True, lst_delete_conc=["Control", "Low"],
    )
    pd.to_pickle(lst_res, f"/workspace/tggate/result/finding/upper/pretrained_512_layer{layer}_delete.pickle")

    lst_res = dat.evaluate(
        folder=f"{folder_feature}/pretrained/224",
        name="layer", layer=layer, 
        pretrained=True,
        n_fold=5, wsi=True, num_patch=None, 
        strategy="max", random_state=24771,
        convertz=True, compression=False, pred_method="logistic_regression", params=lr_params,
        finding=True, 
        delete_sample=True, lst_delete_conc=["Control", "Low"],
    )
    pd.to_pickle(lst_res, f"/workspace/tggate/result/finding/upper/pretrained_all_layer{layer}_delete.pickle")

In [7]:
for layer in lst_layer:
    dat=ClassificationFold()
    lst_res = dat.evaluate(
        folder=f"{folder_feature}/research",
        name="concat_layer", layer=layer, 
        pretrained=True,
        n_fold=5, wsi=False, 
        strategy="max", random_state=24771,
        convertz=True, compression=False, pred_method="logistic_regression", params=lr_params,
        finding=True, 
    )
    pd.to_pickle(lst_res, f"/workspace/tggate/result/finding/pretrained_all_concat{layer}.pickle")

    dat=ClassificationFold()
    lst_res = dat.evaluate(
        folder=f"{folder_feature}/research",
        name="concat_layer", layer=layer, 
        pretrained=True,
        n_fold=5, wsi=False, 
        strategy="max", random_state=24771,
        convertz=True, compression=False, pred_method="logistic_regression", params=lr_params,
        finding=True, 
        delete_sample=True, lst_delete_conc=["Control", "Low"],
    )
    pd.to_pickle(lst_res, f"/workspace/tggate/result/finding/pretrained_all_concat{layer}_delete.pickle")

In [None]:
for layer in lst_layer:
    dat=ClassificationFold()
    lst_res = dat.evaluate(
        folder=f"{folder_feature}/fold",
        name="layer", layer=layer, 
        pretrained=False,
        n_fold=5, wsi=True, num_patch=512, 
        strategy="max", random_state=24771,
        convertz=True, compression=False, pred_method="logistic_regression", params=lr_params,
        finding=True, 
    )
    pd.to_pickle(lst_res, f"/workspace/tggate/result/finding/upper/btfold_512_layer{layer}.pickle")

    lst_res = dat.evaluate(
        folder=f"{folder_feature}/fold",
        name="layer", layer=layer, 
        pretrained=False,
        n_fold=5, wsi=True, num_patch=None, 
        strategy="max", random_state=24771,
        convertz=True, compression=False, pred_method="logistic_regression", params=lr_params,
        finding=True, 
    )
    pd.to_pickle(lst_res, f"/workspace/tggate/result/finding/upper/btfold_all_layer{layer}.pickle")

In [None]:
for layer in lst_layer:
    dat=ClassificationFold()
    lst_res = dat.evaluate(
        folder=f"{folder_feature}/fold",
        name="layer", layer=layer, 
        pretrained=False,
        n_fold=5, wsi=True, num_patch=512, 
        strategy="max", random_state=24771,
        convertz=True, compression=False, pred_method="logistic_regression", params=lr_params,
        finding=True, 
        delete_sample=False, lst_delete_conc=["Control", "Low"],
    )
    pd.to_pickle(lst_res, f"/workspace/tggate/result/finding/upper/btfold_512_layer{layer}_delete.pickle")

    lst_res = dat.evaluate(
        folder=f"{folder_feature}/fold",
        name="layer", layer=layer, 
        pretrained=False,
        n_fold=5, wsi=True, num_patch=None, 
        strategy="max", random_state=24771,
        convertz=True, compression=False, pred_method="logistic_regression", params=lr_params,
        finding=True, 
        delete_sample=False, lst_delete_conc=["Control", "Low"],
    )
    pd.to_pickle(lst_res, f"/workspace/tggate/result/finding/upper/btfold_all_layer{layer}_delete.pickle")

# Plot

In [14]:
res = pd.read_pickle(f"/workspace/tggate/result/finding/upper/pretrained_all_layer4_delete.pickle")

In [15]:
pd.concat(res).groupby(level=0,axis=0).mean().sort_values(by=["mAP"])

  pd.concat(res).groupby(level=0,axis=0).mean().sort_values(by=["mAP"])


Unnamed: 0,AUROC,AUPR,mAP,Accuracy,Balanced Accuracy
Inflammation,0.808772,0.358946,0.362183,0.784142,0.719025
Hepatocellular Responses,0.85658,0.442657,0.445109,0.862427,0.758151
Hepatocellular Degeneration,0.799793,0.475787,0.478353,0.764163,0.717143
Billary Change,0.920462,0.507811,0.509622,0.977999,0.759101
"Hepatocellular Injury, and Death",0.811808,0.514539,0.515959,0.770296,0.734582
Proliferative Lesions,0.837906,0.576811,0.578497,0.781396,0.75632
