# Data Split

## import library

In [1]:
import pickle
import random
from glob import glob

import numpy as np
import pandas as pd
from sklearn.model_selection import GroupKFold
from sklearn.preprocessing import LabelEncoder
from tqdm import tqdm

from toxreprcnn.data_split import RepeatedStratifiedGroupKFold


# Data split

In [3]:
root = ".."

with open(f"{root}/processed_data/210925single_ft_tile_predicts.pickle", "rb") as f:
    labels = pickle.load(f)


In [9]:
print(*sorted(list(labels.keys())), sep="\n")

Alteration, cytoplasmic
Alteration, nuclear
Atrophy
Atypia, nuclear
Cellular infiltration, mononuclear cell
Change, acidophilic
Change, basophilic
Change, eosinophilic
Cyst
Degeneration
Degeneration, acidophilic, eosinophilic
Degeneration, granular, eosinophilic
Degeneration, hydropic
Degeneration, vacuolar
Deposit, glycogen
Deposit, hemosiderin
Deposit, lipid
Deposit, pigment
Fibrosis
Ground glass appearance
Hematopoiesis, extramedullary
Hemorrhage
Hypertrophy
Inclusion body, intracytoplasmic
Increased mitosis
Inflammation
Inflammation, suppurative
Lesion,NOS
Microgranuloma
Mineralization
Necrosis
Nodule, hepatodiaphragmatic
Proliferation
Proliferation, Kupffer cell
Proliferation, bile duct
Proliferation, oval cell
Scar
Single cell necrosis
Swelling
Thrombus
Vacuolization, cytoplasmic
Vacuolization, nuclear


labelsは各finding typeが存在するWSIから抽出されたtileに関してWSSSで予測されたpathological finding存在確率を格納したデータ

finding typeをkeyとするdictで、各valueはtileのリストと存在確率のペアになっている

In [3]:
labels["Proliferation, oval cell"][0][:5]


['/data0/TGGATE/tiles/46789.svs/7.tiff',
 '/data0/TGGATE/tiles/46789.svs/47.tiff',
 '/data0/TGGATE/tiles/46789.svs/10.tiff',
 '/data0/TGGATE/tiles/46789.svs/89.tiff',
 '/data0/TGGATE/tiles/46789.svs/0.tiff']

In [4]:
labels["Proliferation, oval cell"][1][0][:5]


array([[0.99992156],
       [0.9815824 ],
       [0.929671  ],
       [0.99985194],
       [0.999483  ]], dtype=float32)

selected finding types

In [5]:
fts = """Proliferation, bile duct
Ground glass appearance
Increased mitosis
Inclusion body, intracytoplasmic
Deposit, pigment
Single cell necrosis
Vacuolization, cytoplasmic
Swelling"""
fts = fts.split("\n")


In [6]:
tile_set = set()

for ft in fts:
    tile_set |= set(labels[ft][0])
positive_set = set([t.split("/")[-2] for t in tile_set])


In [7]:
tile_root = "/data/TGGATE/tiles"

path_list = glob(f"{tile_root}/*.svs")
svs_path_dict = {path.split("/")[-1]: path for path in path_list}
info = pd.read_csv(f"{root}/../../info/info.csv")
info = info[info["SACRI_PERIOD"].isin(["4 day", "8 day", "15 day", "29 day"])]
whole_set = set(svs_path_dict.keys()) & set(info["FILE"])
positive_set = whole_set & positive_set
rest_set = whole_set - positive_set


  exec(code_obj, self.user_global_ns, self.user_ns)


In [8]:
positive_list = list(positive_set)

positive_info = []

for wsi in tqdm(positive_list):
    for t in glob(svs_path_dict[wsi] + "/*.tiff"):
        positive_info.append((t, "/".join(t.split("/")[-2:]), t.split("/")[-2]))

df = pd.DataFrame(positive_info, columns=["path", "tilename", "FILE"])


100%|██████████| 585/585 [00:00<00:00, 3546.97it/s]


In [9]:
for ft in fts:
    dat = []

    data = list(zip(labels[ft][0], np.concatenate(labels[ft][1])))
    for p, l in data:
        tilename = "/".join(p.split("/")[-2:])
        dat.append([tilename, l[0]])
    # add label information
    df = pd.merge(
        df, pd.DataFrame(dat, columns=["tilename", ft]), on="tilename", how="left"
    )


In [10]:
for ft in fts:
    df.loc[df[ft].isnull(), ft] = 0
df = df[((df[fts] > 0) & (df[fts] < 0.5)).sum(axis=1) == 0]


In [11]:
df


Unnamed: 0,path,tilename,FILE,"Proliferation, bile duct",Ground glass appearance,Increased mitosis,"Inclusion body, intracytoplasmic","Deposit, pigment",Single cell necrosis,"Vacuolization, cytoplasmic",Swelling
0,/mnt/local/HDD/TGGATE/tiles/33958.svs/7.tiff,33958.svs/7.tiff,33958.svs,0.0,0.0,0.0,0.0,0.0,0.0,0.996757,0.000000
1,/mnt/local/HDD/TGGATE/tiles/33958.svs/47.tiff,33958.svs/47.tiff,33958.svs,0.0,0.0,0.0,0.0,0.0,0.0,0.998521,0.000000
2,/mnt/local/HDD/TGGATE/tiles/33958.svs/10.tiff,33958.svs/10.tiff,33958.svs,0.0,0.0,0.0,0.0,0.0,0.0,0.994510,0.000000
3,/mnt/local/HDD/TGGATE/tiles/33958.svs/89.tiff,33958.svs/89.tiff,33958.svs,0.0,0.0,0.0,0.0,0.0,0.0,0.997625,0.000000
4,/mnt/local/HDD/TGGATE/tiles/33958.svs/0.tiff,33958.svs/0.tiff,33958.svs,0.0,0.0,0.0,0.0,0.0,0.0,0.999719,0.000000
...,...,...,...,...,...,...,...,...,...,...,...
58495,/mnt/local/HDD/TGGATE/tiles/25060.svs/72.tiff,25060.svs/72.tiff,25060.svs,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.986450
58496,/mnt/local/HDD/TGGATE/tiles/25060.svs/48.tiff,25060.svs/48.tiff,25060.svs,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.994683
58497,/mnt/local/HDD/TGGATE/tiles/25060.svs/93.tiff,25060.svs/93.tiff,25060.svs,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.940306
58498,/mnt/local/HDD/TGGATE/tiles/25060.svs/73.tiff,25060.svs/73.tiff,25060.svs,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.866732


In [12]:
negative_list = random.sample(rest_set, k=len(df["FILE"].unique()))

negative_info = []
for wsi in tqdm(negative_list):
    for t in glob(svs_path_dict[wsi] + "/*.tiff"):
        negative_info.append((t, "/".join(t.split("/")[-2:]), t.split("/")[-2]))
df_neg = pd.DataFrame(negative_info, columns=["path", "tilename", "FILE"])
for ft in fts:
    df_neg[ft] = 0


100%|██████████| 584/584 [00:03<00:00, 169.12it/s]


In [13]:
df = pd.concat([df, df_neg])


In [14]:
df = pd.merge(df, info[["EXP_ID", "GROUP_ID", "FILE"]], on="FILE", how="inner")
df["EG"] = df["EXP_ID"] * 100 + df["GROUP_ID"]

In [15]:
model_train_df = df


## Data Split


In [16]:
rsgf = RepeatedStratifiedGroupKFold(n_splits=5, random_state=123, n_repeats=100)
g = LabelEncoder().fit_transform(info["COMPOUND_NAME"].to_numpy())
y = info[fts].values.astype(np.int64)
X = info[["FILE"]].to_numpy()
fold = np.zeros(len(info), dtype=np.int64)

for i, (_, idx) in enumerate(rsgf.split(X, y, g)):
    fold[idx] = i
fold = LabelEncoder().fit_transform(fold)
for i in range(6):
    print(y[fold == i].sum(axis=0))
info["fold 1"] = fold


[25 54 27 11  9 66 23  3]
[49  0 39  3  6 10 66 22]
[72 57 41 15  8 12 15 65]
[ 0 94 51  1 15 15 22 27]
[ 15   3  49   0   2  27 100   0]
[0 0 0 0 0 0 0 0]


In [17]:
train1 = info[info["fold 1"] != 0]


In [18]:
spl = np.zeros(len(train1), dtype=np.int64)
spl[train1["SACRI_PERIOD"].isin(["15 day", "29 day"])] = 1
train1["SP_label"] = spl
train1["EG"] = train1["GROUP_ID"] + train1["EXP_ID"] * 100

g = LabelEncoder().fit_transform(train1["EG"].to_numpy())
y = train1[fts + ["SP_label"]].to_numpy().astype(np.int64)
X = train1[["FILE"]].to_numpy()
fold = np.zeros(len(train1), dtype=np.int64)

for i, (_, idx) in enumerate(rsgf.split(X, y, g)):
    fold[idx] = i
fold = LabelEncoder().fit_transform(fold)

for i in range(5):
    print(y[fold == i].sum(axis=0))
train1["fold 2"] = fold


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.


[ 53  21  43   2   3  18  33  41 905]
[ 24  31  47   0   7   9  33  11 920]
[ 14  28  27   5  14  23  42  17 920]
[ 17  47  38  11   5   7  53  11 913]
[ 28  27  25   1   2   7  42  34 930]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  app.launch_new_instance()


In [19]:
train2 = train1[train1["fold 2"] != 0]
df_train, df_test = (
    model_train_df[model_train_df["FILE"].isin(train2["FILE"])],
    model_train_df[~model_train_df["FILE"].isin(train2["FILE"])],
)


In [20]:
gkf = GroupKFold(n_splits=4)

df_train = df_train.sample(frac=1.0)
X = df_train["path"].to_numpy()
y = df_train[fts].to_numpy()
g = df_train["EG"].to_numpy()
fold = np.zeros(len(df_train), dtype=np.int64)

for i, (_, idx) in enumerate(gkf.split(X, y, g)):
    fold[idx] = i
df_train["fold"] = fold
for i in range(4):
    print(y[fold == i].sum(axis=0), (fold == i).sum())


[1014.30326253 1764.41105086 1562.044451      0.          996.43894494
  636.48902392 2498.13666964 1231.07411551] 17121
[1.32425001e+03 1.85360680e+03 1.92004739e+03 0.00000000e+00
 5.63792229e-01 1.01190046e+03 2.54373950e+03 2.20449438e+03] 17121
[ 793.87658972 1154.38245654 1360.54008907  158.96599692  496.35570306
  497.47269809 2310.47177023 1813.83811706] 17121
[2278.36805439 1124.56491148  871.0816865   192.65701479  750.27161199
 1058.77266049 2552.09886622  580.8174988 ] 17121


In [21]:
out_dir = "../../data/TGGATEs/processed"

df_train.to_csv(f"{out_dir}/train_val_for_model_training.csv", index=False)
df_test.to_csv(f"{out_dir}/test_for_finding_types.csv", index=False)
train2.to_csv(f"{out_dir}/train.csv", index=False)