# Preprocessing

1. Organizing data
2. Extracting patches

## 1. Organizing data

### Input files

- ```data/TGGATEs/orig/open_tggates_individual.csv```
- ```data/TGGATEs/orig/open_tggates_pathological_image.csv```
- ```data/TGGATEs/orig/open_tggates_pathology.csv```
- ```data/TGGATEs/orig/tggates_in_vivo_compound_info.txt```
- ```data/TGGATEs/orig/compound_table.db```

You can obtain the data respectively from

- http://togodb.org/db/open_tggates_individual
- http://togodb.org/db/open_tggates_pathological_image
- http://togodb.org/db/open_tggates_pathology
- https://toxico.nibiohn.go.jp/english/datalist.html


### Output files

- ```data/TGGATEs/processed/info.csv```
- ```data/TGGATEs/processed/ft_list.txt```
- ```data/TGGATEs/processed/compound_list.txt```

In [1]:
!pwd

/workspace/ToxRepresentatonCNN/ipynb


In [2]:
import sys

root = ".."
sys.path.append(root)


In [3]:
import toxreprcnn.preprocess
import os
from collections import defaultdict
from tqdm import tqdm
import openslide
import sqlite3
import pandas as pd
import numpy as np
from collections import defaultdict

In [4]:
root = ".."
orig_dir = f"{root}/data/TGGATEs/orig"
processed_dir = f"{root}/data/TGGATEs/processed"


ind_df = pd.read_csv(
    f"{orig_dir}/open_tggates_individual.csv", encoding="shift_jis")
image_df = pd.read_csv(
    f"{orig_dir}/open_tggates_pathological_image.csv", encoding="shift_jis"
)
path_df = pd.read_csv(
    f"{orig_dir}/open_tggates_pathology.csv", encoding="shift_jis")
compound_df = pd.read_csv(
    f"{orig_dir}/tggates_in_vivo_compound_info.txt", encoding="shift_jis", sep="\t"
)

display(ind_df.head())
display(image_df.head())
display(path_df.head())
display(compound_df.head())


  exec(code_obj, self.user_global_ns, self.user_ns)


Unnamed: 0,exp_id,group_id,individual_id,indv_id,compound_name,compound_abbr,compound_no,species,test_type,sin_rep_type,...,organ_wt_link,bw_link,fc_link,has_liver_pathology,liver_pathology_link,has_kidney_pathology,kidney_pathology_link,liver_pathological_image_link,kidney_pathological_image_link,id
0,40,1,1,40011,acetaminophen,APAP,1,Rat,in vivo,Single,...,40011.0,/list?exp_id=0040&group_id=01&individual_id=1,,False,,False,,/list?exp_id=0040&group_id=01&individual_id=1&...,/list?exp_id=0040&group_id=01&individual_id=1&...,
1,40,1,2,40012,acetaminophen,APAP,1,Rat,in vivo,Single,...,40012.0,/list?exp_id=0040&group_id=01&individual_id=2,,False,,False,,/list?exp_id=0040&group_id=01&individual_id=2&...,/list?exp_id=0040&group_id=01&individual_id=2&...,
2,40,1,3,40013,acetaminophen,APAP,1,Rat,in vivo,Single,...,40013.0,/list?exp_id=0040&group_id=01&individual_id=3,,False,,False,,/list?exp_id=0040&group_id=01&individual_id=3&...,/list?exp_id=0040&group_id=01&individual_id=3&...,
3,40,1,4,40014,acetaminophen,APAP,1,Rat,in vivo,Single,...,40014.0,/list?exp_id=0040&group_id=01&individual_id=4,,False,,False,,/list?exp_id=0040&group_id=01&individual_id=4&...,/list?exp_id=0040&group_id=01&individual_id=4&...,
4,40,1,5,40015,acetaminophen,APAP,1,Rat,in vivo,Single,...,40015.0,/list?exp_id=0040&group_id=01&individual_id=5,,False,,False,,/list?exp_id=0040&group_id=01&individual_id=5&...,/list?exp_id=0040&group_id=01&individual_id=5&...,


Unnamed: 0,EXP_ID,GROUP_ID,INDIVIDUAL_ID,COMPOUND_NAME,ORGAN,FILE_LOCATION,CAPTURE_NO,SPECIES,SINGLE_REPEAT_TYPE,ADMINISTRATION_ROUTE_TYPE,ANIMAL_AGE(week),SACRIFICE_PERIOD,DOSE,DOSE_UNIT
0,40,1,1,acetaminophen,Liver,ftp://ftp.biosciencedbc.jp/archive/open-tggate...,1/1,Rat,Single,Gavage,6,3 hr,0,mg/kg
1,40,1,1,acetaminophen,Kidney,ftp://ftp.biosciencedbc.jp/archive/open-tggate...,1/1,Rat,Single,Gavage,6,3 hr,0,mg/kg
2,40,1,2,acetaminophen,Liver,ftp://ftp.biosciencedbc.jp/archive/open-tggate...,1/1,Rat,Single,Gavage,6,3 hr,0,mg/kg
3,40,1,2,acetaminophen,Kidney,ftp://ftp.biosciencedbc.jp/archive/open-tggate...,1/1,Rat,Single,Gavage,6,3 hr,0,mg/kg
4,40,1,3,acetaminophen,Liver,ftp://ftp.biosciencedbc.jp/archive/open-tggate...,1/1,Rat,Single,Gavage,6,3 hr,0,mg/kg


Unnamed: 0,BARCODE,EXP_ID,GROUP_ID,INDIVIDUAL_ID,COMPOUND_NAME,DOSE_LEVEL,SACRIFICE_PERIOD,ORGAN,FINDING_TYPE,TOPOGRAPHY_TYPE,GRADE_TYPE,SP_FLG
0,No ChipData,698,1,1,1% cholesterol + 0.25% sodium cholate,Control,4 day,Liver,"Deposit, glycogen",Peripheral,slight,t
1,No ChipData,698,2,5,1% cholesterol + 0.25% sodium cholate,Control,8 day,Liver,"Deposit, glycogen",Peripheral,slight,t
2,003017906001,698,5,1,1% cholesterol + 0.25% sodium cholate,High,4 day,Liver,"Degeneration, fatty",Peripheral,slight,f
3,No ChipData,698,5,2,1% cholesterol + 0.25% sodium cholate,High,4 day,Liver,"Degeneration, fatty",Peripheral,slight,f
4,003017906002,698,5,3,1% cholesterol + 0.25% sodium cholate,High,4 day,Liver,"Degeneration, fatty",Peripheral,slight,f


Unnamed: 0,COMPOUND_NAME,VEHICLE,Administration route,ORGAN,Single dose Low (mg/kg),Single dose Middle (mg/kg),Single dose High (mg/kg),Repeat dose Low (mg/kg),Repeat dose Middle (mg/kg),Repeat dose High (mg/kg)
0,acetaminophen,0.5% MC,Gavage,Liver,300,600,1000,300,600,1000
1,acetaminophen,0.5% MC,Gavage,Kidney,300,600,1000,300,600,1000
2,isoniazid,0.5% MC,Gavage,Liver,200,600,2000,50,100,200
3,carbon tetrachloride,corn oil,Gavage,Liver,30,100,300,30,100,300
4,phenobarbital,0.5% MC,Gavage,Liver,100,150,300,10,30,100


### Set new individual IDs for each rat

In [5]:
ind_df.columns = map(lambda x: x.upper(), ind_df.columns)

image_df = image_df[image_df["ORGAN"] == "Liver"]
path_df = path_df[path_df["ORGAN"] == "Liver"]
compound_df = compound_df[compound_df["ORGAN"] == "Liver"]

image_df["INDV_ID"] = (
    image_df["EXP_ID"] * 1000 + image_df["GROUP_ID"] *
    10 + image_df["INDIVIDUAL_ID"]
)

path_df["INDV_ID"] = (
    path_df["EXP_ID"] * 1000 + path_df["GROUP_ID"] *
    10 + path_df["INDIVIDUAL_ID"]
)

ind_df["COMPOUND_NAME"] = ind_df["COMPOUND_NAME"].where(
    ind_df["COMPOUND_NAME"] != "TNFﾎｱ", "TNFα"
)


### Finding Types

In [6]:
path_df["FINDING_TYPE"].head()

0      Deposit, glycogen
1      Deposit, glycogen
2    Degeneration, fatty
3    Degeneration, fatty
4    Degeneration, fatty
Name: FINDING_TYPE, dtype: object

In [7]:
ft_list = list(set(path_df["FINDING_TYPE"].dropna()))
len(ft_list)


65

TGGATEs liver pathological images contain 65 pathological finding types.

In [8]:
# saving the finding type list
open(f"{processed_dir}/ft_list.txt", "w").write("\n".join(ft_list) + "\n")


1199

Transforming the pathological finding type information into one-hot vectors.

In [9]:
one_hot = pd.get_dummies(
    path_df[["INDV_ID", "FINDING_TYPE"]].dropna(),
    columns=["FINDING_TYPE"],
    prefix="",
    prefix_sep="",
)
one_hot = one_hot.groupby("INDV_ID").max()
one_hot.head()

Unnamed: 0_level_0,"Accumulation, foam cell","Adenoma, hepatocellular","Alteration, cytoplasmic","Alteration, nuclear",Altered hepatocellular foci,Anisonucleosis,Atrophy,"Atypia, nuclear",Bacterium,Cellular foci,...,"Proliferation, Kupffer cell","Proliferation, bile duct","Proliferation, oval cell",Pyknosis,Scar,Single cell necrosis,Swelling,Thrombus,"Vacuolization, cytoplasmic","Vacuolization, nuclear"
INDV_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
40181,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
40185,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
40191,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
40192,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
40193,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


Merging the pathological finding type information with the individual rat data.

In [10]:
ind_ft_df = pd.merge(ind_df, one_hot, on="INDV_ID", how="left").fillna(0)
ind_ft_df.head()

Unnamed: 0,EXP_ID,GROUP_ID,INDIVIDUAL_ID,INDV_ID,COMPOUND_NAME,COMPOUND_ABBR,COMPOUND_NO,SPECIES,TEST_TYPE,SIN_REP_TYPE,...,"Proliferation, Kupffer cell","Proliferation, bile duct","Proliferation, oval cell",Pyknosis,Scar,Single cell necrosis,Swelling,Thrombus,"Vacuolization, cytoplasmic","Vacuolization, nuclear"
0,40,1,1,40011,acetaminophen,APAP,1,Rat,in vivo,Single,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,40,1,2,40012,acetaminophen,APAP,1,Rat,in vivo,Single,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,40,1,3,40013,acetaminophen,APAP,1,Rat,in vivo,Single,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,40,1,4,40014,acetaminophen,APAP,1,Rat,in vivo,Single,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,40,1,5,40015,acetaminophen,APAP,1,Rat,in vivo,Single,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Compounds

In [11]:
ind_ft_df["COMPOUND_NAME"].head()


0    acetaminophen
1    acetaminophen
2    acetaminophen
3    acetaminophen
4    acetaminophen
Name: COMPOUND_NAME, dtype: object

Merging the compound information with the individual rat data.

In [12]:
ind_ft_com_df = pd.merge(
    ind_ft_df, compound_df[["COMPOUND_NAME", "VEHICLE"]], on="COMPOUND_NAME", how="left"
)
ind_ft_com_df.head()

Unnamed: 0,EXP_ID,GROUP_ID,INDIVIDUAL_ID,INDV_ID,COMPOUND_NAME,COMPOUND_ABBR,COMPOUND_NO,SPECIES,TEST_TYPE,SIN_REP_TYPE,...,"Proliferation, bile duct","Proliferation, oval cell",Pyknosis,Scar,Single cell necrosis,Swelling,Thrombus,"Vacuolization, cytoplasmic","Vacuolization, nuclear",VEHICLE
0,40,1,1,40011,acetaminophen,APAP,1,Rat,in vivo,Single,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.5% MC
1,40,1,2,40012,acetaminophen,APAP,1,Rat,in vivo,Single,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.5% MC
2,40,1,3,40013,acetaminophen,APAP,1,Rat,in vivo,Single,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.5% MC
3,40,1,4,40014,acetaminophen,APAP,1,Rat,in vivo,Single,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.5% MC
4,40,1,5,40015,acetaminophen,APAP,1,Rat,in vivo,Single,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.5% MC


If the dose is zero, then the compound name is changed to the vehicle name.

In [13]:
ind_ft_com_df["COMPOUND_NAME"] = ind_ft_com_df["COMPOUND_NAME"].where(
    ind_ft_com_df["DOSE"] != 0, ind_ft_com_df["VEHICLE"]
)
ind_ft_com_df[ind_ft_com_df["DOSE"] == 0].head()

Unnamed: 0,EXP_ID,GROUP_ID,INDIVIDUAL_ID,INDV_ID,COMPOUND_NAME,COMPOUND_ABBR,COMPOUND_NO,SPECIES,TEST_TYPE,SIN_REP_TYPE,...,"Proliferation, bile duct","Proliferation, oval cell",Pyknosis,Scar,Single cell necrosis,Swelling,Thrombus,"Vacuolization, cytoplasmic","Vacuolization, nuclear",VEHICLE
0,40,1,1,40011,0.5% MC,APAP,1,Rat,in vivo,Single,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.5% MC
1,40,1,2,40012,0.5% MC,APAP,1,Rat,in vivo,Single,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.5% MC
2,40,1,3,40013,0.5% MC,APAP,1,Rat,in vivo,Single,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.5% MC
3,40,1,4,40014,0.5% MC,APAP,1,Rat,in vivo,Single,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.5% MC
4,40,1,5,40015,0.5% MC,APAP,1,Rat,in vivo,Single,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.5% MC


The list of compounds.

In [14]:
compound_list = list(set(ind_ft_com_df["COMPOUND_NAME"]))
print(*compound_list, sep="\t")


nitrosodiethylamine	pemoline	phenacetin	methyltestosterone	ciprofloxacin	WY-14643	triazolam	gemfibrozil	hydroxyzine	2,4-dinitrophenol	propranolol	corn oil	imipramine	ranitidine	1% cholesterol + 0.25% sodium cholate	2-nitrofluorene	ajmaline	nicotinic acid	butylated hydroxyanisole	propylthiouracil	flutamide	colchicine	acetamidofluorene	clomipramine	diazepam	sulindac	tolbutamide	cimetidine	aspirin	moxisylyte	benzbromarone	phalloidin	desmopressin acetate	tannic acid	acetazolamide	caffeine	naproxen	ethionine	diclofenac	dexamethasone	carbamazepine	ethambutol	ibuprofen	bromoethylamine	methylene dianiline	5% glucose sol., 1% DMSO, 0.001N HCl	coumarin	bromobenzene	sulfasalazine	carboplatin	amitriptyline	imatinib, methanesulfonate salt	nitrofurantoin	chlormezanone	phenobarbital	0.5% BSA saline	meloxicam	griseofulvin	ketoconazole	chlorpromazine	methimazole	phenylbutazone	haloperidol	rifampicin	fluphenazine	dantrolene	tiopronin	diltiazem	acetamide	nifedipine	triamterene	simvastatin	carbon tetrachl

In [15]:
# saving the compound list
open(f"{processed_dir}/compound_list.txt",
     "w").write("\n".join(compound_list) + "\n")


2222

Transforming the compound information into one-hot vectors.

In [16]:
one_hot = pd.get_dummies(
    ind_ft_com_df[["INDV_ID", "COMPOUND_NAME"]], prefix="", prefix_sep=""
)
ind_ft_com_df = pd.merge(ind_ft_df, one_hot, on="INDV_ID", how="left")
ind_ft_com_df.head()

Unnamed: 0,EXP_ID,GROUP_ID,INDIVIDUAL_ID,INDV_ID,COMPOUND_NAME,COMPOUND_ABBR,COMPOUND_NO,SPECIES,TEST_TYPE,SIN_REP_TYPE,...,thioridazine,ticlopidine,tiopronin,tolbutamide,triamterene,triazolam,trimethadione,tunicamycin,valproic acid,vitamin A
0,40,1,1,40011,acetaminophen,APAP,1,Rat,in vivo,Single,...,0,0,0,0,0,0,0,0,0,0
1,40,1,2,40012,acetaminophen,APAP,1,Rat,in vivo,Single,...,0,0,0,0,0,0,0,0,0,0
2,40,1,3,40013,acetaminophen,APAP,1,Rat,in vivo,Single,...,0,0,0,0,0,0,0,0,0,0
3,40,1,4,40014,acetaminophen,APAP,1,Rat,in vivo,Single,...,0,0,0,0,0,0,0,0,0,0
4,40,1,5,40015,acetaminophen,APAP,1,Rat,in vivo,Single,...,0,0,0,0,0,0,0,0,0,0


### Images

The file location column includes the location from which you can download the pathological images.

In [17]:
image_df["FILE_LOCATION"].head()


0    ftp://ftp.biosciencedbc.jp/archive/open-tggate...
2    ftp://ftp.biosciencedbc.jp/archive/open-tggate...
4    ftp://ftp.biosciencedbc.jp/archive/open-tggate...
6    ftp://ftp.biosciencedbc.jp/archive/open-tggate...
8    ftp://ftp.biosciencedbc.jp/archive/open-tggate...
Name: FILE_LOCATION, dtype: object

Extracting the file names.

In [18]:
image_df["FILE"] = [f[-1] for f in image_df["FILE_LOCATION"].str.split("/")]
image_df["FILE"].head()


0    26761.svs
2    26765.svs
4    26770.svs
6    26774.svs
8    26778.svs
Name: FILE, dtype: object

Merging the image file information with the individual rat data.

In [19]:
info_df = pd.merge(
    ind_ft_com_df,
    image_df[["INDV_ID", "FILE", "FILE_LOCATION"]],
    on="INDV_ID",
    how="left",
)
info_df.head()

Unnamed: 0,EXP_ID,GROUP_ID,INDIVIDUAL_ID,INDV_ID,COMPOUND_NAME,COMPOUND_ABBR,COMPOUND_NO,SPECIES,TEST_TYPE,SIN_REP_TYPE,...,tiopronin,tolbutamide,triamterene,triazolam,trimethadione,tunicamycin,valproic acid,vitamin A,FILE,FILE_LOCATION
0,40,1,1,40011,acetaminophen,APAP,1,Rat,in vivo,Single,...,0,0,0,0,0,0,0,0,26761.svs,ftp://ftp.biosciencedbc.jp/archive/open-tggate...
1,40,1,2,40012,acetaminophen,APAP,1,Rat,in vivo,Single,...,0,0,0,0,0,0,0,0,26765.svs,ftp://ftp.biosciencedbc.jp/archive/open-tggate...
2,40,1,3,40013,acetaminophen,APAP,1,Rat,in vivo,Single,...,0,0,0,0,0,0,0,0,26770.svs,ftp://ftp.biosciencedbc.jp/archive/open-tggate...
3,40,1,4,40014,acetaminophen,APAP,1,Rat,in vivo,Single,...,0,0,0,0,0,0,0,0,26774.svs,ftp://ftp.biosciencedbc.jp/archive/open-tggate...
4,40,1,5,40015,acetaminophen,APAP,1,Rat,in vivo,Single,...,0,0,0,0,0,0,0,0,26778.svs,ftp://ftp.biosciencedbc.jp/archive/open-tggate...


In [20]:
info_df.to_csv(f"{processed_dir}/info.csv", index=False)


In [21]:
path = f"{root}/data/DrugBank/compound_table.db"

conn = sqlite3.connect(path)
conn.row_factory = sqlite3.Row
cur = conn.cursor()
cur.execute("select * from compound_table")
com_dict = cur.fetchall()

In [22]:
key_list = list(com_dict[0].keys())
key_dict = {k: i for i, k in enumerate(key_list)}

In [23]:
table = [[] for i in range(len(key_dict))]

for cd in com_dict:
    for k in cd.keys():
        table[key_dict[k]].append(cd[k])


In [24]:
df = pd.DataFrame(table, index=key_list).T


In [25]:
com_list = list(set(info_df["COMPOUND_NAME"]))
syn_dict = {}
for cid, syn in df[["compound_id", "synonym"]].values:
    for s in syn.split(";"):
        syn_dict[s.lower()] = cid


In [26]:
dd = defaultdict(list)

for com in com_list:
    if com in syn_dict:
        moa = df.loc[df["compound_id"] == syn_dict[com], "mechanism_of_action"].values[
            0
        ]
        if moa is not None:
            for m in moa.split(";"):
                dd[m].append(com)

In [27]:
for k, v in dd.items():
    print(k, len(v))

Unknown 5
Androgen Receptor agonist 2
Bacterial DNA gyrase inhibitor 1
GABA-A receptor 2
 anion channel positive allosteric modulator 2
Peroxisome proliferator-activated receptor alpha agonist 3
Histamine H1 receptor antagonist 2
Histamine H2 receptor antagonist 3
Thyroid peroxidase inhibitor 2
Type I iodothyronine deiodinase (Type-I 5'-deiodinase) (DIOI) (Type 1 DI) (5DI) inhibitor 1
Androgen Receptor antagonist 1
Tubulin inhibitor 2
Cyclooxygenase inhibitor 9
Sulfonylurea receptor 1, Kir6.2 blocker 3
Adrenergic receptor alpha antagonist 1
Solute carrier family 22 member 12 inhibitor 1
Vasopressin receptor agonist 1
Carbonic anhydrase I inhibitor 1
Carbonic anhydrase II inhibitor 1
Carbonic anhydrase IV inhibitor 1
Carbonic anhydrase XII inhibitor 1
Adenosine receptor antagonist 2
Glucocorticoid receptor agonist 1
Sodium channel alpha subunit blocker 2
Arachidonate 5-lipoxygenase inhibitor 1
DNA inhibitor 6
Cyclooxygenase-2 inhibitor 1
Cytochrome P450 51 inhibitor 1
D2-like dopamine r

#### Extract MoA with no less than 3 compounds

In [28]:
for k, v in dd.items():
    if len(v)>=3:
        print(f"MoA : {k}")
        print(*v)

MoA : Unknown
pemoline phenacetin coumarin chlormezanone bucetin
MoA : Peroxisome proliferator-activated receptor alpha agonist
gemfibrozil clofibrate fenofibrate
MoA : Histamine H2 receptor antagonist
ranitidine cimetidine famotidine
MoA : Cyclooxygenase inhibitor
sulindac aspirin naproxen diclofenac sulfasalazine phenylbutazone mefenamic acid indomethacin acetaminophen
MoA : Sulfonylurea receptor 1, Kir6.2 blocker
tolbutamide chlorpropamide glibenclamide
MoA : DNA inhibitor
carboplatin nitrofurantoin lomustine cyclophosphamide azathioprine nitrofurazone
MoA : Serotonin 2a (5-HT2a) receptor antagonist
chlorpromazine haloperidol thioridazine
MoA : Bacterial 70S ribosome inhibitor
erythromycin ethylsuccinate tetracycline chloramphenicol


In [29]:
moa_list = []
for k, v in dd.items():
    if len(v)>=3:
        if k!="Unknown":
            moa_list.append(k)
moa_dict = {k:i for i,k in enumerate(moa_list)}

In [30]:
com_set = set()
for moa in moa_list:
    com_set |= set(dd[moa])

In [31]:
extracted_com_list = list(com_set)
extracted_com_dict = {k:i for i,k in enumerate(extracted_com_list)}

In [32]:
matrix = np.zeros(shape = (len(extracted_com_list), len(moa_list)))
for moa in moa_list:
    for com in dd[moa]:
        matrix[extracted_com_dict[com], moa_dict[moa]] = 1

In [33]:
moa_df = pd.DataFrame(matrix, columns = moa_list, index=extracted_com_list)
moa_df.head()

Unnamed: 0,Peroxisome proliferator-activated receptor alpha agonist,Histamine H2 receptor antagonist,Cyclooxygenase inhibitor,"Sulfonylurea receptor 1, Kir6.2 blocker",DNA inhibitor,Serotonin 2a (5-HT2a) receptor antagonist,Bacterial 70S ribosome inhibitor
fenofibrate,1.0,0.0,0.0,0.0,0.0,0.0,0.0
diclofenac,0.0,0.0,1.0,0.0,0.0,0.0,0.0
lomustine,0.0,0.0,0.0,0.0,1.0,0.0,0.0
chlorpromazine,0.0,0.0,0.0,0.0,0.0,1.0,0.0
phenylbutazone,0.0,0.0,1.0,0.0,0.0,0.0,0.0


In [34]:
moa_df.to_csv(f"{processed_dir}/moa.csv")

## Extracting patches

- 100 patch/WSI
- patch : 1024x1024 pixels

In [35]:
files = info_df["FILE"].dropna().values

In [35]:
# patch_dir = "/data/TGGATEs/patches"
# wsi_dir = "/data/TGGATEs/WSI"

# for s in tqdm(files):
#     path = f"{wsi_dir}/{s}"
#     if not os.path.exists(f"{patch_dir}/{s}"):
#         os.system(f"mkdir {patch_dir}/{s}")
#         try:
#             toxreprcnn.preprocess.save_tiles(image_path=path, tile_size=1024, n_tiles=100, tile_dir=f"{patch_dir}/{s}")
#         except KeyboardInterrupt:
#             break
#         except:
#             print("Something is wrong with",s)