# extract CAS-RN and toxicity data from toxcast
----------
## extract tox21 ER or AR test from toxcast and extract CAS-RN and toxicity data 

In [1]:
import pandas as pd
from tqdm import tqdm

In [3]:
df = pd.read_csv("../../data/processed/toxcast/m5dat.tsv", sep="\t")

In [5]:
for rec in ["AR", "ERa", "ERb"]:
    for lig in ["Agonist", "Antagonist"]:
        set_assey = set()
        for i in range(len(df)):
            if "TOX21" in df["aenm"][i] and f"_{rec}_" in df["aenm"][i] and lig in df["aenm"][i]:
                set_assey.add(df["aenm"][i])

        before_tsv = []
        for i in tqdm(range(len(df))):
            if df["aenm"][i] in set_assey:
                col = []
                col.append(df["casn"][i])
                col.append(df["hitc"][i])
                before_tsv.append(col)
        
        cas_sev = dict()
        for i in range(len(before_tsv)):
            if before_tsv[i][0] == "nan":
                continue
            elif before_tsv[i][0] in cas_sev.keys():
                cas_sev[before_tsv[i][0]].add(before_tsv[i][1])
            elif before_tsv[i][0] not in cas_sev.keys():
                cas_sev[before_tsv[i][0]] = set()
                cas_sev[before_tsv[i][0]].add(before_tsv[i][1])

        tsv = []
        for cas in cas_sev.keys():
            col = []
            col.append(cas)
            if 1 in cas_sev[cas]:
                col.append(1)
            else:
                col.append(0)
            tsv.append(col)

        if lig == "Agonist":
            l = "ago"
        elif lig == "Antagonist":
            l = "anta"

        pd.DataFrame(tsv).to_csv(f"../../data/processed/tox21/tox21_{rec}_{l}.tsv", sep="\t", header=None, index=False)

        print(rec, lig, "DONE!")

100%|██████████| 3720594/3720594 [00:53<00:00, 70076.29it/s] 


AR Agonist DONE!


100%|██████████| 3720594/3720594 [01:04<00:00, 57420.81it/s] 


AR Antagonist DONE!


100%|██████████| 3720594/3720594 [00:43<00:00, 84624.97it/s] 


ERa Agonist DONE!


100%|██████████| 3720594/3720594 [00:34<00:00, 106629.71it/s]


ERa Antagonist DONE!


100%|██████████| 3720594/3720594 [00:34<00:00, 107601.38it/s]


ERb Agonist DONE!


100%|██████████| 3720594/3720594 [00:34<00:00, 106305.87it/s]

ERb Antagonist DONE!





## for validation test
--------
### Each jacvam validation test for endocrine disruption confirms the following different ligands.  
・jacvam_09_01 agonist    → ER (ERa + ERb) agonist  
・jacvam_09_01 antagonist → ER (ERa + ERb) antagonist  
・jacvam_09_02 agonist    → ERa agonist  
・jacvam_09_02 antagonist → ERa antagonist  
・jacvam_09_04 agonist    → AR agonist  
・jacvam_09_04 antagonist → AR antagonist  
・jacvam_09_05 agonist    → AR agonist  
・jacvam_09_06 antagonist → AR antagonist  
・jacvam_09_07            → ER (ERa + ERb) agonist and antagonist  

In [2]:
import os
import sys

current_dir = os.getcwd()
parent_parent_dir = os.path.dirname(os.path.dirname(current_dir))
src_dir = os.path.join(parent_parent_dir, 'src')
sys.path.append(src_dir)

from prep import prep_for_09

In [3]:
for test in ["01", "02", "04", "05"]:
    for lig in ["ago", "anta"]:
        prep_for_09(test, lig)
        print(test, lig, "DONE!")

prep_for_09("07")
print("07 DONE!")

../../data/processed/0901_ago/cas_sev.tsv is already exists!
01 ago DONE!
../../data/processed/0901_anta/cas_sev.tsv is already exists!
01 anta DONE!
../../data/processed/0902_ago/cas_sev.tsv is already exists!
02 ago DONE!
../../data/processed/0902_anta/cas_sev.tsv is already exists!
02 anta DONE!
../../data/processed/0904_ago/cas_sev.tsv is already exists!
04 ago DONE!
../../data/processed/0904_anta/cas_sev.tsv is already exists!
04 anta DONE!
../../data/processed/0905_ago/cas_sev.tsv is already exists!
05 ago DONE!
../../data/processed/0905_anta/cas_sev.tsv is already exists!
05 anta DONE!
../../data/processed/0907/cas_sev.tsv is already exists!
07 DONE!
