# make lookup table from whole raw data
-----
・use pubchempy and CASRN, get need property from PubChem

In [None]:
import os
import sys

current_dir = os.getcwd()
parent_parent_dir = os.path.dirname(os.path.dirname(current_dir))
src_dir = os.path.join(parent_parent_dir, 'src')
sys.path.append(src_dir)

import collections
import sqlite3
import time
from pathlib import Path

import numpy as np
import pandas as pd
import pubchempy as pcp
from prep import for_lookup, for_lookup_val, prep_pubchem_bycas
from rdkit import Chem, DataStructs
from rdkit.Chem import AllChem, Descriptors, rdMolDescriptors
from scipy.stats import norm
from sklearn.preprocessing import robust_scale
from tqdm import tqdm
from util import file_checker, pickle_dump, pickle_load, robust_z

In [3]:
if file_checker("../../data/processed/other/all_pubchem_data.tsv", False):
    print("not done yet!")

not done yet!


In [4]:
# use pubchempy for gathering data (canonical smiles, xlogp, and TPSA) for GA
# before using pubchempy, gather all CAS-RN in all data 

cas = []
for test_num in ["0901", "0902", "0904", "0905"]:
    for lig in ["ago", "anta"]:
        for file_name in ["for_GA", "validation"]:
            df = pd.read_csv(f"../../data/processed/{test_num}_{lig}/{file_name}.tsv", sep="\t", header=None)
            df = df.dropna()
            for i in range(len(df)):
                cas.append(df.iloc[i,0])

for test_num in ["0701", "0702", "0907", "1001", "1002"]:
    for file_name in ["for_GA", "validation"]:
        df = pd.read_csv(f"../../data/processed/{test_num}/{file_name}.tsv", sep="\t", header=None)
        df = df.dropna()
        for i in range(len(df)):
            cas.append(df.iloc[i,0])

cas = list(set(cas))

In [None]:
property = ['CanonicalSMILES', 'XLogP', 'TPSA']
all_data = prep_pubchem_bycas(cas, property)

100%|██████████| 14625/14625 [10:50:36<00:00,  2.67s/it] 


In [6]:
all_data_na = all_data[0].dropna()
all_data_na.to_csv("../../data/processed/other/all_pubchem_data.tsv", sep="\t", header=None, index=False)

In [8]:
# error means all CASRN that cannot get full data from pubchem
error = set(all_data[1]) | set(cas)
pickle_dump(error, "../../data/processed/other/all_pubchem_error.pickle")

In [15]:
# select pubchem data if a CASRN has multiple CIDs
all_data_na = all_data_na.reset_index()
all_data_na = all_data_na.drop(columns=["index"])

all_data_dict = dict()
for i in tqdm(range(len(all_data_na))):
    cas = all_data_na["CAS"][i]
    if cas not in all_data_dict.keys():
        all_data_dict[cas] = []
        for n in range(len(all_data_na.iloc[i])):
            all_data_dict[cas].append(all_data_na.iloc[i,n])
    else:
        continue

100%|██████████| 11422/11422 [00:03<00:00, 2948.97it/s]


In [16]:
df = pd.DataFrame.from_dict(all_data_dict, orient='index', columns=["CID","CAS","CanonicalSmiles","xlogp","tpsa"]).reset_index().drop(columns=["index"])
df.to_csv("../../data/processed/other/all_pubchem_data.tsv",sep="\t",index=False)

In [7]:
lookup_whole = for_lookup(all_data_na)

100%|██████████| 11422/11422 [09:33<00:00, 19.93it/s] 
100%|██████████| 11422/11422 [38:04<00:00,  5.00it/s] 


In [8]:
pickle_dump(lookup_whole, "../../data/processed/other/lookup_whole.pickle")

In [9]:
# make for_GA dataset 
for test_num in ["0901", "0902", "0904", "0905"]:
    for lig in ["ago", "anta"]:
        for file_name in ["for_GA"]:
            df = pd.read_csv(f"../../data/processed/{test_num}_{lig}/{file_name}.tsv", sep="\t", header=None)
            df = df.dropna()
            cas = []
            for i in range(len(df)):
                cas.append(df.iloc[i,0])
            use_data = all_data_na[all_data_na["CAS"].isin(cas)].reset_index().drop(columns=["index"])
            lookup = for_lookup(use_data)
            pickle_dump(lookup, f"../../data/processed/{test_num}_{lig}/{file_name}_lookup.pickle")

for test_num in ["0701", "0702", "0907", "1001", "1002"]:
    for file_name in ["for_GA"]:
        df = pd.read_csv(f"../../data/processed/{test_num}/{file_name}.tsv", sep="\t", header=None)
        df = df.dropna()
        cas = []
        for i in range(len(df)):
            cas.append(df.iloc[i,0])
        use_data = all_data_na[all_data_na["CAS"].isin(cas)].reset_index().drop(columns=["index"])
        lookup = for_lookup(use_data)
        pickle_dump(lookup, f"../../data/processed/{test_num}/{file_name}_lookup.pickle")

100%|██████████| 6737/6737 [04:22<00:00, 25.71it/s] 
100%|██████████| 6737/6737 [12:39<00:00,  8.87it/s] 
100%|██████████| 6753/6753 [03:33<00:00, 31.64it/s] 
100%|██████████| 6753/6753 [13:23<00:00,  8.41it/s] 
100%|██████████| 6724/6724 [03:31<00:00, 31.79it/s] 
100%|██████████| 6724/6724 [13:29<00:00,  8.31it/s] 
100%|██████████| 6760/6760 [03:35<00:00, 31.44it/s] 
100%|██████████| 6760/6760 [14:33<00:00,  7.74it/s]  
100%|██████████| 6772/6772 [03:24<00:00, 33.08it/s] 
100%|██████████| 6772/6772 [14:14<00:00,  7.93it/s]  
100%|██████████| 6770/6770 [03:15<00:00, 34.68it/s] 
100%|██████████| 6770/6770 [14:15<00:00,  7.92it/s]  
100%|██████████| 6766/6766 [03:19<00:00, 33.98it/s] 
100%|██████████| 6766/6766 [22:15<00:00,  5.07it/s]  
100%|██████████| 6768/6768 [05:47<00:00, 19.49it/s] 
100%|██████████| 6768/6768 [22:33<00:00,  5.00it/s]  
100%|██████████| 7040/7040 [06:19<00:00, 18.55it/s] 
100%|██████████| 7040/7040 [24:20<00:00,  4.82it/s]  
100%|██████████| 7043/7043 [03:32<00:00,

# if compound in validation dataset cannot make lookup, collect data from text in JACVAM

In [11]:
correct_cas = set(all_data_na["CAS"])

all_val_error_cas = set()
for test_num in ["0901", "0902", "0904", "0905"]:
    for lig in ["ago", "anta"]:
        val = pd.read_csv(f"../../data/processed/{test_num}_{lig}/validation.tsv", sep="\t", header=None)
        val_cas = set(val.iloc[:,0])
        val_error_cas = val_cas - correct_cas
        all_val_error_cas = all_val_error_cas | val_error_cas
        print(test_num, lig, val_error_cas)

for test_num in ["0701", "0702", "0907", "1001", "1002"]:
    val = pd.read_csv(f"../../data/processed/{test_num}/validation.tsv", sep="\t", header=None)
    val_cas = set(val.iloc[:,0])
    val_error_cas = val_cas - correct_cas
    all_val_error_cas = all_val_error_cas | val_error_cas
    print(test_num, val_error_cas)

0901 ago {'50-41-9'}
0901 anta {'82640-04-8'}
0902 ago {'50-41-9', '57-30-7', '1461-22-9'}
0902 anta {'82640-04-8', '50-41-9'}
0904 ago set()
0904 anta set()
0905 ago set()
0905 anta set()
0701 {'10108-64-2', '1910-42-5', '7789-12-0', '76-87-9', '554-13-2', '549-18-8', '318-98-9', '151-50-8', '7446-18-6', '7758-99-8', '152-11-4', '13410-01-0', '1330-20-7', '73791-47-6', '51-42-3', '10043-35-3', '7784-46-5', '7647-14-5', '614-39-1', '7487-94-7', '8007-59-8', '7447-40-7', '62-76-0', '7681-49-4', '1327-53-3'}
0702 {'107-64-2', '7758-98-7', '866-84-2', '25646-77-9', '10361-37-2', '9005-64-5', '3926-62-3', '68515-48-0', '557-05-1', '1314-13-2', '7778-80-5', '917-61-3', '12125-02-9', '7779-90-0'}
0907 {'84852-15-3', '2943-75-1'}
1001 {'82385-42-0', '147-24-0', '69-57-8', '115-09-3', '10043-35-3', '54-21-7', '7447-41-8'}
1002 {'147-24-0', '10043-35-3', '34381-68-5'}


## make cas and severity data only in whole dataset

In [36]:
for test_num in ["0901", "0902", "0904", "0905"]:
    for lig in ["ago", "anta"]:
        file_name = "for_GA"
        df = pd.read_csv(f"../../data/processed/{test_num}_{lig}/{file_name}.tsv", sep="\t", header=None)
        df = df.dropna()
        cas = []
        for i in range(len(df)):
            cas.append(df.iloc[i,0])
        use_cas =  set(all_data_na[all_data_na["CAS"].isin(cas)].reset_index().drop(columns=["index"])["CAS"])
        test_name = test_num + "_" + lig
        cas_tox = pd.read_csv(f"../../data/processed/{test_name}/cas_sev.tsv", sep="\t", header=None)
        cas_tox_use = []
        for i in range(len(cas_tox)):
            if cas_tox.iloc[i,0] in use_cas:
                cas_tox_use.append([cas_tox.iloc[i,0], cas_tox.iloc[i,1]])
        pd.DataFrame(cas_tox_use).to_csv(f"../../data/processed/{test_name}/cas_sev_use.tsv", sep="\t", header=None, index=False)

In [37]:
use_cas =  set(all_data_na[all_data_na["CAS"].isin(cas)].reset_index().drop(columns=["index"])["CAS"])
for test_num in ["0701", "0702", "0907", "1001", "1002"]:
    file_name = "for_GA"
    test_name = test_num
    df = pd.read_csv(f"../../data/processed/{test_num}/{file_name}.tsv", sep="\t", header=None)
    df = df.dropna()
    cas = []
    for i in range(len(df)):
        cas.append(df.iloc[i,0])
    use_cas =  set(all_data_na[all_data_na["CAS"].isin(cas)].reset_index().drop(columns=["index"])["CAS"])
    cas_tox = pd.read_csv(f"../../data/processed/{test_name}/cas_sev.tsv", sep="\t", header=None)
    cas_tox_use = []
    for i in range(len(cas_tox)):
        if cas_tox.iloc[i,0] in use_cas:
            cas_tox_use.append([cas_tox.iloc[i,0], cas_tox.iloc[i,1]])
    pd.DataFrame(cas_tox_use).to_csv(f"../../data/processed/{test_name}/cas_sev_use.tsv", sep="\t", header=None, index=False)

In [None]:
df = pd.read_csv("../../data/processed/other/all_pubchem_data.tsv", sep="\t")
df_dict = dict(zip(df.iloc[:,1], df.iloc[:,2]))

In [42]:
for test_num in ["0701", "0702", "0907", "1001", "1002"]:
    test_name = test_num
    cas_tox_use = pd.read_csv(f"../../data/processed/{test_name}/cas_sev_use.tsv", sep="\t", header=None)
    smiles = [df_dict.get(cas, None) for cas in cas_tox_use.iloc[:, 0]]
    cas_tox_use.loc[:,3] = smiles
    pd.DataFrame(cas_tox_use).to_csv(f"../../data/processed/{test_name}/cas_sev_use.tsv", sep="\t", header=None, index=False)

In [43]:
for test_num in ["0901", "0902", "0904", "0905"]:
    for lig in ["ago", "anta"]:
        test_name = test_num + "_" + lig
        cas_tox_use = pd.read_csv(f"../../data/processed/{test_name}/cas_sev_use.tsv", sep="\t", header=None)
        smiles = [df_dict.get(cas, None) for cas in cas_tox_use.iloc[:, 0]]
        cas_tox_use.loc[:,3] = smiles
        pd.DataFrame(cas_tox_use).to_csv(f"../../data/processed/{test_name}/cas_sev_use.tsv", sep="\t", header=None, index=False)

## get data about CAS-RN written below from PubChem or RDkit
  
0701 8007-59-8 (cas = 7681-52-9, xlogp is from rdkit)   
0701 1327-53-3 (cas = CAS-1327-53-3, xlogp is from rdkit)   
0701 1330-20-7 (cas = CAS-1330-20-7, xlogp is from rdkit)  
0702 68515-48-0 (cas = CAS-68515-48-0, xlogp is from rdkit)  
0702 9005-64-5 (cas = 1052273-76-3, xlogp is from rdkit)  
0907 84852-15-3 (cas = CAS-84852-15-3, xlogp is from rdkit)  

In [25]:
property = ['CanonicalSMILES', 'XLogP', 'TPSA']

for test_num in ["0901", "0902", "0904", "0905"]:
    for lig in ["ago", "anta"]:
        val = pd.read_csv(f"../../data/processed/{test_num}_{lig}/validation.tsv", sep="\t", header=None)
        tsv = []
        for i in tqdm(range(len(val))):
            if val.iloc[i,0] in all_val_error_cas:
                time.sleep(2)
                data = pcp.get_properties(property, val.iloc[i,0], "name", as_dataframe=True)
                data = data.reset_index()
                col = []
                col.append(val.iloc[i,0])
                col.append(val.iloc[i,1])
                try:
                    col.append(data["CanonicalSMILES"][0])
                    smiles = data["CanonicalSMILES"][0]
                except:
                    col.append("###")
                    smiles = "###"
                try:
                    col.append(data["XLogP"][0])
                except:
                    if smiles == "###":
                        col.append("###")
                    else:
                        mol = Chem.MolFromSmiles(smiles)
                        col.append(Descriptors.MolLogP(mol))
                try:
                    col.append(data["TPSA"][0])
                except:
                    if smiles == "###":
                        col.append("###")
                    else:
                        mol = Chem.MolFromSmiles(smiles)
                        col.append(rdMolDescriptors.CalcTPSA(mol))
                if "###" in col:
                    print(test_num, val.iloc[i,0])
                tsv.append(col)
            else:
                col = []
                col.append(val.iloc[i,0])
                col.append(val.iloc[i,1])
                for n in range(2, len(all_data_dict[val.iloc[i,0]])):
                    col.append(all_data_dict[val.iloc[i,0]][n])
                tsv.append(col)
        pd.DataFrame(tsv).to_csv(f"../../data/processed/{test_num}_{lig}/validation_pubchem.tsv", sep="\t", header=None, index=False)

        if len(tsv) != len(val):
            cas = [row[0] for row in tsv]
            print(set(val[0]) - set(cas))

for test_num in ["0701", "0702", "0907", "1001", "1002"]:
    val = pd.read_csv(f"../../data/processed/{test_num}/validation.tsv", sep="\t", header=None)
    tsv = []
    for i in tqdm(range(len(val))):
        if val.iloc[i,0] == "8007-59-8":
            val.iloc[i,0] = "7681-52-9"
            time.sleep(2)
            data = pcp.get_properties(property, val.iloc[i,0], "name", as_dataframe=True)
            data = data.reset_index()
            col = []
            col.append(val.iloc[i,0])
            col.append(val.iloc[i,1])
            try:
                col.append(data["CanonicalSMILES"][0])
                smiles = data["CanonicalSMILES"][0]
            except:
                col.append("###")
                smiles = "###"
            try:
                col.append(data["XLogP"][0])
            except:
                if smiles == "###":
                    col.append("###")
                else:
                    mol = Chem.MolFromSmiles(smiles)
                    col.append(Descriptors.MolLogP(mol))
            try:
                col.append(data["TPSA"][0])
            except:
                if smiles == "###":
                    col.append("###")
                else:
                    mol = Chem.MolFromSmiles(smiles)
                    col.append(rdMolDescriptors.CalcTPSA(mol))
            if "###" in col:
                print(test_num, val.iloc[i,0])
            tsv.append(col)
        elif val.iloc[i,0] == "1327-53-3":
            val.iloc[i,0] = "CAS-1327-53-3"
            time.sleep(2)
            data = pcp.get_properties(property, val.iloc[i,0], "name", as_dataframe=True)
            data = data.reset_index()
            col = []
            col.append(val.iloc[i,0])
            col.append(val.iloc[i,1])
            try:
                col.append(data["CanonicalSMILES"][0])
                smiles = data["CanonicalSMILES"][0]
            except:
                col.append("###")
                smiles = "###"
            try:
                col.append(data["XLogP"][0])
            except:
                if smiles == "###":
                    col.append("###")
                else:
                    mol = Chem.MolFromSmiles(smiles)
                    col.append(Descriptors.MolLogP(mol))
            try:
                col.append(data["TPSA"][0])
            except:
                if smiles == "###":
                    col.append("###")
                else:
                    mol = Chem.MolFromSmiles(smiles)
                    col.append(rdMolDescriptors.CalcTPSA(mol))
            if "###" in col:
                print(test_num, val.iloc[i,0])
            tsv.append(col)
        elif val.iloc[i,0] == "1330-20-7":
            val.iloc[i,0] =  "CAS-1330-20-7"
            time.sleep(2)
            data = pcp.get_properties(property, val.iloc[i,0], "name", as_dataframe=True)
            data = data.reset_index()
            col = []
            col.append(val.iloc[i,0])
            col.append(val.iloc[i,1])
            try:
                col.append(data["CanonicalSMILES"][0])
                smiles = data["CanonicalSMILES"][0]
            except:
                col.append("###")
                smiles = "###"
            try:
                col.append(data["XLogP"][0])
            except:
                if smiles == "###":
                    col.append("###")
                else:
                    mol = Chem.MolFromSmiles(smiles)
                    col.append(Descriptors.MolLogP(mol))
            try:
                col.append(data["TPSA"][0])
            except:
                if smiles == "###":
                    col.append("###")
                else:
                    mol = Chem.MolFromSmiles(smiles)
                    col.append(rdMolDescriptors.CalcTPSA(mol))
            if "###" in col:
                print(test_num, val.iloc[i,0])
            tsv.append(col)
        elif val.iloc[i,0] == "68515-48-0":
            val.iloc[i,0] = "CAS-68515-48-0"
            time.sleep(2)
            data = pcp.get_properties(property, val.iloc[i,0], "name", as_dataframe=True)
            data = data.reset_index()
            col = []
            col.append(val.iloc[i,0])
            col.append(val.iloc[i,1])
            try:
                col.append(data["CanonicalSMILES"][0])
                smiles = data["CanonicalSMILES"][0]
            except:
                col.append("###")
                smiles = "###"
            try:
                col.append(data["XLogP"][0])
            except:
                if smiles == "###":
                    col.append("###")
                else:
                    mol = Chem.MolFromSmiles(smiles)
                    col.append(Descriptors.MolLogP(mol))
            try:
                col.append(data["TPSA"][0])
            except:
                if smiles == "###":
                    col.append("###")
                else:
                    mol = Chem.MolFromSmiles(smiles)
                    col.append(rdMolDescriptors.CalcTPSA(mol))
            if "###" in col:
                print(test_num, val.iloc[i,0])
            tsv.append(col)
        elif val.iloc[i,0] == "9005-64-5":
            val.iloc[i,0] = "1052273-76-3"
            time.sleep(2)
            data = pcp.get_properties(property, val.iloc[i,0], "name", as_dataframe=True)
            data = data.reset_index()
            col = []
            col.append(val.iloc[i,0])
            col.append(val.iloc[i,1])
            try:
                col.append(data["CanonicalSMILES"][0])
                smiles = data["CanonicalSMILES"][0]
            except:
                col.append("###")
                smiles = "###"
            try:
                col.append(data["XLogP"][0])
            except:
                if smiles == "###":
                    col.append("###")
                else:
                    mol = Chem.MolFromSmiles(smiles)
                    col.append(Descriptors.MolLogP(mol))
            try:
                col.append(data["TPSA"][0])
            except:
                if smiles == "###":
                    col.append("###")
                else:
                    mol = Chem.MolFromSmiles(smiles)
                    col.append(rdMolDescriptors.CalcTPSA(mol))
            if "###" in col:
                print(test_num, val.iloc[i,0])
            tsv.append(col)
        elif val.iloc[i,0] == "84852-15-3":
            val.iloc[i,0]  = "CAS-84852-15-3"
            time.sleep(2)
            data = pcp.get_properties(property, val.iloc[i,0], "name", as_dataframe=True)
            data = data.reset_index()
            col = []
            col.append(val.iloc[i,0])
            col.append(val.iloc[i,1])
            try:
                col.append(data["CanonicalSMILES"][0])
                smiles = data["CanonicalSMILES"][0]
            except:
                col.append("###")
                smiles = "###"
            try:
                col.append(data["XLogP"][0])
            except:
                if smiles == "###":
                    col.append("###")
                else:
                    mol = Chem.MolFromSmiles(smiles)
                    col.append(Descriptors.MolLogP(mol))
            try:
                col.append(data["TPSA"][0])
            except:
                if smiles == "###":
                    col.append("###")
                else:
                    mol = Chem.MolFromSmiles(smiles)
                    col.append(rdMolDescriptors.CalcTPSA(mol))
            if "###" in col:
                print(test_num, val.iloc[i,0])
            tsv.append(col)
        elif val.iloc[i,0] in all_val_error_cas:
            time.sleep(2)
            data = pcp.get_properties(property, val.iloc[i,0], "name", as_dataframe=True)
            data = data.reset_index()
            col = []
            col.append(val.iloc[i,0])
            col.append(val.iloc[i,1])
            try:
                col.append(data["CanonicalSMILES"][0])
                smiles = data["CanonicalSMILES"][0]
            except:
                col.append("###")
                smiles = "###"
            try:
                col.append(data["XLogP"][0])
            except:
                if smiles == "###":
                    col.append("###")
                else:
                    mol = Chem.MolFromSmiles(smiles)
                    col.append(Descriptors.MolLogP(mol))
            try:
                col.append(data["TPSA"][0])
            except:
                if smiles == "###":
                    col.append("###")
                else:
                    mol = Chem.MolFromSmiles(smiles)
                    col.append(rdMolDescriptors.CalcTPSA(mol))
            if "###" in col:
                print(test_num, val.iloc[i,0])
            tsv.append(col)
        else:
            col = []
            col.append(val.iloc[i,0])
            col.append(val.iloc[i,1])
            for n in range(2, len(all_data_dict[val.iloc[i,0]])):
                col.append(all_data_dict[val.iloc[i,0]][n])
            tsv.append(col)
    pd.DataFrame(tsv).to_csv(f"../../data/processed/{test_num}/validation_pubchem.tsv", sep="\t", header=None, index=False)

    if len(tsv) != len(val):
        cas = [row[0] for row in tsv]
        print(set(val[0]) - set(cas))

  0%|          | 0/42 [00:00<?, ?it/s]

100%|██████████| 42/42 [00:02<00:00, 16.17it/s]
100%|██████████| 25/25 [00:02<00:00,  9.63it/s]
100%|██████████| 86/86 [00:07<00:00, 10.97it/s]
100%|██████████| 21/21 [00:05<00:00,  4.02it/s]
100%|██████████| 10/10 [00:00<00:00, 2734.23it/s]
100%|██████████| 10/10 [00:00<00:00, 2735.12it/s]
100%|██████████| 11/11 [00:00<00:00, 2871.56it/s]
100%|██████████| 9/9 [00:00<00:00, 2775.85it/s]
100%|██████████| 72/72 [01:06<00:00,  1.08it/s]
100%|██████████| 56/56 [00:37<00:00,  1.50it/s]
100%|██████████| 26/26 [00:05<00:00,  4.70it/s]
100%|██████████| 18/18 [00:19<00:00,  1.07s/it]
100%|██████████| 16/16 [00:08<00:00,  1.91it/s]


In [8]:
for test_num in ["0901", "0902", "0904", "0905"]:
    for lig in ["ago", "anta"]:
        val = pd.read_csv(f"../../data/processed/{test_num}_{lig}/validation_pubchem.tsv", sep="\t", header=None)
        lookup = for_lookup_val(val)
        pickle_dump(lookup, f"../../data/processed/{test_num}_{lig}/validation_lookup.pickle")

100%|██████████| 42/42 [00:00<00:00, 485.67it/s]
100%|██████████| 42/42 [00:00<00:00, 373.11it/s]
100%|██████████| 25/25 [00:00<00:00, 6279.27it/s]
100%|██████████| 25/25 [00:00<00:00, 278.46it/s]
100%|██████████| 86/86 [00:00<00:00, 716.45it/s]
100%|██████████| 86/86 [00:00<00:00, 266.52it/s]
100%|██████████| 21/21 [00:00<00:00, 14254.80it/s]
100%|██████████| 21/21 [00:00<00:00, 5425.01it/s]
100%|██████████| 10/10 [00:00<00:00, 17161.64it/s]
100%|██████████| 10/10 [00:00<00:00, 11122.52it/s]
100%|██████████| 10/10 [00:00<00:00, 18468.97it/s]
100%|██████████| 10/10 [00:00<00:00, 11087.24it/s]
100%|██████████| 11/11 [00:00<00:00, 17650.09it/s]
100%|██████████| 11/11 [00:00<00:00, 10045.14it/s]
100%|██████████| 9/9 [00:00<00:00, 25575.02it/s]
100%|██████████| 9/9 [00:00<00:00, 12106.71it/s]


In [9]:
for test_num in ["0701", "0702", "0907", "1001", "1002"]:
    val = pd.read_csv(f"../../data/processed/{test_num}/validation_pubchem.tsv", sep="\t", header=None)
    lookup = for_lookup_val(val)
    pickle_dump(lookup, f"../../data/processed/{test_num}/validation_lookup.pickle")

100%|██████████| 72/72 [00:00<00:00, 379.49it/s]
100%|██████████| 72/72 [00:00<00:00, 248.21it/s]
100%|██████████| 56/56 [00:00<00:00, 6357.41it/s]
100%|██████████| 56/56 [00:00<00:00, 299.79it/s]
100%|██████████| 26/26 [00:00<00:00, 12365.56it/s]
100%|██████████| 26/26 [00:00<00:00, 4191.56it/s]
100%|██████████| 18/18 [00:00<00:00, 19614.83it/s]
100%|██████████| 18/18 [00:00<00:00, 3189.32it/s]
100%|██████████| 16/16 [00:00<00:00, 9533.86it/s]
100%|██████████| 16/16 [00:00<00:00, 3614.81it/s]


# for dataset publication

In [22]:
import pandas as pd
from tqdm import tqdm

df_all = pd.read_csv("../../data/processed/other/all_pubchem_data.tsv", sep="\t")
df_all = df_all.drop_duplicates(subset=["CAS"])
all_data = dict()

for i in tqdm(range(len(df_all))):
    cas = df_all["CAS"][i]
    if cas not in all_data.keys():
        all_data[cas] = []
        for n in range(len(df_all.iloc[i])):
            if n == 0 or n == 1:
                continue
            all_data[cas].append(df_all.iloc[i,n])
    else:
        continue

  0%|          | 0/11422 [00:00<?, ?it/s]

100%|██████████| 11422/11422 [00:03<00:00, 3024.87it/s]


In [21]:
all_data["71-55-6"]

[np.int64(6278), 'CC(Cl)(Cl)Cl', np.float64(0.0), np.float64(2.4)]

In [None]:
for test_num in ["0901", "0902", "0904", "0905"]:
    for lig in ["ago", "anta"]:
        test_name = test_num + "_" + lig
        cas_tox_use = pd.read_csv(f"../../data/processed/{test_name}/cas_sev_use.tsv", sep="\t", header=None)
        val = pd.read_csv(f"../../data/processed/{test_name}/validation_pubchem.tsv", sep="\t", header=None)
        
        tsv = []
        for i in tqdm(range(len(cas_tox_use))):
            cas = cas_tox_use.iloc[i, 0]
            sev = cas_tox_use.iloc[i, 1]
            if cas in all_data.keys():
                col = [cas, sev] + all_data[cas]
            else:
                col = [cas, "###", "###", "###", "###"]
            tsv.append(col)

        for i in tqdm(range(len(val))):
            cas = val.iloc[i, 0]
            sev = val.iloc[i, 1]
            if cas in all_data.keys():
                col = [cas, sev] + all_data[cas]
            else:
                col = []
                for n in range(len(val.iloc[i])):
                    col.append(val.iloc[i, n])
            tsv.append(col)
        
        os.makedirs(f"../../data_validation_test_and_HTS/{test_name}", exist_ok=True)
        
        names = ["CAS", "severity", "CanonicalSmiles", "xlogp", "tpsa"]
        pd.DataFrame(tsv, columns=names).to_csv(f"../../data_validation_test_and_HTS/{test_name}/all_data.tsv", sep="\t", index=False)

        val_tsv = []
        for i in tqdm(range(len(val))):
            cas = val.iloc[i, 0]
            sev = val.iloc[i, 1]
            if cas in all_data.keys():
                col = [cas, sev] + all_data[cas]
            else:
                col = []
                for n in range(len(val.iloc[i])):
                    col.append(val.iloc[i, n])
            val_tsv.append(col)

        names = ["CAS", "severity", "CanonicalSmiles", "xlogp", "tpsa"]
        pd.DataFrame(val_tsv, columns=names).to_csv(f"../../data_validation_test_and_HTS/{test_name}/validation_data.tsv", sep="\t", index=False)

for test_num in ["0701", "0702", "0907", "1001", "1002"]:
    test_name = test_num
    cas_tox_use = pd.read_csv(f"../../data/processed/{test_name}/cas_sev_use.tsv", sep="\t", header=None)
    val = pd.read_csv(f"../../data/processed/{test_name}/validation_pubchem.tsv", sep="\t", header=None)
    
    tsv = []
    for i in tqdm(range(len(cas_tox_use))):
        cas = cas_tox_use.iloc[i, 0]
        sev = cas_tox_use.iloc[i, 1]
        if cas in all_data.keys():
            col = [cas, sev] + all_data[cas]
        else:
            col = [cas, "###", "###", "###", "###"]
        tsv.append(col)

    for i in tqdm(range(len(val))):
        cas = val.iloc[i, 0]
        sev = val.iloc[i, 1]
        if cas in all_data.keys():
            col = [cas, sev] + all_data[cas]
        else:
            col = []
            for n in range(len(val.iloc[i])):
                col.append(val.iloc[i, n])
        tsv.append(col)
    
    os.makedirs(f"../../data_validation_test_and_HTS/{test_name}", exist_ok=True)
    
    names = ["CAS", "severity", "CanonicalSmiles", "xlogp", "tpsa"]
    pd.DataFrame(tsv, columns=names).to_csv(f"../../data_validation_test_and_HTS/{test_name}/all_data.tsv", sep="\t", index=False)

    val_tsv = []
    for i in tqdm(range(len(val))):
        cas = val.iloc[i, 0]
        sev = val.iloc[i, 1]
        if cas in all_data.keys():
            col = [cas, sev] + all_data[cas]
        else:
            col = []
            for n in range(len(val.iloc[i])):
                col.append(val.iloc[i, n])
        val_tsv.append(col)

    names = ["CAS", "severity", "CanonicalSmiles", "xlogp", "tpsa"]
    pd.DataFrame(val_tsv, columns=names).to_csv(f"../../data_validation_test_and_HTS/{test_name}/validation_data.tsv", sep="\t", index=False)

print("All tasks completed successfully!")

100%|██████████| 6737/6737 [00:01<00:00, 5178.93it/s]
100%|██████████| 42/42 [00:00<00:00, 22161.37it/s]
100%|██████████| 42/42 [00:00<00:00, 22186.49it/s]
100%|██████████| 6753/6753 [00:01<00:00, 5222.59it/s]
100%|██████████| 25/25 [00:00<00:00, 265.36it/s]
100%|██████████| 25/25 [00:00<00:00, 10472.15it/s]
100%|██████████| 6724/6724 [00:01<00:00, 3553.19it/s]
100%|██████████| 86/86 [00:00<00:00, 22355.76it/s]
100%|██████████| 86/86 [00:00<00:00, 22423.86it/s]
100%|██████████| 6760/6760 [00:01<00:00, 4782.60it/s]
100%|██████████| 21/21 [00:00<00:00, 17083.08it/s]
100%|██████████| 21/21 [00:00<00:00, 8921.34it/s]
100%|██████████| 6772/6772 [00:00<00:00, 8346.55it/s]
100%|██████████| 10/10 [00:00<00:00, 22215.59it/s]
100%|██████████| 10/10 [00:00<00:00, 21204.77it/s]
100%|██████████| 6770/6770 [00:00<00:00, 8534.18it/s]
100%|██████████| 10/10 [00:00<00:00, 22180.35it/s]
100%|██████████| 10/10 [00:00<00:00, 21269.29it/s]
100%|██████████| 6766/6766 [00:00<00:00, 8376.08it/s]
100%|████████

All tasks completed successfully!





# For your own data

In [None]:
df = pd.read_csv(f"path2yourfile", sep="\t", header=None)
df = df.dropna()
cas = []
for i in range(len(df)):
    cas.append(df.iloc[i,0])

In [None]:
cas = list(set(cas))

property = ['CanonicalSMILES', 'XLogP', 'TPSA']
all_data = prep_pubchem_bycas(cas, property)

all_data_na = all_data[0].dropna()
all_data_na.to_csv("path2yourpubchemresultfile.tsv", sep="\t", header=None, index=False)

error = set(all_data[1]) | set(cas)
pickle_dump(error, "path2yourpubchemerrorfile.pickle")

In [None]:
all_data_na = all_data_na.reset_index()
all_data_na = all_data_na.drop(columns=["index"])

all_data_dict = dict()
for i in tqdm(range(len(all_data_na))):
    cas = all_data_na["CAS"][i]
    if cas not in all_data_dict.keys():
        all_data_dict[cas] = []
        for n in range(len(all_data_na.iloc[i])):
            all_data_dict[cas].append(all_data_na.iloc[i,n])
    else:
        continue

df = pd.DataFrame.from_dict(all_data_dict, orient='index', columns=["CID","CAS","CanonicalSmiles","xlogp","tpsa"]).reset_index().drop(columns=["index"])
df.to_csv("path2yourpubchemresulttsvfile.tsv",sep="\t",index=False)

In [None]:
lookup_whole = for_lookup(all_data_na)
pickle_dump(lookup_whole, "path2yourlookupfile.pickle")