In [1]:
from pandas import read_csv, concat

In [2]:
from assay_processing import (
    log_to_nanomol, 
    bin_assays, 
    delete_assays, 
    build_t2, 
    build_t3, 
    build_t4
)

---
## Read Data

In [3]:
solubility_df = read_csv('data/physicochemical_solubility.csv')
logP_df = read_csv('data/physicochemical_logp.csv')
logD_df = read_csv('data/physicochemical_logd.csv')

---
## Reset Assay IDs

In [4]:
reference = list()

In [5]:
start = 2000000
for i, label in enumerate(logD_df.pH.unique()):
    for a in logD_df.loc[logD_df['pH'] == label, 'assay_id']:
        reference.append({'assay_id': a, 'input_assay_id': start + i})
    logD_df.loc[logD_df['pH'] == label, 'assay_id'] = start + i
    
start = 3000000 
for i, label in enumerate(solubility_df.pH_solvent.unique()):
    for a in solubility_df.loc[solubility_df['pH_solvent'] == label, 'assay_id']:
        reference.append({'assay_id': a, 'input_assay_id': start + i})
    solubility_df.loc[solubility_df['pH_solvent'] == label, 'assay_id'] = start + i
    
for a in logP_df.assay_id.unique():
    reference.append({'assay_id': a, 'input_assay_id': 4000000})
logP_df.assay_id = 4000000

In [6]:
from pickle import dump
with open('reference_pc.pkl', 'wb') as f:
    dump(reference, f)

---
## All standard values must be in nM

In [7]:
solubility_df['standard_value'] = solubility_df.logS.apply(log_to_nanomol)
logD_df['standard_value'] = logD_df.standard_value.apply(log_to_nanomol)
logP_df['standard_value'] = logP_df.standard_value.apply(log_to_nanomol)

In [8]:
columns = set(solubility_df.columns)
columns.intersection_update(logD_df.columns)
columns.intersection_update(logP_df.columns)

In [9]:
physico_chemical_df = concat([solubility_df[columns], logD_df[columns], logP_df[columns]])

In [10]:
physico_chemical_df.to_csv('data/physico_chemical.csv', index=False)

---

In [11]:
input_directory = 'data'
output_directory = 'deliverables'

In [12]:
files = [
    'admet_tox.csv',
    'binding_functional.csv',
    'physico_chemical.csv',
]
file_paths = [input_directory + '/' + f for f in files]

In [13]:
dfs = [read_csv(file_path, low_memory=False) for file_path in file_paths]
df_ = concat(dfs, sort=False)

In [14]:
t3_data, t4_data, assay_ids_to_drop = bin_assays(df_)
df = delete_assays(assay_ids_to_drop, df_)

In [15]:
t2_df = build_t2(df)
t3_df = build_t3(t3_data)
t4_df = build_t4(t4_data)

In [16]:
t2_df.head(10)

Unnamed: 0,input_compound_id,smiles
1,1203610,Clc1ccc(cc1)C(N2CCN(CC2)C(=O)Nc3ccccc3)c4cccnc4
2,1203845,CS(=O)(=O)N1CCN(CC1)C(c2ccc(Cl)cc2)c3cccnc3
3,1203966,Fc1cc(Cl)ccc1C(N2CCN(CC2)C(=O)C3CCCCC3)c4cccnc4
4,2176418,[O-][N+](=O)c1ccccc1S(=O)(=O)Nc2ccc(cc2)c3oc4c...
71,322485,Clc1cccc2c(Cl)cccc12
72,13540,Clc1ccc(cc1)c2ccccc2
75,1203507,CC(C)(C)C(=O)N1CCN(CC1)C(c2cncnc2)c3ccc(Cl)cc3F
76,2156569,COc1ccc(cc1)S(=O)(=O)Nc2ccc(cc2)c3oc4ccc(NS(=O...
84,13725,c1ccc(cc1)c2ccccc2
86,1618891,Cc1ccc(cc1)S(=O)(=O)Nc2ccc(cc2)c3nc4ccc(NS(=O)...


In [17]:
t3_df.head(10)

Unnamed: 0,classification_task_id,input_assay_id,assay_type,target_id,threshold_column,threshold_value,threshold_operator,weight
0,1,942987,ADME,81135,standard_value,74973.328591,>=,1.0
1,2,1617702,PANEL,22221,standard_value,59000.0,>=,1.0
2,3,305428,ADME,12594,standard_value,61400.325732,>=,1.0
3,4,451490,ADME,80583,standard_value,123600.0,>=,1.0
4,5,762745,ADME,22224,standard_value,17280.0,>=,1.0
5,6,425249,ADME,80156,standard_value,3200.0,>=,1.0
6,7,1572745,ADME,22224,standard_value,148000.0,>=,1.0
7,8,458976,ADME,11365,standard_value,1148.912529,>=,1.0
8,9,649312,ADME,80548,standard_value,50000.0,>=,1.0
9,10,944913,ADME,81135,standard_value,57000.0,>=,1.0


In [18]:
t4_df.head(10)

Unnamed: 0_level_0,input_compound_id,classification_task_id,class_label
chembl_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1203610,1203610,1,1
1203845,1203845,1,0
1203966,1203966,1,1
1203507,1203507,1,0
1203480,1203480,1,0
1203702,1203702,1,0
1203385,1203385,1,0
1203510,1203510,1,1
1203982,1203982,1,1
1203434,1203434,1,1


In [19]:
t2_df.to_csv(output_directory + '/T2.csv', index=False)
t3_df.to_csv(output_directory + '/T3.csv', index=False)
t4_df.to_csv(output_directory + '/T4.csv', index=False)