In [42]:
import pandas as pd
import polaris as po
import seaborn as sns
import yellowbrick
import rdkit.Chem as Chem
from admet_ai import ADMETModel
import numpy as np
import datamol as dm
import matplotlib.pyplot as plt

### Add in simple DM features 

In [43]:
ds_path = 'train_admet_all'


df = pd.read_csv(f'../data/{ds_path}.csv')
train_ix = np.load('../data/train_split2_idx.npy')
df.loc[:, 'split'] = ['train' if x else 'val' for x in df.index.isin(train_ix)]

### Add in simple features from RDKit 

In [44]:
def _preprocess(i, row):

    dm.disable_rdkit_log()

    mol = dm.to_mol(row['CXSMILES'], ordered=True)
    mol = dm.fix_mol(mol)
    mol = dm.sanitize_mol(mol, sanifix=True, charge_neutral=False)
    mol = dm.standardize_mol(
        mol, disconnect_metals=False, normalize=True, reionize=True, uncharge=False, stereo=True
    )
    feats = dm.descriptors.compute_many_descriptors(mol)
    feats['Molecule Name'] = row['Molecule Name']
    return feats
    

feats = dm.parallelized(_preprocess, df.iterrows(),arg_type="args", progress=True, total=len(df))
feats = pd.DataFrame(feats)

df_all = pd.merge(left=df, right=feats, on='Molecule Name')
df_all.shape


100%|██████████| 434/434 [00:06<00:00, 63.56it/s] 


(434, 41)

In [45]:
df_all.head()

Unnamed: 0,CXSMILES,HLM,KSOL,LogD,MDR1-MDCKII,MLM,Molecule Name,n_missing,in-vitro_MLM_bienta: CLint (Num) (uL/min/mg),in-vitro_MLM_bienta: CLint (Mod),...,sas,n_aliphatic_carbocycles,n_aliphatic_heterocyles,n_aliphatic_rings,n_aromatic_carbocycles,n_aromatic_heterocyles,n_aromatic_rings,n_saturated_carbocycles,n_saturated_heterocyles,n_saturated_rings
0,COC1=CC=CC(Cl)=C1NC(=O)N1CCC[C@H](C(N)=O)C1 |a...,10.0,400.0,0.3,2.0,10.0,ASAP-0032437,3,10.0,<,...,2.548208,0,1,1,1,0,1,0,1,1
1,O=C(NCC(F)F)[C@H](NC1=CC2=C(C=C1Br)CNC2)C1=CC(...,,333.0,2.9,0.2,,ASAP-0031915,2,,,...,3.486159,1,1,2,2,0,2,1,0,1
2,O=C(NCC(F)F)[C@H](NC1=CC=C2CNCC2=C1)C1=CC(Br)=...,,400.0,0.4,0.5,,ASAP-0031884,3,,,...,3.608114,0,1,1,2,1,3,0,0,0
3,NC(=O)[C@H]1CCCN(C(=O)CC2=CC=CC3=C2C=CO3)C1 |&...,10.0,376.0,1.0,8.5,10.0,ASAP-0031848,2,10.0,<,...,2.646367,0,1,1,1,1,2,0,1,1
4,CC1=CC(CC(=O)N2CCC[C@H](C(N)=O)C2)=CC=N1 |&1:11|,10.0,375.0,-0.3,0.9,10.0,ASAP-0031813,2,10.0,<,...,2.587385,0,1,1,0,1,1,0,1,1


##  Scale Training targets

In [46]:
epsilon = 1e-8
for col in ['MLM', 'HLM', 'KSOL', 'MDR1-MDCKII']: 
    df_all.loc[:, f"Log{col}"] = np.log10(np.clip(df_all[col], a_min=epsilon, a_max=None))

target_cols = list(df_all.filter(regex='^Log').columns)
df_all.shape

(434, 45)

## Feature selection

In [47]:
X = df_all.loc[:, list(feats.columns.difference(['Molecule Name']))]
std = np.std(X.values, axis=0)
predictors = list(X.loc[:, std>0].columns)
predictors

['clogp',
 'fsp3',
 'mw',
 'n_aliphatic_carbocycles',
 'n_aliphatic_heterocyles',
 'n_aliphatic_rings',
 'n_aromatic_carbocycles',
 'n_aromatic_heterocyles',
 'n_aromatic_rings',
 'n_heavy_atoms',
 'n_hetero_atoms',
 'n_lipinski_hba',
 'n_lipinski_hbd',
 'n_rings',
 'n_rotatable_bonds',
 'n_saturated_carbocycles',
 'n_saturated_heterocyles',
 'n_saturated_rings',
 'qed',
 'sas',
 'tpsa']

In [48]:
allVars  =['split'] + predictors + target_cols + ['Molecule Name']
df_all.loc[:, allVars].to_csv('train_admet_split2_features.csv', index=False)

In [49]:
allVars = [x.replace('-', '.') for x in allVars]
allVars = [x.replace(' ', '.') for x in allVars]

imputeVars = [x.replace('-', '.').replace(' ', '.') for x in target_cols]
nonImputeVars = [x.replace('-', '.').replace(' ', '.') for x in ['split', 'Molecule Name'] + predictors] 
predictors = [x.replace('-', '.').replace(' ', '.') for x in predictors] 

print(f"allVars <- c('{"', '".join(allVars)}')")
print(f"imputeVars <- c('{"', '".join(imputeVars)}')")
print(f"nonImputeVars <- c('{"', '".join(nonImputeVars)}')")
print()
print(f"predictors <- c('{"', '".join(predictors)}')")

allVars <- c('split', 'clogp', 'fsp3', 'mw', 'n_aliphatic_carbocycles', 'n_aliphatic_heterocyles', 'n_aliphatic_rings', 'n_aromatic_carbocycles', 'n_aromatic_heterocyles', 'n_aromatic_rings', 'n_heavy_atoms', 'n_hetero_atoms', 'n_lipinski_hba', 'n_lipinski_hbd', 'n_rings', 'n_rotatable_bonds', 'n_saturated_carbocycles', 'n_saturated_heterocyles', 'n_saturated_rings', 'qed', 'sas', 'tpsa', 'LogD', 'LogMLM', 'LogHLM', 'LogKSOL', 'LogMDR1.MDCKII', 'Molecule.Name')
imputeVars <- c('LogD', 'LogMLM', 'LogHLM', 'LogKSOL', 'LogMDR1.MDCKII')
nonImputeVars <- c('split', 'Molecule.Name', 'clogp', 'fsp3', 'mw', 'n_aliphatic_carbocycles', 'n_aliphatic_heterocyles', 'n_aliphatic_rings', 'n_aromatic_carbocycles', 'n_aromatic_heterocyles', 'n_aromatic_rings', 'n_heavy_atoms', 'n_hetero_atoms', 'n_lipinski_hba', 'n_lipinski_hbd', 'n_rings', 'n_rotatable_bonds', 'n_saturated_carbocycles', 'n_saturated_heterocyles', 'n_saturated_rings', 'qed', 'sas', 'tpsa')

predictors <- c('clogp', 'fsp3', 'mw', 'n_ali