In [1]:
import pandas as pd
from sklearn.preprocessing import RobustScaler
from molfeat.trans.pretrained.hf_transformers import PretrainedHFTransformer
import rdkit.Chem as Chem
from admet_ai import ADMETModel

  from .autonotebook import tqdm as notebook_tqdm


This creates the features for the model - predictions from ChemProp (via AdmetAI) and the ChemBertA MTL transformer embedding from HuggingFace. This is done in a separate notebook because the enviroment for the Ordinal Regression model is different to the that required for AdmetAI. 

In [2]:
proj_dir = '/Users/robertarbon/Library/CloudStorage/GoogleDrive-robert.arbon@gmail.com/My Drive/Polaris_ASAP_competition/polaris_challenge/admet'

In [7]:
# Non-imputed  data - contains all the 'training' data from Polaris. 
df_train = pd.read_csv(f'{proj_dir}/dm_features/ordinal_data_split_2/train_admet_split2_features.csv')
# test data - the 'test' set from Polaris. 
df_test = pd.read_csv(f"{proj_dir}/data/test_admet_all.csv")

# change names to be consistent (R doesn't like hyphens or spaces in its dataframe names. )
df_train.rename(columns={'Molecule Name': 'Molecule.Name', 'LogMDR1-MDCKII':'LogMDR1.MDCKII'}, inplace=True)
df_test.rename(columns={'Molecule Name': 'Molecule.Name', 'LogMDR1-MDCKII':'LogMDR1.MDCKII'}, inplace=True)

# Smiles columns because they were removed during the imputation process. 
df_smiles = pd.read_csv(f'{proj_dir}/data/train_admet_all.csv')
df_smiles.rename(columns={'Molecule Name': 'Molecule.Name', 'LogMDR1-MDCKII':'LogMDR1.MDCKII'}, inplace=True)

df_train = df_train.merge(df_smiles.loc[:, ['Molecule.Name', 'CXSMILES']], on='Molecule.Name', how='left')

In [8]:
df_train.shape[0], df_test.shape[0]

(434, 126)

In [9]:
df_train.groupby('split')['split'].count()

split
train    354
val       80
Name: split, dtype: int64

In [19]:
hf_transformer = PretrainedHFTransformer(kind="ChemBERTa-77M-MTR", notation='smiles', dtype=float)
cp_model = ADMETModel()
cp_cols = [
    'BBB_Martins', 
    'Bioavailability_Ma',
    'CYP1A2_Veith',
    'CYP2C19_Veith',
    'CYP2C9_Substrate_CarbonMangels',
    'CYP2C9_Veith',
    'CYP2D6_Substrate_CarbonMangels',
    'CYP2D6_Veith',
    'CYP3A4_Substrate_CarbonMangels',
    'CYP3A4_Veith',
    'PAMPA_NCATS',
    'Pgp_Broccatelli',
    'Caco2_Wang',
    'Clearance_Hepatocyte_AZ',
    'Clearance_Microsome_AZ',
    'Half_Life_Obach',
    'HydrationFreeEnergy_FreeSolv',
    'Lipophilicity_AstraZeneca',
    'PPBR_AZ',
    'Solubility_AqSolDB',
    'VDss_Lombardo'
]

def features(df): 
    # chemprop features
    df.loc[:, 'SMILES'] = df['CXSMILES'].apply(lambda x: Chem.MolToSmiles(Chem.MolFromSmiles(x)))
    cp_features = cp_model.predict(smiles=df.SMILES.values)
    df = pd.concat([df, cp_features.loc[:, cp_cols].reset_index(drop=True)], axis=1)
    # Chemberta features
    hf_features = pd.DataFrame(hf_transformer(df.CXSMILES.values))
    df = pd.concat([df, hf_features], axis=1)
    return df



train_features = features(df_train.loc[:, ['split', 'Molecule.Name', 'CXSMILES']])
test_features = features(df_test.loc[:, ['Molecule.Name', 'CXSMILES']])

  vars(torch.load(path, map_location=lambda storage, loc: storage)["args"]),
  state = torch.load(path, map_location=lambda storage, loc: storage)


Loading pretrained parameter "encoder.encoder.0.cached_zero_vector".
Loading pretrained parameter "encoder.encoder.0.W_i.weight".
Loading pretrained parameter "encoder.encoder.0.W_h.weight".
Loading pretrained parameter "encoder.encoder.0.W_o.weight".
Loading pretrained parameter "encoder.encoder.0.W_o.bias".
Loading pretrained parameter "readout.1.weight".
Loading pretrained parameter "readout.1.bias".
Loading pretrained parameter "readout.4.weight".
Loading pretrained parameter "readout.4.bias".
Loading pretrained parameter "encoder.encoder.0.cached_zero_vector".
Loading pretrained parameter "encoder.encoder.0.W_i.weight".
Loading pretrained parameter "encoder.encoder.0.W_h.weight".
Loading pretrained parameter "encoder.encoder.0.W_o.weight".
Loading pretrained parameter "encoder.encoder.0.W_o.bias".
Loading pretrained parameter "readout.1.weight".
Loading pretrained parameter "readout.1.bias".
Loading pretrained parameter "readout.4.weight".
Loading pretrained parameter "readout.4.b

  state = torch.load(path, map_location=lambda storage, loc: storage)


Loading pretrained parameter "encoder.encoder.0.cached_zero_vector".
Loading pretrained parameter "encoder.encoder.0.W_i.weight".
Loading pretrained parameter "encoder.encoder.0.W_h.weight".
Loading pretrained parameter "encoder.encoder.0.W_o.weight".
Loading pretrained parameter "encoder.encoder.0.W_o.bias".
Loading pretrained parameter "readout.1.weight".
Loading pretrained parameter "readout.1.bias".
Loading pretrained parameter "readout.4.weight".
Loading pretrained parameter "readout.4.bias".
Loading pretrained parameter "encoder.encoder.0.cached_zero_vector".
Loading pretrained parameter "encoder.encoder.0.W_i.weight".
Loading pretrained parameter "encoder.encoder.0.W_h.weight".
Loading pretrained parameter "encoder.encoder.0.W_o.weight".
Loading pretrained parameter "encoder.encoder.0.W_o.bias".
Loading pretrained parameter "readout.1.weight".
Loading pretrained parameter "readout.1.bias".
Loading pretrained parameter "readout.4.weight".
Loading pretrained parameter "readout.4.b

SMILES to Mol: 100%|██████████| 434/434 [00:00<00:00, 1662399.94it/s]
Computing physchem properties: 100%|██████████| 434/434 [00:00<00:00, 577.99it/s]
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)

In [20]:
train_features.to_csv('train_features_by_molecule_name.csv', index=False)
test_features.to_csv('test_features_by_molecule_name.csv', index=False)