#### Generate predictions for a new list of chemicals for Instrinic Clearance¶

- Step 1: Identify substances of interest and their SMILES codes - Use KNIME to convert SMILES into a V2000 sdf file
- See KNIME workflow presented in models directory (httk/models) for example knwf file generated in KNIME 3.7.2
- Step 2: Use sdf file to generate Pubchem and ToxPrint Fingerprints using KNIME and the Chemotyper
- Step 3: Use sdf file to generate OPERA descriptors (v2.6)



In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
import os
import glob

In [2]:
from sklearn import model_selection
from sklearn import preprocessing
from sklearn.metrics import r2_score
import pickle

In [3]:
def normalizeDescriptors(X):
    scaler = preprocessing.StandardScaler().fit(X)
    transformed = scaler.transform(X)
    x_norm = pd.DataFrame(transformed, index = X.index) 
    x_norm.columns = X.columns
    return(x_norm)

In [4]:
raw_dir = '/home/grace/Documents/python/httk/data/raw/'
processed_dir = '/home/grace/Documents/python/httk/data/processed/'
interim_dir = '/home/grace/Documents/python/httk/data/interim/'
figures_dir = '/home/grace/Documents/python/httk/reports/figures/'
external_dir = '/home/grace/Documents/python/httk/data/external/'
models_dir = '/home/grace/Documents/python/httk/models/'

Load descriptors needed for intrinsic clearance (regression model)

Looks like per Table S6 this model only needs Pubchem and ToxPrint fingerprints. 

In [5]:
pubchem = pd.read_csv(processed_dir+'Fub_Pubchem.csv')

In [6]:
pubchem.head()

Unnamed: 0,CASRN,bitvector0,bitvector1,bitvector2,bitvector3,bitvector4,bitvector5,bitvector6,bitvector7,bitvector8,...,bitvector871,bitvector872,bitvector873,bitvector874,bitvector875,bitvector876,bitvector877,bitvector878,bitvector879,bitvector880
0,94-74-6,1,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,148477-71-8,1,1,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,56-29-1,1,1,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,153233-91-1,1,1,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,96182-53-5,1,1,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [7]:
txps = pd.read_excel(processed_dir+'ToxPrints.xlsx')


In [24]:
clint_features_reg = pd.read_csv(external_dir+'Clint_Features_Regression.csv')

In [25]:
retain = [str(val.replace("'", "").replace(" ", "")) for val in clint_features_reg.loc[0,'Fingerprints'].split(',')]
retain[0] = retain[0].replace("[", "")
retain[len(retain)-1] = retain[len(retain)-1].replace("]",'')

In [27]:
#retain

In [30]:
#pubchem.set_index('CASRN', inplace = True)

In [29]:
pubchem

Unnamed: 0_level_0,bitvector0,bitvector1,bitvector2,bitvector3,bitvector4,bitvector5,bitvector6,bitvector7,bitvector8,bitvector9,...,bitvector871,bitvector872,bitvector873,bitvector874,bitvector875,bitvector876,bitvector877,bitvector878,bitvector879,bitvector880
CASRN,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
94-74-6,1,1,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
148477-71-8,1,1,1,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
56-29-1,1,1,1,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
153233-91-1,1,1,1,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
96182-53-5,1,1,1,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
251565-85-2,1,1,1,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
95058-81-4,1,1,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
222400-20-6,1,1,1,1,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
83015-26-3,1,1,1,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0


In [12]:
txps.drop(['DTXSID', 	'PREFERRED_NAME'], axis =1, inplace = True)
txps.set_index('INPUT', inplace = True)

In [32]:
#txps

In [33]:
ids = list(set(txps.index & pubchem.index))

In [34]:
txps_ = txps.loc[ids]
pubchem_ = pubchem.loc[ids]

In [35]:
descriptors = pd.concat([pubchem_, txps_], axis = 1)

In [36]:
descriptors.shape

(1118, 1610)

In [37]:
fingerprints_clintReg = descriptors.loc[:,retain]

In [38]:
fingerprints_clintReg.shape

(1118, 58)

Load saved model

In [39]:
clint_rf = pickle.load(open(models_dir+'clintReg_rf.sav', 'rb'))

In [40]:
len(clint_rf.feature_importances_)

58

Looks like there is a match in terms of number of descriptors expected...

In [45]:
predicted_clint_rf = pd.DataFrame((10**clint_rf.predict(fingerprints_clintReg)), fingerprints_clintReg.index )

In [46]:
predicted_clint_rf.columns = ['pred_clint_rf']

In [50]:
predicted_clint_rf[predicted_clint_rf.index == '1007-28-9']

Unnamed: 0,pred_clint_rf
1007-28-9,6.42073
