#### Making Fub predictions for the HTTK library

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
import openpyxl
import os
import sys

In [2]:
TOP = os.getcwd().replace('notebooks', '')

In [4]:
raw_dir = TOP + 'data/raw/'
interim_dir = TOP + 'data/interim/'
external_dir = TOP + 'data/external/'

In [3]:
models_dir = TOP + 'models/'
reports_dir = TOP + 'reports/'

In [5]:
from sklearn import model_selection
from sklearn import preprocessing
from sklearn.metrics import r2_score
import pickle
import glob

In [6]:
def normalizeDescriptors(X):
    scaler = preprocessing.StandardScaler().fit(X)
    transformed = scaler.transform(X)
    x_norm = pd.DataFrame(transformed, index = X.index) 
    x_norm.columns = X.columns
    return(x_norm)


In [7]:
pubchem = pd.read_csv(interim_dir+'HTTK-Pubchem.csv')

In [8]:
pubchem.shape

(8792, 882)

In [9]:
pubchem.head()

Unnamed: 0,CAS,bitvector0,bitvector1,bitvector2,bitvector3,bitvector4,bitvector5,bitvector6,bitvector7,bitvector8,...,bitvector871,bitvector872,bitvector873,bitvector874,bitvector875,bitvector876,bitvector877,bitvector878,bitvector879,bitvector880
0,2971-36-0,1,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,94-75-7,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,94-82-6,1,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,90-43-7,1,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,1007-28-9,1,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [10]:
pubchem.set_index('CAS', inplace = True)

In [11]:
txps = pd.read_csv(interim_dir+'HTTK-TxPs.txt', sep = ';')

In [13]:
txps.drop(['M_COMPOUND_HISTORY_[STRING]', 'M_CORINA_SYMPHONY_ERRORS_[STRING]'], axis =1, inplace = True)

In [15]:
txps.set_index('M_NAME', inplace = True)

In [17]:
df_opera = pd.read_csv(interim_dir+'HTTK-OPERA.csv', index_col='MoleculeID')[['LogP_pred','pKa_a_pred', 'pKa_b_pred']]
df_opera

Unnamed: 0_level_0,LogP_pred,pKa_a_pred,pKa_b_pred
MoleculeID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2971-36-0,4.621513,8.332265,
94-75-7,2.772283,2.419436,
94-82-6,3.381940,3.109999,
90-43-7,3.137555,9.349336,
1007-28-9,1.263031,,3.433594
...,...,...,...
75-01-4,1.005203,,
108341-18-0,-1.289582,,7.989363
115104-28-4,5.990367,3.763057,5.549423
1437319-51-1,5.721995,,5.834184


In [18]:
df_opera['pKa_pred']=df_opera[['pKa_a_pred','pKa_b_pred']].min(axis=1)


In [19]:
df_opera = df_opera[~df_opera.index.duplicated(keep='first')]

In [20]:
df_opera = df_opera.dropna(subset=['pKa_pred','LogP_pred']) #add1
df_opera.fillna(0, inplace=True) 

In [21]:
opera_scaler = pickle.load(open(models_dir+'opera_scaler_v2.sav', 'rb'))

In [22]:
# Normalize opera properties based on transformation scaler vector from the base models
opera_scaled = opera_scaler.transform(df_opera)
opera = pd.DataFrame(opera_scaled, index = df_opera.index) 
opera.columns = df_opera.columns
opera = opera[['pKa_pred','LogP_pred']]


In [23]:
desc = pd.read_csv(external_dir+'Human.Funbound.plasma_Features_v2.csv')

In [24]:
ids = list(set(pubchem.index & txps.index))
txps = txps.loc[ids]
pubchem = pubchem.loc[ids]
fingerprints = pd.concat([pubchem,txps ], axis =1)

In [25]:
retain = [str(val.replace("'", "").replace(" ", "")) for val in desc.loc[0,'Fingerprints'].split(',')]
retain[0] = retain[0].replace("[", "")
retain[len(retain)-1] = retain[len(retain)-1].replace("c]",'c')
fingerprints_fub = fingerprints.loc[:,retain]

In [26]:
opera_ = opera.loc[ids]

Passing list-likes to .loc or [] with any missing label will raise
KeyError in the future, you can use .reindex() as an alternative.

See the documentation here:
https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#deprecate-loc-reindex-listlike
  """Entry point for launching an IPython kernel.


In [27]:
descriptors = pd.concat([fingerprints_fub, opera_], axis=1).dropna(axis=0, how='any')

In [28]:
fub_rf = pickle.load(open(models_dir+'fub_rf_v2.sav', 'rb'))
fub_svr = pickle.load(open(models_dir+'fub_svr_v2.sav', 'rb'))

In [29]:
predicted_Fub = pd.DataFrame(1/(1+10**fub_rf.predict(descriptors)), descriptors.index )
predicted_Fub.columns = ['pred_Fub_rf']
predicted_Fub_2 = pd.DataFrame(1/(1+10**fub_svr.predict(descriptors)), descriptors.index )
predicted_Fub_2.columns = ['pred_Fub_svr']
predicted_Fub_all = pd.concat([predicted_Fub, predicted_Fub_2], axis = 1)
predicted_Fub_all['Consensus (SVM,RF)'] = predicted_Fub_all[['pred_Fub_svr', 'pred_Fub_rf']].mean(axis = 1)

predicted_Fub_all.head()

Unnamed: 0_level_0,pred_Fub_rf,pred_Fub_svr,"Consensus (SVM,RF)"
MoleculeID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
68-11-1,0.757716,0.288604,0.52316
115-44-6,0.431735,0.515488,0.473611
693-98-1,0.876785,0.94376,0.910272
4784-40-1,0.100349,0.076452,0.0884
534-85-0,0.040982,0.11895,0.079966


In [34]:
predicted_Fub_all[predicted_Fub_all.index.isin(['100-01-6', '100-02-7', '100-10-7', '100-15-2', '100-19-6'])]

Unnamed: 0_level_0,pred_Fub_rf,pred_Fub_svr,"Consensus (SVM,RF)"
MoleculeID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
100-02-7,0.131728,0.095463,0.113595
100-10-7,0.310349,0.610963,0.460656
100-15-2,0.386404,0.17403,0.280217
100-01-6,0.388741,0.177836,0.283289
100-19-6,0.335527,0.329827,0.332677


In [30]:
check_df = pd.read_csv(reports_dir+'pradeep_FupClintPredictions_HTTK_chem_props.csv')

In [35]:
check_df.head()

Unnamed: 0,CASRN,Fub (SVR Prediction),Fub (RF Prediction),Fub (Consensus Prediction),Clint Prediction (Bin),Clint Prediction
0,100-01-6,0.177836,0.388741,0.283289,Medium,11.456444
1,100-02-7,0.095463,0.131728,0.113595,Medium,10.190809
2,100-10-7,0.610963,0.310349,0.460656,Medium,9.8062
3,100-15-2,0.17403,0.386404,0.280217,Medium,12.955616
4,100-19-6,0.329827,0.335527,0.332677,Medium,12.207164


In [36]:
check_df.shape

(6569, 6)

In [37]:
# checks out OK for Fub predictions