#### Making CLint classification predictions for the HTTK library

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
import openpyxl
import os
import sys

In [2]:
TOP = os.getcwd().replace('notebooks', '')

In [3]:
raw_dir = TOP + 'data/raw/'
interim_dir = TOP + 'data/interim/'
external_dir = TOP + 'data/external/'

In [4]:
models_dir = TOP + 'models/'
reports_dir = TOP + 'reports/'

In [5]:
from sklearn import model_selection
from sklearn import preprocessing
from sklearn.metrics import r2_score
import pickle
import glob

In [6]:
def normalizeDescriptors(X):
    scaler = preprocessing.StandardScaler().fit(X)
    transformed = scaler.transform(X)
    x_norm = pd.DataFrame(transformed, index = X.index) 
    x_norm.columns = X.columns
    return(x_norm)


In [7]:
pubchem = pd.read_csv(interim_dir+'HTTK-Pubchem.csv')

In [8]:
pubchem.set_index('CAS', inplace = True)

In [9]:
txps = pd.read_csv(interim_dir+'HTTK-TxPs.txt', sep = ';')

In [10]:
txps.drop(['M_COMPOUND_HISTORY_[STRING]', 'M_CORINA_SYMPHONY_ERRORS_[STRING]'], axis =1, inplace = True)

In [11]:
txps.set_index('M_NAME', inplace = True)

In [12]:
df_opera = pd.read_csv(interim_dir+'HTTK-OPERA.csv', index_col='MoleculeID')[['LogP_pred','pKa_a_pred', 'pKa_b_pred']]
df_opera

Unnamed: 0_level_0,LogP_pred,pKa_a_pred,pKa_b_pred
MoleculeID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2971-36-0,4.621513,8.332265,
94-75-7,2.772283,2.419436,
94-82-6,3.381940,3.109999,
90-43-7,3.137555,9.349336,
1007-28-9,1.263031,,3.433594
...,...,...,...
75-01-4,1.005203,,
108341-18-0,-1.289582,,7.989363
115104-28-4,5.990367,3.763057,5.549423
1437319-51-1,5.721995,,5.834184


In [13]:
df_opera['pKa_pred']=df_opera[['pKa_a_pred','pKa_b_pred']].min(axis=1)

In [14]:
df_opera = df_opera[~df_opera.index.duplicated(keep='first')]

In [15]:
df_opera = df_opera.dropna(subset=['pKa_pred','LogP_pred']) #add1
df_opera.fillna(0, inplace=True) 

In [16]:
opera_scaler = pickle.load(open(models_dir+'opera_scaler_v2.sav', 'rb'))

In [17]:
# Normalize opera properties based on transformation scaler vector from the base models
opera_scaled = opera_scaler.transform(df_opera)
opera = pd.DataFrame(opera_scaled, index = df_opera.index) 
opera.columns = df_opera.columns
opera = opera[['pKa_pred','LogP_pred']]


In [18]:
ids = list(set(pubchem.index & txps.index))
txps = txps.loc[ids]
pubchem = pubchem.loc[ids]
fingerprints = pd.concat([pubchem,txps ], axis =1)

In [51]:
clint_features_clas = pd.read_csv(external_dir+'1-s2.0-S2468111320300463-mmc13.csv')

In [52]:
retain = [str(val.replace("'", "").replace(" ", "")) for val in clint_features_clas.loc[0,'Fingerprints'].split(',')]
retain[0] = retain[0].replace("[", "")
retain[len(retain)-1] = retain[len(retain)-1].replace("]",'')
fingerprints_clintClas = fingerprints.loc[:,retain]
# Set X vector for predictions
X_clintClas = pd.concat([fingerprints_clintClas, opera], axis=1).dropna()

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  


In [53]:
X_clintClas.shape

(6569, 59)

In [54]:
X_clintClas

Unnamed: 0,bitvector2,bitvector12,bitvector14,bitvector15,bitvector19,bitvector20,bitvector33,bitvector37,bitvector143,bitvector179,...,bitvector688,bitvector696,bitvector697,bitvector698,bitvector712,chain:alkaneLinear_ethyl_C2(H_gt_1),chain:alkaneLinear_ethyl_C2_(connect_noZ_CN=4),chain:aromaticAlkane_Ph-C1_acyclic_connect_noDblBd,pKa_pred,LogP_pred
100-01-6,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-1.780964,-0.622044
100-02-7,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.297170,-0.247336
100-10-7,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-1.581221,-0.370013
100-15-2,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-1.915396,-0.279668
100-19-6,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,...,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.608819,-0.539233
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
NOCAS_48518,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-1.184283,0.708795
NOCAS_48522,1.0,1.0,1.0,1.0,1.0,0.0,1.0,0.0,0.0,1.0,...,1.0,1.0,1.0,0.0,1.0,1.0,0.0,1.0,0.536448,2.015039
NOCAS_48792,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,1.0,1.0,...,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,-0.006903,-0.799142
NOCAS_48895,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,...,1.0,1.0,1.0,0.0,1.0,1.0,0.0,0.0,1.000753,0.493047


In [55]:
clint_clas = pickle.load(open(models_dir+'clintClas_svc.sav', 'rb'))

In [57]:
predicted_Clint_cls = pd.DataFrame(clint_clas.predict(X_clintClas), X_clintClas.index)

In [59]:
predicted_Clint_cls.columns = ['Clint Prediction (Bin)']

In [60]:
predicted_Clint_cls

Unnamed: 0,Clint Prediction (Bin)
100-01-6,-2
100-02-7,-2
100-10-7,-2
100-15-2,-2
100-19-6,-2
...,...
NOCAS_48518,-2
NOCAS_48522,-2
NOCAS_48792,-2
NOCAS_48895,-2


In [61]:

predicted_Clint_cls.loc[predicted_Clint_cls['Clint Prediction (Bin)'] == -3, 'Clint Prediction (Bin)'] = 'Low'
predicted_Clint_cls.loc[predicted_Clint_cls['Clint Prediction (Bin)'] == -2, 'Clint Prediction (Bin)'] = 'Medium'
predicted_Clint_cls.loc[predicted_Clint_cls['Clint Prediction (Bin)'] == -1, 'Clint Prediction (Bin)'] = 'High'

In [62]:
predicted_Clint_cls[['Clint Prediction (Bin)']]

Unnamed: 0,Clint Prediction (Bin)
100-01-6,Medium
100-02-7,Medium
100-10-7,Medium
100-15-2,Medium
100-19-6,Medium
...,...
NOCAS_48518,Medium
NOCAS_48522,Medium
NOCAS_48792,Medium
NOCAS_48895,Medium


In [42]:
check_df = pd.read_csv(reports_dir+'pradeep_FupClintPredictions_HTTK_chem_props.csv')

In [43]:
check_df

Unnamed: 0,CASRN,Fub (SVR Prediction),Fub (RF Prediction),Fub (Consensus Prediction),Clint Prediction (Bin),Clint Prediction
0,100-01-6,0.177836,0.388741,0.283289,Medium,11.456444
1,100-02-7,0.095463,0.131728,0.113595,Medium,10.190809
2,100-10-7,0.610963,0.310349,0.460656,Medium,9.806200
3,100-15-2,0.174030,0.386404,0.280217,Medium,12.955616
4,100-19-6,0.329827,0.335527,0.332677,Medium,12.207164
...,...,...,...,...,...,...
6564,NOCAS_48518,0.001492,0.005965,0.003729,Medium,11.415380
6565,NOCAS_48522,0.007195,0.007716,0.007456,Medium,5.879028
6566,NOCAS_48792,0.102380,0.147277,0.124829,Medium,4.558198
6567,NOCAS_48895,0.523002,0.112102,0.317552,Medium,5.226937
