Making CLint classification predictions for the Tox21 library

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
import openpyxl
import os
import sys

In [2]:
TOP = os.getcwd().replace('notebooks', '')

In [3]:
raw_dir = TOP + 'data/raw/'
interim_dir = TOP + 'data/interim/'
external_dir = TOP + 'data/external/'
models_dir = TOP + 'models/'

In [4]:
from sklearn import model_selection
from sklearn import preprocessing
from sklearn.metrics import r2_score
import pickle
import glob

In [5]:
def normalizeDescriptors(X):
    scaler = preprocessing.StandardScaler().fit(X)
    transformed = scaler.transform(X)
    x_norm = pd.DataFrame(transformed, index = X.index) 
    x_norm.columns = X.columns
    return(x_norm)

In [6]:
pubchem = pd.read_csv(interim_dir+'Tox21_pubchem.csv')

In [7]:
pubchem.rename(columns = {'Unnamed: 0' : 'dsstox_sid'}, inplace = True)

In [8]:
pubchem.set_index('dsstox_sid', inplace = True)

In [9]:
txps = pd.read_csv(interim_dir+'tox21_txps_all.csv')

In [10]:
txps.set_index('INPUT', inplace = True)

txps.drop(['Unnamed: 0','DTXSID', 'PREFERRED_NAME'], axis = 1, inplace = True)
txps.head()

Unnamed: 0_level_0,atom:element_main_group,atom:element_metal_group_I_II,atom:element_metal_group_III,atom:element_metal_metalloid,atom:element_metal_poor_metal,atom:element_metal_transistion_metal,atom:element_noble_gas,bond:C#N_cyano_acylcyanide,bond:C#N_cyano_cyanamide,bond:C#N_cyano_cyanohydrin,...,ring:polycycle_bicyclo_propene,ring:polycycle_spiro_[2.2]pentane,ring:polycycle_spiro_[2.5]octane,ring:polycycle_spiro_[4.5]decane,ring:polycycle_spiro_1_4-dioxaspiro[4.5]decane,ring:polycycle_tricyclo_[3.5.5]_cyclopropa[cd]pentalene,ring:polycycle_tricyclo_[3.7.7]bullvalene,ring:polycycle_tricyclo_[3.7.7]semibullvalene,ring:polycycle_tricyclo_adamantane,ring:polycycle_tricyclo_benzvalene
INPUT,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
DTXSID7020005,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
DTXSID2020006,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
DTXSID7020007,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
DTXSID7020009,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
DTXSID6020014,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [11]:
ids = list(set(pubchem.index & txps.index))
txps = txps.loc[ids]
pubchem = pubchem.loc[ids]
fingerprints = pd.concat([pubchem,txps ], axis =1)

In [12]:
fingerprints.columns = fingerprints.columns.str.replace('PubchemFP', 'bitvector')
fingerprints.head()

Unnamed: 0,bitvector0,bitvector1,bitvector2,bitvector3,bitvector4,bitvector5,bitvector6,bitvector7,bitvector8,bitvector9,...,ring:polycycle_bicyclo_propene,ring:polycycle_spiro_[2.2]pentane,ring:polycycle_spiro_[2.5]octane,ring:polycycle_spiro_[4.5]decane,ring:polycycle_spiro_1_4-dioxaspiro[4.5]decane,ring:polycycle_tricyclo_[3.5.5]_cyclopropa[cd]pentalene,ring:polycycle_tricyclo_[3.7.7]bullvalene,ring:polycycle_tricyclo_[3.7.7]semibullvalene,ring:polycycle_tricyclo_adamantane,ring:polycycle_tricyclo_benzvalene
DTXSID6044773,1,1,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
DTXSID6057746,1,1,1,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
DTXSID2028016,1,1,1,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
DTXSID8025670,1,1,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
DTXSID1023730,1,1,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0


In [13]:
df_opera = pd.read_csv(interim_dir+'OPERA_TOX21.csv', index_col='dsstox_sid')
df_opera

Unnamed: 0_level_0,Unnamed: 0,OPERA_LogP,OPERA_PKAA,OPERA_PKAB
dsstox_sid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
DTXSID0020022,0,3.700590,,
DTXSID0020024,1,0.752342,,
DTXSID0020105,2,-2.164240,5.42029,
DTXSID0020107,3,-1.204540,7.56472,
DTXSID0020151,4,1.962760,,
...,...,...,...,...
DTXSID9057842,8398,2.523990,4.24546,
DTXSID9057844,8399,7.929440,,3.05035
DTXSID9057846,8400,2.994230,-1.62208,
DTXSID9057848,8401,3.289430,,4.54729


In [14]:
df_opera.drop(['Unnamed: 0'], axis = 1, inplace = True)

In [15]:
df_opera.columns = ['LogP_pred','pKa_a_pred', 'pKa_b_pred']

In [16]:
df_opera['pKa_pred']=df_opera[['pKa_a_pred','pKa_b_pred']].min(axis=1)

In [17]:
df_opera = df_opera[~df_opera.index.duplicated(keep='first')]

In [18]:
df_opera = df_opera.dropna(subset=['pKa_pred','LogP_pred']) #add1
df_opera.fillna(0, inplace=True) 

In [19]:
opera_scaler = pickle.load(open(models_dir+'opera_scaler_v2.sav', 'rb'))

In [20]:
# Normalize opera properties based on transformation scaler vector from the base models
opera_scaled = opera_scaler.transform(df_opera)
opera = pd.DataFrame(opera_scaled, index = df_opera.index) 
opera.columns = df_opera.columns
opera = opera[['pKa_pred','LogP_pred']]

In [21]:
opera.head()

Unnamed: 0_level_0,pKa_pred,LogP_pred
dsstox_sid,Unnamed: 1_level_1,Unnamed: 2_level_1
DTXSID0020105,-0.278175,-2.392401
DTXSID0020107,0.472962,-1.899167
DTXSID0020236,-0.815912,-2.05848
DTXSID0020238,1.797584,-0.996638
DTXSID0020943,-0.895641,-0.524309


In [22]:
clint_clas = pickle.load(open(models_dir+'clintClas_svc.sav', 'rb'))

In [23]:
clint_clas.classes_

array([-3, -2, -1], dtype=int32)

In [24]:
fingerprints

Unnamed: 0,bitvector0,bitvector1,bitvector2,bitvector3,bitvector4,bitvector5,bitvector6,bitvector7,bitvector8,bitvector9,...,ring:polycycle_bicyclo_propene,ring:polycycle_spiro_[2.2]pentane,ring:polycycle_spiro_[2.5]octane,ring:polycycle_spiro_[4.5]decane,ring:polycycle_spiro_1_4-dioxaspiro[4.5]decane,ring:polycycle_tricyclo_[3.5.5]_cyclopropa[cd]pentalene,ring:polycycle_tricyclo_[3.7.7]bullvalene,ring:polycycle_tricyclo_[3.7.7]semibullvalene,ring:polycycle_tricyclo_adamantane,ring:polycycle_tricyclo_benzvalene
DTXSID6044773,1,1,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
DTXSID6057746,1,1,1,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
DTXSID2028016,1,1,1,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
DTXSID8025670,1,1,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
DTXSID1023730,1,1,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
DTXSID2045656,1,1,1,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
DTXSID2048822,1,1,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
DTXSID7020687,1,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
DTXSID2021577,1,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0


In [25]:
clint_features_clas = pd.read_csv(external_dir+'1-s2.0-S2468111320300463-mmc13.csv')

In [27]:
retain = [str(val.replace("'", "").replace(" ", "")) for val in clint_features_clas.loc[0,'Fingerprints'].split(',')]
retain[0] = retain[0].replace("[", "")
retain[len(retain)-1] = retain[len(retain)-1].replace("]",'')
fingerprints_clintClas = fingerprints.loc[:,retain]
# Set X vector for predictions
X_clintClas = pd.concat([fingerprints_clintClas, opera], axis=1).dropna()

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  


In [29]:
X_clintClas.head()

Unnamed: 0,bitvector2,bitvector12,bitvector14,bitvector15,bitvector19,bitvector20,bitvector33,bitvector37,bitvector143,bitvector179,...,bitvector688,bitvector696,bitvector697,bitvector698,bitvector712,chain:alkaneLinear_ethyl_C2(H_gt_1),chain:alkaneLinear_ethyl_C2_(connect_noZ_CN=4),chain:aromaticAlkane_Ph-C1_acyclic_connect_noDblBd,pKa_pred,LogP_pred
DTXSID0020020,0,0,1,0,1,0,0,0,0,1,...,1,1,0,1,0,0,0,1,0.150108,-0.709237
DTXSID0020070,0,0,1,0,1,0,0,0,0,0,...,0,0,0,0,0,1,1,0,-0.879707,-2.794995
DTXSID0020072,0,0,1,0,0,0,0,1,0,0,...,1,1,1,0,1,0,0,0,-0.621965,-1.498246
DTXSID0020074,1,0,1,0,1,0,0,0,0,1,...,1,1,1,1,0,0,0,0,-0.803666,-1.845464
DTXSID0020076,0,0,1,1,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,-0.70554,-1.753311


In [30]:
predicted_Clint_cls = pd.DataFrame(clint_clas.predict(X_clintClas), X_clintClas.index)

In [31]:
predicted_Clint_cls.columns = ['Clint Prediction (Bin)']

In [32]:

predicted_Clint_cls.loc[predicted_Clint_cls['Clint Prediction (Bin)'] == -3, 'Clint Prediction (Bin)'] = 'Low'
predicted_Clint_cls.loc[predicted_Clint_cls['Clint Prediction (Bin)'] == -2, 'Clint Prediction (Bin)'] = 'Medium'
predicted_Clint_cls.loc[predicted_Clint_cls['Clint Prediction (Bin)'] == -1, 'Clint Prediction (Bin)'] = 'High'

In [33]:
predicted_Clint_cls

Unnamed: 0,Clint Prediction (Bin)
DTXSID0020020,Medium
DTXSID0020070,Low
DTXSID0020072,Medium
DTXSID0020074,Low
DTXSID0020076,Medium
...,...
DTXSID9057844,High
DTXSID9057846,Low
DTXSID9057848,Medium
DTXSID9057898,Low


In [35]:
writer = pd.ExcelWriter(external_dir+'Tox21_httk_Clint_cls_predictions.xlsx', engine='openpyxl')

# Convert the dataframe to an XlsxWriter Excel object.

predicted_Clint_cls.to_excel(writer, sheet_name = 'Tox21_Clint_cls_predictions')


writer.save()