Making CLint regression predictions for the Tox21 library

- Used Physprop collection from genra_dev_v5 to extract OPERA predictions
- Downloaded ToxPrints from the EPA CompTox Chemicals Dashboard
- Used PadelPy to compute PubChem fingerprints



In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
import openpyxl
import os
import sys

In [2]:
TOP = os.getcwd().replace('notebooks', '')

In [3]:
raw_dir = TOP + 'data/raw/'
interim_dir = TOP + 'data/interim/'
external_dir = TOP + 'data/external/'
models_dir = TOP + 'models/'

In [4]:
from sklearn import model_selection
from sklearn import preprocessing
from sklearn.metrics import r2_score
import pickle
import glob

In [5]:
def normalizeDescriptors(X):
    scaler = preprocessing.StandardScaler().fit(X)
    transformed = scaler.transform(X)
    x_norm = pd.DataFrame(transformed, index = X.index) 
    x_norm.columns = X.columns
    return(x_norm)


In [6]:
pubchem = pd.read_csv(interim_dir+'Tox21_pubchem.csv')

In [7]:
pubchem.rename(columns = {'Unnamed: 0' : 'dsstox_sid'}, inplace = True)

In [8]:
pubchem.set_index('dsstox_sid', inplace = True)

In [9]:
txps = pd.read_csv(interim_dir+'tox21_txps_all.csv')

In [10]:
txps.head()

Unnamed: 0.1,Unnamed: 0,INPUT,DTXSID,PREFERRED_NAME,atom:element_main_group,atom:element_metal_group_I_II,atom:element_metal_group_III,atom:element_metal_metalloid,atom:element_metal_poor_metal,atom:element_metal_transistion_metal,...,ring:polycycle_bicyclo_propene,ring:polycycle_spiro_[2.2]pentane,ring:polycycle_spiro_[2.5]octane,ring:polycycle_spiro_[4.5]decane,ring:polycycle_spiro_1_4-dioxaspiro[4.5]decane,ring:polycycle_tricyclo_[3.5.5]_cyclopropa[cd]pentalene,ring:polycycle_tricyclo_[3.7.7]bullvalene,ring:polycycle_tricyclo_[3.7.7]semibullvalene,ring:polycycle_tricyclo_adamantane,ring:polycycle_tricyclo_benzvalene
0,0,DTXSID7020005,DTXSID7020005,Acetamide,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1,DTXSID2020006,DTXSID2020006,Acetaminophen,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,2,DTXSID7020007,DTXSID7020007,Acetohexamide,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,3,DTXSID7020009,DTXSID7020009,Acetonitrile,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,4,DTXSID6020014,DTXSID6020014,Dehydroacetic acid,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [11]:
txps.set_index('INPUT', inplace = True)

txps.drop(['Unnamed: 0','DTXSID', 'PREFERRED_NAME'], axis = 1, inplace = True)
txps.head()

Unnamed: 0_level_0,atom:element_main_group,atom:element_metal_group_I_II,atom:element_metal_group_III,atom:element_metal_metalloid,atom:element_metal_poor_metal,atom:element_metal_transistion_metal,atom:element_noble_gas,bond:C#N_cyano_acylcyanide,bond:C#N_cyano_cyanamide,bond:C#N_cyano_cyanohydrin,...,ring:polycycle_bicyclo_propene,ring:polycycle_spiro_[2.2]pentane,ring:polycycle_spiro_[2.5]octane,ring:polycycle_spiro_[4.5]decane,ring:polycycle_spiro_1_4-dioxaspiro[4.5]decane,ring:polycycle_tricyclo_[3.5.5]_cyclopropa[cd]pentalene,ring:polycycle_tricyclo_[3.7.7]bullvalene,ring:polycycle_tricyclo_[3.7.7]semibullvalene,ring:polycycle_tricyclo_adamantane,ring:polycycle_tricyclo_benzvalene
INPUT,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
DTXSID7020005,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
DTXSID2020006,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
DTXSID7020007,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
DTXSID7020009,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
DTXSID6020014,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [12]:
clint_features_reg = pd.read_csv(external_dir+'Clint_Features_Regression.csv')

In [13]:
ids = list(set(pubchem.index & txps.index))
txps = txps.loc[ids]
pubchem = pubchem.loc[ids]
fingerprints = pd.concat([pubchem,txps ], axis =1)

In [14]:
fingerprints.columns = fingerprints.columns.str.replace('PubchemFP', 'bitvector')

In [24]:
#fingerprints[fingerprints['ring:polycycle_bicyclo_propene'] == '-']

In [16]:
retain = [str(val.replace("'", "").replace(" ", "")) for val in clint_features_reg.loc[0,'Fingerprints'].split(',')]
retain[0] = retain[0].replace("[", "")
retain[len(retain)-1] = retain[len(retain)-1].replace("]",'')

In [17]:
fingerprints_clintReg = fingerprints.loc[:,retain]

In [18]:
fingerprints_clintReg.shape

(8604, 58)

In [28]:
fingerprints_clintReg = fingerprints_clintReg[fingerprints_clintReg['bond:CN_amine_aliphatic_generic'] != '-']

In [19]:
clint_rf = pickle.load(open(models_dir+'clintReg_rf.sav', 'rb'))

In [20]:
len(clint_rf.feature_importances_)


58

In [29]:
predicted_clint_rf = pd.DataFrame((10**clint_rf.predict(fingerprints_clintReg)), fingerprints_clintReg.index )

In [30]:
predicted_clint_rf.columns = ['pred_clint_rf']

In [31]:
predicted_clint_rf.shape

(8573, 1)

In [32]:
predicted_clint_rf.head()

Unnamed: 0,pred_clint_rf
DTXSID2026943,7.663445
DTXSID5057622,8.432165
DTXSID7045948,5.197866
DTXSID0047408,19.148468
DTXSID1048887,9.326406


In [34]:
writer = pd.ExcelWriter(external_dir+'Tox21_httk_CLint_reg_predictions.xlsx', engine='openpyxl')

# Convert the dataframe to an XlsxWriter Excel object.

predicted_clint_rf.to_excel(writer, sheet_name = 'Tox21_Clint_regression_predictions')


writer.save()