#### Making CLint regression predictions for the HTTK library

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
import openpyxl
import os
import sys

In [2]:
TOP = os.getcwd().replace('notebooks', '')

In [3]:
raw_dir = TOP + 'data/raw/'
interim_dir = TOP + 'data/interim/'
external_dir = TOP + 'data/external/'

In [4]:
models_dir = TOP + 'models/'
reports_dir = TOP + 'reports/'

In [5]:
from sklearn import model_selection
from sklearn import preprocessing
from sklearn.metrics import r2_score
import pickle
import glob

In [6]:
def normalizeDescriptors(X):
    scaler = preprocessing.StandardScaler().fit(X)
    transformed = scaler.transform(X)
    x_norm = pd.DataFrame(transformed, index = X.index) 
    x_norm.columns = X.columns
    return(x_norm)

In [7]:
pubchem = pd.read_csv(interim_dir+'HTTK-Pubchem.csv')

In [8]:
pubchem.set_index('CAS', inplace = True)

In [9]:
txps = pd.read_csv(interim_dir+'HTTK-TxPs.txt', sep = ';')

In [10]:
txps.drop(['M_COMPOUND_HISTORY_[STRING]', 'M_CORINA_SYMPHONY_ERRORS_[STRING]'], axis =1, inplace = True)

In [11]:
txps.set_index('M_NAME', inplace = True)

In [12]:
clint_features_reg = pd.read_csv(external_dir+'Clint_Features_Regression.csv')

In [13]:
ids = list(set(pubchem.index & txps.index))
txps = txps.loc[ids]
pubchem = pubchem.loc[ids]
fingerprints = pd.concat([pubchem,txps ], axis =1)

In [14]:
retain = [str(val.replace("'", "").replace(" ", "")) for val in clint_features_reg.loc[0,'Fingerprints'].split(',')]
retain[0] = retain[0].replace("[", "")
retain[len(retain)-1] = retain[len(retain)-1].replace("]",'')

In [15]:
fingerprints_clintReg = fingerprints.loc[:,retain]

In [16]:
clint_rf = pickle.load(open(models_dir+'clintReg_rf.sav', 'rb'))

In [17]:
predicted_clint_rf = pd.DataFrame((10**clint_rf.predict(fingerprints_clintReg)), fingerprints_clintReg.index )

In [18]:
predicted_clint_rf.columns = ['pred_clint_rf']

In [19]:
predicted_clint_rf.head()

Unnamed: 0,pred_clint_rf
10190-99-5,8.406022
93-55-0,16.160451
78967-07-4,8.592281
32329-20-7,10.37499
123-96-6,12.022457


In [20]:
check_df = pd.read_csv(reports_dir+'pradeep_FupClintPredictions_HTTK_chem_props.csv')

In [22]:
check_df.head().CASRN.tolist()

['100-01-6', '100-02-7', '100-10-7', '100-15-2', '100-19-6']

In [24]:
check_df.head()

Unnamed: 0,CASRN,Fub (SVR Prediction),Fub (RF Prediction),Fub (Consensus Prediction),Clint Prediction (Bin),Clint Prediction
0,100-01-6,0.177836,0.388741,0.283289,Medium,11.456444
1,100-02-7,0.095463,0.131728,0.113595,Medium,10.190809
2,100-10-7,0.610963,0.310349,0.460656,Medium,9.8062
3,100-15-2,0.17403,0.386404,0.280217,Medium,12.955616
4,100-19-6,0.329827,0.335527,0.332677,Medium,12.207164


In [23]:
predicted_clint_rf[predicted_clint_rf.index.isin(check_df.head().CASRN.tolist())]

Unnamed: 0,pred_clint_rf
100-02-7,10.190809
100-19-6,12.207164
100-01-6,11.456444
100-15-2,12.955616
100-10-7,9.8062


In [25]:
#checks out OK for Clint reg