# Generate MACCS train/test data, and fit balanced random forest model

In [3]:
from imblearn.ensemble import BalancedRandomForestClassifier
from sklearn.metrics import classification_report, balanced_accuracy_score

In [4]:
import pandas as pd
import numpy as np
import json
from rdkit.Chem import MolFromSmiles
from rdkit.Chem import MACCSkeys
import joblib

In [6]:
# read ames data with SMILES
ames_maccs_smiles= pd.read_csv('ames_maccs_smiles_df.csv')
ames_maccs_smiles= ames_maccs_smiles.iloc[:,1:].set_index('casn')
ames_maccs_smiles= ames_maccs_smiles[['bioactive', 'Smiles']]

In [7]:
# read the test set
testX= pd.read_csv('testX.csv')
testX= testX.set_index('casrn')

In [137]:
ames_maccs_smiles

Unnamed: 0_level_0,bioactive,Smiles
casn,Unnamed: 1_level_1,Unnamed: 2_level_1
100-00-5,True,O=[N+]([O-])c1ccc(Cl)cc1
100-01-6,True,O=[N+]([O-])c1ccc(N)cc1
100-02-7,False,Oc1ccc([N+]([O-])=O)cc1
100-17-4,True,c1([N+]([O-])=O)ccc(OC)cc1
100-40-3,False,C=CC1CC=CCC1
...,...,...
99-59-2,True,O=[N+]([O-])c1ccc(OC)c(N)c1
99610-72-7,True,Oc1c(cc(cc1NCCO)[N+](=O)[O-])[N+](=O)[O-]
99788-75-7,True,OCC(CNc1c(C(=O)c(cccc2)c2C3=O)c3c(NCC(CO)O)cc1)O
999-81-5,False,C(CCl)[N+](C)(C)C.[Cl-]


Generate MACCS

In [8]:
def smiles_maccs(smiles_input: str) -> pd.DataFrame:
    '''
    description : function to calculate maccs keys from smiles
    input: smiles
    output: string of MACCS keys
    '''

    macc_bits = 167
    maccs_fp_name = [f'maccs_{i}' for i in range(macc_bits)] # generate generic fingerprint names for maccs

    mol= MolFromSmiles(smiles_input)  # generate mol objects from SMILE

    maccs_fp= MACCSkeys.GenMACCSKeys(mol)  # # get maccs fingerprints
    maccs_fp_bits = list(maccs_fp) 

    df_maccs = pd.DataFrame(data=[maccs_fp_bits]) # generate dataframe from maccs numeric bits
    df_maccs.columns = maccs_fp_name # map descriptor names to the columns
    # add Smiles column
    df_maccs[['Smiles']]= smiles_input
    
    return(df_maccs)

In [10]:
# generate MACCS
ames_smiles_maccs_list= [smiles_maccs(smiles_input= smile_inst) for smile_inst in ames_maccs_smiles.Smiles]
# join with original dataframe
ames_maccs_df= pd.merge(pd.concat(ames_smiles_maccs_list), ames_maccs_smiles.reset_index(), on= "Smiles")

In [11]:
ames_maccs_df= ames_maccs_df.set_index('casn')

In [12]:
# set MACCS col_names
macc_bits = 167
maccs_fp_name = [f'maccs_{i}' for i in range(macc_bits)][1:] # generate generic fingerprint names for maccs

Split the data

In [13]:
train= ames_maccs_df[~ames_maccs_df.index.isin(testX.index)]
test= ames_maccs_df[ames_maccs_df.index.isin(testX.index)]
# separate features and targets
trainX= train.filter(maccs_fp_name)
trainy= train.filter(['bioactive'])
testX= test.filter(maccs_fp_name)
testy= test.filter(['bioactive'])

Transfrom data to boolean

In [14]:
trainX= trainX.astype(bool)
trainy= trainy.astype(bool)
testX= testX.astype(bool)
testy= testy.astype(bool)


Initialize balanced random forest model

In [68]:
rf_bal_model= BalancedRandomForestClassifier(class_weight={0: 1, 1: 1.5}, max_depth=12,
                                             max_features=120, min_samples_leaf=2,
                                             n_estimators=200, random_state=0)

In [None]:
#rf_bal_model.fit(np.array(trainX), trainy.values.ravel())

In [16]:
rf_bal_model_loaded= joblib.load('rf_bal_model.pkl')

In [34]:
print(balanced_accuracy_score(np.array(trainy), rf_bal_model_loaded.predict(np.array(trainX))))

0.9186521978454039


In [19]:
print(balanced_accuracy_score(np.array(testy), rf_bal_model_loaded.predict(np.array(testX))))

0.8992914979757085


Persist the model in pkl format

In [81]:
joblib.dump(rf_bal_model, 'rf_bal_model.pkl')

['rf_bal_model.pkl']

Load and use the model

In [18]:
rf_bal_model_loaded= joblib.load('rf_bal_model.pkl')

Generate MACCS features for model to consume

In [36]:
def get_maccs_from_smiles(smiles_input: str) -> np.array:
    '''
    description : function to calculate maccs keys from smiles
    input: smiles
    output: string of MACCS keys
    '''
    mol= MolFromSmiles(smiles_input)  # generate mol objects from SMILE

    maccs_fp= MACCSkeys.GenMACCSKeys(mol)  # # get maccs fingerprints
    maccs_fp_array= np.array(list(
        maccs_fp)[1:]) # removing the first bit which is always zero
    maccs_fp_array_reshaped= maccs_fp_array.reshape(1,-1) # reshape to be accepted my model
    return(maccs_fp_array_reshaped)

In [39]:
get_maccs_from_smiles(smiles_input= "CCC")

array([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0,
        1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0]])

Run a prediction

In [38]:
rf_bal_model_loaded.predict_proba(
    get_maccs_from_smiles(smiles_input= "CCC"))

array([[0.86732095, 0.13267905]])

In [109]:
rf_bal_model_loaded.predict(np.array(testX))

array([ True,  True,  True,  True,  True, False,  True, False,  True,
       False, False,  True,  True,  True,  True, False,  True,  True,
        True,  True,  True, False, False,  True,  True,  True, False,
        True,  True,  True,  True,  True, False,  True,  True,  True,
        True,  True,  True,  True, False, False, False,  True,  True,
        True,  True, False,  True,  True,  True, False, False,  True,
       False,  True,  True,  True,  True, False,  True,  True, False,
        True,  True, False, False, False,  True,  True, False])

In [41]:
import requests

url = 'http://137.183.11.247:5000/predict'

In [42]:
r = requests.post(url)


In [43]:
r

<Response [500]>

In [44]:
print(r.content)

b'<!doctype html>\n<html lang=en>\n  <head>\n    <title>IndexError: list index out of range\n // Werkzeug Debugger</title>\n    <link rel="stylesheet" href="?__debugger__=yes&amp;cmd=resource&amp;f=style.css">\n    <link rel="shortcut icon"\n        href="?__debugger__=yes&amp;cmd=resource&amp;f=console.png">\n    <script src="?__debugger__=yes&amp;cmd=resource&amp;f=debugger.js"></script>\n    <script>\n      var CONSOLE_MODE = false,\n          EVALEX = true,\n          EVALEX_TRUSTED = false,\n          SECRET = "s3eZ7XQq0hEgK4JTuYhn";\n    </script>\n  </head>\n  <body style="background-color: #fff">\n    <div class="debugger">\n<h1>IndexError</h1>\n<div class="detail">\n  <p class="errormsg">IndexError: list index out of range\n</p>\n</div>\n<h2 class="traceback">Traceback <em>(most recent call last)</em></h2>\n<div class="traceback">\n  <h3></h3>\n  <ul><li><div class="frame" id="frame-139913879582272">\n  <h4>File <cite class="filename">"/home/mahmoud/anaconda3/envs/my-rdkit-env