In [1]:
import numpy as np
import pandas as pd
import pickle
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Model, load_model
from keras_self_attention import SeqSelfAttention
import warnings
warnings.filterwarnings('ignore')

Using TensorFlow backend.


### Prepare Data

In [2]:
# training data (needed because you need a tokenizer for the LSTM model to apply on new data)
dataset = pd.read_csv("../data/train_valid/training_set.csv", delimiter=",")
X_train = dataset.iloc[:,0:1].values

max_len = 100

for p in range (X_train.shape[0]):
  s = X_train[p,0]
  s = s.replace("[nH]","A")
  s = s.replace("Cl","L")
  s = s.replace("Br","R")
  s = s.replace("[C@]","C")
  s = s.replace("[C@@]","C")
  s = s.replace("[C@@H]","C")
  s =[s[i:i+1] for i in range(0,len(s),1)]
  s = " ".join(s)
  X_train[p,0] = s
X_train = X_train[:,0]
X_train = X_train.tolist()

tokenizer = Tokenizer(num_words=max_len)
tokenizer.fit_on_texts(X_train)

In [3]:
# load test file and define descriptors (you will need the file to have all descriptors)
test_file = 'blockers_sampled.csv' # load your test set here
dataset = pd.read_csv(test_file, delimiter=',')

X_test_smi = dataset.iloc[:,0:1].values # change indices if needed
X_test_rdkit = dataset.iloc[:,1:120] # change indices if needed
X_test_morganfp = dataset.iloc[:,120:1144] # change indices if needed

for p in range (X_test_smi.shape[0]):
  s = X_test_smi[p,0]
  s = s.replace("[nH]","A")
  s = s.replace("Cl","L")
  s = s.replace("Br","R")
  s = s.replace("[C@]","C")
  s = s.replace("[C@@]","C")
  s = s.replace("[C@@H]","C")
  s =[s[i:i+1] for i in range(0,len(s),1)]
  s = " ".join(s)
  X_test_smi[p,0] = s
X_test_smi = X_test_smi[:,0]  
X_test_smi = X_test_smi.tolist()

X_test_smi = tokenizer.texts_to_sequences(X_test_smi)
X_test_smi = pad_sequences(X_test_smi, maxlen=max_len, padding='post')

# df for saving consensus predictions
consensus_df = dataset.iloc[:,0:1]
consensus_df.shape

(3281, 1)

### Random Forest Predictions

In [4]:
# best performing rf model was based on rdkit descriptors
pkl_file = open('rf_rdkit.pkl', 'rb')
model = pickle.load(pkl_file)

y_pred = model.predict(X_test_rdkit)
y_pred_prob = model.predict_proba(X_test_rdkit).T[1]
y_pred_prob = y_pred_prob.ravel()
y_pred_prob = np.round(y_pred_prob, 2)

consensus_df['rf_pred'] = y_pred.tolist()
consensus_df['rf_pred_prob'] = y_pred_prob.tolist()

pkl_file.close()

### XGBoost Predictions

In [5]:
# best performing xgb model was based on rdkit descriptors
pkl_file = open('xgb_rdkit.pkl', 'rb')
model = pickle.load(pkl_file)

y_pred = model.predict(X_test_rdkit)
y_pred_prob = model.predict_proba(X_test_rdkit).T[1]
y_pred_prob = y_pred_prob.ravel()
y_pred_prob = np.round(y_pred_prob, 2)

consensus_df['xgb_pred'] = y_pred.tolist()
consensus_df['xgb_pred_prob'] = y_pred_prob.tolist()

pkl_file.close()

### DNN Predictions

In [6]:
# best performing dnn model was based on  morgan fingerprints
pkl_file = open('dnn_morganfp.pkl', 'rb')
model = pickle.load(pkl_file)

predictions = model.predict(X_test_morganfp)
y_pred = np.round(predictions,0)
y_pred = y_pred.ravel()
predictions = np.array(predictions).ravel()
y_pred_prob = np.round(predictions, 2)

consensus_df['dnn_pred'] = y_pred.tolist()
consensus_df['dnn_pred_prob'] = y_pred_prob.tolist()

pkl_file.close()

### LSTM Predictions

In [7]:
# the best performing lstm model was based on sequential self attention scheme
model = load_model("lstm_attn_smiles.h5", custom_objects=SeqSelfAttention.get_custom_objects())

predictions = model.predict(X_test_smi)
y_pred = np.round(predictions,0)
y_pred = y_pred.ravel()
predictions = np.array(predictions).ravel()
y_pred_prob = np.round(predictions, 2)

consensus_df['lstm_pred'] = y_pred.tolist()
consensus_df['lstm_pred_prob'] = y_pred_prob.tolist()

pkl_file.close()

### Save

In [8]:
consensus_df['consensus_pred_prob'] = consensus_df[['rf_pred_prob','xgb_pred_prob','dnn_pred_prob', 'lstm_pred_prob']].mean(axis=1)
consensus_df['consensus_pred'] = np.where(consensus_df['consensus_pred_prob']>=0.5, 1, 0)

# save to file
consensus_df.to_csv('blockers_sampled_predictions.csv', sep=',', index=None)
consensus_df.shape

(3281, 11)