In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.layers import  Dense, Bidirectional, LSTM, Dropout,Input, Embedding,GRU
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization
from tensorflow.keras.losses import BinaryCrossentropy
from tensorflow.keras.models import Model ,Sequential
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.metrics import Precision, AUC, binary_accuracy, TrueNegatives, TruePositives, FalseNegatives, FalsePositives
from helper_functions import smi_tokenizer, mcc_metric
from rdkit import Chem
from rdkit.Chem import PandasTools
from tensorflow.keras.wrappers.scikit_learn import KerasClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.utils import class_weight

In [2]:
data = pd.read_table('F:/bbb/data/B3DB/B3DB_classification.tsv')
data.head(1)

Unnamed: 0,NO.,compound_name,IUPAC_name,SMILES,CID,logBB,BBB+/BBB-,Inchi,threshold,reference,group,comments
0,1,sulphasalazine,2-hydroxy-5-[[4-(pyridin-2-ylsulfamoyl)phenyl]...,O=C(O)c1cc(N=Nc2ccc(S(=O)(=O)Nc3ccccn3)cc2)ccc1O,5339.0,-2.69,BBB-,InChI=1S/C18H14N4O5S/c23-16-9-6-13(11-15(16)18...,,R2|R2|R25|R46|,A,


In [3]:
X = data["SMILES"]

target=data['BBB+/BBB-'].values.tolist()
label_encoder = LabelEncoder()
y_classification = label_encoder.fit_transform(target)

In [4]:
smile_data = [smi_tokenizer(smile) for smile in X]

In [5]:
xtrain, xtest, ytrain, ytest = train_test_split(smile_data, y_classification, test_size=0.2, random_state=42)
xtrain, xvalid, ytrain, yvalid  = train_test_split(xtrain, ytrain, test_size= 0.16, random_state=42)

In [6]:
len(y_classification)

7807

In [7]:
np.unique(y_classification, return_counts=True)

(array([0, 1], dtype=int64), array([4956, 2851], dtype=int64))

In [8]:
class_weights = class_weight.compute_class_weight(
           'balanced',
            np.unique(ytrain), 
            ytrain)



In [9]:
class_weights

array([0.79038577, 1.36092372])

In [10]:
class_weights = {0:0.78950695,
                1: 1.36353712} 

In [11]:
text_vectorizer = TextVectorization(max_tokens=None, standardize=None, output_sequence_length=None,)

In [12]:
text_vectorizer.adapt(smile_data)
word_in_vocab = text_vectorizer.get_vocabulary()
len(word_in_vocab)

67

In [13]:
input = Input(shape = (1,), dtype = 'string')
x = text_vectorizer(input)
x = Embedding(input_dim=len(word_in_vocab),
                        output_dim=128, 
                        input_length=None,
                        )(x)
x = Bidirectional(LSTM(64, return_sequences=True,name = 'lstm1'))(x)
x = Bidirectional(LSTM(64, return_sequences=True,name = 'lstm1'))(x)
x = Dropout(0.2)(x)
x = Bidirectional(LSTM(64, return_sequences=False,name = 'lstm1'))(x)
x = Dropout(0.2)(x)

x = Dense(256, activation = 'relu')(x)
x = Dense(10, activation = 'relu')(x)
output = Dense(1, activation = "sigmoid")(x)
model = Model(inputs = input, outputs = output)


In [14]:
def save(i):
    return 'F:/bbb/model/nlp/model_'+str(i)+'.tf'

checkpoint = tf.keras.callbacks.ModelCheckpoint(save(33), 
                monitor='val_auc', verbose=1, 
                save_best_only=True, save_weights_only = False, mode='max')
callbacks_list = [checkpoint]

model.compile(optimizer = tf.keras.optimizers.Adam(learning_rate = 0.001) ,loss = BinaryCrossentropy(), metrics = [binary_accuracy, AUC(), mcc_metric 
                    ,Precision(),TrueNegatives(), TruePositives(), FalseNegatives(), FalsePositives()])
model.fit(x = np.array(xtrain), y = np.array(ytrain), validation_data = (np.array(xvalid), np.array(yvalid)), epochs=60, batch_size = 32, 
              callbacks = callbacks_list, class_weight = class_weights)


Epoch 1/60
Epoch 00001: val_auc improved from -inf to 0.84196, saving model to F:/bbb/model/nlp\model_33.tf




INFO:tensorflow:Assets written to: F:/bbb/model/nlp\model_33.tf\assets


INFO:tensorflow:Assets written to: F:/bbb/model/nlp\model_33.tf\assets


Epoch 2/60
Epoch 00002: val_auc improved from 0.84196 to 0.84921, saving model to F:/bbb/model/nlp\model_33.tf




INFO:tensorflow:Assets written to: F:/bbb/model/nlp\model_33.tf\assets


INFO:tensorflow:Assets written to: F:/bbb/model/nlp\model_33.tf\assets


Epoch 3/60
Epoch 00003: val_auc improved from 0.84921 to 0.87295, saving model to F:/bbb/model/nlp\model_33.tf




INFO:tensorflow:Assets written to: F:/bbb/model/nlp\model_33.tf\assets


INFO:tensorflow:Assets written to: F:/bbb/model/nlp\model_33.tf\assets


Epoch 4/60
Epoch 00004: val_auc improved from 0.87295 to 0.87616, saving model to F:/bbb/model/nlp\model_33.tf




INFO:tensorflow:Assets written to: F:/bbb/model/nlp\model_33.tf\assets


INFO:tensorflow:Assets written to: F:/bbb/model/nlp\model_33.tf\assets


Epoch 5/60
Epoch 00005: val_auc improved from 0.87616 to 0.87979, saving model to F:/bbb/model/nlp\model_33.tf




INFO:tensorflow:Assets written to: F:/bbb/model/nlp\model_33.tf\assets


INFO:tensorflow:Assets written to: F:/bbb/model/nlp\model_33.tf\assets


Epoch 6/60
Epoch 00006: val_auc did not improve from 0.87979
Epoch 7/60
Epoch 00007: val_auc did not improve from 0.87979
Epoch 8/60
Epoch 00008: val_auc improved from 0.87979 to 0.88954, saving model to F:/bbb/model/nlp\model_33.tf




INFO:tensorflow:Assets written to: F:/bbb/model/nlp\model_33.tf\assets


INFO:tensorflow:Assets written to: F:/bbb/model/nlp\model_33.tf\assets


Epoch 9/60
Epoch 00009: val_auc did not improve from 0.88954
Epoch 10/60
Epoch 00010: val_auc improved from 0.88954 to 0.89941, saving model to F:/bbb/model/nlp\model_33.tf




INFO:tensorflow:Assets written to: F:/bbb/model/nlp\model_33.tf\assets


INFO:tensorflow:Assets written to: F:/bbb/model/nlp\model_33.tf\assets


Epoch 11/60
Epoch 00011: val_auc improved from 0.89941 to 0.90780, saving model to F:/bbb/model/nlp\model_33.tf




INFO:tensorflow:Assets written to: F:/bbb/model/nlp\model_33.tf\assets


INFO:tensorflow:Assets written to: F:/bbb/model/nlp\model_33.tf\assets


Epoch 12/60
Epoch 00012: val_auc did not improve from 0.90780
Epoch 13/60
Epoch 00013: val_auc did not improve from 0.90780
Epoch 14/60
Epoch 00014: val_auc did not improve from 0.90780
Epoch 15/60
Epoch 00015: val_auc improved from 0.90780 to 0.92005, saving model to F:/bbb/model/nlp\model_33.tf




INFO:tensorflow:Assets written to: F:/bbb/model/nlp\model_33.tf\assets


INFO:tensorflow:Assets written to: F:/bbb/model/nlp\model_33.tf\assets


Epoch 16/60
Epoch 00016: val_auc improved from 0.92005 to 0.92132, saving model to F:/bbb/model/nlp\model_33.tf




INFO:tensorflow:Assets written to: F:/bbb/model/nlp\model_33.tf\assets


INFO:tensorflow:Assets written to: F:/bbb/model/nlp\model_33.tf\assets


Epoch 17/60
Epoch 00017: val_auc did not improve from 0.92132
Epoch 18/60
Epoch 00018: val_auc did not improve from 0.92132
Epoch 19/60
Epoch 00019: val_auc did not improve from 0.92132
Epoch 20/60
Epoch 00020: val_auc improved from 0.92132 to 0.93329, saving model to F:/bbb/model/nlp\model_33.tf




INFO:tensorflow:Assets written to: F:/bbb/model/nlp\model_33.tf\assets


INFO:tensorflow:Assets written to: F:/bbb/model/nlp\model_33.tf\assets


Epoch 21/60
Epoch 00021: val_auc did not improve from 0.93329
Epoch 22/60
Epoch 00022: val_auc did not improve from 0.93329
Epoch 23/60
Epoch 00023: val_auc did not improve from 0.93329
Epoch 24/60
Epoch 00024: val_auc did not improve from 0.93329
Epoch 25/60
Epoch 00025: val_auc did not improve from 0.93329
Epoch 26/60
Epoch 00026: val_auc did not improve from 0.93329
Epoch 27/60
Epoch 00027: val_auc did not improve from 0.93329
Epoch 28/60
Epoch 00028: val_auc did not improve from 0.93329
Epoch 29/60
Epoch 00029: val_auc improved from 0.93329 to 0.93457, saving model to F:/bbb/model/nlp\model_33.tf




INFO:tensorflow:Assets written to: F:/bbb/model/nlp\model_33.tf\assets


INFO:tensorflow:Assets written to: F:/bbb/model/nlp\model_33.tf\assets


Epoch 30/60
Epoch 00030: val_auc did not improve from 0.93457
Epoch 31/60
Epoch 00031: val_auc did not improve from 0.93457
Epoch 32/60
Epoch 00032: val_auc did not improve from 0.93457
Epoch 33/60
Epoch 00033: val_auc did not improve from 0.93457
Epoch 34/60
Epoch 00034: val_auc improved from 0.93457 to 0.93590, saving model to F:/bbb/model/nlp\model_33.tf




INFO:tensorflow:Assets written to: F:/bbb/model/nlp\model_33.tf\assets


INFO:tensorflow:Assets written to: F:/bbb/model/nlp\model_33.tf\assets


Epoch 35/60
Epoch 00035: val_auc improved from 0.93590 to 0.93916, saving model to F:/bbb/model/nlp\model_33.tf




INFO:tensorflow:Assets written to: F:/bbb/model/nlp\model_33.tf\assets


INFO:tensorflow:Assets written to: F:/bbb/model/nlp\model_33.tf\assets


Epoch 36/60
Epoch 00036: val_auc did not improve from 0.93916
Epoch 37/60
Epoch 00037: val_auc did not improve from 0.93916
Epoch 38/60
Epoch 00038: val_auc did not improve from 0.93916
Epoch 39/60
Epoch 00039: val_auc did not improve from 0.93916
Epoch 40/60
Epoch 00040: val_auc did not improve from 0.93916
Epoch 41/60
Epoch 00041: val_auc did not improve from 0.93916
Epoch 42/60
Epoch 00042: val_auc did not improve from 0.93916
Epoch 43/60
Epoch 00043: val_auc did not improve from 0.93916
Epoch 44/60
Epoch 00044: val_auc did not improve from 0.93916
Epoch 45/60
Epoch 00045: val_auc did not improve from 0.93916
Epoch 46/60
Epoch 00046: val_auc did not improve from 0.93916
Epoch 47/60
Epoch 00047: val_auc did not improve from 0.93916
Epoch 48/60
Epoch 00048: val_auc did not improve from 0.93916
Epoch 49/60
Epoch 00049: val_auc did not improve from 0.93916
Epoch 50/60
Epoch 00050: val_auc did not improve from 0.93916
Epoch 51/60
Epoch 00051: val_auc did not improve from 0.93916
Epoch 52

<keras.callbacks.History at 0x1f250fd8670>

In [15]:
model.load_weights(save(33))
model.evaluate(np.array(xtest), np.array(ytest))



[0.4952695071697235,
 0.8405889868736267,
 0.9156013131141663,
 0.6659770011901855,
 0.7392638325691223,
 831.0,
 482.0,
 79.0,
 170.0]

In [16]:
model.evaluate(np.array(xvalid), np.array(yvalid))



[0.3782069683074951,
 0.8700000047683716,
 0.9391647577285767,
 0.7307599186897278,
 0.7934508919715881,
 555.0,
 315.0,
 48.0,
 82.0]

In [17]:
tf.__version__

'2.7.0'