In [1]:
!nvidia-smi

Tue Feb 15 07:14:19 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 462.30       Driver Version: 462.30       CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name            TCC/WDDM | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  GeForce GTX 165... WDDM  | 00000000:01:00.0 Off |                  N/A |
| N/A   48C    P8     2W /  N/A |    134MiB /  4096MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [2]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.layers import  Dense, Bidirectional, LSTM, Dropout,Input, Embedding
from tensorflow.keras.losses import BinaryCrossentropy
from tensorflow.keras.models import Model 
from tensorflow.keras.metrics import Precision, AUC, binary_accuracy, TrueNegatives, TruePositives, FalseNegatives, FalsePositives
from helper_functions import mcc_metric
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.utils import class_weight
from transformers import AutoTokenizer

In [3]:
data = pd.read_table('F:/bbb/data/B3DB/B3DB_classification.tsv')
data.head(1)

Unnamed: 0,NO.,compound_name,IUPAC_name,SMILES,CID,logBB,BBB+/BBB-,Inchi,threshold,reference,group,comments
0,1,sulphasalazine,2-hydroxy-5-[[4-(pyridin-2-ylsulfamoyl)phenyl]...,O=C(O)c1cc(N=Nc2ccc(S(=O)(=O)Nc3ccccn3)cc2)ccc1O,5339.0,-2.69,BBB-,InChI=1S/C18H14N4O5S/c23-16-9-6-13(11-15(16)18...,,R2|R2|R25|R46|,A,


In [4]:
X = data["SMILES"]

target=data['BBB+/BBB-'].values.tolist()
label_encoder = LabelEncoder()
y_classification = label_encoder.fit_transform(target)

In [5]:
tokenizer = AutoTokenizer.from_pretrained('DeepChem/ChemBERTa-77M-MLM', from_pt = True)

In [6]:
smiles_data = tokenizer(list(X),  truncation=True, padding=True)
smiles_data = smiles_data['input_ids']

In [7]:
xtrain, xtest, ytrain, ytest = train_test_split(smiles_data, y_classification, test_size=0.2, random_state=42)
xtrain, xvalid, ytrain, yvalid  = train_test_split(xtrain, ytrain, test_size= 0.16, random_state=42)

In [8]:
len(y_classification)

7807

In [9]:
np.unique(y_classification, return_counts=True)

(array([0, 1], dtype=int64), array([4956, 2851], dtype=int64))

In [10]:
class_weights = class_weight.compute_class_weight(
           'balanced',
            np.unique(ytrain), 
            ytrain)



In [11]:
class_weights = {0:0.78950695,
                1: 1.36353712} 

In [12]:
word_in_vocab = tokenizer.vocab
len(word_in_vocab)

593

In [13]:
np.array(xtrain[0]).shape

(244,)

In [14]:
input = Input(shape = (244,))

x = Embedding(input_dim=len(word_in_vocab),
                        output_dim=128, 
                        input_length=None,
                        )(input)
x = Bidirectional(LSTM(64, return_sequences=True,name = 'lstm1'))(x)
x = Bidirectional(LSTM(64, return_sequences=True,name = 'lstm1'))(x)
x = Dropout(0.2)(x)
x = Bidirectional(LSTM(64, return_sequences=False,name = 'lstm1'))(x)
x = Dropout(0.2)(x)

x = Dense(256, activation = 'relu')(x)
x = Dense(10, activation = 'relu')(x)
output = Dense(1, activation = "sigmoid")(x)
model = Model(inputs = input, outputs = output)


In [15]:
def save(i):
    return 'F:/bbb/model/nlp/model_'+str(i)+'.tf'

checkpoint = tf.keras.callbacks.ModelCheckpoint(save(33), 
                monitor='val_auc', verbose=1, 
                save_best_only=True, save_weights_only = False, mode='max')
callbacks_list = [checkpoint]

model.compile(optimizer = tf.keras.optimizers.Adam(learning_rate = 0.001) ,loss = BinaryCrossentropy(), metrics = [binary_accuracy, AUC(), mcc_metric 
                    ,Precision(),TrueNegatives(), TruePositives(), FalseNegatives(), FalsePositives()])
model.fit(x = np.array(xtrain), y = np.array(ytrain), validation_data = (np.array(xvalid), np.array(yvalid)), epochs=60, batch_size = 32, 
              callbacks = callbacks_list, class_weight = class_weights)


Epoch 1/60
Epoch 00001: val_auc improved from -inf to 0.84680, saving model to F:/bbb/model/nlp\model_33.tf




INFO:tensorflow:Assets written to: F:/bbb/model/nlp\model_33.tf\assets


INFO:tensorflow:Assets written to: F:/bbb/model/nlp\model_33.tf\assets


Epoch 2/60
Epoch 00002: val_auc improved from 0.84680 to 0.84814, saving model to F:/bbb/model/nlp\model_33.tf




INFO:tensorflow:Assets written to: F:/bbb/model/nlp\model_33.tf\assets


INFO:tensorflow:Assets written to: F:/bbb/model/nlp\model_33.tf\assets


Epoch 3/60
Epoch 00003: val_auc improved from 0.84814 to 0.85681, saving model to F:/bbb/model/nlp\model_33.tf




INFO:tensorflow:Assets written to: F:/bbb/model/nlp\model_33.tf\assets


INFO:tensorflow:Assets written to: F:/bbb/model/nlp\model_33.tf\assets


Epoch 4/60
Epoch 00004: val_auc improved from 0.85681 to 0.85865, saving model to F:/bbb/model/nlp\model_33.tf




INFO:tensorflow:Assets written to: F:/bbb/model/nlp\model_33.tf\assets


INFO:tensorflow:Assets written to: F:/bbb/model/nlp\model_33.tf\assets


Epoch 5/60
Epoch 00005: val_auc improved from 0.85865 to 0.87870, saving model to F:/bbb/model/nlp\model_33.tf




INFO:tensorflow:Assets written to: F:/bbb/model/nlp\model_33.tf\assets


INFO:tensorflow:Assets written to: F:/bbb/model/nlp\model_33.tf\assets


Epoch 6/60
Epoch 00006: val_auc improved from 0.87870 to 0.88336, saving model to F:/bbb/model/nlp\model_33.tf




INFO:tensorflow:Assets written to: F:/bbb/model/nlp\model_33.tf\assets


INFO:tensorflow:Assets written to: F:/bbb/model/nlp\model_33.tf\assets


Epoch 7/60
Epoch 00007: val_auc did not improve from 0.88336
Epoch 8/60
Epoch 00008: val_auc improved from 0.88336 to 0.89205, saving model to F:/bbb/model/nlp\model_33.tf




INFO:tensorflow:Assets written to: F:/bbb/model/nlp\model_33.tf\assets


INFO:tensorflow:Assets written to: F:/bbb/model/nlp\model_33.tf\assets


Epoch 9/60
Epoch 00009: val_auc did not improve from 0.89205
Epoch 10/60
Epoch 00010: val_auc improved from 0.89205 to 0.90013, saving model to F:/bbb/model/nlp\model_33.tf




INFO:tensorflow:Assets written to: F:/bbb/model/nlp\model_33.tf\assets


INFO:tensorflow:Assets written to: F:/bbb/model/nlp\model_33.tf\assets


Epoch 11/60
Epoch 00011: val_auc did not improve from 0.90013
Epoch 12/60
Epoch 00012: val_auc improved from 0.90013 to 0.90335, saving model to F:/bbb/model/nlp\model_33.tf




INFO:tensorflow:Assets written to: F:/bbb/model/nlp\model_33.tf\assets


INFO:tensorflow:Assets written to: F:/bbb/model/nlp\model_33.tf\assets


Epoch 13/60
Epoch 00013: val_auc did not improve from 0.90335
Epoch 14/60
Epoch 00014: val_auc improved from 0.90335 to 0.91003, saving model to F:/bbb/model/nlp\model_33.tf




INFO:tensorflow:Assets written to: F:/bbb/model/nlp\model_33.tf\assets


INFO:tensorflow:Assets written to: F:/bbb/model/nlp\model_33.tf\assets


Epoch 15/60
Epoch 00015: val_auc did not improve from 0.91003
Epoch 16/60
Epoch 00016: val_auc did not improve from 0.91003
Epoch 17/60
Epoch 00017: val_auc improved from 0.91003 to 0.91690, saving model to F:/bbb/model/nlp\model_33.tf




INFO:tensorflow:Assets written to: F:/bbb/model/nlp\model_33.tf\assets


INFO:tensorflow:Assets written to: F:/bbb/model/nlp\model_33.tf\assets


Epoch 18/60
Epoch 00018: val_auc did not improve from 0.91690
Epoch 19/60
Epoch 00019: val_auc improved from 0.91690 to 0.92026, saving model to F:/bbb/model/nlp\model_33.tf




INFO:tensorflow:Assets written to: F:/bbb/model/nlp\model_33.tf\assets


INFO:tensorflow:Assets written to: F:/bbb/model/nlp\model_33.tf\assets


Epoch 20/60
Epoch 00020: val_auc did not improve from 0.92026
Epoch 21/60
Epoch 00021: val_auc improved from 0.92026 to 0.92511, saving model to F:/bbb/model/nlp\model_33.tf




INFO:tensorflow:Assets written to: F:/bbb/model/nlp\model_33.tf\assets


INFO:tensorflow:Assets written to: F:/bbb/model/nlp\model_33.tf\assets


Epoch 22/60
Epoch 00022: val_auc improved from 0.92511 to 0.92783, saving model to F:/bbb/model/nlp\model_33.tf




INFO:tensorflow:Assets written to: F:/bbb/model/nlp\model_33.tf\assets


INFO:tensorflow:Assets written to: F:/bbb/model/nlp\model_33.tf\assets


Epoch 23/60
Epoch 00023: val_auc did not improve from 0.92783
Epoch 24/60
Epoch 00024: val_auc did not improve from 0.92783
Epoch 25/60
Epoch 00025: val_auc did not improve from 0.92783
Epoch 26/60
Epoch 00026: val_auc did not improve from 0.92783
Epoch 27/60
Epoch 00027: val_auc improved from 0.92783 to 0.93589, saving model to F:/bbb/model/nlp\model_33.tf




INFO:tensorflow:Assets written to: F:/bbb/model/nlp\model_33.tf\assets


INFO:tensorflow:Assets written to: F:/bbb/model/nlp\model_33.tf\assets


Epoch 28/60
Epoch 00028: val_auc did not improve from 0.93589
Epoch 29/60
Epoch 00029: val_auc did not improve from 0.93589
Epoch 30/60
Epoch 00030: val_auc did not improve from 0.93589
Epoch 31/60
Epoch 00031: val_auc did not improve from 0.93589
Epoch 32/60
Epoch 00032: val_auc did not improve from 0.93589
Epoch 33/60
Epoch 00033: val_auc did not improve from 0.93589
Epoch 34/60
Epoch 00034: val_auc did not improve from 0.93589
Epoch 35/60
Epoch 00035: val_auc did not improve from 0.93589
Epoch 36/60
Epoch 00036: val_auc did not improve from 0.93589
Epoch 37/60
Epoch 00037: val_auc did not improve from 0.93589
Epoch 38/60
Epoch 00038: val_auc did not improve from 0.93589
Epoch 39/60
Epoch 00039: val_auc did not improve from 0.93589
Epoch 40/60
Epoch 00040: val_auc did not improve from 0.93589
Epoch 41/60
Epoch 00041: val_auc did not improve from 0.93589
Epoch 42/60
Epoch 00042: val_auc did not improve from 0.93589
Epoch 43/60
Epoch 00043: val_auc did not improve from 0.93589
Epoch 44

<keras.callbacks.History at 0x1575ed294c0>

In [16]:
model.load_weights(save(33))
model.evaluate(np.array(xtest), np.array(ytest))



[0.4268687963485718,
 0.8201024532318115,
 0.9147198796272278,
 0.6360712051391602,
 0.7034883499145508,
 797.0,
 484.0,
 77.0,
 204.0]

In [17]:
model.evaluate(np.array(xvalid), np.array(yvalid))



[0.35811635851860046,
 0.8429999947547913,
 0.9358888268470764,
 0.6884199976921082,
 0.7309417128562927,
 517.0,
 326.0,
 37.0,
 120.0]

In [18]:
tf.__version__

'2.7.0'