In [1]:
!nvidia-smi

Tue Feb 15 09:56:47 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 462.30       Driver Version: 462.30       CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name            TCC/WDDM | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  GeForce GTX 165... WDDM  | 00000000:01:00.0 Off |                  N/A |
| N/A   51C    P8     3W /  N/A |    134MiB /  4096MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [2]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.layers import  Dense, Bidirectional, LSTM, Dropout,Input, Embedding
from tensorflow.keras.losses import BinaryCrossentropy
from tensorflow.keras.models import Model 
from tensorflow.keras.metrics import Precision, AUC, binary_accuracy, TrueNegatives, TruePositives, FalseNegatives, FalsePositives
from helper_functions import mcc_metric
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.utils import class_weight
from transformers import AutoTokenizer

In [3]:
data = pd.read_table('F:/bbb/data/B3DB/B3DB_classification.tsv')
data.head(1)

Unnamed: 0,NO.,compound_name,IUPAC_name,SMILES,CID,logBB,BBB+/BBB-,Inchi,threshold,reference,group,comments
0,1,sulphasalazine,2-hydroxy-5-[[4-(pyridin-2-ylsulfamoyl)phenyl]...,O=C(O)c1cc(N=Nc2ccc(S(=O)(=O)Nc3ccccn3)cc2)ccc1O,5339.0,-2.69,BBB-,InChI=1S/C18H14N4O5S/c23-16-9-6-13(11-15(16)18...,,R2|R2|R25|R46|,A,


In [4]:
data_new = data.dropna(subset=['IUPAC_name'])
X = data_new['IUPAC_name']
target=data_new['BBB+/BBB-'].values.tolist()
label_encoder = LabelEncoder()
y_classification = label_encoder.fit_transform(target)

In [5]:
tokenizer = AutoTokenizer.from_pretrained('gumgo91/IUPAC_BERT', from_pt = True)

In [6]:
iupac_data = tokenizer(list(X),  truncation=True, padding=True, max_length=256)
iupac_data = iupac_data['input_ids']

In [7]:
from sklearn.model_selection import train_test_split
xtrain, xtest, ytrain, ytest = train_test_split(iupac_data, y_classification, test_size = 0.2, random_state = 42)
xtrain, xvalid, ytrain, yvalid = train_test_split(xtrain, ytrain, test_size = 0.16, random_state = 42)

In [8]:
len(y_classification)

6170

In [9]:
np.unique(y_classification, return_counts=True)

(array([0, 1], dtype=int64), array([3992, 2178], dtype=int64))

In [10]:
class_weights = class_weight.compute_class_weight(
           'balanced',
            np.unique(ytrain), 
            ytrain)



In [11]:
word_in_vocab = tokenizer.vocab
len(word_in_vocab)

1500

In [12]:
np.array(xtrain[0]).shape

(256,)

In [13]:
input = Input(shape = (256,))

x = Embedding(input_dim=len(word_in_vocab),
                        output_dim=256, 
                        input_length=None,
                        )(input)
x = Bidirectional(LSTM(64, return_sequences=True,name = 'lstm1'))(x)
x = Bidirectional(LSTM(64, return_sequences=True,name = 'lstm1'))(x)
x = Dropout(0.2)(x)
x = Bidirectional(LSTM(64, return_sequences=False,name = 'lstm1'))(x)
x = Dropout(0.2)(x)

x = Dense(256, activation = 'relu')(x)
x = Dense(10, activation = 'relu')(x)
output = Dense(1, activation = "sigmoid")(x)
model = Model(inputs = input, outputs = output)


In [14]:
class_weights

array([0.77582335, 1.4063772 ])

In [15]:
class_weights = {0: 0.77582335,
                1: 1.4063772}
def save(i):
    return 'F:/bbb/model/nlp/model_'+str(i)+'iupac'+'.tf'

checkpoint = tf.keras.callbacks.ModelCheckpoint(save(33), 
                monitor='val_auc', verbose=1, 
                save_best_only=True, save_weights_only = False, mode='max')
callbacks_list = [checkpoint]

model.compile(optimizer = tf.keras.optimizers.Adam(learning_rate = 0.001) ,loss = BinaryCrossentropy(), metrics = [binary_accuracy, AUC(), mcc_metric 
                    ,Precision(),TrueNegatives(), TruePositives(), FalseNegatives(), FalsePositives()])
model.fit(x = np.array(xtrain), y = np.array(ytrain), validation_data = (np.array(xvalid), np.array(yvalid)), epochs=60, batch_size = 32, 
              callbacks = callbacks_list, class_weight = class_weights)

Epoch 1/60
Epoch 00001: val_auc improved from -inf to 0.86767, saving model to F:/bbb/model/nlp\model_33iupac.tf




INFO:tensorflow:Assets written to: F:/bbb/model/nlp\model_33iupac.tf\assets


INFO:tensorflow:Assets written to: F:/bbb/model/nlp\model_33iupac.tf\assets


Epoch 2/60
Epoch 00002: val_auc improved from 0.86767 to 0.89508, saving model to F:/bbb/model/nlp\model_33iupac.tf




INFO:tensorflow:Assets written to: F:/bbb/model/nlp\model_33iupac.tf\assets


INFO:tensorflow:Assets written to: F:/bbb/model/nlp\model_33iupac.tf\assets


Epoch 3/60
Epoch 00003: val_auc improved from 0.89508 to 0.89968, saving model to F:/bbb/model/nlp\model_33iupac.tf




INFO:tensorflow:Assets written to: F:/bbb/model/nlp\model_33iupac.tf\assets


INFO:tensorflow:Assets written to: F:/bbb/model/nlp\model_33iupac.tf\assets


Epoch 4/60
Epoch 00004: val_auc did not improve from 0.89968
Epoch 5/60
Epoch 00005: val_auc did not improve from 0.89968
Epoch 6/60
Epoch 00006: val_auc did not improve from 0.89968
Epoch 7/60
Epoch 00007: val_auc did not improve from 0.89968
Epoch 8/60
Epoch 00008: val_auc did not improve from 0.89968
Epoch 9/60
Epoch 00009: val_auc did not improve from 0.89968
Epoch 10/60
Epoch 00010: val_auc did not improve from 0.89968
Epoch 11/60
Epoch 00011: val_auc did not improve from 0.89968
Epoch 12/60
Epoch 00012: val_auc did not improve from 0.89968
Epoch 13/60
Epoch 00013: val_auc did not improve from 0.89968
Epoch 14/60
Epoch 00014: val_auc did not improve from 0.89968
Epoch 15/60
Epoch 00015: val_auc did not improve from 0.89968
Epoch 16/60
Epoch 00016: val_auc did not improve from 0.89968
Epoch 17/60
Epoch 00017: val_auc did not improve from 0.89968
Epoch 18/60
Epoch 00018: val_auc did not improve from 0.89968
Epoch 19/60
Epoch 00019: val_auc did not improve from 0.89968
Epoch 20/60
Ep

<keras.callbacks.History at 0x1d0bff294c0>

In [16]:
model.load_weights(save(33))
model.evaluate(np.array(xtest), np.array(ytest))



[0.38693147897720337,
 0.8119935393333435,
 0.9018661379814148,
 0.6096577048301697,
 0.699999988079071,
 645.0,
 357.0,
 79.0,
 153.0]

In [17]:
model.evaluate(np.array(xvalid), np.array(yvalid))



[0.3851379454135895,
 0.8113924264907837,
 0.8996754884719849,
 0.600176215171814,
 0.690095841884613,
 425.0,
 216.0,
 52.0,
 97.0]