In [1]:
!pip install transformers

In [2]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.models import Model
from tensorflow.keras.utils import plot_model
from transformers import TFAutoModel, RobertaTokenizer
from keras.callbacks import ReduceLROnPlateau, ModelCheckpoint

In [3]:
train = pd.read_csv('../input/druginteraction/train.csv')
valid = pd.read_csv('../input/druginteraction/valid.csv')
test = pd.read_csv('../input/druginteraction/test.csv')
labels = pd.read_csv('../input/druginteraction/labels.csv')

In [4]:
seq_len = 128
num_sample_train = len(train)
num_sample_valid = len(valid)
num_sample_test = len(test)

xtrain_drug1 = train['Drug1']
xtrain_drug2 = train['Drug2']
xtrain_y = train['label']

xtest_drug1 = test['Drug1']
xtest_drug2 = test['Drug2']
xtest_y = test['label']

xvalid_drug1 = valid['Drug1']
xvalid_drug2 = valid['Drug2']
xvalid_y = valid['label']

In [5]:
tokenizer = RobertaTokenizer.from_pretrained('DeepChem/ChemBERTa-77M-MLM')

In [6]:
def tokenize(sentences, tokenizer):
    input_ids, input_masks = [],[]
    for sentence in sentences:
        inputs = tokenizer.encode_plus(sentence, add_special_tokens=True, max_length=seq_len, pad_to_max_length=True, 
                                             return_attention_mask=True, return_token_type_ids=False)
        input_ids.append(inputs['input_ids'])
        input_masks.append(inputs['attention_mask'])
        
    return np.asarray(input_ids, dtype='int32'), np.asarray(input_masks, dtype='int32')

In [7]:
xtrain_drug1, xtrain_drug1_mask = tokenize(list(xtrain_drug1), tokenizer = tokenizer)
xtrain_drug2, xtrain_drug2_mask = tokenize(list(xtrain_drug2), tokenizer = tokenizer)

xvalid_drug1, xvalid_drug1_mask = tokenize(list(xvalid_drug1), tokenizer=tokenizer)
xvalid_drug2, xvalid_drug2_mask = tokenize(list(xvalid_drug2), tokenizer=tokenizer)

xtest_drug1, xtest_drug1_mask = tokenize(list(xtest_drug1), tokenizer=tokenizer)
xtest_drug2, xtest_drug2_mask = tokenize(list(xtest_drug2), tokenizer=tokenizer)

In [8]:
input_shape = xtrain_drug1.shape[1]
output_shape = len(labels)

In [9]:
Drug1_model = TFAutoModel.from_pretrained('DeepChem/ChemBERTa-77M-MLM', from_pt = True)
Drug2_model = TFAutoModel.from_pretrained('DeepChem/ChemBERTa-77M-MLM', from_pt = True)

In [10]:
def model(input_shape, output_shape):
    input_ids_drug1 = layers.Input(shape=(input_shape,), name = "input_ids_drug1", dtype = 'int32')
    input_ids_drug2 = layers.Input(shape=(input_shape,), name = "input_ids_drug2", dtype = 'int32')

    input_mask_drug1 = layers.Input(shape=(input_shape,), name = "input_mask_drug1")
    input_mask_drug2 = layers.Input(shape=(input_shape,), name = "input_mask_drug2")

    embed1 = Drug1_model([input_ids_drug1, input_mask_drug1])[1]

    embed2 = Drug2_model([input_ids_drug2, input_mask_drug2])[1]
    d1 = layers.Dense(1024, activation = 'relu')(embed1)
    d1 = layers.Dense(512, activation = 'relu')(d1)
    d2 = layers.Dense(1024, activation = 'relu')(embed2)
    d2 = layers.Dense(512, activation = 'relu')(d2)

    dtot = layers.Concatenate()([d1,d2])
    d3 = layers.Dense(1024, activation = 'relu')(dtot)
    d3 = layers.Dense(512, activation = 'relu')(d3)
    d3 = layers.Dense(256, activation = 'relu')(d3)
    d3 = layers.Dense(output_shape, activation = 'softmax')(d3)

    model = Model(inputs = [input_ids_drug1,input_ids_drug2,input_mask_drug1,input_mask_drug2], outputs = d3)
    return model

In [11]:
model_train = model(input_shape, output_shape)
model_train.layers[4].trainable = False
model_train.layers[5].trainable = False
model_train.summary()

In [12]:
plot_model(model_train)

In [13]:
optimizer = tf.keras.optimizers.Adam(learning_rate = 0.001)
loss = tf.keras.losses.SparseCategoricalCrossentropy()
metrics = tf.keras.metrics.SparseCategoricalAccuracy('accuracy')

model_train.compile(optimizer = optimizer, loss = loss, metrics = metrics)

In [14]:
def save(i):
    return './model'+str(i)+'.tf'
best_model = ModelCheckpoint(filepath = save(1), monitor = 'val_accuracy', save_best_only=True, save_weights_only = False, mode='max')

rlr = ReduceLROnPlateau(monitor='val_loss', factor=0.5,patience=10, min_lr=0.000001, verbose=1, min_delta=1e-5)


In [15]:
history = model_train.fit([xtrain_drug1, xtrain_drug2, xtrain_drug1_mask, xtrain_drug2_mask], xtrain_y
                    ,validation_data =([xvalid_drug1,xvalid_drug2, xvalid_drug1_mask, xvalid_drug2_mask],xvalid_y),
                     epochs = 100, callbacks=[rlr, best_model], batch_size=256)

In [18]:
model_train.load_weights(save(1))

In [20]:
model_train.evaluate([xtest_drug1, xtest_drug2, xtest_drug1_mask, xtest_drug2_mask], xtest_y)

In [21]:
model_train.evaluate([xvalid_drug1,xvalid_drug2, xvalid_drug1_mask, xvalid_drug2_mask],xvalid_y)

In [22]:
import shutil
shutil.make_archive('model', 'zip', './model1.tf')