In [1]:
# import required packages and modules
import tensorflow as tf
import keras
import numpy as np
import pandas as pd
from keras.models import Model, Input
from keras.layers import LSTM, Embedding, Dense, TimeDistributed, Dropout, Bidirectional
from keras_contrib.layers import CRF
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score
import pickle
# print version of tf and keras
print(tf.__version__)
print(keras.__version__)

Using TensorFlow backend.


1.15.0
2.3.1


In [2]:
# define constants
WORD_EMBEDDING_SIZE = 768
MAX_LEN = 64
BATCH_SIZE=32
NR_EPOCHS=200

In [3]:
# import embedings from pickle
infile = open('y_train_file','rb')
y_train = pickle.load(infile)
infile.close()
infile = open('train_embeddings_file','rb')
train_embeddings = pickle.load(infile)
infile.close()

infile = open('valid_embeddings_file','rb')
valid_embeddings = pickle.load(infile)
infile.close()
infile = open('y_valid_file','rb')
y_valid = pickle.load(infile)
infile.close()

infile = open('test_embeddings_file','rb')
test_embeddings = pickle.load(infile)
infile.close()
infile = open('y_test_file','rb')
y_test = pickle.load(infile)
infile.close()

In [4]:
# read data from excel
# split the data into train, validation and testing
df = pd.read_excel("All_Questions_V1.xlsx",'data') 
print(df.head(1))
relation = df['Relation']
rest, test = train_test_split(df, test_size=0.2, random_state=0, 
                               stratify=df['Relation'])
train, valid = train_test_split(rest, test_size=0.1, random_state=0, 
                               stratify=rest['Relation'])
train_size, test_size, validation_size = len(train), len(test), len(valid)
print(f'Train:{train_size}, Test: {test_size}, Validation: {validation_size}')

   SlNo                                  Question Relation          NER_Tag  \
0     1  what are the brand names of Metipranolol    brand  O O O O O O B-E   

   Q_Len  T_Len       Subject                          Subject_URI  \
0      7      7  Metipranolol  http://bio2rdf.org/drugbank:DB01214   

                                   Relation_URI  
0  http://bio2rdf.org/drugbank_vocabulary:brand  
Train:406, Test: 114, Validation: 46


In [5]:
# function to convert labels to numerical values
# return numerical label and encoder which will be used for decoding later
def process_labels(df):
    labelencoder = LabelEncoder()
    df_data_cat_label = df.copy(deep=True)
    df_data_cat_label['Class_Cat'] = labelencoder.fit_transform(df_data_cat_label['Relation'])
    return df_data_cat_label['Class_Cat'].values, labelencoder

In [6]:
# process labels of training, validation and testing dataset
train_labels, train_encoder = process_labels(train)
valid_labels, valid_encoder = process_labels(valid)
test_labels, test_encoder = process_labels(test)

In [7]:
# define Bi-LSTM classifier model
input = Input(shape=(MAX_LEN,WORD_EMBEDDING_SIZE))
output = Bidirectional(LSTM(units=int(MAX_LEN/8), 
                           dropout=0.5,
                          recurrent_dropout=0.5))(input)
output = Dense(37, activation="softmax")(output)  
model = Model(input, output)

Instructions for updating:
If using Keras pass *_constraint arguments to layers.


In [8]:
# compile the model using required optimizer, loss function and evalution metrics
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
model.summary()

Model: "model_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 64, 768)           0         
_________________________________________________________________
bidirectional_1 (Bidirection (None, 16)                49728     
_________________________________________________________________
dense_1 (Dense)              (None, 37)                629       
Total params: 50,357
Trainable params: 50,357
Non-trainable params: 0
_________________________________________________________________


In [9]:
# train the model with training data and validate witj validation data
history = model.fit(np.asarray(train_embeddings), np.asarray(train_labels), batch_size=BATCH_SIZE, epochs=NR_EPOCHS,
                    validation_data = (np.asarray(valid_embeddings), np.asarray(valid_labels)), verbose=2)


Train on 406 samples, validate on 46 samples
Epoch 1/200
 - 1s - loss: 3.6093 - accuracy: 0.0320 - val_loss: 3.5791 - val_accuracy: 0.0217
Epoch 2/200
 - 1s - loss: 3.5830 - accuracy: 0.0591 - val_loss: 3.5609 - val_accuracy: 0.0652
Epoch 3/200
 - 1s - loss: 3.5537 - accuracy: 0.0813 - val_loss: 3.5255 - val_accuracy: 0.0435
Epoch 4/200
 - 1s - loss: 3.5079 - accuracy: 0.0714 - val_loss: 3.4718 - val_accuracy: 0.1304
Epoch 5/200
 - 1s - loss: 3.4604 - accuracy: 0.1059 - val_loss: 3.4243 - val_accuracy: 0.1087
Epoch 6/200
 - 1s - loss: 3.3856 - accuracy: 0.1158 - val_loss: 3.3747 - val_accuracy: 0.1522
Epoch 7/200
 - 1s - loss: 3.3078 - accuracy: 0.1552 - val_loss: 3.3091 - val_accuracy: 0.1957
Epoch 8/200
 - 1s - loss: 3.2728 - accuracy: 0.1626 - val_loss: 3.2494 - val_accuracy: 0.1957
Epoch 9/200
 - 1s - loss: 3.2046 - accuracy: 0.1675 - val_loss: 3.2006 - val_accuracy: 0.1957
Epoch 10/200
 - 1s - loss: 3.1467 - accuracy: 0.2365 - val_loss: 3.1409 - val_accuracy: 0.2174
Epoch 11/200


Epoch 85/200
 - 1s - loss: 1.2118 - accuracy: 0.7833 - val_loss: 1.3798 - val_accuracy: 0.7174
Epoch 86/200
 - 1s - loss: 1.1618 - accuracy: 0.8079 - val_loss: 1.3997 - val_accuracy: 0.6739
Epoch 87/200
 - 1s - loss: 1.1789 - accuracy: 0.7980 - val_loss: 1.3413 - val_accuracy: 0.7174
Epoch 88/200
 - 1s - loss: 1.1442 - accuracy: 0.8079 - val_loss: 1.3592 - val_accuracy: 0.6739
Epoch 89/200
 - 1s - loss: 1.1414 - accuracy: 0.8054 - val_loss: 1.3428 - val_accuracy: 0.6957
Epoch 90/200
 - 1s - loss: 1.1310 - accuracy: 0.8227 - val_loss: 1.3191 - val_accuracy: 0.7174
Epoch 91/200
 - 1s - loss: 1.1326 - accuracy: 0.8079 - val_loss: 1.3215 - val_accuracy: 0.7174
Epoch 92/200
 - 1s - loss: 1.0844 - accuracy: 0.8448 - val_loss: 1.3061 - val_accuracy: 0.7174
Epoch 93/200
 - 1s - loss: 1.0553 - accuracy: 0.8473 - val_loss: 1.2963 - val_accuracy: 0.6739
Epoch 94/200
 - 1s - loss: 1.0893 - accuracy: 0.8202 - val_loss: 1.2932 - val_accuracy: 0.6957
Epoch 95/200
 - 1s - loss: 1.0963 - accuracy: 0.84

Epoch 171/200
 - 1s - loss: 0.5425 - accuracy: 0.9286 - val_loss: 1.0250 - val_accuracy: 0.7609
Epoch 172/200
 - 1s - loss: 0.5753 - accuracy: 0.9187 - val_loss: 1.0671 - val_accuracy: 0.7174
Epoch 173/200
 - 1s - loss: 0.6126 - accuracy: 0.8966 - val_loss: 1.0665 - val_accuracy: 0.7174
Epoch 174/200
 - 1s - loss: 0.5776 - accuracy: 0.9360 - val_loss: 1.0898 - val_accuracy: 0.6957
Epoch 175/200
 - 1s - loss: 0.5538 - accuracy: 0.9138 - val_loss: 1.0571 - val_accuracy: 0.7174
Epoch 176/200
 - 1s - loss: 0.5726 - accuracy: 0.9039 - val_loss: 1.0642 - val_accuracy: 0.7174
Epoch 177/200
 - 1s - loss: 0.5520 - accuracy: 0.9286 - val_loss: 1.0638 - val_accuracy: 0.7174
Epoch 178/200
 - 1s - loss: 0.5616 - accuracy: 0.9236 - val_loss: 1.0528 - val_accuracy: 0.7174
Epoch 179/200
 - 1s - loss: 0.5149 - accuracy: 0.9458 - val_loss: 1.0508 - val_accuracy: 0.7174
Epoch 180/200
 - 1s - loss: 0.5796 - accuracy: 0.9236 - val_loss: 1.0396 - val_accuracy: 0.6957
Epoch 181/200
 - 1s - loss: 0.5788 - acc

In [10]:
# predict relations for validation dataset
valid_pred = model.predict(np.asarray(valid_embeddings), verbose=1)



In [11]:
# predict relations for testing dataset
test_pred = model.predict(np.asarray(test_embeddings), verbose=1)



In [12]:
# function to convert the predictions to numeric labels
def pred_to_num_labels(pred_list):
    num_labels = []
    for pred in pred_list:
        num = np.argmax(pred)
        num_labels.append(num)
    return num_labels

In [13]:
# check one prediction of validation datset
p_valid_labels = pred_to_num_labels(valid_pred)
print(p_valid_labels[0])
print(valid_labels[0])
print(valid_encoder.inverse_transform([p_valid_labels[0]]))
print(valid['Question'].iloc[0])

36
36
['volume-of-distribution']
what is the volume of distribution for Coagulation factor VIIa


In [14]:
# check one prediction of testing datset
p_test_labels = pred_to_num_labels(test_pred)
print(p_test_labels[0])
print(test_labels[0])
print(test_encoder.inverse_transform([p_test_labels[0]]))
print(test['Question'].iloc[0])

23
23
['patent']
Nitroglycerin is patented under which number


In [15]:
# calculate and print validation and testing dataset accuracy
print(f'Validation Accuracy: {accuracy_score(p_valid_labels, valid_labels)}')
print(f'Test Accuracy: {accuracy_score(p_test_labels,test_labels)}')

Validation Accuracy: 0.6739130434782609
Test Accuracy: 0.6842105263157895
