# TRAINING A DISEASE MARKER WITH NCBI-DISEASE DATASET 
This notebook is about marking the words that are disease.\
The next step is defining the branch of disease.

**Import necessary libraries**


In [None]:
from datasets import list_datasets, load_dataset
import pandas as pd
import tensorflow as tf
import nltk as nltk
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import numpy as np
from tensorflow.keras import Model
from tensorflow.keras.layers import Dot, Embedding, Flatten
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization

**Load the training and validation datasets**

In [None]:
ncbi_train = load_dataset("ncbi_disease", split="train")
ncbi_validation = load_dataset("ncbi_disease", split="validation")
ncbi_test = load_dataset("ncbi_disease", split="test")
nltk.download("stopwords")


**Convert the datasets into pandas dataframe. Merge the validation and the training dataset then split it with specific ratio**

In [None]:
train_df = ncbi_train.to_pandas()

valid_df = ncbi_validation.to_pandas()
# train_df = train_df.append(valid_df)
# train_df = train_df.sample(frac=1)
# train_df, valid_df = train_test_split(train_df, test_size=0.2)

test_df = ncbi_test.to_pandas()
pd.set_option('display.max_colwidth', None)

**Define the 2d to 1d converter function**

In [None]:
def conv2d_to_1d(tokens,labels):
    token_set = []
    label_set = []
    for element in tokens:
        for item in element:
            token_set.append(item)
    
    for element in labels:
        for item in element:
            label_set.append(item)

    return token_set,label_set


**Define the stopword and punctuation remover function from both features and labels**

In [None]:

from string import punctuation

stop = stopwords.words('english')

def remove_stopwords_punctuations(tokens, labels):
    new_tokens = []
    new_labels = []
    flag = True
    for index in range(len(tokens)):
        formatted_token = str(tokens[index]).strip().lower()
        stop_count = stop.count(formatted_token)
        punctuation_count = punctuation.count(formatted_token)

        if(stop_count == 0) and (punctuation_count == 0):
            new_tokens.append(str(tokens[index]).strip().lower())
            new_labels.append(labels[index])

    return new_tokens,new_labels
   


**Define the vocab size and embedding dimensions for the data shape**\
*Convert the data into 2d to 1d array in order to be sequenced\
*Remove all stopword and punctuations

In [None]:
vocab_size = 12000
embedding_dim = 16
max_length = 50
trunc_type='post'
pad_type='post'
oov_tok = "<OOV>"

tokenizer = Tokenizer(num_words = vocab_size, oov_token=oov_tok)
train_set,train_label_set = conv2d_to_1d(train_df.tokens.values, train_df.ner_tags)
valid_set, valid_label_set = conv2d_to_1d(valid_df.tokens.values, valid_df.ner_tags)
test_set, test_label_set = conv2d_to_1d(test_df.tokens.values, test_df.ner_tags)

train_set, train_label_set = remove_stopwords_punctuations(train_set,train_label_set)
valid_set, valid_label_set = remove_stopwords_punctuations(valid_set,valid_label_set)
test_set, test_label_set = remove_stopwords_punctuations(test_set,test_label_set)



**Pad the sequences so that they are all the same length**


In [None]:
tokenizer.fit_on_texts(train_set)
word_index = tokenizer.word_index

training_sequences = tokenizer.texts_to_sequences(train_set)
training_padded = pad_sequences(training_sequences,maxlen=max_length, 
                                truncating=trunc_type, padding=pad_type)

validation_sequences = tokenizer.texts_to_sequences(valid_set)
validation_padded = pad_sequences(validation_sequences,maxlen=max_length, 
                                truncating=trunc_type, padding=pad_type)

training_labels_final = np.array(train_label_set)
validation_labels_final = np.array(valid_label_set)


print(training_padded)

**Evaluate the test accuracy of the model**

In [None]:
def predict_tokens(model_type,model, tokens,token_labels):
  # Create the sequences
  padding_type='post'
  sample_sequences = tokenizer.texts_to_sequences(tokens)
  tokens_padded = pad_sequences(sample_sequences, padding=padding_type, 
                                 maxlen=max_length) 
  
  results = model.evaluate(tokens_padded,token_labels)
  print("test loss, test acc:", results, " <-> ",model_type)

**Plot the given type history from model**

In [None]:
import matplotlib.pyplot as plt

def plot_graphs(history, string):
  plt.plot(history.history[string])
  plt.plot(history.history['val_'+string])
  plt.xlabel("Epochs")
  plt.ylabel(string)
  plt.legend([string, 'val_'+string])
  plt.show()

**Implement the training function for each type of model**\
*Define an early stopping callback with patience 5

In [None]:
callback = tf.keras.callbacks.EarlyStopping(monitor='loss', patience=5)

def fit_model_and_show_results (model, num_epochs,train_padded, train_label, valid_padded, valid_label):
  model.summary()
  history = model.fit(train_padded, train_label, epochs=num_epochs, 
                      validation_data=(valid_padded, valid_label), callbacks=[callback])
  plot_graphs(history, "accuracy")
  plot_graphs(history, "loss")

**Use only Embedding to train the network**

In [None]:
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size, embedding_dim, input_length=max_length),
    tf.keras.layers.GlobalAveragePooling1D(),  
    tf.keras.layers.Dropout(0.6),
    tf.keras.layers.Dense(3, activation='softmax')
])

model.compile(loss='sparse_categorical_crossentropy',
                  optimizer=tf.keras.optimizers.Adam(learning_rate=0.0005), 
                  metrics=['accuracy'])

fit_model_and_show_results(model, num_epochs=100, train_padded=training_padded, train_label=training_labels_final, valid_padded=validation_padded, valid_label=validation_labels_final)

**Use a CNN model to train network**

In [None]:
model_cnn = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size, embedding_dim, input_length=max_length),
    tf.keras.layers.Conv1D(16, 2, activation='relu'),
    tf.keras.layers.GlobalMaxPooling1D(),
    tf.keras.layers.Flatten(),
    tf.keras.layers.Dropout(0.2),
    tf.keras.layers.Dense(3, activation='softmax')
])


model_cnn.compile(loss='sparse_categorical_crossentropy',
                  optimizer=tf.keras.optimizers.Adam(learning_rate=0.00007), 
                  metrics=['accuracy'])

fit_model_and_show_results(model_cnn, num_epochs=50, train_padded=training_padded, train_label=training_labels_final, valid_padded=validation_padded, valid_label=validation_labels_final)

**Use a GRU model to train the network**

In [None]:
model_gru = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size, embedding_dim, input_length=max_length),
    tf.keras.layers.Bidirectional(tf.keras.layers.GRU(32)),
    tf.keras.layers.Dropout(0.2),
    tf.keras.layers.Dense(3, activation='softmax')
])

model_gru.compile(loss='sparse_categorical_crossentropy',
                  optimizer=tf.keras.optimizers.Adam(learning_rate=0.00007), 
                  metrics=['accuracy'])

fit_model_and_show_results(model_gru,  num_epochs=20, train_padded=training_padded, train_label=training_labels_final, valid_padded=validation_padded, valid_label=validation_labels_final)

**Use a Bidirectional LSTM model to train the network**

In [None]:
model_bidi_lstm = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size, embedding_dim, input_length=max_length),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(embedding_dim)), 
    tf.keras.layers.Dropout(0.2),
    tf.keras.layers.Dense(3, activation='softmax')
])

model_bidi_lstm.compile(loss='sparse_categorical_crossentropy',
                  optimizer=tf.keras.optimizers.Adam(learning_rate=0.00007), 
                  metrics=['accuracy'])

fit_model_and_show_results(model_bidi_lstm,  num_epochs=20, train_padded=training_padded, train_label=training_labels_final, valid_padded=validation_padded, valid_label=validation_labels_final)

**Use a Multiple Bidirectional LSTMs**

In [None]:
model_multiple_bidi_lstm = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size, embedding_dim, input_length=max_length),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(embedding_dim, 
                                                       return_sequences=True)),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(embedding_dim)),
    tf.keras.layers.Dropout(0.2),
    tf.keras.layers.Dense(3, activation='softmax')
])

model_multiple_bidi_lstm.compile(loss='sparse_categorical_crossentropy',
                  optimizer=tf.keras.optimizers.Adam(learning_rate=0.00007), 
                  metrics=['accuracy'])

fit_model_and_show_results(model_multiple_bidi_lstm, num_epochs=20, train_padded=training_padded, train_label=training_labels_final, valid_padded=validation_padded, valid_label=validation_labels_final)

**Compare the models with the Test Data**

In [None]:
predict_tokens("Only Embedding Model",model, test_set, np.array(test_label_set))
predict_tokens("CNN model",model_cnn, test_set, np.array(test_label_set))
predict_tokens("GRU model",model_gru, test_set, np.array(test_label_set))
predict_tokens("Bidi LSTM model",model_bidi_lstm, test_set, np.array(test_label_set))
predict_tokens("Multiple Bidi LSTM model",model_multiple_bidi_lstm, test_set, np.array(test_label_set))