In [None]:
# mount google drive
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# set path for files
path = '/content/drive/My Drive/thesis_dataset/'

In [None]:
# install required packages
!pip install transformers



In [None]:
# import all required packages/modules
from tqdm.notebook import tqdm
import numpy as np
import pandas as pd
import tensorflow as tf
from transformers import BertTokenizer
from transformers import TFBertModel, BertConfig
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import GridSearchCV 
from sklearn.svm import SVC 
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier 
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB 
from sklearn.neighbors import KNeighborsClassifier

In [None]:
# define constants
MAX_TOKENS = 64
BERT_PRETRAIN_MODEL_NAME = "bert-base-cased"

In [None]:
# read data from excel
df = pd.read_excel(path+"All_Questions_V1.xlsx",'data', encoding='utf-8') 
df.head(1)

Unnamed: 0,SlNo,Question,Relation,NER_Tag,Q_Len,T_Len,Subject,Subject_old,Subject_URI_old,Subject_URI,Relation_URI
0,1,what are the brand names of Metipranolol,brand,O O O O O O B-E,7,7,Metipranolol,Metipranolol,http://bio2rdf.org/drugbank:DB01214,http://bio2rdf.org/drugbank:DB01214,http://bio2rdf.org/drugbank_vocabulary:brand


In [None]:
# split the full dataset into train, valid and test dataset
rest, test = train_test_split(df, test_size=0.2, random_state=0, 
                               stratify=df['Relation'])
train, valid = train_test_split(rest, test_size=0.1, random_state=0, 
                               stratify=rest['Relation'])
train_size, test_size, validation_size = len(train), len(test), len(valid)
print(f'Train:{train_size}, Test: {test_size}, Validation: {validation_size}')

Train:406, Test: 114, Validation: 46


In [None]:
# create instance of tokenzier from BERT pretrained model
tokenizer = BertTokenizer.from_pretrained(BERT_PRETRAIN_MODEL_NAME, do_lower_case=False)

In [None]:
# create instance of Label Encoder
labelencoder = LabelEncoder()

In [None]:
# process the question phrase, labels to return input_ids, attention_masks, encoded-numeric labels and label names
def process_data(df_data, tokenizer, max_tokens, train=False):
  # process labels only for training data
  if(train):
    df_class = pd.get_dummies(df_data, columns=["Relation"], prefix=[""], prefix_sep="" )
    column_names = df_class.columns.to_list()
    label_names = column_names[8:]
    df_data_cat_label = df_data.copy(deep=True)
    df_data_cat_label['Class_Cat'] = labelencoder.fit_transform(df_data['Relation'])
    numeric_labels = df_data_cat_label['Class_Cat'].values
  else:
    numeric_labels, label_names = [], []

  # process data and provide input_ids and attention_masks
  tokens_list = []
  attn_masks_list = []
  for question in tqdm(df_data['Question']):
      tokens = tokenizer.encode(question, max_length = max_tokens, truncation=True, add_special_tokens = True)
      tokens_list.append(tokens)
  # we use post padding for BERT
  padded_tokens_list = pad_sequences(tokens_list, maxlen=max_tokens, truncating="post", padding="post", dtype="long", value=0)

  # create atttion masks
  for tokens in padded_tokens_list:
      attn_masks = [int(token > 0) for token in tokens]
      attn_masks_list.append(attn_masks)

  return padded_tokens_list, np.asarray(attn_masks_list), np.asarray(numeric_labels), label_names

In [None]:
# process question phrases, labels to get input_ids, attention_masks for BERT input and target numeric labels for Classical ML models
train_input_ids, train_attention_masks, train_labels, labels = process_data(train, tokenizer, MAX_TOKENS, True)
valid_input_ids, valid_attention_masks, valid_labels, _  = process_data(valid, tokenizer, MAX_TOKENS, True)
test_input_ids, test_attention_masks, test_labels, _  = process_data(test, tokenizer, MAX_TOKENS, True)
num_class = len(labels)

HBox(children=(FloatProgress(value=0.0, max=406.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=46.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=114.0), HTML(value='')))




In [None]:
# create model from pretrained BERT model
config_params = BertConfig.from_pretrained(BERT_PRETRAIN_MODEL_NAME)
config_params.output_hidden_states=True
model = TFBertModel.from_pretrained(BERT_PRETRAIN_MODEL_NAME, config = config_params)

Some weights of the model checkpoint at bert-base-cased were not used when initializing TFBertModel: ['nsp___cls', 'mlm___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFBertModel were initialized from the model checkpoint at bert-base-cased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.


In [None]:
# obtain outputs from BERT model
train_outputs = model({"input_ids": train_input_ids, "attention_mask": train_attention_masks})                    
valid_outputs = model({"input_ids": valid_input_ids, "attention_mask": valid_attention_masks})
test_outputs = model({"input_ids": test_input_ids, "attention_mask": test_attention_masks})

In [None]:
# extract feature vector from BERT output CLS token
train_features = train_outputs[0][:,0,:].numpy()
valid_features = valid_outputs[0][:,0,:].numpy()
test_features = test_outputs[0][:,0,:].numpy()

In [None]:
print(train_features.shape)
print(train_labels.shape)

(406, 768)
(406,)


In [None]:
# function to print classification accuracy
# functon can print actual and predicted labels for visualiztion, but commented
def print_accuracy(model, valid_features, valid_labels, test_features, test_labels):
  # validation dataset
  print("Validation Dataset")
  #predictions = model.predict(valid_features)  
  accuracy = model.score(valid_features, valid_labels) 
  #print(valid_labels)
  #print(predictions)
  print(accuracy)
  print("----------------------------------------------------------------------")
  # test dataset
  print("Testing Dataset")
  #predictions = model.predict(test_features) 
  accuracy = model.score(test_features, test_labels) 
  #print(test_labels)
  #print(predictions)
  print(accuracy)

In [None]:
# Decision tree model
dtree_model = DecisionTreeClassifier(max_depth = 2).fit(train_features, train_labels) 
print_accuracy(dtree_model, valid_features, valid_labels, test_features, test_labels)

Validation Dataset
0.06521739130434782
----------------------------------------------------------------------
Testing Dataset
0.02631578947368421


In [None]:
# Random forest model
rf_model = RandomForestClassifier(max_depth = 2).fit(train_features, train_labels) 
print_accuracy(rf_model, valid_features, valid_labels, test_features, test_labels)

Validation Dataset
0.32608695652173914
----------------------------------------------------------------------
Testing Dataset
0.2543859649122807


In [None]:
# Gaussian naive bayes model
gnb_model = GaussianNB().fit(train_features, train_labels) 
print_accuracy(gnb_model, valid_features, valid_labels, test_features, test_labels)

Validation Dataset
0.7608695652173914
----------------------------------------------------------------------
Testing Dataset
0.5526315789473685


In [None]:
# K nearest neighbor model
knn_model = KNeighborsClassifier(n_neighbors=5).fit(train_features, train_labels) 
print_accuracy(knn_model, valid_features, valid_labels, test_features, test_labels)

Validation Dataset
0.5217391304347826
----------------------------------------------------------------------
Testing Dataset
0.42105263157894735


In [None]:
# linear SVM classifier model
svm_model_linear = SVC(kernel = 'linear', C =1).fit(train_features, train_labels) 
print_accuracy(svm_model_linear, valid_features, valid_labels, test_features, test_labels)

Validation Dataset
0.6739130434782609
----------------------------------------------------------------------
Testing Dataset
0.6491228070175439


In [None]:
# Logistic regression classifier model
log_reg_model = LogisticRegression(C = 1, max_iter=100).fit(train_features, train_labels)
print_accuracy(log_reg_model, valid_features, valid_labels, test_features, test_labels)

Validation Dataset
0.8260869565217391
----------------------------------------------------------------------
Testing Dataset
0.6666666666666666


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression




---


Both logistics and SVM models are performing equally good. But, LR model is not stable becasue of non-convergence. However, we conduct gridsearch for both algorithms



---



In [None]:

#param_grid = {'C': np.linspace(10, 100, 5),
#              'max_iter': np.linspace(50, 500, 10)}
#grid = GridSearchCV(LogisticRegression(), param_grid, refit = True, verbose = 3) 
#grid.fit(train_features, train_labels) 

In [None]:
#print(grid.best_params_) 
#print(grid.best_estimator_) 
#{'C': 55.0, 'max_iter': 200.0}
#LogisticRegression(C=55.0, class_weight=None, dual=False, fit_intercept=True,
#                   intercept_scaling=1, l1_ratio=None, max_iter=200.0,
#                   multi_class='auto', n_jobs=None, penalty='l2',
#                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
#                   warm_start=False)

In [None]:
log_reg_model = LogisticRegression(C=55.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=200.0,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False).fit(train_features, train_labels) 
print_accuracy(log_reg_model, valid_features, valid_labels, test_features, test_labels)

Validation Dataset
0.8478260869565217
----------------------------------------------------------------------
Testing Dataset
0.7280701754385965


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


In [None]:
#param_grid = {'C': [0.1, 1, 10, 100, 1000], 
#			'gamma': [1, 0.1, 0.01, 0.001, 0.0001], 
#			'kernel': ['rbf', 'linear', 'sigmoid', 'poly']} 
#grid = GridSearchCV(SVC(), param_grid, refit = True, verbose = 3) 
#grid.fit(train_features, train_labels) 

In [None]:
#print(grid.best_params_) 
#print(grid.best_estimator_) 
#{'C': 10, 'gamma': 0.01, 'kernel': 'rbf'}
#SVC(C=10, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
#    decision_function_shape='ovr', degree=3, gamma=0.01, kernel='rbf',
#    max_iter=-1, probability=False, random_state=None, shrinking=True,
#    tol=0.001, verbose=False).fit(train_features, train_labels)

In [None]:
svm_model_linear = SVC(C=10, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma=0.01, kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False).fit(train_features, train_labels) 
print_accuracy(svm_model_linear, valid_features, valid_labels, test_features, test_labels)

Validation Dataset
0.7391304347826086
----------------------------------------------------------------------
Testing Dataset
0.6491228070175439




---


Both the best models found by gridsearch for the algorithms SVM and Logistic-Regression(LR) perform equally good. But the LR model built with the best parameters is not converging. Non-convergence means the model did not fit the data properly and the estimations has high variance and unstable [1],[2]. Hence we choose to use SVM classifier


---


References:
1. https://www.rasch.org/rmt/rmt11b.htm
2. Article:
   title=Variable selection of correlated predictors in logistic regression: investigating the diet-heart hypothesis,
  author=Thompson, Warren Robert,
  year=2009


---



**References**

Followed Examples from


---

https://www.depends-on-the-definition.com/named-entity-recognition-with-bert/

https://mccormickml.com/2019/07/22/BERT-fine-tuning/

http://jalammar.github.io/a-visual-guide-to-using-bert-for-the-first-time/

https://www.kaggle.com/nkaenzig/bert-tensorflow-2-huggingface-transformers

https://colab.research.google.com/drive/1ZQvuAVwA3IjybezQOXnrXMGAnMyZRuPU#scrollTo=tBa6vRHknSkv


---

