# Binary Classification

In this code, questions are classified based on their answers in to class [0:related,1:unrelated] by using parsbert,sinabert8G, mbert, XLMroberta models. For this purpose, the embedding vectors obtained by the each bert-based models are given to a "fully connected one-layer feed forward network" to predict their labels.


In [None]:
!nvidia-smi

## Python 3.7

In [None]:
!sudo apt-get update -y
!sudo apt-get install python3.7
from IPython.display import clear_output
clear_output()
!sudo update-alternatives --install /usr/bin/python3 python3 /usr/bin/python3.7 1

# Choose one of the given alternatives:
!sudo update-alternatives --config python3

# This one used to work but now NOT(for me)!
# !sudo update-alternatives --config python

# Check the result
!python3 --version

# Attention: Install pip (... needed!)
!sudo apt install python3-pip

In [None]:
!apt install python3.7-distutils

## requirments

In [None]:
!pip install sacremoses==0.0.45

In [None]:
!pip install -q transformers==3.1.0
!pip install sentencepiece

In [None]:
!pip show sentencepiece

## Model

In [None]:
import json
import pandas
from sklearn.metrics import classification_report
from torch.utils.data import Dataset, DataLoader
import torch
from torch.nn.utils.rnn import pad_sequence
import transformers
from random import *
from sklearn.model_selection import train_test_split
from transformers import AutoConfig, AutoTokenizer, AutoModel
from transformers import XLMRobertaConfig, XLMRobertaTokenizer, XLMRobertaModel


tokenizer = AutoTokenizer.from_pretrained("HooshvareLab/bert-fa-base-uncased")
bert_model = AutoModel.from_pretrained('hooshafzar/SINA-BERT')
#for parsbert: AutoTokenizer.from_pretrained("HooshvareLab/bert-fa-base-uncased") and AutoModel.from_pretrained("HooshvareLab/bert-fa-base-uncased")
#for mbert: AutoTokenizer.from_pretrained("bert-base-multilingual-cased") and AutoModel.from_pretrained("bert-base-multilingual-cased")
#for XLMroberta:  XLMRobertaTokenizer.from_pretrained('xlm-roberta-base') and XLMRobertaModel.from_pretrained('xlm-roberta-base',return_dict=False)
if tokenizer is None:
  print('tokenizer is None')



In [None]:
SPLIT_RATIO = 0.1  # the ratio for test
DROPOUT_RATE = .1
LEARNING_RATE = 2e-5
NUM_EPOCHS = 10
TRAIN_BATCH_SIZE = 8
TEST_BATCH_SIZE = 8
#print(config.to_json_string())

In [None]:
with open('4400_Qstn_pediatric_Gast.json') as handle:
    labeled_data = pandas.read_json(handle)

In [None]:
labels = list(sorted(labeled_data['label'].unique()))#new_data
print(labels)
train, test = train_test_split(labeled_data, test_size=SPLIT_RATIO, random_state=1, stratify=labeled_data['label'])#new_data
train = train.reset_index(drop=True)
test = test.reset_index(drop=True)

In [None]:
test['label'].value_counts()

In [None]:
class medicalDataset(torch.utils.data.Dataset):

    def __init__(self, tokenizer, questions, answers, labels, max_len,label_list):
        self.questions = questions
        self.answers = answers
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.questions)

    def __getitem__(self, item):
        question = str(self.questions[item])
        answer = str(self.answers[item])
        label=self.labels[item]

        encoding_q = self.tokenizer.encode_plus(
            question,
            add_special_tokens=True,
            truncation=True,
            max_length=self.max_len,
            return_token_type_ids=True,
            padding='max_length',
            return_attention_mask=True,
            return_tensors='pt')
        encoding_a = self.tokenizer.encode_plus(
            answer,
            add_special_tokens=True,
            truncation=True,
            max_length=self.max_len,
            return_token_type_ids=True,
            padding='max_length',
            return_attention_mask=True,
            return_tensors='pt')

        inputs = {
            'question': question,
            'answer': answer,
            'label': label,
            'q_input_ids': encoding_q['input_ids'].flatten(),
            'a_input_ids': encoding_a['input_ids'].flatten(),
            'q_attention_mask': encoding_q['attention_mask'].flatten(),
            'a_attention_mask': encoding_a['attention_mask'].flatten(),
            'q_token_type_ids': encoding_q['token_type_ids'].flatten(),
            'a_token_type_ids': encoding_a['token_type_ids'].flatten(),
        }

        return inputs


def create_data_loader(x, y, z, tokenizer, max_len, batch_size, label_list):
    dataset = medicalDataset(
        questions=x,
        answers=y,
        labels=z,
        tokenizer=tokenizer,
        max_len=max_len,
        label_list=label_list)

    return torch.utils.data.DataLoader(dataset, batch_size=batch_size)

In [None]:
label_list = [0,1]
train_data_loader = create_data_loader(train['question'].to_numpy(), train['answer'].to_numpy(), train['label'].to_numpy(), tokenizer, 300, TRAIN_BATCH_SIZE, label_list)
test_data_loader = create_data_loader(test['question'].to_numpy(), test['answer'].to_numpy(), test['label'].to_numpy(), tokenizer, 300, TEST_BATCH_SIZE, label_list)

In [None]:
class Net(torch.nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        self.bert_model = bert_model
        self.dropout = torch.nn.Dropout(DROPOUT_RATE)
        self.classifier = torch.nn.Linear(self.bert_model.config.hidden_size*2, 2)

    def forward(self, q_input_ids, q_attention_mask, q_token_type_ids,a_input_ids, a_attention_mask, a_token_type_ids):
        qout = self.bert_model(input_ids=q_input_ids, attention_mask=q_attention_mask, token_type_ids=q_token_type_ids)
        aout = self.bert_model (input_ids=a_input_ids, attention_mask=a_attention_mask, token_type_ids=a_token_type_ids)

        qa = torch.cat([qout[1], aout[1]], -1)
        qa = self.dropout(qa)
        y_pred = self.classifier(qa)
        return y_pred

model = Net()
device = 'cuda' if torch.cuda.is_available() else 'cpu'
model.to(device)

In [None]:
from tqdm.notebook import tqdm

from torch.optim import AdamW
from transformers import get_linear_schedule_with_warmup


criterion = torch.nn.CrossEntropyLoss().to(device)
optimizer = AdamW(model.parameters(), lr=LEARNING_RATE)
total_steps = len(train_data_loader) * NUM_EPOCHS
scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=0,
    num_training_steps=total_steps
)

for epoch in tqdm(range(1, NUM_EPOCHS + 1)):
    print('EPOCH:', epoch)
    for dl in tqdm(train_data_loader, total=len(train_data_loader)):
        optimizer.zero_grad()
        q_input_ids = dl['q_input_ids']
        q_attention_mask = dl['q_attention_mask']
        q_token_type_ids = dl['q_token_type_ids']
        a_input_ids = dl['a_input_ids']
        a_attention_mask = dl['a_attention_mask']
        a_token_type_ids = dl['a_token_type_ids']
        label = dl['label'].to(device)
        q_input_ids = q_input_ids.to(device)
        a_input_ids = a_input_ids.to(device)
        q_attention_mask = q_attention_mask.to(device)
        a_attention_mask = a_attention_mask.to(device)
        q_token_type_ids = q_token_type_ids.to(device)
        a_token_type_ids = a_token_type_ids.to(device)

        y = model(q_input_ids,q_attention_mask,q_token_type_ids,a_input_ids,a_attention_mask,a_token_type_ids)
        loss = criterion(y, label)
        _, preds = torch.max(y, dim=1)
        loss.backward()
        optimizer.step()
        scheduler.step()




In [None]:
torch.save(model.state_dict(), 'sinabert_'+ str(epoch)+'_epc.model')

# Evaluation
The performance of the model has been evaluated through accuracy, recall, precision and f1 metrics.

In [None]:
preds, trues = [], []
for dl in tqdm(test_data_loader, total=len(test_data_loader)):
      optimizer.zero_grad()
      q_input_ids = dl['q_input_ids']
      q_attention_mask = dl['q_attention_mask']
      q_token_type_ids = dl['q_token_type_ids']
      a_input_ids = dl['a_input_ids']
      a_attention_mask = dl['a_attention_mask']
      a_token_type_ids = dl['a_token_type_ids']
      label =dl['label']

      q_input_ids = q_input_ids.to(device)
      a_input_ids = a_input_ids.to(device)
      q_attention_mask = q_attention_mask.to(device)
      a_attention_mask = a_attention_mask.to(device)
      q_token_type_ids = q_token_type_ids.to(device)
      a_token_type_ids = a_token_type_ids.to(device)
      y = model(q_input_ids,q_attention_mask,q_token_type_ids,a_input_ids,a_attention_mask,a_token_type_ids)
      y = torch.argmax(y.detach().cpu(), -1)
      trues.extend(label)
      preds.extend(y.numpy().tolist())


print(classification_report(trues, preds, digits=4))

In [None]:
with open('sinabert_dgs_pred.txt', 'w', encoding='utf-8') as writer:
    #for p, gid in zip(preds, test['global_id'].values().tolist()):
       #writer.write(gid + '\t' + str(p) + '\n')
    counter = 0
    print(len(test['global_id'].tolist()))
    for sample in test['global_id'].tolist():
        p = preds[counter]
        writer.write(sample+ '\t' + str(p) + '\n')
        counter += 1
