# Bert-based+CNN Sentiment Analysis

In this code, Medical comments are classified into three categories: [satisfied, unsatisfied, no-idea] by parsbert,sinabert8G, mbert, XLMroberta models. For this purpose, the embedding vectors obtained by the each bert-based models are given to a "CNN with 100 filter in size of [2,3,4,5,6]", "MaxPooling" and then a "fully connected one-layer feed forward network" to predict the label of each input comment. 


In [None]:
!nvidia-smi
!pip install -q transformers
!pip install -q hazm
#!pip install -q clean-text[gpl]
!pip install sentencepiece
#!pip install hazm

In [None]:
# Import required packages

import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.metrics import f1_score
from sklearn.utils import shuffle

import hazm
from hazm import *

#import plotly.express as px
#import plotly.graph_objects as go

from tqdm.notebook import tqdm

import os
import re
import json
import copy
import collections

from transformers import BertConfig, BertTokenizer
from transformers import BertModel,GPT2LMHeadModel,AutoConfig, AutoTokenizer
from transformers import XLMRobertaConfig, XLMRobertaTokenizer
from transformers import XLMRobertaModel

from transformers import AdamW
from transformers import get_linear_schedule_with_warmup

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch, gc

gc.collect()
torch.cuda.empty_cache()



In [None]:
data = pd.read_csv('medical-sentiment.csv', encoding='utf-8')
data = data[['comment', 'sentiment']]

data.head()

In [None]:
S_data = data[data['sentiment'] == 'satisfied']
U_data = data[data['sentiment'] == 'unsatisfied']
N_data = data[data['sentiment'] == 'no-idea']

cutting_point = int(min(len(S_data), len(U_data),len(N_data)) * 2)

if cutting_point <= len(S_data):
   S_data = S_data.sample(n=cutting_point).reset_index(drop=True)

if cutting_point <= len(U_data):
    U_data = U_data.sample(n=cutting_point).reset_index(drop=True)

if cutting_point <= len(N_data):
    N_data = N_data.sample(n=cutting_point).reset_index(drop=True)
new_data = pd.concat([S_data,U_data,N_data])
new_data = new_data.sample(frac=1).reset_index(drop=True)
new_data.info()

In [None]:
labels = list(sorted(data['sentiment'].unique()))

new_data['sentiment_id'] = new_data['sentiment'].apply(lambda t: labels.index(t))

train, test = train_test_split(new_data, test_size=0.05, random_state=1, stratify=new_data['sentiment'])
train, valid = train_test_split(train, test_size=0.1, random_state=1, stratify=train['sentiment'])

train = train.reset_index(drop=True)
valid = valid.reset_index(drop=True)
test = test.reset_index(drop=True)

x_train, y_train = train['comment'].values.tolist(), train['sentiment_id'].values.tolist()
x_valid, y_valid = valid['comment'].values.tolist(), valid['sentiment_id'].values.tolist()
x_test, y_test = test['comment'].values.tolist(), test['sentiment_id'].values.tolist()

In [None]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(f'device: {device}')

train_on_gpu = torch.cuda.is_available()

The MODEL_NAME_OR_PATH parameter should change for each model.

In [None]:
# general config
MAX_LEN = 512
TRAIN_BATCH_SIZE = 8
VALID_BATCH_SIZE = 8
TEST_BATCH_SIZE = 8

EPOCHS = 3
EEVERY_EPOCH = 1000
LEARNING_RATE = 2e-5
CLIP = 0.0

MODEL_NAME_OR_PATH = 'hooshafzar/SINA-BERT'

# for parsbert:"HooshvareLab/bert-fa-base-uncased" 
#for mbert: "bert-base-multilingual-cased
#form xlmroberta: "xlm-roberta-base"

OUTPUT_PATH = 'sentiment_sina_bert_model.bin'
os.makedirs(os.path.dirname(OUTPUT_PATH), exist_ok=True)

In [None]:
label2id = {label: i for i, label in enumerate(labels)}
id2label = {v: k for k, v in label2id.items()}

print(f'label2id: {label2id}')
print(f'id2label: {id2label}')

Only for XLMRoberta

In [None]:
#tokenizer = XLMRobertaTokenizer.from_pretrained(MODEL_NAME_OR_PATH)
#config = XLMRobertaConfig.from_pretrained(
#    MODEL_NAME_OR_PATH, **{
#        'label2id': label2id,
#        'id2label': id2label,
#    })

#print(config.to_json_string())

For parsbert,sinabert and mbert

In [None]:
tokenizer = BertTokenizer.from_pretrained("HooshvareLab/bert-fa-base-uncased") #
config = BertConfig.from_pretrained(
    "HooshvareLab/bert-fa-base-uncased", **{
        'label2id': label2id,
        'id2label': id2label,
    })

print(config.to_json_string())

In [None]:
class medicalDataset(torch.utils.data.Dataset):

    def __init__(self, tokenizer, comments, targets=None, label_list=None, max_len=128):
        self.comments = comments
        self.targets = targets
        self.has_target = isinstance(targets, list) or isinstance(targets, np.ndarray)

        self.tokenizer = tokenizer
        self.max_len = max_len

        
        self.label_map = {label: i for i, label in enumerate(label_list)} if isinstance(label_list, list) else {}
    
    def __len__(self):
        return len(self.comments)

    def __getitem__(self, item):
        comment = str(self.comments[item])

        if self.has_target:
            target = self.label_map.get(str(self.targets[item]), str(self.targets[item]))

        encoding = self.tokenizer.encode_plus(
            comment,
            add_special_tokens=True,
            truncation=True,
            max_length=self.max_len,
            return_token_type_ids=True,
            padding='max_length',
            return_attention_mask=True,
            return_tensors='pt')

        
        inputs = {
            'comment': comment,
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'token_type_ids': encoding['token_type_ids'].flatten(),
        }

        if self.has_target:
            inputs['targets'] = torch.tensor(target, dtype=torch.long)
        
        return inputs


def create_data_loader(x, y, tokenizer, max_len, batch_size, label_list):
    dataset = medicalDataset(
        comments=x,
        targets=y,
        tokenizer=tokenizer,
        max_len=max_len, 
        label_list=label_list)
    
    return torch.utils.data.DataLoader(dataset, batch_size=batch_size)

In [None]:
label_list = ['no-idea','satisfied', 'unsatisfied']
train_data_loader = create_data_loader(train['comment'].to_numpy(), train['sentiment'].to_numpy(), tokenizer, MAX_LEN, TRAIN_BATCH_SIZE, label_list)
valid_data_loader = create_data_loader(valid['comment'].to_numpy(), valid['sentiment'].to_numpy(), tokenizer, MAX_LEN, VALID_BATCH_SIZE, label_list)
test_data_loader = create_data_loader(test['comment'].to_numpy(), None, tokenizer, MAX_LEN, TEST_BATCH_SIZE, label_list)

In [None]:
import torch.nn.functional as F

class SentimentModel(nn.Module):

    def __init__(self, config):
        super(SentimentModel, self).__init__()
        self.bert = BertModel.from_pretrained(MODEL_NAME_OR_PATH,return_dict=False) 
        #for xlmroberta: XLMRobertaModel.from_pretrained() 
        self.bert.resize_token_embeddings(len(tokenizer))
        self.dropout = nn.Dropout(0.1)
        self.num_filters=[100,100,100,100,100]
        self.filter_sizes=[2,3,4,5,6]
        self.conv1d_list = nn.ModuleList([
            nn.Conv1d(in_channels=config.hidden_size,
                      out_channels=self.num_filters[i],
                      kernel_size=self.filter_sizes[i])
            for i in range(len(self.filter_sizes))
        ])
        # Fully-connected layer and Dropout
        self.fc1 = nn.Linear(MAX_LEN, config.hidden_size)
        self.fc2 = nn.Linear(np.sum(self.num_filters), config.num_labels)
    def forward(self,  input_ids, attention_mask, token_type_ids):
        output,_ = self.bert(
            input_ids=input_ids, 
            attention_mask=attention_mask, 
            token_type_ids=token_type_ids)
        output = output.permute(0, 2, 1)
        output=  self.fc1(output)  
        conv_list = [F.relu(conv1d(output)) for conv1d in self.conv1d_list]
        pool_list = [F.max_pool1d(conv, kernel_size=conv.shape[2])
            for conv in conv_list]
        fc = torch.cat([pool.squeeze(dim=2) for pool in pool_list],
                         dim=1)
        logits = self.fc2(self.dropout(fc))
        return logits 
#'/content/gdrive/My Drive/Colab Notebooks/informal_model_final'        

In [None]:
pt_model = None
pt_model = SentimentModel(config=config)
pt_model = pt_model.to(device)

print('pt_model', type(pt_model))

In [None]:
def simple_accuracy(y_true, y_pred):
    return (y_true == y_pred).mean()

def acc_and_f1(y_true, y_pred, average='weighted'):
    acc = simple_accuracy(y_true, y_pred)
    f1 = f1_score(y_true=y_true, y_pred=y_pred, average=average)
    return {
        "acc": acc,
        "f1": f1,
    }

def y_loss(y_true, y_pred, losses):
    y_true = torch.stack(y_true).cpu().detach().numpy()
    y_pred = torch.stack(y_pred).cpu().detach().numpy()
    y = [y_true, y_pred]
    loss = np.mean(losses)

    return y, loss


def eval_op(model, data_loader, loss_fn):
    model.eval()

    losses = []
    y_pred = []
    y_true = []

    with torch.no_grad():
        for dl in tqdm(data_loader, total=len(data_loader), desc="Evaluation... "):
            input_ids = dl['input_ids']
            attention_mask = dl['attention_mask']
            token_type_ids = dl['token_type_ids']
            targets = dl['targets']

            # move tensors to GPU if CUDA is available
            input_ids = input_ids.to(device)
            attention_mask = attention_mask.to(device)
            token_type_ids = token_type_ids.to(device)
            targets = targets.to(device)

            # compute predicted outputs by passing inputs to the model
            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask,
                token_type_ids=token_type_ids)
            
            # convert output probabilities to predicted class
            _, preds = torch.max(outputs, dim=1)

            # calculate the batch loss
            loss = loss_fn(outputs, targets)

            # accumulate all the losses
            losses.append(loss.item())

            y_pred.extend(preds)
            y_true.extend(targets)
    
    eval_y, eval_loss = y_loss(y_true, y_pred, losses)
    return eval_y, eval_loss


def train_op(model, 
             data_loader, 
             loss_fn, 
             optimizer, 
             scheduler, 
             step=0, 
             print_every_step=100, 
             eval=False,
             eval_cb=None,
             eval_loss_min=np.Inf,
             eval_data_loader=None, 
             clip=0.0):
    
    model.train()

    losses = []
    y_pred = []
    y_true = []

    for dl in tqdm(data_loader, total=len(data_loader), desc="Training... "):
        step += 1
        input_ids = dl['input_ids']
        attention_mask = dl['attention_mask']
        token_type_ids = dl['token_type_ids']
        targets = dl['targets']

        # move tensors to GPU if CUDA is available
        input_ids = input_ids.to(device)
        attention_mask = attention_mask.to(device)
        token_type_ids = token_type_ids.to(device)
        targets = targets.to(device)

        # clear the gradients of all optimized variables
        optimizer.zero_grad()

        # compute predicted outputs by passing inputs to the model
        outputs = model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids)
        
        # convert output probabilities to predicted class
        _, preds = torch.max(outputs, dim=1)

        # calculate the batch loss
        loss = loss_fn(outputs, targets)

        # accumulate all the losses
        losses.append(loss.item())

        # compute gradient of the loss with respect to model parameters
        loss.backward()

        # `clip_grad_norm` helps prevent the exploding gradient problem in RNNs / LSTMs.
        if clip > 0.0:
            nn.utils.clip_grad_norm_(model.parameters(), max_norm=clip)

        # perform optimization step
        optimizer.step()

        # perform scheduler step
        scheduler.step()

        y_pred.extend(preds)
        y_true.extend(targets)

        if eval:
            train_y, train_loss = y_loss(y_true, y_pred, losses)
            train_score = acc_and_f1(train_y[0], train_y[1], average='weighted')

            if step % print_every_step == 0:
                eval_y, eval_loss = eval_op(model, eval_data_loader, loss_fn)
                eval_score = acc_and_f1(eval_y[0], eval_y[1], average='weighted')

                if hasattr(eval_cb, '__call__'):
                    eval_loss_min = eval_cb(model, step, train_score, train_loss, eval_score, eval_loss, eval_loss_min)

    train_y, train_loss = y_loss(y_true, y_pred, losses)

    return train_y, train_loss, step, eval_loss_min

In [None]:
optimizer = AdamW(pt_model.parameters(), lr=LEARNING_RATE, correct_bias=False)
total_steps = len(train_data_loader) * EPOCHS
scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=0,
    num_training_steps=total_steps
)

loss_fn = nn.CrossEntropyLoss()

step = 0
eval_loss_min = np.Inf
history = collections.defaultdict(list)


def eval_callback(epoch, epochs, output_path):
    def eval_cb(model, step, train_score, train_loss, eval_score, eval_loss, eval_loss_min):
        statement = ''
        statement += 'Epoch: {}/{}...'.format(epoch, epochs)
        statement += 'Step: {}...'.format(step)
        
        statement += 'Train Loss: {:.6f}...'.format(train_loss)
        statement += 'Train Acc: {:.3f}...'.format(train_score['acc'])

        statement += 'Valid Loss: {:.6f}...'.format(eval_loss)
        statement += 'Valid Acc: {:.3f}...'.format(eval_score['acc'])

        print(statement)

        if eval_loss <= eval_loss_min:
            print('Validation loss decreased ({:.6f} --> {:.6f}).  Saving model ...'.format(
                eval_loss_min,
                eval_loss))
            
            torch.save(model.state_dict(), output_path)
            eval_loss_min = eval_loss
        
        return eval_loss_min


    return eval_cb


for epoch in tqdm(range(1, EPOCHS + 1), desc="Epochs... "):
    train_y, train_loss, step, eval_loss_min = train_op(
        model=pt_model, 
        data_loader=train_data_loader, 
        loss_fn=loss_fn, 
        optimizer=optimizer, 
        scheduler=scheduler, 
        step=step, 
        print_every_step=EEVERY_EPOCH, 
        eval=True,
        eval_cb=eval_callback(epoch, EPOCHS, OUTPUT_PATH),
        eval_loss_min=eval_loss_min,
        eval_data_loader=valid_data_loader, 
        clip=CLIP)
    
    train_score = acc_and_f1(train_y[0], train_y[1], average='weighted')
    
    eval_y, eval_loss = eval_op(
        model=pt_model, 
        data_loader=valid_data_loader, 
        loss_fn=loss_fn)
    
    eval_score = acc_and_f1(eval_y[0], eval_y[1], average='weighted')
    
    history['train_acc'].append(train_score['acc'])
    history['train_loss'].append(train_loss)
    history['val_acc'].append(eval_score['acc'])
    history['val_loss'].append(eval_loss)


In [None]:
torch.save(pt_model.state_dict(), OUTPUT_PATH)  

In [None]:
def predict(model, comments, tokenizer, max_len=128, batch_size=32):
    data_loader = create_data_loader(comments, None, tokenizer, max_len, batch_size, None)
    
    predictions = []
    prediction_probs = []

    
    model.eval()
    with torch.no_grad():
        for dl in tqdm(data_loader, position=0):
            input_ids = dl['input_ids']
            attention_mask = dl['attention_mask']
            token_type_ids = dl['token_type_ids']

            # move tensors to GPU if CUDA is available
            input_ids = input_ids.to(device)
            attention_mask = attention_mask.to(device)
            token_type_ids = token_type_ids.to(device)
            
            # compute predicted outputs by passing inputs to the model
            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask,
                token_type_ids=token_type_ids)
            
            # convert output probabilities to predicted class
            _, preds = torch.max(outputs, dim=1)

            predictions.extend(preds)
            prediction_probs.extend(F.softmax(outputs, dim=1))

    predictions = torch.stack(predictions).cpu().detach().numpy()
    prediction_probs = torch.stack(prediction_probs).cpu().detach().numpy()

    return predictions, prediction_probs

In [None]:
test_comments = test['comment'].to_numpy()
preds, probs = predict(pt_model, test_comments, tokenizer, max_len=512)

In [None]:
y_test, y_pred = [label_list.index(label) for label in test['sentiment'].values], preds

print(f'F1: {f1_score(y_test, y_pred, average="weighted")}')
print()
print(classification_report(y_test, y_pred, target_names=label_list,digits=4))

# Evaluation
The performance of the model has been evaluated through accuracy, recall, precision and f1 metrics. The results are as follows:

Test data set comments and their predicted labels by the trained model can be seen below:

In [None]:
for i in range(len(test['comment'])):
  print(test['comment'][i],id2label[preds[i]])

In [None]:
for i in range(len(test['comment'])):
  print(test['comment'][i],id2label[preds[i]])