# Aspect based sentiment analysis
## BERT model

In [2]:
from transformers import BertTokenizer, BertModel
from transformers.trainer_utils import set_seed
import torch
from torch import nn
import pandas as pd
import random
import numpy as np
from keras.preprocessing.sequence import pad_sequences
from tqdm import tqdm
from sklearn.model_selection import train_test_split
import torch.nn.functional as F

set_seed(1234)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

cuda


In [3]:
class CustomBertTokenClassifier(nn.Module):
    def __init__(self, bert, num_labels):
        super().__init__()
        self.bert = bert
        self.embedding_dim = bert.config.to_dict()['hidden_size']
        self.dropout = nn.Dropout(0.5)
        self.num_labels = num_labels

        """uncomment below for Bert-linear"""
        self.fc = nn.Linear(self.embedding_dim, num_labels)

        '''OR uncomment below for Bert-Gru'''
        # self.gru = nn.GRU(input_size=self.embedding_dim, hidden_size=256, batch_first=True)
        # self.fc = nn.Linear(256, num_labels)


    def forward(self, input_ids, attention_mask, labels=None):
        # text = [batch size, sent len]
        bert_outs = self.bert(input_ids, attention_mask)  # outputs = {last_hidden_state, hidden_states}
        embedding = bert_outs.last_hidden_state  # embedding = [batch size, sent len, emb dim=768]
        #all_hidden = bert_outs.hidden_states  # all_hidden = tuple of tensors size=13, each tensor = [batch size, sent_len, hidden_dim=768]
        #print('embedding:', embedding.shape)
        #print('all_hidden:', f"Tuple length:{len(all_hidden)}", [t.shape for t in all_hidden])
        dropped = self.dropout(embedding)

        """uncomment below for Bert-linear"""
        logits = self.fc(dropped) # output = [batch size, sent len, output dim]
        
        '''OR uncomment below for Bert-Gru'''
        # output,h_n = self.gru(dropped)
        # logits = self.fc(output)

        loss = None
        if labels is not None:
            # print(labels.size())
            loss_fct = nn.CrossEntropyLoss()
            # Only keep active parts of the loss
            if attention_mask is not None:
                active_loss = attention_mask.view(-1) == 1
                active_logits = logits.view(-1, self.num_labels)
                active_labels = torch.where(
                    active_loss, labels.view(-1), torch.tensor(loss_fct.ignore_index).type_as(labels)
                )
                loss = loss_fct(active_logits, active_labels)
            else:
                loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
        
        return logits, loss

# Sample code for training the pytorch model
def train_model(epochs, model, optimizer, train_dataloader):
    for ep in range(epochs):
        model.train()
        total_loss = 0
        for step, batch in tqdm(enumerate(train_dataloader), total=len(train_dataloader)):
            # Add batch to GPU
            batch = tuple(t.to(device) for t in batch)
            b_input_ids, b_input_mask, b_labels = batch

            optimizer.zero_grad()
            logits, loss = model(b_input_ids, attention_mask=b_input_mask, labels=b_labels)
            #label_ids = b_labels.to('cpu').numpy()
            total_loss += loss.item()
            # Backward pass
            loss.backward()
            optimizer.step()
        print(f'Loss = {total_loss / len(train_dataloader)}')
        
def evaluate(model, val_dataloader):
    """After the completion of each training epoch, measure the model's performance
    on our validation set.
    """
    # Put the model into the evaluation mode. The dropout layers are disabled during
    # the test time.
    model.eval()
    total_loss = 0
    for step, batch in tqdm(enumerate(val_dataloader), total=len(val_dataloader)):
        # Add batch to GPU
        batch = tuple(t.to(device) for t in batch)
        b_input_ids, b_input_mask, b_labels = batch
        with torch.no_grad():
            _, loss = model(b_input_ids, attention_mask=b_input_mask, labels=b_labels)
        #label_ids = b_labels.to('cpu').numpy()
        total_loss += loss.item()
    print(f'Loss = {total_loss / len(val_dataloader)}')
    return

In [5]:
df_train = pd.read_csv('/content/preproc_bert_16_restaurant_train.csv')
df_train['token_ids'] = df_train['token_ids'].apply(eval)
df_train['labels'] = df_train['labels'].apply(eval)
l_train=df_train['labels'].tolist()
max_len_train=max(len(i) for i in l_train)
print(max_len_train)
df_train.head(5)

92


Unnamed: 0,sid,token_ids,labels
0,1004293:0,"[101, 13325, 2013, 3025, 8466, 2023, 2109, 200...","[O, O, O, O, O, O, O, O, O, O, O, B-neg, O, O,..."
1,1004293:1,"[101, 2057, 1010, 2045, 2020, 2176, 1997, 2149...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."
2,1004293:2,"[101, 2027, 2196, 2716, 2149, 19394, 5649, 271...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."
3,1004293:3,"[101, 1996, 2833, 2001, 10223, 6508, 1011, 220...","[O, O, B-neg, O, O, O, O, O, O, O, O, O, O, O,..."
4,1004293:4,"[101, 2044, 2035, 2008, 1010, 2027, 10865, 200...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O]"


In [6]:
df_test = pd.read_csv('/content/preproc_bert_16_restaurant_test.csv')
df_test['token_ids'] = df_test['token_ids'].apply(eval)
df_test['labels'] = df_test['labels'].apply(eval)
l_test=df_test['labels'].tolist()
max_len_test=max(len(i) for i in l_test)
print(max_len_test)
df_test.head(5)

93


Unnamed: 0,sid,token_ids,labels
0,en_BlueRibbonSushi_478218171:0,"[101, 9805, 2213, 999, 102]","[O, O, O, O, O]"
1,en_BlueRibbonSushi_478218171:1,"[101, 4240, 2428, 2204, 10514, 6182, 1012, 102]","[O, O, O, O, B-pos, I-pos, O, O]"
2,en_BlueRibbonSushi_478218171:2,"[101, 2025, 1996, 5221, 8810, 2021, 11706, 101...","[O, O, O, O, B-neu, O, O, O, O]"
3,en_BlueRibbonSushi_478218171:3,"[101, 2665, 5572, 13675, 21382, 7987, 9307, 20...","[O, B-pos, I-pos, I-pos, I-pos, I-pos, I-pos, ..."
4,en_BlueRibbonSushi_478218171:4,"[101, 2123, 1005, 1056, 2681, 1996, 4825, 2302...","[O, O, O, O, O, O, O, O, O, O, O]"


In [7]:
# MAX_LEN=max(max_len_test,max_len_train)+2
# print(MAX_LEN)

In [8]:
# Simple output mapping:

'''Uncomment below for 16 restaurant data '''
label_mapping = {'O': 0, 'B-pos': 1, 'B-neg': 2, 'B-neu': 3, 'I-pos': 4, 'I-neg': 5, 'I-neu': 6}

'''OR Uncomment below for 14 restaurant/laptop data '''
# label_mapping = {'O': 0, 'B-pos': 1, 'B-neg': 2, 'B-neu': 3, 'B-con': 4, 'I-pos': 5, 'I-neg': 6, 'I-neu': 7, 'I-con': 8}


def generate_input_masks_labels(df, MAX_LEN):
    input_ids  = list(df['token_ids'])
    attention_masks = [torch.ones((len(x)), dtype=torch.long) for x in input_ids]
    labels = [[label_mapping[x] for x in seq] for seq in df['labels']]
    input_ids = pad_sequences(input_ids, maxlen=MAX_LEN, dtype="long", truncating="post", padding="post", value=0)
    attention_masks = pad_sequences(attention_masks, maxlen=MAX_LEN, dtype="long", truncating="post", padding="post", value=0)
    labels = pad_sequences(labels, maxlen=MAX_LEN, dtype="long", truncating="post", padding="post", value=0)
    input_ids = torch.tensor(input_ids).long()
    attention_masks = torch.tensor(attention_masks).long()
    labels = torch.tensor(labels).long()
    return input_ids, attention_masks, labels

In [9]:
# df_train, df_test = train_test_split(df,test_size=0.1)

In [10]:
input_ids, attention_masks, labels = generate_input_masks_labels(df_train, max_len_train)
print(input_ids.shape)
print(attention_masks.shape)
print(labels.shape)

torch.Size([2000, 92])
torch.Size([2000, 92])
torch.Size([2000, 92])


In [11]:
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
BATCH_SIZE=10

train_data = TensorDataset(input_ids, attention_masks, labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=BATCH_SIZE)

In [12]:
input_ids_test, attention_masks_test, labels_test = generate_input_masks_labels(df_test, max_len_test)
test_data = TensorDataset(input_ids_test, attention_masks_test, labels_test)
test_sampler = SequentialSampler(test_data)
test_dataloader = DataLoader(test_data, sampler=test_sampler, batch_size=BATCH_SIZE)

In [13]:
from transformers import AdamW
pred_log = []
epochs = 3
num_labels = len(label_mapping)
#num_labels = 3
bert = BertModel.from_pretrained("bert-base-uncased")
model = CustomBertTokenClassifier(bert, num_labels)
model.to(device)
param_optimizer = list(model.named_parameters())
no_decay = ['bias', 'gamma', 'beta']
optimizer_grouped_parameters = [
    {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
    'weight_decay_rate': 0.05},
    {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
    'weight_decay_rate': 0.0}
]
optimizer = AdamW(optimizer_grouped_parameters, lr=5e-5)
for ep in range(epochs):
    print(f"=== Training phase {ep+1} ====")
    train_model(1, model, optimizer, train_dataloader)
    print(f"=== Eval phase {ep+1} ====")
    evaluate(model,test_dataloader)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=570.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=440473133.0, style=ProgressStyle(descri…




Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
  0%|          | 0/200 [00:00<?, ?it/s]

=== Training phase 1 ====


100%|██████████| 200/200 [00:23<00:00,  8.48it/s]
  4%|▍         | 3/68 [00:00<00:02, 28.82it/s]

Loss = 0.2645166733115911
=== Eval phase 1 ====


100%|██████████| 68/68 [00:02<00:00, 28.67it/s]
  0%|          | 1/200 [00:00<00:22,  8.77it/s]

Loss = 0.18678577770205104
=== Training phase 2 ====


100%|██████████| 200/200 [00:23<00:00,  8.55it/s]
  4%|▍         | 3/68 [00:00<00:02, 28.88it/s]

Loss = 0.11637852341867984
=== Eval phase 2 ====


100%|██████████| 68/68 [00:02<00:00, 28.68it/s]
  0%|          | 1/200 [00:00<00:22,  8.82it/s]

Loss = 0.15269656839561374
=== Training phase 3 ====


100%|██████████| 200/200 [00:23<00:00,  8.54it/s]
  4%|▍         | 3/68 [00:00<00:02, 28.54it/s]

Loss = 0.059641815554350615
=== Eval phase 3 ====


100%|██████████| 68/68 [00:02<00:00, 28.71it/s]

Loss = 0.17850478386561222





In [14]:
def evaluate_prob(model, val_dataloader):
    """After the completion of each training epoch, measure the model's performance
    on our validation set.
    """
    # Put the model into the evaluation mode. The dropout layers are disabled during
    # the test time.
    model.eval()
    total_loss = 0
    all_pred = []
    for step, batch in tqdm(enumerate(val_dataloader), total=len(val_dataloader)):
        # Add batch to GPU
        batch = tuple(t.to(device) for t in batch)
        b_input_ids, b_input_mask, b_labels = batch
        with torch.no_grad():
            logits, loss = model(b_input_ids, attention_mask=b_input_mask, labels=b_labels)
        #all_logits.append(logits)
        #label_ids = b_labels.to('cpu').numpy()
        logits = logits.cpu().numpy()
        all_pred.append(np.argmax(logits,axis=2))
        # all_logits.append((np.argmax(logits,axis=2),b_labels.cpu().numpy()))
        total_loss += loss.item()
    print(f'Loss = {total_loss / len(val_dataloader)}')
    return np.concatenate(all_pred, axis=0)

In [15]:
pred = evaluate_prob(model,test_dataloader)

100%|██████████| 68/68 [00:02<00:00, 28.67it/s]

Loss = 0.17850478386561222





In [16]:
# print(pred[:5])

In [17]:
from sklearn.metrics import confusion_matrix
#all_pred = np.concatenate(pred, axis=0)
#print(pred.shape)
test_labels_np = labels_test.cpu().numpy().flatten()
pred = pred.flatten()

att_mask = attention_masks_test.cpu().numpy().flatten() == 1
test_labels_final = test_labels_np[att_mask]
pred_final = pred[att_mask]

#print(test_labels_np.shape)
pd.DataFrame(confusion_matrix(test_labels_final, pred_final))

Unnamed: 0,0,1,2,3,4,5,6
0,10959,75,42,0,81,27,0
1,52,386,15,0,16,0,0
2,24,13,76,0,1,1,0
3,7,13,10,0,0,0,0
4,111,18,0,0,394,4,0
5,21,0,0,0,11,13,0
6,16,0,0,0,9,1,0


In [18]:
from sklearn.metrics import classification_report

'''Uncomment below for 16 restaurant data '''
print(classification_report(test_labels_final,pred_final,labels=[1,2,3,4,5,6], target_names=["B-pos",'B-neg','B-neu','I-pos','I-neg','I-neu']))

'''OR Uncomment below for 14 restaurant data '''
# print(classification_report(test_labels_final,pred_final,labels=[1,2,3,4,5,6,7,8], target_names=["B-pos",'B-neg','B-neu','B-con','I-pos','I-neg','I-neu','I-con']))

              precision    recall  f1-score   support

       B-pos       0.76      0.82      0.79       469
       B-neg       0.53      0.66      0.59       115
       B-neu       0.00      0.00      0.00        30
       I-pos       0.77      0.75      0.76       527
       I-neg       0.28      0.29      0.29        45
       I-neu       0.00      0.00      0.00        26

   micro avg       0.72      0.72      0.72      1212
   macro avg       0.39      0.42      0.40      1212
weighted avg       0.69      0.72      0.70      1212



  _warn_prf(average, modifier, msg_start, len(result))


In [19]:
# from sklearn.metrics import precision_score, recall_score, f1_score 
# micro_precision = precision_score(test_labels_final,pred_final, labels=[1,2,3,4,5,6], average='micro')
# micro_recall = recall_score(test_labels_final,pred_final, labels=[1,2,3,4,5,6], average='micro')
# micro_f1 = f1_score(test_labels_final,pred_final, labels=[1,2,3,4,5,6], average='micro')
# print(f'micro_precision: {micro_precision:.3f}')
# print(f'micro_recall: {micro_recall:.3f}')
# print(f'micro_f1: {micro_f1:.3f}')

micro_precision: 0.721
micro_recall: 0.717
micro_f1: 0.719


In [20]:
def conv2BIO(x):
    if x == 0:
        return 0
    if x > 4:
        return 2
    return 1

vfunc = np.vectorize(conv2BIO)

bio_labels = vfunc(test_labels_final)
bio_pred = vfunc(pred_final)

pd.DataFrame(confusion_matrix(bio_labels, bio_pred))

Unnamed: 0,0,1,2
0,10959,198,27
1,194,942,5
2,37,20,14


In [21]:
print(classification_report(bio_labels,bio_pred, labels=[1,2], target_names=["Pos",'Neg']))

              precision    recall  f1-score   support

         Pos       0.81      0.83      0.82      1141
         Neg       0.30      0.20      0.24        71

   micro avg       0.79      0.79      0.79      1212
   macro avg       0.56      0.51      0.53      1212
weighted avg       0.78      0.79      0.78      1212



In [22]:
# micro_precision = precision_score(bio_labels,bio_pred, labels=[1,2], average='micro')
# micro_recall = recall_score(bio_labels,bio_pred, labels=[1,2], average='micro')
# micro_f1 = f1_score(bio_labels,bio_pred, labels=[1,2], average='micro')
# print('micro_precision: ',micro_precision)
# print('micro_recall: ', micro_recall)
# print('micro_f1: ',micro_f1)

micro_precision:  0.7927031509121062
micro_recall:  0.7887788778877888
micro_f1:  0.7907361455748553


In [None]:
#  torch.cuda.empty_cache()