In [1]:
# Install dependencies
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.19.2-py3-none-any.whl (4.2 MB)
[K     |████████████████████████████████| 4.2 MB 14.0 MB/s 
Collecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.7.0-py3-none-any.whl (86 kB)
[K     |████████████████████████████████| 86 kB 5.6 MB/s 
Collecting tokenizers!=0.11.3,<0.13,>=0.11.1
  Downloading tokenizers-0.12.1-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.6 MB)
[K     |████████████████████████████████| 6.6 MB 53.8 MB/s 
Collecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 66.5 MB/s 
Installing collected packages: pyyaml, tokenizers, huggingface-hub, transformers
  Attempting uninstall: pyyaml
    Found existing installation: PyYAML 3.13
    Uninstalling PyY

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
%cd /content/drive/MyDrive/PIL

/content/drive/MyDrive/PIL


In [4]:
import torch
import torch.nn as nn
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import transformers
from transformers import BertModel, BertTokenizer, AdamW, get_linear_schedule_with_warmup
from torch.optim import lr_scheduler

import logging
logging.basicConfig(level=logging.ERROR)

import warnings
warnings.filterwarnings("ignore")

In [5]:
class SentimentClassifier(nn.Module):
  """
  This class defines the model architecture which is simply a fully-connected
  layer on top of a pre-trained BERT model. 
  """

  def __init__(self, BERT_MODEL):
    super(SentimentClassifier, self).__init__()
    self.bert = BertModel.from_pretrained(BERT_MODEL)
    self.drop = nn.Dropout(p=0.3)
    self.out = nn.Linear(self.bert.config.hidden_size, 2) # Binary Classifier

  def forward(self, ids, mask, token_type_ids):
    last_hidden_state, pooled_output = self.bert(ids, attention_mask=mask, token_type_ids=token_type_ids, return_dict=False)
    output = self.drop(pooled_output)
    return self.out(output)

In [6]:
class SentiHood:
  """
  This class tokenizes the input text using the pre-trained BERT tokenizer 
  (wordpiece) and returns the corresponding tensors.
  """
  
  def __init__(self, text, auxiliary_sentence, targets, tokenizer, max_len):
    self.text = text
    self.auxiliary_sentence = auxiliary_sentence
    self.tokenizer = tokenizer
    self.max_len = max_len
    self.targets = targets

  def __len__(self):
    return len(self.targets)

  def __getitem__(self, item):
    text = str(self.text[item])
    auxiliary_sentence = str(self.auxiliary_sentence[item])
    targets = self.targets[item]

    text = text + ' ' + auxiliary_sentence

    inputs = self.tokenizer.encode_plus(
        text,
        add_special_tokens = True,
        max_length = self.max_len,
        pad_to_max_length = True
    )

    ids = inputs["input_ids"]
    mask = inputs["attention_mask"]
    token_type_ids = inputs["token_type_ids"]

    return {
        "ids": torch.tensor(ids, dtype=torch.long),
        "mask": torch.tensor(mask, dtype=torch.long),
        "token_type_ids": torch.tensor(token_type_ids, dtype=torch.long),
        "targets": torch.tensor(targets, dtype=torch.long)
    }

In [7]:
def loss_function(outputs, targets):
	"""
	This function defines the loss function which is used to train the model, i.e.
	CrossEntropy.
	"""

	# probability, predicted = torch.max(outputs, 1)
	# print(f"Predicted = {predicted.cpu().detach().numpy()}\nTargets = {targets}")

	return nn.CrossEntropyLoss(reduction='mean')(outputs, targets)


In [8]:
def train_loop_function(data_loader, model, optimizer, device):
  """
  This function defines the training loop over the entire training set.
  """

  model.train()

  running_loss = 0.0
  for bi, d in enumerate(data_loader):
    ids = d["ids"]
    mask = d["mask"]
    token_type_ids = d["token_type_ids"]
    targets = d["targets"]

    ids = ids.to(device, dtype=torch.long)
    mask = mask.to(device, dtype=torch.long)
    token_type_ids = token_type_ids.to(device, dtype=torch.long)
    targets = targets.to(device, dtype=torch.long)

    optimizer.zero_grad()

    outputs = model(ids=ids, mask=mask, token_type_ids=token_type_ids)
    loss = loss_function(outputs, targets)

    loss.backward()
    optimizer.step()

    running_loss += loss.item()
    if bi % 10 == 0 and bi!=0:
      temp = f'Batch index = {bi}\tLoss = {running_loss/10}'
      print(temp)

      f1 = open('/content/drive/MyDrive/PIL/BERT-ABSA/Bert-pair/NLI-B/Models/' + 'loss.txt', 'a+')
      temp = temp + '\n'
      f1.write(temp)
      f1.close()

      running_loss = 0.0

In [9]:
def eval_loop_function(data_loader, model, device):
  """
  This function defines the evaluation loop over the entire validation set.
  It also computes accuracy of the trained model, which is used to select the 
  best model.
  """

  model.eval()

  corrects = 0
  total = 0
  for bi, d in enumerate(data_loader):
    ids = d["ids"]
    mask = d["mask"]
    token_type_ids = d["token_type_ids"]
    targets = d["targets"]

    ids = ids.to(device, dtype=torch.long)
    mask = mask.to(device, dtype=torch.long)
    token_type_ids = token_type_ids.to(device, dtype=torch.long)
    targets = targets.to(device, dtype=torch.long)

    outputs = model(ids=ids, mask=mask, token_type_ids=token_type_ids)

    _, predicted = torch.max(outputs, 1)
    total = total + targets.size(0)
    corrects = corrects + (predicted==targets).sum().item()

    print(f"bi: {bi}\nPredicted: {predicted}\nTargets: {targets}")

  accuracy = corrects / total * 100
  f1 = open('/content/drive/MyDrive/PIL/BERT-ABSA/Bert-pair/NLI-B/Models/' + 'accuracy.txt', 'a+')
  temp = f"Corrects: {corrects}\tTotal: {total}\tAccuracy: {accuracy}\n"
  f1.write(temp)
  f1.close()

  return accuracy

In [11]:
def run():
  """
  This function defines hyperparameters, model and optimizer, loads required
  datasets and initiate the training and validation procedures.
  """

  TRAIN_MAX_LEN = 160
  VALID_MAX_LEN = 160
  TRAIN_BATCH_SIZE = 24
  VALID_BATCH_SIZE = 24
  EPOCHS = 4
  BERT_MODEL = 'bert-base-uncased'
  LEARNING_RATE = 2e-5

  locations = ['LOCATION1', 'LOCATION2']
  aspects = ['dining', 'general', 'green-nature', 'live', 'multicultural', 'nightlife', 'price', 'quiet', 'safety','shopping', 'touristy', 'transit-location']

  training_set_path = '/content/drive/MyDrive/PIL/BERT-ABSA/Bert-pair/NLI-B/Datasets/training_set.csv'
  validation_set_path = '/content/drive/MyDrive/PIL/BERT-ABSA/Bert-pair/NLI-B/Datasets/validation_set.csv'

  df_train = pd.read_csv(training_set_path)
  df_valid = pd.read_csv(validation_set_path)
  df_train = df_train.reset_index(drop=True)
  df_valid = df_valid.reset_index(drop=True)

  tokenizer = transformers.BertTokenizer.from_pretrained(BERT_MODEL)

  train_dataset = SentiHood(
      text = df_train['text'].values,
      auxiliary_sentence = df_train['auxiliary_sentence'],
      targets = df_train['sentiment'].values,
      tokenizer = tokenizer,
      max_len = TRAIN_MAX_LEN
  )
  print(f"Training Set: {len(train_dataset)}")

  # Custom sampler to compensate class imbalance in the dataset
  # ============================================================================
  class_counts = []
  for i in range(2):
    class_counts.append(df_train[df_train['sentiment']==i].shape[0])
  print(f"Class Counts: {class_counts}")
  
  num_samples = sum(class_counts)
  labels = df_train['sentiment'].values

  class_weights = []
  for i in range(len(class_counts)):
    if class_counts[i] != 0:
      class_weights.append(num_samples/class_counts[i])
    else:
      class_weights.append(0)

  weights = [class_weights[labels[i]] for i in range(int(num_samples))]
  sampler = torch.utils.data.sampler.WeightedRandomSampler(torch.DoubleTensor(weights), int(num_samples))
  # ============================================================================

  train_data_loader = torch.utils.data.DataLoader(
      train_dataset,
      batch_size = TRAIN_BATCH_SIZE,
      shuffle = False,
      sampler = sampler
  )

  valid_dataset = SentiHood(
      text = df_valid['text'].values,
      auxiliary_sentence = df_train['auxiliary_sentence'],
      targets = df_valid['sentiment'].values,
      tokenizer = tokenizer,
      max_len = VALID_MAX_LEN
  )
  print(f"Validation Set: {len(valid_dataset)}")

  valid_data_loader = torch.utils.data.DataLoader(
      valid_dataset,
      batch_size = VALID_BATCH_SIZE,
      shuffle = False
  )

  device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
  print(f"Device: {device}")

  model = SentimentClassifier(BERT_MODEL)
  model = model.to(device)

  num_train_steps = int(len(train_dataset) / TRAIN_BATCH_SIZE * EPOCHS)
  optimizer = AdamW(model.parameters(), lr=LEARNING_RATE)

  scheduler = lr_scheduler.StepLR(
      optimizer,
      step_size = 1,
      gamma = 0.8
  )

  for epoch in range(EPOCHS):
    train_loop_function(data_loader=train_data_loader, model=model, optimizer=optimizer, device=device)
    accuracy = eval_loop_function(data_loader=valid_data_loader, model=model, device=device)

    print(f"\nEpoch = {epoch}\tAccuracy Score = {accuracy}")
    print(f"Learning Rate = {scheduler.get_lr()[0]}\n")

    scheduler.step()

    torch.save(model, '/content/drive/MyDrive/PIL/BERT-ABSA/Bert-pair/NLI-B/Models/' + str(epoch) + '.bin')

if __name__ == "__main__":
  run()

Training Set: 135072
Class Counts: [90048, 45024]
Validation Set: 33732
Device: cuda:0


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max 

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
       device='cuda:0')
Targets: tensor([0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1],
       device='cuda:0')
bi: 407
Predicted: tensor([0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1],
       device='cuda:0')
Targets: tensor([0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1],
       device='cuda:0')
bi: 408
Predicted: tensor([0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1],
       device='cuda:0')
Targets: tensor([0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1],
       device='cuda:0')
bi: 409
Predicted: tensor([0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1],
       device='cuda:0')
Targets: tensor([0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1],
       device='cuda:0')
bi: 410
Predicted: tensor([0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0,