<a href="https://colab.research.google.com/github/Mulac/TDA-SentimentAnalysis/blob/master/transformer_model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Transformer Model**

In [None]:
!pip install transformers

In [None]:
import torch
from torchtext import data
from transformers import BertTokenizer, BertModel
import torch.nn as nn
import torch.optim as optim
from time import perf_counter

In [None]:
tokeniser = BertTokenizer.from_pretrained('bert-base-cased')

# Creating the dataset

In [None]:
def preprocess_tweet(tweet):
  ## TO BE IMPLEMENTED
  return tokeniser.convert_tokens_to_ids(tweet)

In [None]:
""" 
Defines the fields for the dataset.
The tweets will be tokenised and preprocessed here.
"""

TEXT = data.Field(tokenize = tokeniser.tokenize,
                  sequential = True,
                  use_vocab = False,
                  batch_first = True,
                  preprocessing = preprocess_tweet,
                  init_token = tokeniser.cls_token_id,
                  eos_token = tokeniser.eos_token_id,
                  pad_token = tokeniser.pad_token_id,
                  unk_token = tokeniser.unk_token_id)

LABEL = data.LabelField(dtype = torch.long)

fields = [('label', LABEL), ('tweet', TEXT)]

In [None]:
"""
Creates dataset and splits it into training data and validation data.
"""
tweet_data = data.TabularDataset(path = 'data/tweets.csv', format = "csv", fields = fields, skip_header = True)
train_data, valid_data = tweet_data.split()

In [None]:
print("Training data size:", len(train_data))
print("Training data size:", len(valid_data), "\n")

print("Row/Example object in the training data set:", vars(train_data[0]))
print("Converting the tweet text back into tokens gives:", tokeniser.convert_ids_to_tokens(vars(train_data[0])['tweet']), "\n")

print("Row/Example object in the validation data set:", vars(valid_data[0]))
print("Converting the tweet text back into tokens gives:", tokeniser.convert_ids_to_tokens(vars(valid_data[0])['tweet']))

Training data size: 10248
Training data size: 4392 

Row/Example object in the training data set: {'label': 'negative', 'tweet': [137, 1237, 1592, 3161, 1128, 1541, 1444, 1199, 8132, 1555, 2013, 1111, 1240, 13143, 142, 27485, 1107, 1103, 2106, 1107, 2290, 119, 8375, 148, 10973, 1120, 21268, 22572, 4419, 1123, 188, 1732, 1181]}
Converting the tweet text back into tokens gives: ['@', 'American', '##A', '##ir', 'you', 'really', 'need', 'some', 'customer', 'service', 'training', 'for', 'your', 'unhappy', 'E', '##Es', 'in', 'the', 'morning', 'in', 'Chicago', '.', 'Gate', 'K', '##20', 'at', '430', 'ch', '##king', 'her', 's', '##ch', '##d'] 

Row/Example object in the validation data set: {'label': 'neutral', 'tweet': [137, 10859, 1592, 3161, 1202, 1207, 7306, 1435, 1149, 1120, 9191, 136]}
Converting the tweet text back into tokens gives: ['@', 'Southwest', '##A', '##ir', 'do', 'new', 'flights', 'come', 'out', 'at', 'midnight', '?']


In [None]:
"""
Builds the vocab object for the tweet and label fields in the training data.
"""
TEXT.build_vocab(train_data)
LABEL.build_vocab(train_data)

In [None]:
print("Mapping of tokens to numbers for the labels:", LABEL.vocab.stoi)

Mapping of tokens to numbers for the labels: defaultdict(<function _default_unk_index at 0x7f5a271a9ae8>, {'negative': 0, 'neutral': 1, 'positive': 2})


In [None]:
"""
Splitting the dataset into training and validation.
Sorted by length of tweets, so each batch has similarly sized tweets.
"""
train_iter, valid_iter = data.BucketIterator.splits(
    (train_data, valid_data), sort_key=lambda x: len(x.tweet), batch_size=32)

In [None]:
print("Number of batches (Training set):", len(train_iter), "\n")
print("Number of batches (Validation set):",len(valid_iter))

# Building the model

To get the embeddings for the tweets, we will use BERT (pre-trained **Bidirectional Encoder Representations from Transformers** by Google). These embeddings will be passed to the GRU (Gated Recurrent Unit).

In [None]:
bert = BertModel.from_pretrained('bert-base-cased')

In [None]:
class Network(nn.Module):
    def __init__(self, bert, hidden_dim, output_dim, num_layers, dropout):
        
        super().__init__()
        
        self.bert = bert
        
        embedding_dim = bert.config.to_dict()['hidden_size'] ## # size of input features for the GRU = embedding dimension size
        
        self.rnn = nn.GRU(input_size = embedding_dim, 
                          hidden_size = hidden_dim, 
                          num_layers = num_layers, 
                          bidirectional = True, 
                          batch_first = True, 
                          dropout = dropout)
        
        self.lin = nn.Linear(hidden_dim * 2, output_dim) #  bidirectional, so the hidden_size must be multiplied by 2 for the forward and backward pass
        
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, tweet):
                        
        with torch.no_grad():            # don't want to train BERT
            embedded = self.bert(tweet)[0]  ## gets embedding from BERT
        
        output_shape, hidden_state = self.rnn(embedded)   ## pass embedding into GRU
        
        hidden_state = self.dropout(torch.cat((hidden_state[-2,:,:], hidden_state[-1,:,:]), dim = 1)) ## bidirectional, so there are two hidden states

        output = self.lin(hidden_state)  ## final linear layer gives 3 outputs for each tweet in the batch, representing negative, neutral, positive probabilities
        
        return output

# Hyper-parameters and creating the model


In [None]:
HIDDEN_DIM = 256
NUM_LAYERS = 2
DROPOUT_PROB = 0.25
OUTPUT_DIM= 3

model = Network(bert, HIDDEN_DIM, OUTPUT_DIM, NUM_LAYERS, DROPOUT_PROB )

We don't want to train the BERT model, so we set *requires_grad = false* for BERT's parameters.

In [None]:
for name, parameter in model.named_parameters():
  if name.startswith('bert'):
    parameter.requires_grad = False

# Training the model

In [None]:
optimiser = optim.Adam(model.parameters())
criterion = nn.CrossEntropyLoss() ## multiple classes, so cross entropy loss is appropriate

In [None]:
"""
Training loop
"""
def train():

  model.train()
  correct = 0
  total = 0

  for batch in train_iter:    

    optimiser.zero_grad() 

    output = model(batch.tweet)

    loss = criterion(output, batch.label)
    loss.backward()

    optimiser.step()

    predictions = torch.argmax(output, dim=1) ## gets index of the highest probability sentiment for each tweet

    total += len(batch.label)
    correct += (predictions == batch.label).sum().item()

  print("Training accuracy:", (correct/total)*100)


In [None]:
def evaluate():

  model.eval()
  correct = 0
  total = 0

  with torch.no_grad():
    for batch in valid_iter:

      output = model(batch.tweet)

      predictions = torch.argmax(output, dim=1)

      total += len(batch.label)
      correct += (predictions == batch.label).sum().item()

  print("Validation accuracy:", (correct/total)*100)


In [None]:
EPOCHS = 5

for epoch in range(EPOCHS):

  print("\nEpoch" ,(epoch+1))
  start = perf_counter()
  train()
  end = perf_counter()
  print("Epoch time:", (end-start)/60, "minutes \n")
  evaluate()

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
        1, 2, 0, 0, 0, 0, 0, 0])
labe tensor([0, 2, 1, 0, 0, 0, 1, 0, 0, 0, 2, 2, 0, 2, 0, 1, 0, 0, 0, 0, 2, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0])
TRAIN correct:  2237 total:  2752 

pred tensor([0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 2, 0, 0, 2, 1, 0, 0, 0, 0, 0, 2, 0, 0, 0,
        0, 0, 0, 0, 1, 0, 0, 1])
labe tensor([1, 0, 0, 0, 2, 0, 0, 0, 0, 0, 2, 0, 0, 2, 1, 0, 0, 0, 0, 0, 2, 0, 0, 0,
        1, 0, 0, 0, 1, 0, 0, 1])
TRAIN correct:  2267 total:  2784 

pred tensor([0, 1, 0, 0, 2, 0, 0, 0, 0, 1, 0, 2, 0, 2, 2, 0, 0, 1, 1, 0, 0, 0, 1, 0,
        1, 0, 0, 0, 1, 0, 1, 0])
labe tensor([0, 1, 1, 0, 2, 0, 0, 0, 0, 1, 0, 0, 1, 2, 2, 0, 0, 1, 0, 0, 2, 0, 2, 0,
        1, 0, 0, 1, 1, 0, 1, 0])
TRAIN correct:  2292 total:  2816 

pred tensor([0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 2, 0, 2, 0, 0, 0, 0, 0, 0, 1, 0, 0,
        0, 0, 2, 1, 0, 0, 0, 0])
labe tensor([0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 2, 1, 2, 0, 0, 0, 0, 2, 0, 0, 0, 0,
 