<a href="https://colab.research.google.com/github/Mulac/TDA-SentimentAnalysis/blob/master/transformer_model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Transformer Model**

In [None]:
!pip install transformers

In [2]:
import torch
from torchtext import data
from transformers import BertTokenizer, BertModel
import torch.nn as nn
import torch.optim as optim
from time import perf_counter
import pandas as pd 


In [3]:
tokeniser = BertTokenizer.from_pretrained('bert-base-uncased')

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=231508.0, style=ProgressStyle(descripti…




# Creating the dataset

In [4]:
def preprocess_tweet(tweet):
  return tokeniser.convert_tokens_to_ids(tweet)

In [5]:
""" 
Defines the fields for the dataset.
The tweets will be tokenised and preprocessed here.
"""

TEXT = data.Field(tokenize = tokeniser.tokenize,
                  sequential = True,
                  use_vocab = False,
                  batch_first = True,
                  preprocessing = preprocess_tweet,
                  init_token = tokeniser.cls_token_id,
                  eos_token = tokeniser.eos_token_id,
                  pad_token = tokeniser.pad_token_id,
                  unk_token = tokeniser.unk_token_id)

LABEL = data.LabelField(dtype = torch.long)

fields = [('label', LABEL), ('tweet', TEXT)]

In [6]:
"""
Creates dataset and splits it into training data and validation data.
"""

!curl --remote-name \
     -H 'Accept: application/vnd.github.v3.raw' \
     --location https://raw.githubusercontent.com/Mulac/TDA-SentimentAnalysis/master/data/0.75_min_confidence.csv

tweet_data = data.TabularDataset(path = '0.75_min_confidence.csv', format = "csv", fields = fields, skip_header = True)
train_data, valid_data = tweet_data.split()

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100 1207k  100 1207k    0     0  3136k      0 --:--:-- --:--:-- --:--:-- 3136k


In [7]:
print("Training data size:", len(train_data))
print("Training data size:", len(valid_data), "\n")

print("Row/Example object in the training data set:", vars(train_data[0]))
print("Converting the tweet text back into tokens gives:", tokeniser.convert_ids_to_tokens(vars(train_data[0])['tweet']), "\n")

print("Row/Example object in the validation data set:", vars(valid_data[0]))
print("Converting the tweet text back into tokens gives:", tokeniser.convert_ids_to_tokens(vars(valid_data[0])['tweet']))

Training data size: 7323
Training data size: 3138 

Row/Example object in the training data set: {'label': 'positive', 'tweet': [1030, 6892, 16558, 5657, 20666, 2549, 2013, 10108, 2000, 5887, 2050, 999, 1998, 2026, 12476, 3462, 16742, 2003, 2728, 999]}
Converting the tweet text back into tokens gives: ['@', 'jet', '##bl', '##ue', '232', '##4', 'from', 'orlando', 'to', 'dc', '##a', '!', 'and', 'my', 'awesome', 'flight', 'attendant', 'is', 'robert', '!'] 

Row/Example object in the validation data set: {'label': 'negative', 'tweet': [1030, 4943, 11215, 2746, 2039, 2006, 1016, 8093, 2015, 2006, 1010, 2145, 4033, 1005, 1056, 5287, 2000, 1037, 16360]}
Converting the tweet text back into tokens gives: ['@', 'southwest', '##air', 'coming', 'up', 'on', '2', '##hr', '##s', 'on', ',', 'still', 'haven', "'", 't', 'spoken', 'to', 'a', 'rep']


In [8]:
"""
Builds the vocab object for the tweet and label fields in the training data.
"""
TEXT.build_vocab(train_data)
LABEL.build_vocab(train_data)

In [9]:
print("Mapping of tokens to numbers for the labels:", LABEL.vocab.stoi)

Mapping of tokens to numbers for the labels: defaultdict(<function _default_unk_index at 0x7fc2c523de18>, {'negative': 0, 'neutral': 1, 'positive': 2})


In [10]:
"""
Splitting the dataset into training and validation.
Sorted by length of tweets, so each batch has similarly sized tweets.
"""
train_iter, valid_iter = data.BucketIterator.splits(
    (train_data, valid_data), sort_key=lambda x: len(x.tweet), batch_size=32)

In [11]:
print("Number of batches (Training set):", len(train_iter), "\n")
print("Number of batches (Validation set):",len(valid_iter))

Number of batches (Training set): 229 

Number of batches (Validation set): 99


# Building the model

To get the embeddings for the tweets, we will use BERT (pre-trained **Bidirectional Encoder Representations from Transformers** by Google). These embeddings will be passed to the GRU (Gated Recurrent Unit).

In [12]:
bert = BertModel.from_pretrained('bert-base-uncased')

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=433.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=440473133.0, style=ProgressStyle(descri…




In [13]:
class Network(nn.Module):
    def __init__(self, bert, hidden_dim, output_dim, num_layers, dropout):
        
        super().__init__()
        
        self.bert = bert
        
        embedding_dim = bert.config.to_dict()['hidden_size'] ## # size of input features for the GRU = embedding dimension size
        
        self.rnn = nn.GRU(input_size = embedding_dim, 
                          hidden_size = hidden_dim, 
                          num_layers = num_layers, 
                          bidirectional = True, 
                          batch_first = True, 
                          dropout = dropout)
        
        self.lin = nn.Linear(hidden_dim * 2, output_dim) #  bidirectional, so the hidden_size must be multiplied by 2 for the forward and backward pass
        
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, tweet):
                        
        with torch.no_grad():            # don't want to train BERT
            embedded = self.bert(tweet)[0]  ## gets embedding from BERT
        
        output_shape, hidden_state = self.rnn(embedded)   ## pass embedding into GRU
        
        hidden_state = self.dropout(torch.cat((hidden_state[-2,:,:], hidden_state[-1,:,:]), dim = 1)) ## bidirectional, so there are two hidden states

        output = self.lin(hidden_state)  ## final linear layer gives 3 outputs for each tweet in the batch, representing negative, neutral, positive probabilities
        
        return output

# Hyper-parameters and creating the model


In [14]:
HIDDEN_DIM = 256
NUM_LAYERS = 2
DROPOUT_PROB = 0.25
OUTPUT_DIM= 3

model = Network(bert, HIDDEN_DIM, OUTPUT_DIM, NUM_LAYERS, DROPOUT_PROB )

We don't want to train the BERT model, so we set *requires_grad = false* for BERT's parameters.

In [15]:
for name, parameter in model.named_parameters():
  if name.startswith('bert'):
    parameter.requires_grad = False

# Training the model

In [16]:
optimiser = optim.Adam(model.parameters())
criterion = nn.CrossEntropyLoss() ## multiple classes, so cross entropy loss is appropriate

In [17]:
"""
Training loop
"""
def train():

  model.train()
  correct = 0
  total = 0

  for batch in train_iter:    

    optimiser.zero_grad() 

    output = model(batch.tweet)

    loss = criterion(output, batch.label)
    loss.backward()

    optimiser.step()

    predictions = torch.argmax(output, dim=1) ## gets index of the highest probability sentiment for each tweet

    total += len(batch.label)
    correct += (predictions == batch.label).sum().item()

  print("Training accuracy:", (correct/total)*100)


In [18]:
def evaluate():

  model.eval()
  correct = 0
  total = 0

  with torch.no_grad():
    for batch in valid_iter:

      output = model(batch.tweet)

      predictions = torch.argmax(output, dim=1)

      total += len(batch.label)
      correct += (predictions == batch.label).sum().item()

  print("Validation accuracy:", (correct/total)*100)


In [19]:
EPOCHS = 5

for epoch in range(EPOCHS):

  print("\nEpoch" ,(epoch+1))
  start = perf_counter()
  train()
  end = perf_counter()
  print("Epoch time:", (end-start)/60, "minutes \n")
  evaluate()


Epoch 1
Training accuracy: 80.02184896900178
Epoch time: 17.66147328845 minutes 

Validation accuracy: 85.11790949649458

Epoch 2
Training accuracy: 85.83913696572442
Epoch time: 17.888755843583336 minutes 

Validation accuracy: 85.50031867431484

Epoch 3
Training accuracy: 86.87696299330875
Epoch time: 17.84626606416667 minutes 

Validation accuracy: 87.06182281708095

Epoch 4
Training accuracy: 87.83285538713642
Epoch time: 17.939545806500004 minutes 

Validation accuracy: 87.69917144678139

Epoch 5
Training accuracy: 88.07865628840639
Epoch time: 17.898101863183335 minutes 

Validation accuracy: 87.79477374123645
