<a href="https://colab.research.google.com/github/myomyint-maung/nlp-assignments/blob/main/05-Sentiment-Analysis/SST2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Import necessary libraries
import torch, torchtext, torchdata
from torch import nn
import time

# Choose computing device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)

# Set SEED for reproducibility
SEED = 1234
torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

cuda


In [2]:
# Load SST2
from torchtext.datasets import SST2
train = SST2(split='train')
val = SST2(split='dev')

In [3]:
# Check a train sample
for text, sentiment in train:
    print(text, sentiment)
    break

hide new secretions from the parental units 0


In [4]:
# Check a val sample
for text, sentiment in val:
    print(text, sentiment)
    break

it 's a charming and often affecting journey . 1


In [5]:
# Extract the training and validation data
datasets    = [train, val]
train_data  = []
val_data    = []
data        = [train_data, val_data]

for i in range(len(data)):
    dataset = datasets[i]
    for text, sentiment in dataset:
        data[i].append((text, sentiment))

len(train_data), len(val_data)

(67349, 872)

In [6]:
# Convert the datasets into dataframes
import pandas as pd

train_df  = pd.DataFrame(train_data, columns=["Text", "Sentiment"])
val_df    = pd.DataFrame(val_data, columns=["Text", "Sentiment"])

In [7]:
# Check the dataframes
print(train_df.head())
print(val_df.head())

                                                Text  Sentiment
0        hide new secretions from the parental units          0
1                contains no wit , only labored gags          0
2  that loves its characters and communicates som...          1
3  remains utterly satisfied to remain the same t...          0
4  on the worst revenge-of-the-nerds clichés the ...          0
                                                Text  Sentiment
0     it 's a charming and often affecting journey .          1
1                  unflinchingly bleak and desperate          0
2  allows us to hope that nolan is poised to emba...          1
3  the acting , costumes , music , cinematography...          1
4                   it 's slow -- very , very slow .          0


In [8]:
# Check the class labels in the dataframes
print(train_df.Sentiment.value_counts())
print(val_df.Sentiment.value_counts())

1    37569
0    29780
Name: Sentiment, dtype: int64
1    444
0    428
Name: Sentiment, dtype: int64


**Note: 0 means negative and 1 means positive.**

In [9]:
# Create a tokenizer
from torchtext.data.utils import get_tokenizer

tokenizer = get_tokenizer('spacy', language='en_core_web_sm')

In [10]:
# Create vocab out of the training set
from torchtext.vocab import build_vocab_from_iterator

def yield_tokens(data_iter):
    for text, _ in data_iter:
        yield tokenizer(text)
        
vocab = build_vocab_from_iterator(yield_tokens(train),
                                  specials=['<unk>','<pad>','<bos>','<eos>'])

vocab(['<unk>', '<pad>', '<bos>', '<eos>'])

[0, 1, 2, 3]

In [11]:
# Set <unk> as the default index of the vocab
vocab.set_default_index(vocab['<unk>'])

vocab['hahaha']

0

In [12]:
# Create idex2word dictionary
idx2word = vocab.get_itos()

idx2word[0:4]

['<unk>', '<pad>', '<bos>', '<eos>']

In [13]:
# Check the vocab size
len(vocab)

13891

In [14]:
# Load FastText embeddings
from torchtext.vocab import FastText

fast_vectors = FastText(language='simple')

In [15]:
# Select FastText embeddings for the vocab
fast_embedding = fast_vectors.get_vecs_by_tokens(vocab.get_itos()).to(device)

fast_embedding.shape

torch.Size([13891, 300])

In [16]:
# Create a function to collate batches
from torch.utils.data   import DataLoader
from torch.nn.utils.rnn import pad_sequence

text_pipeline  = lambda x: vocab(tokenizer(x))
label_pipeline = lambda x: int(x) - 1

pad_idx = vocab['<pad>']

def collate_batch(batch):
    label_list, text_list, length_list = [], [], []
    for (_text, _label) in batch:
        label_list.append(_label)
        processed_text = torch.tensor(text_pipeline(_text), dtype=torch.int64)
        text_list.append(processed_text)
        length_list.append(processed_text.size(0))
        
    return torch.tensor(label_list, dtype=torch.int64), \
        pad_sequence(text_list, padding_value=pad_idx, batch_first=True), \
        torch.tensor(length_list, dtype=torch.int64)

In [17]:
# Prepare data loaders
batch_size = 64

train_loader = DataLoader(train_data, batch_size = batch_size,
                          shuffle=True, collate_fn=collate_batch)

val_loader   = DataLoader(val_data, batch_size = batch_size,
                          shuffle=True, collate_fn=collate_batch)

In [18]:
# Check a sample batch from the train_loader
for label, text, length in train_loader:
  break

label, text, length

(tensor([0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 1,
         0, 0, 1, 1, 0, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0,
         1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1, 1]),
 tensor([[  420,    51,     8,  ...,     1,     1,     1],
         [ 3043,     1,     1,  ...,     1,     1,     1],
         [   19, 12666,   672,  ...,     1,     1,     1],
         ...,
         [ 9460,     1,     1,  ...,     1,     1,     1],
         [   32,    36,  4124,  ...,     1,     1,     1],
         [   13,     6,   121,  ...,     1,     1,     1]]),
 tensor([ 8,  1, 13,  1,  1, 16,  4, 13,  5, 14, 30, 37,  2, 37, 29, 17, 15,  4,
          8, 15, 24,  2, 12,  6, 36, 19, 25,  5, 29, 10,  4,  2, 21,  7,  5,  9,
         19, 20,  9,  8,  3,  1, 18,  3,  4,  3,  9, 21,  9,  5,  8,  6, 15, 10,
         13,  9, 17,  9,  1,  3,  5,  1, 10, 17]))

In [19]:
label.shape, text.shape, length.shape

(torch.Size([64]), torch.Size([64, 37]), torch.Size([64]))

In [20]:
# Define the lengths of the data loaders
train_loader_length = len(list(iter(train_loader)))
val_loader_length   = len(list(iter(val_loader)))

train_loader_length, val_loader_length

(1053, 14)

In [21]:
# Create the LSTM model
class LSTM(nn.Module):
    def __init__(self, input_dim, emb_dim, hid_dim, output_dim, num_layers, bidirectional, dropout):
        super().__init__()
        #put padding_idx so asking the embedding layer to ignore padding
        self.embedding = nn.Embedding(input_dim, emb_dim, padding_idx=pad_idx)
        self.lstm = nn.LSTM(emb_dim, 
                           hid_dim, 
                           num_layers=num_layers, 
                           bidirectional=bidirectional, 
                           dropout=dropout,
                           batch_first=True)
        self.fc = nn.Linear(hid_dim * 2, output_dim)
        
    def forward(self, text, text_lengths):
        #text = [batch size, seq len]
        embedded = self.embedding(text)
        
        #++ pack sequence ++
        packed_embedded = nn.utils.rnn.pack_padded_sequence(embedded, text_lengths.to('cpu'), enforce_sorted=False, batch_first=True)
        
        #embedded = [batch size, seq len, embed dim]
        packed_output, (hn, cn) = self.lstm(packed_embedded)  #if no h0, all zeroes
        
        #++ unpack in case we need to use it ++
        output, output_lengths = nn.utils.rnn.pad_packed_sequence(packed_output, batch_first=True)
        
        #output = [batch size, seq len, hidden dim * num directions]
        #output over padding tokens are zero tensors
        
        #hidden = [num layers * num directions, batch size, hid dim]
        #cell = [num layers * num directions, batch size, hid dim]
        
        #concat the final forward (hidden[-2,:,:]) and backward (hidden[-1,:,:]) hidden layers
        hn = torch.cat((hn[-2,:,:], hn[-1,:,:]), dim = 1)
        #hn = [batch size, hidden dim * num directions]
        
        return self.fc(hn)

In [22]:
#explicitly initialize weights for better learning
def initialize_weights(m):
    if isinstance(m, nn.Linear):
        nn.init.xavier_normal_(m.weight)
        nn.init.zeros_(m.bias)
    elif isinstance(m, nn.LSTM):
        for name, param in m.named_parameters():
            if 'bias' in name:
                nn.init.zeros_(param)
            elif 'weight' in name:
                nn.init.orthogonal_(param)

In [23]:
# Set the parameters for the LSTM model
input_dim  = len(vocab)
hid_dim    = 256
emb_dim    = 300
output_dim = 2

#for biLSTM
num_layers = 2
bidirectional = True
dropout = 0.5

model = LSTM(input_dim, emb_dim, hid_dim, output_dim, num_layers, bidirectional, dropout).to(device)
model.apply(initialize_weights)
model.embedding.weight.data = fast_embedding #**<------applied the fast text embedding as the initial weights

In [24]:
# Print the model's parameters
def count_parameters(model):
    params = [p.numel() for p in model.parameters() if p.requires_grad]
    for item in params:
        print(f'{item:>6}')
    print(f'______\n{sum(params):>6}')
    
count_parameters(model)

4167300
307200
262144
  1024
  1024
307200
262144
  1024
  1024
524288
262144
  1024
  1024
524288
262144
  1024
  1024
  1024
     2
______
6888070


In [25]:
# Set the hyperparameters for training
import torch.optim as optim

lr=1e-3
optimizer = optim.Adam(model.parameters(), lr=lr)
criterion = nn.CrossEntropyLoss()

In [26]:
# Create a function to calculate prediction accuracy
def accuracy(preds, y):
    
    predicted = torch.max(preds.data, 1)[1]
    batch_corr = (predicted == y).sum()
    acc = batch_corr / len(y)
    
    return acc

In [27]:
# Create a function to train the model
def train(model, loader, optimizer, criterion, loader_length):
    epoch_loss = 0
    epoch_acc = 0
    model.train() #useful for batchnorm and dropout
    
    for i, (label, text, text_length) in enumerate(loader): 
        label = label.to(device) #(batch_size, )
        text = text.to(device) #(batch_size, seq len)
                
        #predict
        predictions = model(text, text_length).squeeze(1) #output by the fc is (batch_size, 1), thus need to remove this 1
        
        #calculate loss
        loss = criterion(predictions, label)
        acc  = accuracy(predictions, label)
        
        #backprop
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        epoch_loss += loss.item()
        epoch_acc += acc.item()
                        
    return epoch_loss / loader_length, epoch_acc / loader_length

In [28]:
# Create a function to evaluate the model
def evaluate(model, loader, criterion, loader_length):
    epoch_loss = 0
    epoch_acc = 0
    model.eval()
    
    with torch.no_grad():
        for i, (label, text, text_length) in enumerate(loader): 
            label = label.to(device) #(batch_size, )
            text  = text.to(device)  #(seq len, batch_size)

            predictions = model(text, text_length).squeeze(1) 
            
            loss = criterion(predictions, label)
            acc  = accuracy(predictions, label)

            epoch_loss += loss.item()
            epoch_acc += acc.item()
        
    return epoch_loss / loader_length, epoch_acc / loader_length

In [29]:
# Create a function to calculate training time
def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

In [30]:
# Create the folder to save models
import os
from os import path

if path.exists('./models') == False:
  os.mkdir('./models')

In [31]:
# Train and save the model
best_val_loss = float('inf')
num_epochs      = 2

save_path = f'models/{model.__class__.__name__}_SST2.pt'

train_losses = []
train_accs = []
val_losses = []
val_accs = []

for epoch in range(num_epochs):
    
    start_time = time.time()

    train_loss, train_acc = train(model, train_loader, optimizer, criterion, train_loader_length)
    val_loss, val_acc = evaluate(model, val_loader, criterion, val_loader_length)
    
    #for plotting
    train_losses.append(train_loss)
    train_accs.append(train_acc)
    val_losses.append(val_loss)
    val_accs.append(val_acc)
    
    end_time = time.time()
    
    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    
    if val_loss < best_val_loss:
        best_val_loss = val_loss
        torch.save(model.state_dict(), save_path)
    
    print(f'Epoch: {epoch+1:02} | Time: {epoch_mins}m {epoch_secs}s')
    print(f'\t Train Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')   
    print(f'\t Val. Loss: {val_loss:.3f} |  Val. Acc: {val_acc*100:.2f}%')

Epoch: 01 | Time: 0m 17s
	 Train Loss: 0.307 | Train Acc: 86.95%
	 Val. Loss: 0.475 |  Val. Acc: 83.64%
Epoch: 02 | Time: 0m 17s
	 Train Loss: 0.153 | Train Acc: 94.27%
	 Val. Loss: 0.392 |  Val. Acc: 85.11%


In [32]:
# Load the saved model
model.load_state_dict(torch.load(save_path))

<All keys matched successfully>

In [33]:
# Test on some random text
test_str = "Google is now falling nonstop.  The price is really bad now."
text = torch.tensor(text_pipeline(test_str)).to(device)
text = text.reshape(1, -1)  #because batch_size is 1
text_length = torch.tensor([text.size(1)]).to(dtype=torch.int64)

def predict(text, text_length):
    with torch.no_grad():
        output = model(text, text_length).squeeze(1)
        predicted = torch.max(output.data, 1)[1]
        return predicted

predict(text, text_length)

tensor([0], device='cuda:0')

In [34]:
# Test on some negating positive sentence
test_str = "He is hardly intelligent."
text = torch.tensor(text_pipeline(test_str)).to(device)
text = text.reshape(1, -1)  #because batch_size is 1
text_length = torch.tensor([text.size(1)]).to(dtype=torch.int64)

predict(text, text_length)

tensor([0], device='cuda:0')

In [35]:
# Test on another negating positive sentence
test_str = "You will be wrong to think that he is smart."
text = torch.tensor(text_pipeline(test_str)).to(device)
text = text.reshape(1, -1)  #because batch_size is 1
text_length = torch.tensor([text.size(1)]).to(dtype=torch.int64)

predict(text, text_length)

tensor([0], device='cuda:0')

In [36]:
# Test on some negating negative sentence
test_str = "The reviews for the movie were not bad."
text = torch.tensor(text_pipeline(test_str)).to(device)
text = text.reshape(1, -1)  #because batch_size is 1
text_length = torch.tensor([text.size(1)]).to(dtype=torch.int64)

predict(text, text_length)

tensor([1], device='cuda:0')

In [37]:
# Test on another negating negative sentence
test_str = "There is nothing to dislike about the party."
text = torch.tensor(text_pipeline(test_str)).to(device)
text = text.reshape(1, -1)  #because batch_size is 1
text_length = torch.tensor([text.size(1)]).to(dtype=torch.int64)

predict(text, text_length)

tensor([0], device='cuda:0')

### Conclusion

**The SST2 model achieved over 94% training accuracy, which is comparable to the accuracy achieved by the SST model, and over 85% validation accuracy, which is better than that of the SST model. However, like the SST model, the SST2 model has a shortcoming in classifying negating negative sentences.**