In [22]:
import numpy as np
import pandas as pd

import torch
import torchvision
import torch.nn as nn   # neural network modules
import torch.optim as optim   # optimization algorithms
import torch.nn.functional as F   # functions without parameters like activation functions
from torch.utils.data import TensorDataset, DataLoader, Dataset   # dataset management, create batches
import torchvision.datasets as datasets   # standard datasets on pytorch
import torchvision.transforms as transforms   #transform datasets

import matplotlib.pyplot as plt

# PyTorch TensorBoard support
from torch.utils.tensorboard import SummaryWriter
from datetime import datetime

In [23]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [24]:
train_dataset = pd.read_csv("IMDB_Dataset.csv")

print(f"Full train dataset shape is {train_dataset.shape}")

Full train dataset shape is (50000, 2)


In [25]:
train_dataset = train_dataset.sample(frac=0.2, random_state=123)

In [26]:
train_dataset.head()

Unnamed: 0,review,sentiment
11872,"This movie was beyond awful, it was a pimple o...",negative
40828,As of this writing John Carpenter's 'Halloween...,positive
36400,I must admit a slight disappointment with this...,positive
5166,Oh dear! The BBC is not about to be knocked of...,negative
30273,its a totally average film with a few semi-alr...,negative


In [27]:
#train_dataset['review'][0]

In [28]:
train_dataset['review'].str.split().str.len().mean()

229.3021

In [29]:
from transformers import DistilBertTokenizer   # frequently used for sentiment analysis
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

In [30]:
# create a class to tokenize the dataset

class SentimentDataset(Dataset):
    def __init__(self, texts, labels):
        self.texts = texts
        self.labels = labels
        
    def __len__(self):   #returns the length of the dataset, which is the number of samples or instances in the dataset
        return len(self.texts)
    
    def __getitem__(self, index):   #retrieving a single sample from the dataset based on the provided index
        text = self.texts[index]
        label = self.labels[index]
        #retrieves text and label at the given index from their lists respectively
        
        #used to tokenize the input text using the specified tokenizer
        encoding = tokenizer.encode_plus(
            text,
            add_special_tokens=True,   #Adds special tokens like [CLS] (start token) and [SEP] (separator token) to mark the beginning and end of the text
            padding='max_length',   #Pads or truncates the tokenized text to a fixed length specified by max_length
            truncation=True,   #Truncates the text if it exceeds the maximum length
            max_length=512,   #Specifies the maximum length of the tokenized sequence
            return_tensors='pt'   #Returns the tokenized tensors in PyTorch format
        )
        #encoding object contains the tokenized input --> extract the input_ids and attention_mask tensors from it
        #The squeeze() function is used to remove any unnecessary dimensions from the tensors
        input_ids = encoding['input_ids'].squeeze()
        attention_mask = encoding['attention_mask'].squeeze()
        
        return {
            'input_ids': input_ids,
            'attention_mask': attention_mask,
            'label': torch.tensor(label)
        }
        #returns a dictionary that contains the tokenized input (input_ids and attention_mask) along with the corresponding label (torch.tensor(label)). 
        #The input tensors are ready to be fed into the LSTM model, and the label is converted to a PyTorch tensor

In [31]:
from sklearn.model_selection import train_test_split
X,y = train_dataset['review'].values, train_dataset['sentiment'].values
x_train,x_test,y_train,y_test = train_test_split(X,y, test_size = 0.2, stratify=y)
# split the dataset into train and test sets in a way that preserves the same proportions of examples in each class as observed in the original dataset

print(f'shape of train data is {x_train.shape}')
print(f'shape of test data is {x_test.shape}')

shape of train data is (8000,)
shape of test data is (2000,)


In [32]:
label_map = {'negative': 0, 'positive': 1}

# Convert labels to numerical values
y_train = np.array([label_map[label] for label in y_train])
y_test = np.array([label_map[label] for label in y_test])

train_dataset = SentimentDataset(x_train, y_train)
test_dataset = SentimentDataset(x_test, y_test)

In [33]:
vocab_size = len(tokenizer.vocab)
print(vocab_size)

30522


In [34]:
#print(tokenizer.vocab)

In [35]:
batch_size = 64
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(test_dataset, batch_size=batch_size)

In [36]:
for index, data in enumerate(train_loader):
    print(f"Sample {index}: {data}")
    if index >= 5:
        break

Sample 0: {'input_ids': tensor([[  101,  1045, 12860,  ...,  1010,  4901,   102],
        [  101,  2023,  2003,  ...,     0,     0,     0],
        [  101,  1045,  1005,  ...,     0,     0,     0],
        ...,
        [  101,  2043,  1045,  ...,     0,     0,     0],
        [  101,  3532, 11280,  ...,     0,     0,     0],
        [  101,  2009,  1005,  ...,     0,     0,     0]]), 'attention_mask': tensor([[1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]]), 'label': tensor([1, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1,
        1, 0, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 0, 1, 1, 1, 0, 1,
        0, 0, 0, 1, 1, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1])}
Sample 1: {'input_ids': tensor([[ 101, 2025, 2069,  ...,    0,    0,    0],
        [ 101, 2017, 2763,  ...,    0,    0,    0],
        [ 101, 1045, 

In [37]:
# hyperparameters

embedding_dim = 200   #represents the size of the input vectors that will be fed into the LSTM layer. 
# It determines the number of features that each word token will be represented with in the LSTM model

vocab_size = vocab_size
hidden_size = 256
num_layers = 2
num_classes = 2
learning_rate = 0.001
num_epochs = 6

In [38]:
class LSTM(nn.Module):
    def __init__(self, embedding_dim, hidden_size, num_layers, num_classes, vocab_size):
        super(LSTM, self).__init__()
        self.embedding_dim = embedding_dim
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_size, num_layers, batch_first = True, bidirectional = True)
        # batch_first --> If True, then the input and output tensors are provided as (batch, seq, feature) instead of (seq, batch, feature)
        # bidirectional --> If True, becomes a bidirectional LSTM
        self.fc = nn.Linear(hidden_size * 2, num_classes)
        # need to multiply by 2 because one layer going forward and the other going backward
        
    def forward(self, input_ids, attention_mask):
        # need to define hidden state and cell state to be sent into the LSTM
        h0 = torch.zeros(self.num_layers * 2, input_ids.size(0), self.hidden_size).to(device) 
        c0 = torch.zeros(self.num_layers * 2, input_ids.size(0), self.hidden_size).to(device)
        # need to multiply by 2 because one layer going forward and the other going backward
        # but they are all concatenated for the same hidden state
        # x.size(0) number of examples in a batch size 
        
        embedded = self.embedding(input_ids)
        
        
        embedded = embedded * attention_mask.unsqueeze(-1)
        #attention_mask.unsqueeze(-1) adds an extra dimension to the attention_mask tensor at the last dimension. 
        #This is done to match the dimensions of the embedded tensor so that the element-wise multiplication can be performed.
        #multiplies each element of the embedded tensor with the corresponding element of the attention_mask tensor. Since the attention_mask tensor has values 0 or 1,
        #this operation effectively masks out the padding elements in the embedded tensor by setting their values to 0
        
        out, (hidden_state, cell_state) = self.lstm(embedded, (h0, c0))   # only output is used here
        out = self.fc(out[:, -1, :])   # take the last hidden state and send to the linear layer
        
        return out

In [39]:
model = LSTM(embedding_dim=embedding_dim, hidden_size=hidden_size, num_layers=num_layers, num_classes=num_classes, vocab_size=vocab_size).to(device)

In [40]:
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)  

In [41]:
def train_one_epoch(epoch_index, tb_writer):
    running_loss = 0.
    last_loss = 0.

    # Here, we use enumerate(training_loader) instead of iter(training_loader) so that we can track the batch index and do some intra-epoch reporting
    for index, data in enumerate(train_loader):

        # Every data instance is an input + label pair
        batch_x = data['input_ids']
        batch_y = data['label']
        attention_mask = data['attention_mask']
        
        # Zero your gradients for every batch!
        optimizer.zero_grad()

        # Make predictions for this batch
        outputs = model(batch_x, attention_mask = attention_mask)

        # Compute the loss and its gradients
        loss = criterion(outputs, batch_y)
        loss.backward()

        # Adjust learning weights
        optimizer.step()

        # Gather data and report
        running_loss += loss.item() #retrieves the scalar value of the loss function for the current batch
        if index % 10 == 9: #reports on loss every 10 batches
            last_loss = running_loss / 10 # loss per batch
            print(f'  batch {index + 1} loss: {last_loss}')
            tb_x = epoch_index * len(train_loader) + index + 1  #variable is used to set the x-axis value for the scalar summary in TensorBoard, based on the current epoch and batch index
            tb_writer.add_scalar('Loss/train', last_loss, tb_x)
            running_loss = 0

    return last_loss #last calculated batch loss

In [42]:
# Initializing in a separate cell so we can easily add more epochs to the same run
timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
writer = SummaryWriter(f'runs/spaceship_trainer_{timestamp}')
epoch_number = 0

EPOCHS = 5

best_vloss = 1_000_000.

for epoch in range(EPOCHS):
    print(f'EPOCH {epoch_number + 1}:')

    # Make sure gradient tracking is on, and do a pass over the data
    model.train(True)
    avg_loss = train_one_epoch(epoch_number, writer) # runs the training function above 

    # We don't need gradients on to do reporting
    model.train(False)

    running_vloss = 0.0
    for index, vdata in enumerate(val_loader):
        vbatch_x = vdata['input_ids']
        vbatch_y = vdata['label']
        vattention_mask = vdata['attention_mask']
        
        voutputs = model(vbatch_x, vattention_mask)
        vloss = criterion(voutputs, vbatch_y)
        running_vloss += vloss

    avg_vloss = running_vloss / (index + 1)
    print(f'LOSS: train {avg_loss} validation {avg_vloss}')

    # Log the running loss averaged per batch
    # for both training and validation
    writer.add_scalars('Training vs. Validation Loss',
                    { 'Training' : avg_loss, 'Validation' : avg_vloss },
                    epoch_number + 1)
    writer.flush()
    '''
    # Track best performance, and save the model's state
    if avg_vloss < best_vloss:
        best_vloss = avg_vloss
        model_path = f'model_{timestamp}_{epoch_number}'
        torch.save(model.state_dict(), model_path)
    '''
    epoch_number += 1

EPOCH 1:
  batch 10 loss: 0.6949404120445252
  batch 20 loss: 0.6939243018627167
  batch 30 loss: 0.688591581583023
  batch 40 loss: 0.6952060580253601
  batch 50 loss: 0.6894813716411591
  batch 60 loss: 0.6969460725784302
  batch 70 loss: 0.6929971694946289
  batch 80 loss: 0.6935159146785737
  batch 90 loss: 0.6936171233654023
  batch 100 loss: 0.6933436453342438
  batch 110 loss: 0.6944387555122375
  batch 120 loss: 0.6944052100181579
LOSS: train 0.6944052100181579 validation 0.692832350730896
EPOCH 2:
  batch 10 loss: 0.6885309815406799
  batch 20 loss: 0.6869628667831421
  batch 30 loss: 0.6871935427188873
  batch 40 loss: 0.6903585195541382
  batch 50 loss: 0.6900281369686126
  batch 60 loss: 0.6896438241004944
  batch 70 loss: 0.6893807768821716
  batch 80 loss: 0.6850758910179138
  batch 90 loss: 0.6914754092693329
  batch 100 loss: 0.6889065563678741
  batch 110 loss: 0.6903203547000885
  batch 120 loss: 0.6864817500114441
LOSS: train 0.6864817500114441 validation 0.698512017

In [47]:
def check_accuracy(loader,model): 
    if loader == train_loader: 
        print("checking accuracy on training data")
    else:
        print("checking accuracy on test data")
    num_correct = 0
    num_samples = 0
    model.eval() 
    
    with torch.no_grad(): 
        for data in loader:
            x= data['input_ids'].to(device=device)
            y= data['label'].to(device=device)
            am = data['attention_mask'].to(device=device)
            
            outputs = model(x,am)
            
            
            _, predictions = outputs.max(1)
            
            num_correct += (predictions == y).sum()
            num_samples += predictions.size(0)
            
        print(f'{num_correct} / {num_samples} with accuracy {float(num_correct)/float(num_samples)*100:.2f}')   #2dp
        
    model.train()

In [48]:
check_accuracy(train_loader,model)
check_accuracy(val_loader,model)

checking accuracy on training data
4578 / 8000 with accuracy 57.23
checking accuracy on test data
1010 / 2000 with accuracy 50.50
