In [1]:
! nvidia-smi

Sun Jul 11 03:47:01 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 470.42.01    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   59C    P8    10W /  70W |      0MiB / 15109MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [2]:
from torchtext import datasets

train_iter, test_iter = datasets.IMDB(split=('train', 'test'))
train_data, test_data = list(train_iter), list(test_iter)

aclImdb_v1.tar.gz: 100%|██████████| 84.1M/84.1M [00:01<00:00, 68.2MB/s]


In [3]:
print(f'Number of training examples: {len(train_data)}')
print(f'Number of testing examples: {len(test_data)}')

Number of training examples: 25000
Number of testing examples: 25000


In [4]:
import torch
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator

SEED = 1234

torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

tokenizer = get_tokenizer('spacy', language='en')
def yield_tokens(data_iter):
    for _, text in data_iter:
        yield tokenizer(text)

vocab = build_vocab_from_iterator(yield_tokens(train_data), specials=["<unk>",'<PAD>'])
vocab.set_default_index(vocab["<unk>"])

In [5]:
text_transform = lambda x: vocab(tokenizer(x))
label_transform = lambda x: 1 if x == 'pos' else 0

In [6]:
for idx, (label, line) in enumerate(train_data):
  print(label, line)
  if idx == 10:
    break

neg I rented I AM CURIOUS-YELLOW from my video store because of all the controversy that surrounded it when it was first released in 1967. I also heard that at first it was seized by U.S. customs if it ever tried to enter this country, therefore being a fan of films considered "controversial" I really had to see this for myself.<br /><br />The plot is centered around a young Swedish drama student named Lena who wants to learn everything she can about life. In particular she wants to focus her attentions to making some sort of documentary on what the average Swede thought about certain political issues such as the Vietnam War and race issues in the United States. In between asking politicians and ordinary denizens of Stockholm about their opinions on politics, she has sex with her drama teacher, classmates, and married men.<br /><br />What kills me about I AM CURIOUS-YELLOW is that 40 years ago, this was considered pornographic. Really, the sex and nudity scenes are few and far between,

In [7]:
pad_idx = vocab.get_stoi()['<PAD>']

In [8]:
import random

from sklearn.model_selection import train_test_split
train_data, valid_data = train_test_split(train_data, random_state = random.seed(SEED))

In [9]:
print(f'Number of training examples: {len(train_data)}')
print(f'Number of validation examples: {len(valid_data)}')
print(f'Number of testing examples: {len(test_data)}')

Number of training examples: 18750
Number of validation examples: 6250
Number of testing examples: 25000


In [10]:
print(vocab.get_itos()[:10])

['<unk>', '<PAD>', 'the', ',', '.', 'and', 'a', 'of', 'to', 'is']


In [11]:
print(vocab.get_stoi())



In [12]:
from torch.utils.data import DataLoader
from torch.nn.utils.rnn import pad_sequence

def collate_batch(batch):
   label_list, text_list, text_len = [], [], []
   for (_label, _text) in batch:
        label_list.append(label_transform(_label))
        processed_text = torch.tensor(text_transform(_text))
        text_list.append(processed_text)
        text_len.append(len(processed_text))
   return torch.tensor(label_list, dtype=torch.float), pad_sequence(text_list, padding_value=pad_idx), torch.LongTensor(text_len)

## Sorting The Text with in a group similar to Bucket Iterator

To group the texts with similar length together, like introduced in the legacy `BucketIterator` class, first of all, we randomly create multiple "pools", and each of them has a size of `batch_size * 100`. Then, we sort the samples within the individual pool by length. This idea can be implemented succintly through `batch_sampler` argument of PyTorch `Dataloader`. `batch_sampler` accepts 'Sampler' or Iterable object that yields indices of next batch. In the code below, we implemented a generator that yields batch of indices for which the corresponding batch of data is of similar length. 

In [13]:
import random
from torch.utils.data.sampler import Sampler
class custom_batch_sampler(Sampler):
  def __init__(self, data, batch_size):
    self.data = data
    self.batch_size = batch_size 
    self.indices = [(i, len(tokenizer(line[1]))) for i, line in enumerate(self.data)] ## Length of text on indices 1
    self.pooled_indices = []
    for i in range(0, len(self.indices), self.batch_size * 100):
        self.pooled_indices.extend(sorted(self.indices[i:i + self.batch_size * 100], key=lambda x: x[1]))  ## Sorted based on the text only ## the first is indice and other is length
    self.pooled_indices = [x[0] for x in self.pooled_indices]  ## seperating only indices

  def __iter__(self):
    for i in range(0, len(self.pooled_indices), self.batch_size):
        yield self.pooled_indices[i:i + self.batch_size]
  
  def __len__(self):
    return len(self.data)//self.batch_size

In [14]:
BATCH_SIZE = 128

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

train_dataloader =  DataLoader(train_data, batch_sampler=custom_batch_sampler(train_data, BATCH_SIZE),
                              collate_fn=collate_batch, num_workers=2, pin_memory=True)
valid_dataloader = DataLoader(valid_data, batch_sampler=custom_batch_sampler(valid_data, BATCH_SIZE),
                              collate_fn=collate_batch, num_workers=2, pin_memory=True)
test_dataloader = DataLoader(test_data, batch_sampler=custom_batch_sampler(test_data, BATCH_SIZE),
                              collate_fn=collate_batch, num_workers=2, pin_memory=True)

In [15]:
print('Train')
for batch in train_dataloader:
    print(f'Text matrix size: {batch[1].size()}')
    print(f'Target vector size: {batch[0].size()}')
    print(f'Length vector size: {batch[2].size()}')
    break
    
print('\nValid:')
for batch in valid_dataloader:
    print(f'Text matrix size: {batch[1].size()}')
    print(f'Target vector size: {batch[0].size()}')
    print(f'Length vector size: {batch[2].size()}')
    break
    
print('\nTest:')
for batch in test_dataloader:
    print(f'Text matrix size: {batch[1].size()}')
    print(f'Target vector size: {batch[0].size()}')
    print(f'Length vector size: {batch[2].size()}')
    break

Train
Text matrix size: torch.Size([50, 128])
Target vector size: torch.Size([128])
Length vector size: torch.Size([128])

Valid:
Text matrix size: torch.Size([56, 128])
Target vector size: torch.Size([128])
Length vector size: torch.Size([128])

Test:
Text matrix size: torch.Size([49, 128])
Target vector size: torch.Size([128])
Length vector size: torch.Size([128])


In [16]:
import torch.nn as nn

class LSTM(nn.Module):
    def __init__(self, input_dim, embedding_dim, hidden_dim, output_dim, n_layers, bidirectional, dropout, pad_idx):
        
        super().__init__()
        
        self.embedding = nn.Embedding(input_dim, embedding_dim, padding_idx = pad_idx )
        
        self.LSTM = nn.LSTM(embedding_dim, hidden_dim, num_layers=n_layers, dropout=dropout, bidirectional=bidirectional)
        
        self.fc = nn.Linear(hidden_dim * 2 , output_dim)

        self.dropout = nn.Dropout(dropout)
        
    def forward(self, text, text_lengths):

        #text = [sent len, batch size]
        
        embedded = self.dropout(self.embedding(text))
        
        #embedded = [sent len, batch size, emb dim]
        
        packed_embedded = nn.utils.rnn.pack_padded_sequence(embedded, text_lengths.to('cpu'), enforce_sorted=False)

        output, (hidden, cell) = self.LSTM(packed_embedded)


        
        #output = [sent len, batch size, hid dim]
        #hidden = [1, batch size, hid dim]
        
        #assert torch.equal(output[-1,:,:], hidden.squeeze(0))

        hidden = self.dropout(torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim = 1))
        
        return self.fc(hidden)

In [17]:
INPUT_DIM = len(vocab)
EMBEDDING_DIM = 100
HIDDEN_DIM = 256
OUTPUT_DIM = 1
N_LAYERS = 2
BIDIRECTIONAL = True
DROPOUT = 0.5
PAD_IDX = pad_idx

model = LSTM(INPUT_DIM, EMBEDDING_DIM, HIDDEN_DIM, OUTPUT_DIM, N_LAYERS, BIDIRECTIONAL, DROPOUT, PAD_IDX)

In [18]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f'The model has {count_parameters(model):,} trainable parameters')

The model has 14,452,357 trainable parameters


In [19]:
import torch.optim as optim

optimizer = optim.Adam(model.parameters(), lr=1e-3)

In [20]:
criterion = nn.BCEWithLogitsLoss()

In [21]:
model = model.to(device)
criterion = criterion.to(device)

In [22]:
def binary_accuracy(preds, y):
    """
    Returns accuracy per batch, i.e. if you get 8/10 right, this returns 0.8, NOT 8
    """

    #round predictions to the closest integer
    rounded_preds = torch.round(torch.sigmoid(preds))
    correct = (rounded_preds == y).float() #convert into float for division 
    acc = correct.sum() / len(correct)
    return acc

In [23]:
def calculate_metrics(preds, y):
    
    #round predictions to the closest integer
    rounded_preds = torch.round(torch.sigmoid(preds))
    correct = (rounded_preds == y).float() #convert into float for division 
    Acc = correct.sum() / len(correct)
    TP = ((rounded_preds == 1) & (y == 1)).sum()
    TN = ((rounded_preds == 0) & (y == 0)).sum()
    FP = ((rounded_preds == 1) & (y == 0)).sum()
    FN = ((rounded_preds == 0) & (y == 1)).sum()
    return Acc.item(),TP.item(),TN.item(),FP.item(), FN.item()

In [32]:
def train(model, iterator, optimizer, criterion):
    
    epoch_loss = 0
    epoch_acc = 0
    
    model.train()
    
    global train_acc, train_tp, train_tn, train_fp, train_fn
    train_acc= train_tp= train_tn= train_fp= train_fn=0
    
    for batch in iterator:
        
        optimizer.zero_grad()

        label, text, text_length = batch        
        label, text, text_length = label.to(device), text.to(device), text_length.to(device)
        predictions = model(text, text_length).squeeze(1)
        
        loss = criterion(predictions, label)
        
        acc = binary_accuracy(predictions, label)
        temp1, temp2, temp3, temp4, temp5 = calculate_metrics(predictions, label)
        train_acc, train_tp, train_tn, train_fp, train_fn = train_acc+temp1, train_tp+temp2, train_tn+temp3, train_fp+temp4, train_fn+temp5

        loss.backward()
        
        optimizer.step()
        
        epoch_loss += loss.item()
        epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [34]:
def evaluate(model, iterator, criterion):
    
    epoch_loss = 0
    epoch_acc = 0
    
    model.eval()
    global valid_acc, valid_tp, valid_tn, valid_fp, valid_fn
    valid_acc= valid_tp= valid_tn= valid_fp= valid_fn=0
    with torch.no_grad():
    
        for batch in iterator:

            label, text, text_length = batch        
            label, text, text_length = label.to(device), text.to(device), text_length.to(device)
            predictions = model(text, text_length).squeeze(1)
            
            loss = criterion(predictions, label)
            
            acc = binary_accuracy(predictions, label)
            temp1, temp2, temp3, temp4, temp5 = calculate_metrics(predictions, label)
            valid_acc, valid_tp, valid_tn, valid_fp, valid_fn = valid_acc+temp1, valid_tp+temp2, valid_tn+temp3, valid_fp+temp4, valid_fn+temp5
            epoch_loss += loss.item()
            epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [35]:
import time

def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

In [36]:
from prettytable import PrettyTable

N_EPOCHS = 10

best_valid_loss = float('inf')

for epoch in range(N_EPOCHS):

    start_time = time.time()
    
    train_loss, train_acc = train(model, train_dataloader, optimizer, criterion)
    valid_loss, valid_acc = evaluate(model, valid_dataloader, criterion)
    
    end_time = time.time()

    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), 'tut1-model.pt')
    
    print(f'Epoch: {epoch+1:02} | Epoch Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. Acc: {valid_acc*100:.2f}%')

    con = PrettyTable(['Train', 'Actual Positive', 'Actual Negative'])
    con.add_row(['Predicted Positive',train_tp,train_fp])
    con.add_row(['Predicted Negative',train_fn,train_tn])
    print(con)

    con = PrettyTable(['Valid', 'Actual Positive', 'Actual Negative'])
    con.add_row(['Predicted Positive',valid_tp,valid_fp])
    con.add_row(['Predicted Negative',valid_fn,valid_tn])
    print(con)

    table = PrettyTable(['Metrics', 'Train', 'Valid'])
    table.add_row(['Loss', f'{train_loss:.3f}',f'{valid_loss:.3f}'])
    table.add_row(['Accuracy', f'{((train_tp+train_tn)/(train_tp+train_fp+train_fn+train_tn))*100:.2f}%',f'{((valid_tp+valid_tn)/(valid_tp+valid_fp+valid_fn+valid_tn))*100:.2f}%'])    
    table.add_row(['Precision', f'{(train_tp/(train_fp+train_tp))*1:.2f}',f'{(valid_tp/(valid_fp+valid_tp))*1:.2f}'])
    table.add_row(['Recall', f'{(train_tp/(train_fn+train_tp))*1:.2f}',f'{(valid_tp/(valid_fn+valid_tp))*1:.2f}'])
    table.add_row(['F1 score', f'{(train_tp/(train_tp+0.5*(train_fp+train_fn)))*1:.2f}',f'{(valid_tp/(valid_tp+0.5*(valid_fp+valid_fn)))*1:.2f}'])
    print(table)

Epoch: 01 | Epoch Time: 0m 41s
	Train Loss: 0.673 | Train Acc: 59.35%
	 Val. Loss: 0.670 |  Val. Acc: 61.52%
+--------------------+-----------------+-----------------+
|       Train        | Actual Positive | Actual Negative |
+--------------------+-----------------+-----------------+
| Predicted Positive |       5635      |       3934      |
| Predicted Negative |       3767      |       5414      |
+--------------------+-----------------+-----------------+
+--------------------+-----------------+-----------------+
|       Valid        | Actual Positive | Actual Negative |
+--------------------+-----------------+-----------------+
| Predicted Positive |       2627      |       2013      |
| Predicted Negative |       471       |       1139      |
+--------------------+-----------------+-----------------+
+-----------+--------+--------+
|  Metrics  | Train  | Valid  |
+-----------+--------+--------+
|    Loss   | 0.673  | 0.670  |
|  Accuracy | 58.93% | 60.26% |
| Precision |  0.59  | 

In [37]:
model.load_state_dict(torch.load('tut1-model.pt'))

test_loss, test_acc = evaluate(model, test_dataloader, criterion)

print(f'Test Loss: {test_loss:.3f} | Test Acc: {test_acc*100:.2f}%')

Test Loss: 0.380 | Test Acc: 84.86%


In [38]:
import spacy
nlp = spacy.load('en_core_web_sm')

def predict_sentiment(model, sentence):
    classification = {0: 'neg', 1: 'pos'}
    model.eval()
    tokenized = [tok.text for tok in nlp.tokenizer(sentence)]
    indexed = [vocab.get_stoi()[t] for t in tokenized]
    length = [len(indexed)]
    tensor = torch.LongTensor(indexed).to(device)
    tensor = tensor.unsqueeze(1)
    length_tensor = torch.LongTensor(length)
    prediction = torch.sigmoid(model(tensor, length_tensor).squeeze(0))
    return classification[round(prediction.item())]

In [39]:
predict_sentiment(model, "This film is terrible")

'neg'

In [40]:
predict_sentiment(model, "This film is great")

'pos'

In [41]:
cpu_model = model.to('cpu')
torch.save(cpu_model.state_dict(), 'upgraded_sentiment_analysis.pt')