## Load and Preprocess Text

In [1]:
!pip install text-preprocessing

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [2]:
from text_preprocessing import preprocess_text
from text_preprocessing import to_lower, remove_email, remove_url, remove_punctuation

preprocess_functions = [to_lower, remove_email, remove_url, remove_punctuation]

def clean_text(text):
    return preprocess_text(text, preprocess_functions)

[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [3]:
def sentiment2label(sentiment):
    return 0 if sentiment == 'negative' else 1

In [4]:
import pandas as pd

df = pd.read_csv('/content/drive/MyDrive/PyTorch/PyTorch-NLP-Tutorial/Corpus/IMDB Dataset.csv')
df['text'] = df['review'].apply(clean_text)
df['label'] = df['sentiment'].apply(sentiment2label)
df = df[['text', 'label']]
df = df.sample(frac=1)
df.dropna(inplace=True) 
df.reset_index(drop=True, inplace=True)

In [5]:
# splitting into train and test set
train_df = df.iloc[:int(len(df)*0.8), :].reset_index(drop=True)
test_df = df.iloc[int(len(df)*0.8):, :].reset_index(drop=True)

## Build the Vocabulary class to convert text into numerical values

In [6]:
class Vocabulary: 
    def __init__(self, freq_threshold=10, max_size=100000):
        '''
        freq_threshold : the minimum times a word must occur in corpus to be included in vocabulary
        max_size : max vocab size
        '''
        self.freq_threshold = freq_threshold
        self.max_size = max_size

        self.itos = {0: '<PAD>', 1:'<SOS>', 2:'<EOS>', 3: '<UNK>'}
        self.stoi = {k:j for j, k in self.itos.items()} 
          
    
    def __len__(self):
        return len(self.itos)
    

    @staticmethod
    def tokenizer(text):
        return [tok.lower().strip() for tok in text.split(' ')]
    
    
    def build_vocabulary(self, sentence_list):
        '''
        build the vocabulary: create a dictionary mapping of index to string (itos) and string to index (stoi)
        (itos) -> {5:'the', 6:'a', 7:'an'} | (stoi) -> {'the':5, 'a':6, 'an':7}
        '''
        frequencies = {} 
        idx = 4  # because 4 tokens already added -> (itos) -> {0: '<PAD>', 1:'<SOS>', 2:'<EOS>', 3: '<UNK>'}
        
        # calculate the freq of words
        for sentence in sentence_list:
            for word in self.tokenizer(sentence):
                if word not in frequencies.keys():
                    frequencies[word] = 1
                else:
                    frequencies[word] += 1
                    
                    
        # limit vocab by removing low freq words
        frequencies = {k:v for k,v in frequencies.items() if v > self.freq_threshold} 
        
        # limit vocab to the max_size specified
        frequencies = dict(sorted(frequencies.items(), key = lambda x: -x[1])[:self.max_size-idx]) # idx = 4 for pad, start, end , unk
            
        # create vocab
        for word in frequencies.keys():
            self.stoi[word] = idx
            self.itos[idx] = word
            idx += 1


    # to convert text into numeric values
    def numericalize(self, text):
        '''
        convert the list of words to a list of corresponding indexes
        eg. cat and a dog -> [4, 5, 6, 3]
        '''   
        tokenized_text = self.tokenizer(text)  # tokenize text 
        numericalized_text = []

        for token in tokenized_text:
            if token in self.stoi.keys():
                numericalized_text.append(self.stoi[token])
            else: # out-of-vocab (OOV) words are represented by UNK token index
                numericalized_text.append(self.stoi['<UNK>'])
                
        return numericalized_text

## Custom Dataloader

**Train DataLoader**

In [7]:
import torch
from torch.nn.utils.rnn import pad_sequence

class TrainDataset(torch.utils.data.Dataset):

    def __init__(self, df, text_column, label_column, freq_threshold=5, vocab_size=10000):
        self.df = df
        
        # get texts and labels
        self.texts = self.df[text_column]
        self.labels = self.df[label_column]
        
        # build vocabulary
        self.vocab = Vocabulary(freq_threshold, vocab_size)
        self.vocab.build_vocabulary(self.texts.tolist())


    def __len__(self):
        return len(self.df)
    
    
    def __getitem__(self, index):
        text = self.texts[int(index)]
        label = self.labels[index]
            
        # numericalize texts ['<SOS>','cat', 'in', 'a', 'bag','<EOS>'] -> [1,12,2,9,24,2]
        numerialized_text = [self.vocab.stoi["<SOS>"]]
        numerialized_text += self.vocab.numericalize(text)
        numerialized_text.append(self.vocab.stoi["<EOS>"])
    
        label = [float(label)]
        
        return torch.tensor(numerialized_text), torch.tensor(label, requires_grad = True) 

In [8]:
train_dataset = TrainDataset(
    df = train_df, text_column = 'text', label_column = 'label', 
    freq_threshold = 10, vocab_size = 25000
)

print(f'{df.loc[1]}\n')

text, label = train_dataset[1]
print(text)
print(label)

text     very much a film from the times  extremely lon...
label                                                    0
Name: 1, dtype: object

tensor([    1,    56,    76,     5,    22,    39,     4,   216,    34,   555,
          205,   834,    19,    59,   401,    86,  1947,     6,    37,   210,
         1036,   906,   183,     4,   111,   369,    40,   943,    28,     5,
          928,     7,   486,   300,   407,     6,  1948,    65,    17,   729,
         5522,    18, 12006,    15,  7557,     9,  4341,  1935,    30,  2373,
          462,    30,  2283,    76,     7,    27,    63, 20660,   145,     3,
        14547, 15682,     9,   206,     5,  1786,     8,    68,    21,    30,
          103,     9,  1862,     3,    15,     4,    22,    48,    31,  5421,
         1142,  1235,    10,    14,    11,  4021,   108,  1397,     7,   696,
        12741,     6,  9217, 13245,   140,    17,     4,  5388,  1287,     7,
            4,     3,  7612,    14,   108,    88,   643,    26,  4945,     7,


**Test DataLoader**

In [9]:
class TestDataset(torch.utils.data.Dataset):

    def __init__(self, train_dataset, df, text_column, label_column):
        self.train_dataset = train_dataset
        self.df = df
        
        # get texts and labels
        self.texts = self.df[text_column]
        self.labels = self.df[label_column]
        
        # utilizing vocabulary created using training set
        self.vocab = self.train_dataset.vocab


    def __len__(self):
        return len(self.df)
    
    
    def __getitem__(self, index):
        text = self.texts[int(index)]
        label = self.labels[index]
            
        # numericalize texts ['<SOS>','cat', 'in', 'a', 'bag','<EOS>'] -> [1,12,2,9,24,2]
        numerialized_text = [self.vocab.stoi["<SOS>"]]
        numerialized_text += self.vocab.numericalize(text)
        numerialized_text.append(self.vocab.stoi["<EOS>"])
    
        label = [float(label)]
        
        return torch.tensor(numerialized_text), torch.tensor(label, requires_grad = True) 

In [10]:
test_dataset = TestDataset(
    train_dataset = train_dataset, df = test_df, text_column = 'text', label_column = 'label'
)

print(f'{df.loc[100]}\n')

text, label = test_dataset[100]
print(text)
print(label)

text     neat premise very unrealistic what i learned f...
label                                                    0
Name: 100, dtype: object

tensor([    1,     4,    90,    63,    12,   204,    13,    22,    12,   421,
           11,    11,    16, 11182,    15,    12,   233,     5,  1565,  3285,
         3705,    12,    93,   143,   334,    19,     4,  2015,     7,   208,
         4164, 15913,    12,  3470,    49,     7,    14, 13442,    15,     8,
           73,    10,    67,   983,     8,   187,   130,   254,  1332,    38,
           26,   399,    52, 12012,   746,     8,    82,    12,   641,    11,
            4, 12012,     3,    15, 12012,   151,    79,    11,    58,   157,
         2304,   296,    27,  4919,    16,    24,     7,    13,  3878,    15,
           13,    22,   412,  1160,    50,     7,  2202,    15,    12,    84,
          415,    14,  2304,   492,     4,   594,  1673,     7,   388,  1532,
            6,   607,    19,    27,     3,    15,    84,    30,    16,     5

## Collate Fn of DataLoader

In [11]:
class MyCollate:
    def __init__(self, pad_idx):
        self.pad_idx = pad_idx
        
    def __call__(self, batch):
        source = [item[0] for item in batch] 
        source = pad_sequence(source, batch_first=True, padding_value = self.pad_idx) 
        
        target = torch.tensor([item[1].item() for item in batch])
        return source, target

In [12]:
train_loader = torch.utils.data.DataLoader(
    dataset = train_dataset, batch_size = 32, num_workers = 1, shuffle = True, pin_memory = True, drop_last = True,
    collate_fn = MyCollate(pad_idx = train_dataset.vocab.stoi["<PAD>"])
)

In [13]:
test_loader = torch.utils.data.DataLoader(
    dataset = test_dataset, batch_size = 64, num_workers = 1, shuffle = True, pin_memory = True, 
    collate_fn = MyCollate(pad_idx = train_dataset.vocab.stoi["<PAD>"])
)

In [14]:
for idx, (texts, labels) in enumerate(train_loader):
    print(texts.shape, labels.shape)
    if idx >= 4:
        break

torch.Size([32, 601]) torch.Size([32])
torch.Size([32, 966]) torch.Size([32])
torch.Size([32, 614]) torch.Size([32])
torch.Size([32, 803]) torch.Size([32])
torch.Size([32, 871]) torch.Size([32])


In [15]:
for (text, label) in train_loader:
    print(f"{text.shape}\n{type(text)}\n{text}")
    print(f"\n{label.shape}\n{type(label)}\n{label}")
    break

torch.Size([32, 955])
<class 'torch.Tensor'>
tensor([[   1,  104, 9452,  ...,    0,    0,    0],
        [   1,   12,   28,  ...,    0,    0,    0],
        [   1,   32,  250,  ...,    0,    0,    0],
        ...,
        [   1,   41,  372,  ...,    0,    0,    0],
        [   1,   12,  175,  ...,    0,    0,    0],
        [   1,   13,   20,  ...,    0,    0,    0]])

torch.Size([32])
<class 'torch.Tensor'>
tensor([1., 1., 1., 0., 1., 0., 1., 0., 0., 0., 0., 1., 0., 0., 0., 1., 0., 0.,
        0., 0., 0., 1., 0., 0., 1., 1., 0., 0., 0., 0., 0., 1.])


## Build the Neural Bag-of-Words Model

In [16]:
import torch.nn as nn

class BagOfWords(nn.Module):
    def __init__(self, vocab_size, embedding_dim, output_dim, pad_index):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=pad_index)
        self.fc = nn.Linear(embedding_dim, output_dim)
    
    def forward(self, ids):
        # ids = [batch size, seq len]
        embedded = self.embedding(ids)
        # embedded = [batch size, seq len, embedding dim]
        pooled = embedded.mean(dim=1)
        # pooled = [batch size, embedding dim]
        prediction = self.fc(pooled)
        # prediction = [batch size, output dim]
        return prediction

In [17]:
# hyperparameters
vocab_size = train_dataset.vocab.__len__()
embedding_dim = 256
output_dim = len(set(df['label'].values))
pad_index = train_dataset.vocab.stoi["<PAD>"]

model = BagOfWords(vocab_size, embedding_dim, output_dim, pad_index)

In [18]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f'The model has {count_parameters(model):,} trainable parameters')

The model has 6,400,514 trainable parameters


In [19]:
import torch.optim as optim

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

optimizer = optim.Adam(model.parameters())
criterion = nn.CrossEntropyLoss()

model = model.to(device)
criterion = criterion.to(device)

## Train and Test the Model

## Training

In [20]:
from tqdm import tqdm
import sys

def train(model, iterator, optimizer, criterion, device):
    
    epoch_loss = 0
    epoch_acc = 0
    
    model.train()
    
    for batch in tqdm(iterator):
        
        optimizer.zero_grad()

        text, label = batch
        text, label = text.to(device), label.to(device)
                
        predictions = model(text)
        predictions_values, predictions_idxs = torch.max(predictions, dim=1, keepdim=False)  # [0] to return values, [1] to return indices

        predictions_values = torch.round(predictions_values)

        loss = criterion(predictions_values.float(), label)
        
        correction = (predictions_values == label).float()
        acc = correction.sum() / len(correction)
        
        loss.backward()
        
        optimizer.step()
        
        epoch_loss += loss.item()
        epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)


In [21]:
def evaluate(model, iterator, optimizer, criterion, device):
    
    epoch_loss = 0
    epoch_acc = 0
    
    model.eval()
    
    with torch.no_grad():
        for batch in tqdm(iterator):

            text, label = batch
            text, label = text.to(device), label.to(device)
                    
            predictions = model(text)
            predictions_values, predictions_idxs = torch.max(predictions, dim=1, keepdim=False)  # [0] to return values, [1] to return indices

            predictions_values = torch.round(predictions_values)

            loss = criterion(predictions_values.float(), label)
            
            correction = (predictions_values == label).float()
            acc = correction.sum() / len(correction)
            
            epoch_loss += loss.item()
            epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)


In [22]:
import numpy as np

n_epochs = 20
best_valid_loss = float('inf')

train_losses = []
train_accs = []
valid_losses = []
valid_accs = []

for epoch in range(n_epochs):
    print(f'epoch: {epoch+1}')

    train_loss, train_acc = train(model, train_loader, optimizer, criterion, device)
    valid_loss, valid_acc = evaluate(model, test_loader, optimizer, criterion, device)

    train_losses.append(train_loss)
    train_accs.append(train_acc)
    valid_losses.append(valid_loss)
    valid_accs.append(valid_acc)
    
    epoch_train_loss = np.mean(train_loss)
    epoch_train_acc = np.mean(train_acc)
    epoch_valid_loss = np.mean(valid_loss)
    epoch_valid_acc = np.mean(valid_acc)
    
    if epoch_valid_loss < best_valid_loss:
        best_valid_loss = epoch_valid_loss
        torch.save(model.state_dict(), 'nbow.pt')
    
    print(f'train_loss: {epoch_train_loss:.3f}, train_acc: {epoch_train_acc:.3f}')
    print(f'valid_loss: {epoch_valid_loss:.3f}, valid_acc: {epoch_valid_acc:.3f}\n')


epoch: 1


100%|██████████| 1250/1250 [00:12<00:00, 100.86it/s]
100%|██████████| 157/157 [00:02<00:00, 68.62it/s]


train_loss: 55.341, train_acc: 0.501
valid_loss: 133.446, valid_acc: 0.496

epoch: 2


100%|██████████| 1250/1250 [00:11<00:00, 106.14it/s]
100%|██████████| 157/157 [00:02<00:00, 69.69it/s]


train_loss: 55.341, train_acc: 0.501
valid_loss: 133.437, valid_acc: 0.496

epoch: 3


100%|██████████| 1250/1250 [00:11<00:00, 106.72it/s]
100%|██████████| 157/157 [00:02<00:00, 67.77it/s]


train_loss: 55.341, train_acc: 0.501
valid_loss: 133.464, valid_acc: 0.497

epoch: 4


100%|██████████| 1250/1250 [00:11<00:00, 106.57it/s]
100%|██████████| 157/157 [00:02<00:00, 67.67it/s]


train_loss: 55.341, train_acc: 0.501
valid_loss: 133.464, valid_acc: 0.497

epoch: 5


100%|██████████| 1250/1250 [00:11<00:00, 105.01it/s]
100%|██████████| 157/157 [00:02<00:00, 67.88it/s]


train_loss: 55.341, train_acc: 0.501
valid_loss: 133.411, valid_acc: 0.495

epoch: 6


100%|██████████| 1250/1250 [00:15<00:00, 80.98it/s]
100%|██████████| 157/157 [00:02<00:00, 67.26it/s]


train_loss: 55.341, train_acc: 0.501
valid_loss: 133.429, valid_acc: 0.496

epoch: 7


100%|██████████| 1250/1250 [00:11<00:00, 106.54it/s]
100%|██████████| 157/157 [00:02<00:00, 68.69it/s]


train_loss: 55.341, train_acc: 0.501
valid_loss: 133.446, valid_acc: 0.496

epoch: 8


100%|██████████| 1250/1250 [00:11<00:00, 104.62it/s]
100%|██████████| 157/157 [00:02<00:00, 67.52it/s]


train_loss: 55.341, train_acc: 0.501
valid_loss: 133.411, valid_acc: 0.495

epoch: 9


100%|██████████| 1250/1250 [00:11<00:00, 106.04it/s]
100%|██████████| 157/157 [00:02<00:00, 67.48it/s]


train_loss: 55.341, train_acc: 0.501
valid_loss: 133.455, valid_acc: 0.497

epoch: 10


100%|██████████| 1250/1250 [00:11<00:00, 106.93it/s]
100%|██████████| 157/157 [00:02<00:00, 67.51it/s]


train_loss: 55.341, train_acc: 0.501
valid_loss: 133.411, valid_acc: 0.495

epoch: 11


100%|██████████| 1250/1250 [00:11<00:00, 106.45it/s]
100%|██████████| 157/157 [00:02<00:00, 68.78it/s]


train_loss: 55.341, train_acc: 0.501
valid_loss: 133.437, valid_acc: 0.496

epoch: 12


100%|██████████| 1250/1250 [00:11<00:00, 107.15it/s]
100%|██████████| 157/157 [00:02<00:00, 66.88it/s]


train_loss: 55.341, train_acc: 0.501
valid_loss: 133.446, valid_acc: 0.496

epoch: 13


100%|██████████| 1250/1250 [00:11<00:00, 105.50it/s]
100%|██████████| 157/157 [00:02<00:00, 66.91it/s]


train_loss: 55.341, train_acc: 0.501
valid_loss: 133.420, valid_acc: 0.495

epoch: 14


100%|██████████| 1250/1250 [00:11<00:00, 105.91it/s]
100%|██████████| 157/157 [00:02<00:00, 65.84it/s]


train_loss: 55.341, train_acc: 0.501
valid_loss: 133.402, valid_acc: 0.495

epoch: 15


100%|██████████| 1250/1250 [00:11<00:00, 106.08it/s]
100%|██████████| 157/157 [00:02<00:00, 66.11it/s]


train_loss: 55.341, train_acc: 0.501
valid_loss: 133.446, valid_acc: 0.496

epoch: 16


100%|██████████| 1250/1250 [00:11<00:00, 105.89it/s]
100%|██████████| 157/157 [00:02<00:00, 67.84it/s]


train_loss: 55.341, train_acc: 0.501
valid_loss: 133.420, valid_acc: 0.495

epoch: 17


100%|██████████| 1250/1250 [00:11<00:00, 107.40it/s]
100%|██████████| 157/157 [00:02<00:00, 66.67it/s]


train_loss: 55.341, train_acc: 0.501
valid_loss: 133.429, valid_acc: 0.496

epoch: 18


100%|██████████| 1250/1250 [00:11<00:00, 106.54it/s]
100%|██████████| 157/157 [00:02<00:00, 67.87it/s]


train_loss: 55.341, train_acc: 0.501
valid_loss: 133.411, valid_acc: 0.495

epoch: 19


100%|██████████| 1250/1250 [00:11<00:00, 104.91it/s]
100%|██████████| 157/157 [00:02<00:00, 67.56it/s]


train_loss: 55.341, train_acc: 0.501
valid_loss: 133.446, valid_acc: 0.496

epoch: 20


100%|██████████| 1250/1250 [00:11<00:00, 106.25it/s]
100%|██████████| 157/157 [00:02<00:00, 66.33it/s]

train_loss: 55.341, train_acc: 0.501
valid_loss: 133.437, valid_acc: 0.496






# Inference

In [23]:
def predict_sentiment(text, model, device):
    ids = train_dataset.vocab.numericalize(text)
    tensor = torch.LongTensor(ids).unsqueeze(dim=0).to(device)
    prediction = model(tensor).squeeze(dim=0)
    probability = torch.softmax(prediction, dim=-1)
    predicted_class = prediction.argmax(dim=-1).item()
    predicted_probability = probability[predicted_class].item()
    print(f"{'Negative' if predicted_class == 0 else 'Positive'} | probability score = {predicted_probability:.4f}")

In [24]:
predict_sentiment('This film is terrible', model, device)

Negative | probability score = 0.5226


In [25]:
predict_sentiment('This film is great', model, device)

Positive | probability score = 0.5190
