In [1]:
!pip install text-preprocessing

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting text-preprocessing
  Downloading text_preprocessing-0.1.1-py2.py3-none-any.whl (9.6 kB)
Collecting names-dataset==2.1
  Downloading names_dataset-2.1.0-py3-none-any.whl (62.6 MB)
[K     |████████████████████████████████| 62.6 MB 233 kB/s 
[?25hCollecting contractions
  Downloading contractions-0.1.73-py2.py3-none-any.whl (8.7 kB)
Collecting pyspellchecker
  Downloading pyspellchecker-0.7.1-py3-none-any.whl (2.5 MB)
[K     |████████████████████████████████| 2.5 MB 59.0 MB/s 
[?25hCollecting unittest-xml-reporting
  Downloading unittest_xml_reporting-3.2.0-py2.py3-none-any.whl (20 kB)
Collecting textsearch>=0.0.21
  Downloading textsearch-0.0.24-py2.py3-none-any.whl (7.6 kB)
Collecting anyascii
  Downloading anyascii-0.3.1-py3-none-any.whl (287 kB)
[K     |████████████████████████████████| 287 kB 75.4 MB/s 
[?25hCollecting pyahocorasick
  Downloading pyahocorasick-1.4.4-cp3

In [2]:
from text_preprocessing import preprocess_text
from text_preprocessing import to_lower, remove_email, remove_url, remove_punctuation

preprocess_functions = [to_lower, remove_email, remove_url, remove_punctuation]

def clean_text(text):
    return preprocess_text(text, preprocess_functions)

[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


In [3]:
def sentiment2label(sentiment):
    return 0 if sentiment == 'negative' else 1

In [8]:
import pandas as pd

df = pd.read_csv('/content/drive/MyDrive/PyTorch/PyTorch-NLP-Tutorial/Corpus/IMDB Dataset.csv')
df['text'] = df['review'].apply(clean_text)
df['label'] = df['sentiment'].apply(sentiment2label)
df = df[['text', 'label']]
df = df.sample(frac=1)
df.dropna(inplace=True) 
df.reset_index(drop=True, inplace=True)

In [9]:
train_df = df.iloc[:int(len(df)*0.8), :].reset_index(drop=True)
test_df = df.iloc[int(len(df)*0.8):, :].reset_index(drop=True)

In [10]:
class Vocabulary: 
    def __init__(self, freq_threshold=10, max_size=100000):
        '''
        freq_threshold : the minimum times a word must occur in corpus to be included in vocabulary
        max_size : max vocab size
        '''
        self.freq_threshold = freq_threshold
        self.max_size = max_size

        self.itos = {0: '<PAD>', 1:'<SOS>', 2:'<EOS>', 3: '<UNK>'}
        self.stoi = {k:j for j, k in self.itos.items()} 
          
    
    def __len__(self):
        return len(self.itos)
    

    @staticmethod
    def tokenizer(text):
        return [tok.lower().strip() for tok in text.split(' ')]
    
    
    def build_vocabulary(self, sentence_list):
        '''
        build the vocabulary: create a dictionary mapping of index to string (itos) and string to index (stoi)
        (itos) -> {5:'the', 6:'a', 7:'an'} | (stoi) -> {'the':5, 'a':6, 'an':7}
        '''
        frequencies = {} 
        idx = 4  # because 4 tokens already added -> (itos) -> {0: '<PAD>', 1:'<SOS>', 2:'<EOS>', 3: '<UNK>'}
        
        # calculate the freq of words
        for sentence in sentence_list:
            for word in self.tokenizer(sentence):
                if word not in frequencies.keys():
                    frequencies[word] = 1
                else:
                    frequencies[word] += 1
                    
                    
        # limit vocab by removing low freq words
        frequencies = {k:v for k,v in frequencies.items() if v > self.freq_threshold} 
        
        # limit vocab to the max_size specified
        frequencies = dict(sorted(frequencies.items(), key = lambda x: -x[1])[:self.max_size-idx]) # idx = 4 for pad, start, end , unk
            
        # create vocab
        for word in frequencies.keys():
            self.stoi[word] = idx
            self.itos[idx] = word
            idx += 1

 
    def numericalize(self, text):
        '''
        convert the list of words to a list of corresponding indexes
        eg. cat and a dog -> [4, 5, 6, 3]
        '''   
        tokenized_text = self.tokenizer(text)  # tokenize text 
        numericalized_text = []

        for token in tokenized_text:
            if token in self.stoi.keys():
                numericalized_text.append(self.stoi[token])
            else: # out-of-vocab (OOV) words are represented by UNK token index
                numericalized_text.append(self.stoi['<UNK>'])
                
        return numericalized_text

In [11]:
import torch
from torch.nn.utils.rnn import pad_sequence

In [12]:
class TrainDataset(torch.utils.data.Dataset):

    def __init__(self, df, text_column, label_column, freq_threshold=5, vocab_size=10000):
        self.df = df
        
        # get texts and labels
        self.texts = self.df[text_column]
        self.labels = self.df[label_column]
        
        # build vocabulary
        self.vocab = Vocabulary(freq_threshold, vocab_size)
        self.vocab.build_vocabulary(self.texts.tolist())


    def __len__(self):
        return len(self.df)
    
    
    def __getitem__(self, index):
        text = self.texts[int(index)]
        label = self.labels[index]
            
        # numericalize texts ['<SOS>','cat', 'in', 'a', 'bag','<EOS>'] -> [1,12,2,9,24,2]
        numerialized_text = [self.vocab.stoi["<SOS>"]]
        numerialized_text += self.vocab.numericalize(text)
        numerialized_text.append(self.vocab.stoi["<EOS>"])
    
        label = [float(label)]
        
        return torch.tensor(numerialized_text), torch.tensor(label, requires_grad = True) 

In [13]:
train_dataset = TrainDataset(
    df = train_df, text_column = 'text', label_column = 'label', 
    freq_threshold = 10, vocab_size = 25000
)

print(f'{df.loc[1]}\n')

text, label = train_dataset[1]
print(text)
print(label)

text     when i was younger this movie always aired on ...
label                                                    1
Name: 1, dtype: object

tensor([    1,    54,    12,    16,  1068,    13,    20,   205,  3141,    23,
         2495,   317,    10,     4,  1491,    23,  1176,  1902,    13,    16,
            4,   155,   160,  1769,    16,     6,  2310,     5,   538,   130,
            4,  7957,    12,   205,   564,   978,     8,    11,   460,   142,
        10870,  6472,    19,    57,   735,    94,   844,   188,    19,    57,
            3,   243,  2755,     5,     6,   799,   984,     3,    64,    63,
            7,     4,  1281,    12,    16,  1506,     8,  2803, 23424,     5,
          345,   250,     4,   234,     9,  6302,   540,  1008,    42,   116,
        21516,    21,    32,     6,   247,    20,     5,    12,   422,  8191,
            5, 17492,   982,  4136,     3,    19,   116,   791,    10,     4,
           20,    21,    30,     9,  1244,  3155,   114, 18036,     5,   146,


In [14]:
class TestDataset(torch.utils.data.Dataset):

    def __init__(self, train_dataset, df, text_column, label_column):
        self.train_dataset = train_dataset
        self.df = df
        
        # get texts and labels
        self.texts = self.df[text_column]
        self.labels = self.df[label_column]
        
        # utilizing vocabulary created using training set
        self.vocab = self.train_dataset.vocab


    def __len__(self):
        return len(self.df)
    
    
    def __getitem__(self, index):
        text = self.texts[int(index)]
        label = self.labels[index]
            
        # numericalize texts ['<SOS>','cat', 'in', 'a', 'bag','<EOS>'] -> [1,12,2,9,24,2]
        numerialized_text = [self.vocab.stoi["<SOS>"]]
        numerialized_text += self.vocab.numericalize(text)
        numerialized_text.append(self.vocab.stoi["<EOS>"])
    
        label = [float(label)]
        
        return torch.tensor(numerialized_text), torch.tensor(label, requires_grad = True) 

In [15]:
test_dataset = TestDataset(
    train_dataset = train_dataset, df = test_df, text_column = 'text', label_column = 'label'
)

print(f'{df.loc[100]}\n')

text, label = test_dataset[100]
print(text)
print(label)

text     i cant say that this movie deserves a ten beca...
label                                                    1
Name: 100, dtype: object

tensor([    1,   679,  2393,   829,    17,    44,   158,  6242,     7,   116,
          377,   569,  2592,     6,   692,   364,    19,   679, 14823,  1029,
        24300,  2393,    48,    27,   922,   376,   258,     8,   380,    99,
           10,   372,     5,  3110,    99,    10,    67, 20639, 13815,    52,
          504,     4,    22,   185,     9,     4,  2193,  1264,     7,     6,
          170,   362,   346,    50,  1402,   132,     4,    91,  7917,   204,
          160,   576,  2401, 15082,     4,   196,   290,    39,     4,    63,
            4,   454,  8531,   839,     7,  2674, 19919,   305,  5588,   709,
           33,  2222,     3,  8800,  2525,  4734, 22298,    27, 22191,     5,
         4074,  4102,    23,     4,   354,     7,     4,   181,   679,  2393,
          459,    76,    56,    74,    44,     6,  2266,  7382,    15,   157

In [16]:
class MyCollate:
    def __init__(self, pad_idx):
        self.pad_idx = pad_idx
        
    def __call__(self, batch):
        source = [item[0] for item in batch] 
        source = pad_sequence(source, batch_first=True, padding_value = self.pad_idx) 
        
        target = torch.tensor([item[1].item() for item in batch])
        return source, target

In [17]:
train_loader = torch.utils.data.DataLoader(
    dataset = train_dataset, batch_size = 32, num_workers = 1, shuffle = True, pin_memory = True, drop_last = True,
    collate_fn = MyCollate(pad_idx = train_dataset.vocab.stoi["<PAD>"])
)

In [18]:
test_loader = torch.utils.data.DataLoader(
    dataset = test_dataset, batch_size = 64, num_workers = 1, shuffle = True, pin_memory = True, 
    collate_fn = MyCollate(pad_idx = train_dataset.vocab.stoi["<PAD>"])
)

In [19]:
for idx, (texts, labels) in enumerate(train_loader):
    print(texts.shape, labels.shape)
    if idx >= 4:
        break

torch.Size([32, 903]) torch.Size([32])
torch.Size([32, 747]) torch.Size([32])
torch.Size([32, 764]) torch.Size([32])
torch.Size([32, 1002]) torch.Size([32])
torch.Size([32, 971]) torch.Size([32])


In [20]:
for (text, label) in train_loader:
    print(f"{text.shape}\n{type(text)}\n{text}")
    print(f"\n{label.shape}\n{type(label)}\n{label}")
    break

torch.Size([32, 806])
<class 'torch.Tensor'>
tensor([[   1,   32,    4,  ...,    0,    0,    0],
        [   1,    4,  875,  ...,    0,    0,    0],
        [   1, 1603, 4192,  ...,    0,    0,    0],
        ...,
        [   1,    3,    9,  ...,    0,    0,    0],
        [   1,   58,   18,  ...,    0,    0,    0],
        [   1,   12,   16,  ...,    0,    0,    0]])

torch.Size([32])
<class 'torch.Tensor'>
tensor([0., 0., 1., 1., 1., 1., 0., 1., 0., 0., 1., 1., 1., 1., 1., 0., 0., 0.,
        0., 0., 1., 1., 1., 0., 0., 1., 0., 0., 1., 1., 0., 0.])


In [21]:
import torch.nn as nn

class RNN(nn.Module):
    def __init__(self, input_dim, embedding_dim, hidden_dim, output_dim):
        
        super().__init__()
        
        self.embedding = nn.Embedding(input_dim, embedding_dim)
        
        self.rnn = nn.RNN(embedding_dim, hidden_dim)
        
        self.fc = nn.Linear(hidden_dim, output_dim)
        
    def forward(self, text):

        #text = [sent len, batch size]
        
        embedded = self.embedding(text)
        
        #embedded = [sent len, batch size, emb dim]
        
        output, hidden = self.rnn(embedded)
        
        #output = [sent len, batch size, hid dim]
        #hidden = [1, batch size, hid dim]
        
        assert torch.equal(output[-1,:,:], hidden.squeeze(0))
        
        return self.fc(hidden.squeeze(0))

In [22]:
class BagOfWords(nn.Module):
    def __init__(self, vocab_size, embedding_dim, output_dim, pad_index):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=pad_index)
        self.fc = nn.Linear(embedding_dim, output_dim)
    
    def forward(self, ids):
        # ids = [batch size, seq len]
        embedded = self.embedding(ids)
        # embedded = [batch size, seq len, embedding dim]
        pooled = embedded.mean(dim=1)
        # pooled = [batch size, embedding dim]
        prediction = self.fc(pooled)
        # prediction = [batch size, output dim]
        return prediction


In [23]:
vocab_size = train_dataset.vocab.__len__()
embedding_dim = 256
output_dim = 2
pad_index = train_dataset.vocab.stoi["<PAD>"]

model = BagOfWords(vocab_size, embedding_dim, output_dim, pad_index)

In [24]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f'The model has {count_parameters(model):,} trainable parameters')

The model has 6,400,514 trainable parameters


In [25]:
import torch.optim as optim

optimizer = optim.Adam(model.parameters())
criterion = nn.CrossEntropyLoss()

In [26]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

model = model.to(device)
criterion = criterion.to(device)

In [27]:
def get_accuracy(prediction, label):
    batch_size, _ = prediction.shape
    predicted_classes = prediction.argmax(dim=-1)
    correct_predictions = predicted_classes.eq(label).sum()
    accuracy = correct_predictions / batch_size
    return accuracy

In [28]:
import tqdm, sys

def train(dataloader, model, criterion, optimizer, device):

    model.train()
    epoch_losses = []
    epoch_accs = []

    for batch in tqdm.tqdm(dataloader, desc='training...', file=sys.stdout):
        ids, label = batch
        ids, label = ids.to(device), label.type(torch.LongTensor).to(device)
        prediction = model(ids)
        loss = criterion(prediction, label)
        accuracy = get_accuracy(prediction, label)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        epoch_losses.append(loss.item())
        epoch_accs.append(accuracy.item())

    return epoch_losses, epoch_accs


In [29]:
def evaluate(dataloader, model, criterion, device):
    
    model.eval()
    epoch_losses = []
    epoch_accs = []

    with torch.no_grad():
        for batch in tqdm.tqdm(dataloader, desc='evaluating...', file=sys.stdout):
            ids, label = batch
            ids, label = ids.to(device), label.type(torch.LongTensor).to(device)
            prediction = model(ids)
            loss = criterion(prediction, label)
            accuracy = get_accuracy(prediction, label)
            epoch_losses.append(loss.item())
            epoch_accs.append(accuracy.item())

    return epoch_losses, epoch_accs

In [30]:
import numpy as np

n_epochs = 20
best_valid_loss = float('inf')

train_losses = []
train_accs = []
valid_losses = []
valid_accs = []

for epoch in range(n_epochs):

    train_loss, train_acc = train(train_loader, model, criterion, optimizer, device)
    valid_loss, valid_acc = evaluate(test_loader, model, criterion, device)

    train_losses.extend(train_loss)
    train_accs.extend(train_acc)
    valid_losses.extend(valid_loss)
    valid_accs.extend(valid_acc)
    
    epoch_train_loss = np.mean(train_loss)
    epoch_train_acc = np.mean(train_acc)
    epoch_valid_loss = np.mean(valid_loss)
    epoch_valid_acc = np.mean(valid_acc)
    
    if epoch_valid_loss < best_valid_loss:
        best_valid_loss = epoch_valid_loss
        torch.save(model.state_dict(), 'nbow.pt')
    
    print(f'epoch: {epoch+1}')
    print(f'train_loss: {epoch_train_loss:.3f}, train_acc: {epoch_train_acc:.3f}')
    print(f'valid_loss: {epoch_valid_loss:.3f}, valid_acc: {epoch_valid_acc:.3f}')


training...: 100%|██████████| 1250/1250 [00:14<00:00, 86.68it/s] 
evaluating...: 100%|██████████| 157/157 [00:02<00:00, 68.95it/s]
epoch: 1
train_loss: 0.518, train_acc: 0.777
valid_loss: 0.388, valid_acc: 0.864
training...: 100%|██████████| 1250/1250 [00:12<00:00, 103.11it/s]
evaluating...: 100%|██████████| 157/157 [00:02<00:00, 67.97it/s]
epoch: 2
train_loss: 0.303, train_acc: 0.892
valid_loss: 0.298, valid_acc: 0.892
training...: 100%|██████████| 1250/1250 [00:11<00:00, 108.37it/s]
evaluating...: 100%|██████████| 157/157 [00:02<00:00, 68.70it/s]
epoch: 3
train_loss: 0.237, train_acc: 0.917
valid_loss: 0.273, valid_acc: 0.900
training...: 100%|██████████| 1250/1250 [00:11<00:00, 107.29it/s]
evaluating...: 100%|██████████| 157/157 [00:02<00:00, 68.78it/s]
epoch: 4
train_loss: 0.202, train_acc: 0.931
valid_loss: 0.256, valid_acc: 0.906
training...: 100%|██████████| 1250/1250 [00:11<00:00, 107.89it/s]
evaluating...: 100%|██████████| 157/157 [00:02<00:00, 68.34it/s]
epoch: 5
train_loss: 

In [31]:
def predict_sentiment(text, model, device):
    ids = train_dataset.vocab.numericalize(text)
    tensor = torch.LongTensor(ids).unsqueeze(dim=0).to(device)
    prediction = model(tensor).squeeze(dim=0)
    probability = torch.softmax(prediction, dim=-1)
    predicted_class = prediction.argmax(dim=-1).item()
    predicted_probability = probability[predicted_class].item()
    print(f"{'Negative' if predicted_class == 0 else 'Positive'} | probability score = {predicted_probability:.4f}")

In [32]:
predict_sentiment('This film is terrible', model, device)

Negative | probability score = 1.0000


In [33]:
predict_sentiment('This film is great', model, device)

Positive | probability score = 1.0000
