In [1]:
!pip install text-preprocessing

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting text-preprocessing
  Downloading text_preprocessing-0.1.1-py2.py3-none-any.whl (9.6 kB)
Collecting contractions
  Downloading contractions-0.1.73-py2.py3-none-any.whl (8.7 kB)
Collecting unittest-xml-reporting
  Downloading unittest_xml_reporting-3.2.0-py2.py3-none-any.whl (20 kB)
Collecting names-dataset==2.1
  Downloading names_dataset-2.1.0-py3-none-any.whl (62.6 MB)
[K     |████████████████████████████████| 62.6 MB 1.1 MB/s 
[?25hCollecting pyspellchecker
  Downloading pyspellchecker-0.7.1-py3-none-any.whl (2.5 MB)
[K     |████████████████████████████████| 2.5 MB 60.3 MB/s 
Collecting textsearch>=0.0.21
  Downloading textsearch-0.0.24-py2.py3-none-any.whl (7.6 kB)
Collecting pyahocorasick
  Downloading pyahocorasick-1.4.4-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (110 kB)
[K     |████████████████████████████████| 110 kB 70.9 MB/s 
[?25hCollecting anyasci

In [2]:
from text_preprocessing import preprocess_text
from text_preprocessing import to_lower, remove_email, remove_url, remove_punctuation

preprocess_functions = [to_lower, remove_email, remove_url, remove_punctuation]

def clean_text(text):
    return preprocess_text(text, preprocess_functions)

[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


In [3]:
def sentiment2label(sentiment):
    return 0 if sentiment == 'negative' else 1

In [4]:
import pandas as pd

df = pd.read_csv('/content/drive/MyDrive/PyTorch/PyTorch-NLP-Tutorial/Corpus/IMDB Dataset.csv')
df['text'] = df['review'].apply(clean_text)
df['label'] = df['sentiment'].apply(sentiment2label)
df = df[['text', 'label']]
df.dropna(inplace=True) 
df.reset_index(drop=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return func(*args, **kwargs)


Unnamed: 0,text,label
0,one of the other reviewers has mentioned that ...,1
1,a wonderful little production br br the filmin...,1
2,i thought this was a wonderful way to spend ti...,1
3,basically theres a family where a little boy j...,0
4,petter matteis love in the time of money is a ...,1
...,...,...
49995,i thought this movie did a down right good job...,1
49996,bad plot bad dialogue bad acting idiotic direc...,0
49997,i am a catholic taught in parochial elementary...,0
49998,im going to have to disagree with the previous...,0


In [5]:
class Vocabulary: 
    def __init__(self, freq_threshold=10, max_size=100000):
        '''
        freq_threshold : the minimum times a word must occur in corpus to be included in vocabulary
        max_size : max vocab size
        '''
        self.freq_threshold = freq_threshold
        self.max_size = max_size

        self.itos = {0: '<PAD>', 1:'<SOS>', 2:'<EOS>', 3: '<UNK>'}
        self.stoi = {k:j for j, k in self.itos.items()} 
          
    
    def __len__(self):
        return len(self.itos)
    

    @staticmethod
    def tokenizer(text):
        return [tok.lower().strip() for tok in text.split(' ')]

    
    @staticmethod
    def generate_bigrams(x):
        n_grams = set(zip(*[x[i:] for i in range(2)]))
        for n_gram in n_grams:
            x.append(' '.join(n_gram))
        return x

    
    def build_vocabulary(self, sentence_list):
        '''
        build the vocabulary: create a dictionary mapping of index to string (itos) and string to index (stoi)
        (itos) -> {5:'the', 6:'a', 7:'an'} | (stoi) -> {'the':5, 'a':6, 'an':7}
        '''
        frequencies = {} 
        idx = 4  # because 4 tokens already added -> (itos) -> {0: '<PAD>', 1:'<SOS>', 2:'<EOS>', 3: '<UNK>'}
        
        # # calculate the freq of words
        # for sentence in sentence_list:
        #     for word in self.tokenizer(sentence):
        #         if word not in frequencies.keys():
        #             frequencies[word] = 1
        #         else:
        #             frequencies[word] += 1
        

        # calculate the freq of words
        for sentence in sentence_list:

            sentence = self.tokenizer(sentence)
            sentence_n_bigrams = self.generate_bigrams(sentence)

            for word in sentence_n_bigrams:
                if word not in frequencies.keys():
                    frequencies[word] = 1
                else:
                    frequencies[word] += 1
                    
                    
        # limit vocab by removing low freq words
        frequencies = {k:v for k,v in frequencies.items() if v >= self.freq_threshold} 
        
        # limit vocab to the max_size specified
        frequencies = dict(sorted(frequencies.items(), key = lambda x: -x[1])[:self.max_size-idx]) # idx = 4 for pad, start, end , unk
            
        # create vocab
        for word in frequencies.keys():
            self.stoi[word] = idx
            self.itos[idx] = word
            idx += 1

 
    # def numericalize(self, text):
    #     '''
    #     convert the list of words to a list of corresponding indexes
    #     eg. cat and a dog -> [4, 5, 6, 3]
    #     '''   
    #     tokenized_text = self.tokenizer(text)  # tokenize text 
    #     numericalized_text = []

    #     for token in tokenized_text:
    #         if token in self.stoi.keys():
    #             numericalized_text.append(self.stoi[token])
    #         else: # out-of-vocab (OOV) words are represented by UNK token index
    #             numericalized_text.append(self.stoi['<UNK>'])
                
    #     return numericalized_text

    def numericalize(self, text):
        tokenized_text = self.tokenizer(text)  # tokenize text 
        sentence_n_bigrams = self.generate_bigrams(tokenized_text)

        numericalized_text = []

        for token in sentence_n_bigrams:
            if token in self.stoi.keys():
                numericalized_text.append(self.stoi[token])
            else: # out-of-vocab (OOV) words are represented by UNK token index
                numericalized_text.append(self.stoi['<UNK>'])
                
        return numericalized_text

In [6]:
# vocab = Vocabulary(1, 100)
# vocab.build_vocabulary(
#     ['Joe waited for the train', 'The train was late', 'This is a cat', 'Dogs are friendly']
# )

# print(vocab.numericalize('The train was late'))
# print(vocab.stoi)

In [7]:
import torch
from torch.nn.utils.rnn import pad_sequence

In [8]:
class TrainDataset(torch.utils.data.Dataset):

    def __init__(self, df, text_column, label_column, freq_threshold=5, vocab_size=10000):
        self.df = df
        
        # get texts and labels
        self.texts = self.df[text_column]
        self.labels = self.df[label_column]
        
        # build vocabulary
        self.vocab = Vocabulary(freq_threshold, vocab_size)
        self.vocab.build_vocabulary(self.texts.tolist())


    def __len__(self):
        return len(self.df)
    
    
    def __getitem__(self, index):
        text = self.texts[int(index)]
        label = self.labels[index]
            
        # numericalize texts ['<SOS>','cat', 'in', 'a', 'bag','<EOS>'] -> [1,12,2,9,24,2]
        numerialized_text = [self.vocab.stoi["<SOS>"]]
        numerialized_text += self.vocab.numericalize(text)
        numerialized_text.append(self.vocab.stoi["<EOS>"])
    
        label = [float(label)]
        
        return torch.tensor(numerialized_text), torch.tensor(label, requires_grad = True) 

In [9]:
train_dataset = TrainDataset(
    df = df, text_column = 'text', label_column = 'label', 
    freq_threshold = 10, vocab_size = 25000
)

print(f'{df.loc[1]}\n')

text, label = train_dataset[1]
print(text)
print(label)

text     a wonderful little production br br the filmin...
label                                                    1
Name: 1, dtype: object

tensor([    1,     6,   531,   134,   493,    15,    15,     4,  2429,  6444,
            9,    57,     3,    57,     3,  3052,     5,   544,     6,     3,
            5,   757,     3,   373,     7,  3649,     8,     4,   587,   566,
           15,    15,     4,   182,    26,   811,    79,  4561,   707, 10824,
           24,    66,    48,   231,    34,     4,     3,    21,    30,    48,
           34,     4,  4436,   232,  7214,   110,    25,    74,   482,    70,
            4,     3,  1265,     3,    36,     4,  3442,     8,  3135, 20097,
        17593,    24,    66,     9,    11,    79,   354,     4,   176,    21,
           11,     9,     6,     3,   596,     5,  4817,   566,     6, 10040,
          493,    46,    31,     7,     4,    87,  6974,     7,   270,     5,
           27,   142,    15,    15,     4,  3649,    68,   344,   472,    18,


In [10]:
class TestDataset(torch.utils.data.Dataset):

    def __init__(self, train_dataset, df, text_column, label_column):
        self.train_dataset = train_dataset
        self.df = df
        
        # get texts and labels
        self.texts = self.df[text_column]
        self.labels = self.df[label_column]
        
        # utilizing vocabulary created using training set
        self.vocab = self.train_dataset.vocab


    def __len__(self):
        return len(self.df)
    
    
    def __getitem__(self, index):
        text = self.texts[int(index)]
        label = self.labels[index]
            
        # numericalize texts ['<SOS>','cat', 'in', 'a', 'bag','<EOS>'] -> [1,12,2,9,24,2]
        numerialized_text = [self.vocab.stoi["<SOS>"]]
        numerialized_text += self.vocab.numericalize(text)
        numerialized_text.append(self.vocab.stoi["<EOS>"])
    
        label = [float(label)]
        
        return torch.tensor(numerialized_text), torch.tensor(label, requires_grad = True) 

In [11]:
test_dataset = TestDataset(
    train_dataset = train_dataset, df = df, text_column = 'text', label_column = 'label'
)

print(f'{df.loc[100]}\n')

text, label = test_dataset[100]
print(text)
print(label)

text     this short film that inspired the soontobe ful...
label                                                    1
Name: 100, dtype: object

tensor([    1,    13,   456,    22,    14,  3021,     4,     3,   511,  3647,
         1290,    35,     3,  6216,    35,     9,     6,   888,   566,    14,
            3,   609,  1128,  4584,     3,  5296,  1439,     4,   456,    22,
          577,  4802,     4,     3,    40,   111,   123,  8629,    39,    27,
          385, 12397,    10,     4,   804,   609,     4,   658,     3,    13,
         1350,  4187,    19,    51,   189,  2635,  1219,     4,   628,    21,
            9,  1973,  1690,    19,     4, 23138,     7,     4,  1290,    13,
          106,     3,   954,    65,     9, 23414,    36,    51,     3,   270,
            5,     6,   550,  1060,   917,   663,    24,    78,   447,  2891,
         3564,    11,     4,   133,    14,     9,   375,    62,   178,    68,
           28,    65,     8,   138,    21,    32,  2416,  1851,    11,   133

In [12]:
class MyCollate:
    def __init__(self, pad_idx):
        self.pad_idx = pad_idx
        
    def __call__(self, batch):
        source = [item[0] for item in batch] 
        source = pad_sequence(source, batch_first=False, padding_value = self.pad_idx) 
        
        target = torch.tensor([item[1].item() for item in batch])
        return source, target

In [13]:
train_loader = torch.utils.data.DataLoader(
    dataset = train_dataset, batch_size = 32, num_workers = 1, shuffle = True, pin_memory = True, drop_last = True,
    collate_fn = MyCollate(pad_idx = train_dataset.vocab.stoi["<PAD>"])
)

In [14]:
test_loader = torch.utils.data.DataLoader(
    dataset = test_dataset, batch_size = 64, num_workers = 1, shuffle = True, pin_memory = True,
    collate_fn = MyCollate(pad_idx = train_dataset.vocab.stoi["<PAD>"])
)

In [15]:
for idx, (texts, labels) in enumerate(train_loader):
    print(texts.shape, labels.shape)
    if idx >= 4:
        break

torch.Size([1185, 32]) torch.Size([32])
torch.Size([993, 32]) torch.Size([32])
torch.Size([1876, 32]) torch.Size([32])
torch.Size([1372, 32]) torch.Size([32])
torch.Size([1204, 32]) torch.Size([32])


In [16]:
for (text, label) in train_loader:
    print(f"{text.shape}\n{type(text)}\n{text}")
    print(f"\n{label.shape}\n{type(label)}\n{label}")
    break

torch.Size([1151, 32])
<class 'torch.Tensor'>
tensor([[   1,    1,    1,  ...,    1,    1,    1],
        [1670,   12, 1248,  ...,   52,   79,   12],
        [  12,   68,    3,  ...,   26,   11,  127],
        ...,
        [   0,    0,    0,  ...,    0,    0,    0],
        [   0,    0,    0,  ...,    0,    0,    0],
        [   0,    0,    0,  ...,    0,    0,    0]])

torch.Size([32])
<class 'torch.Tensor'>
tensor([0., 0., 0., 0., 0., 1., 1., 0., 0., 0., 1., 0., 0., 1., 1., 1., 0., 1.,
        0., 0., 1., 1., 1., 0., 0., 0., 1., 0., 0., 0., 0., 0.])


In [17]:
import torch.nn as nn
import torch.nn.functional as F

class FastText(nn.Module):
    def __init__(self, vocab_size, embedding_dim, output_dim, pad_idx):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=pad_idx) 
        self.fc = nn.Linear(embedding_dim, output_dim)
    
    def forward(self, text):
        # text = [sent len, batch size]
        embedded = self.embedding(text)
        # embedded = [sent len, batch size, emb dim]
        embedded = embedded.permute(1, 0, 2)
        # embedded = [batch size, sent len, emb dim]
        pooled = F.avg_pool2d(embedded, (embedded.shape[1], 1)).squeeze(1)
        # pooled = [batch size, embedding_dim]
        return self.fc(pooled)

In [18]:
INPUT_DIM = train_dataset.vocab.__len__()
EMBEDDING_DIM = 100
OUTPUT_DIM = 1
PAD_IDX = train_dataset.vocab.stoi["<PAD>"]

model = FastText(INPUT_DIM, EMBEDDING_DIM, OUTPUT_DIM, PAD_IDX)

In [19]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f'The model has {count_parameters(model):,} trainable parameters')

The model has 2,500,101 trainable parameters


In [20]:
import torch.optim as optim

optimizer = optim.Adam(model.parameters())
criterion = nn.BCEWithLogitsLoss()

In [21]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

model = model.to(device)
criterion = criterion.to(device)

In [22]:
def binary_accuracy(preds, y):
    """
    Returns accuracy per batch, i.e. if you get 8/10 right, this returns 0.8, NOT 8
    """

    #round predictions to the closest integer
    rounded_preds = torch.round(torch.sigmoid(preds))
    correct = (rounded_preds == y).float() #convert into float for division 
    acc = correct.sum() / len(correct)
    return acc

In [23]:
def train(model, iterator, optimizer, criterion):
    
    epoch_loss = 0
    epoch_acc = 0
    
    model.train()
    
    for batch in iterator:
        
        optimizer.zero_grad()

        text, label = batch
        text, label = text.to(device), label.to(device)
                
        predictions = model(text).squeeze(1)
        
        loss = criterion(predictions, label)
        
        acc = binary_accuracy(predictions, label)
        
        loss.backward()
        
        optimizer.step()
        
        epoch_loss += loss.item()
        epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)


In [24]:
def evaluate(model, iterator, criterion):
    
    epoch_loss = 0
    epoch_acc = 0
    
    model.eval()
    
    with torch.no_grad():
    
        for batch in iterator:

            text, label = batch
            text, label = text.to(device), label.to(device)

            predictions = model(text).squeeze(1)
            
            loss = criterion(predictions, label)
            
            acc = binary_accuracy(predictions, label)

            epoch_loss += loss.item()
            epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [25]:
import time

def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

In [26]:
N_EPOCHS = 20

best_valid_loss = float('inf')

for epoch in range(N_EPOCHS):

    start_time = time.time()
    
    train_loss, train_acc = train(model, train_loader, optimizer, criterion)
    valid_loss, valid_acc = evaluate(model, test_loader, criterion)
    
    end_time = time.time()

    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), 'tut1-model.pt')
    
    print(f'Epoch: {epoch+1:02} | Epoch Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. Acc: {valid_acc*100:.2f}%')

Epoch: 01 | Epoch Time: 0m 45s
	Train Loss: 0.613 | Train Acc: 72.36%
	 Val. Loss: 0.495 |  Val. Acc: 85.06%
Epoch: 02 | Epoch Time: 0m 42s
	Train Loss: 0.390 | Train Acc: 87.36%
	 Val. Loss: 0.342 |  Val. Acc: 89.45%
Epoch: 03 | Epoch Time: 0m 42s
	Train Loss: 0.296 | Train Acc: 90.01%
	 Val. Loss: 0.278 |  Val. Acc: 91.11%
Epoch: 04 | Epoch Time: 0m 44s
	Train Loss: 0.252 | Train Acc: 91.32%
	 Val. Loss: 0.242 |  Val. Acc: 92.13%
Epoch: 05 | Epoch Time: 0m 42s
	Train Loss: 0.223 | Train Acc: 92.26%
	 Val. Loss: 0.217 |  Val. Acc: 93.00%
Epoch: 06 | Epoch Time: 0m 42s
	Train Loss: 0.200 | Train Acc: 93.08%
	 Val. Loss: 0.195 |  Val. Acc: 93.73%
Epoch: 07 | Epoch Time: 0m 42s
	Train Loss: 0.183 | Train Acc: 93.77%
	 Val. Loss: 0.178 |  Val. Acc: 94.37%
Epoch: 08 | Epoch Time: 0m 42s
	Train Loss: 0.169 | Train Acc: 94.36%
	 Val. Loss: 0.164 |  Val. Acc: 94.92%
Epoch: 09 | Epoch Time: 0m 43s
	Train Loss: 0.155 | Train Acc: 94.87%
	 Val. Loss: 0.151 |  Val. Acc: 95.48%
Epoch: 10 | Epoch T

In [27]:
model.load_state_dict(torch.load('tut1-model.pt'))

test_loss, test_acc = evaluate(model, test_loader, criterion)

print(f'Test Loss: {test_loss:.3f} | Test Acc: {test_acc*100:.2f}%')

Test Loss: 0.066 | Test Acc: 98.73%


In [28]:
def predict_sentiment(text, model, device):
    ids = train_dataset.vocab.numericalize(text)
    tensor = torch.LongTensor(ids).unsqueeze(dim=1).to(device)
    prediction = model(tensor).squeeze(dim=1)
    predicted_probability = torch.sigmoid(prediction).item()
    predicted_class = torch.round(torch.tensor(predicted_probability))
    print(f"{'Negative' if predicted_class == 0 else 'Positive'} | probability score = {predicted_probability:.4f}")

In [29]:
predict_sentiment("This film is terrible", model, device)

Negative | probability score = 0.0000


In [30]:
predict_sentiment("This film is great", model, device)

Positive | probability score = 1.0000
