## Clone the Repository

In [1]:
!git clone https://github.com/mehedihasanbijoy/PyTorch-NLP-Tutorial.git

fatal: destination path 'PyTorch-NLP-Tutorial' already exists and is not an empty directory.


## Preparing Custom Dataset

In [2]:
import pandas as pd
import torch
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import DataLoader, Dataset
from sklearn.model_selection import train_test_split

In [3]:
df = pd.read_csv('/content/PyTorch-NLP-Tutorial/1. Text Classification/corpus/TweetSentiment.csv')
df.dropna(inplace=True)
df.reset_index(inplace=True)
df.sample(2)

Unnamed: 0,index,text,cleaned_text,sentiment,label
967,1011,Think I`ll go enjoy the sun`s rays again...I L...,think ill enjoy the suns rays againi love bein...,positive,2.0
17320,17809,Now I`m off to bed - HAPPY MOTHER`S DAY ALL - ...,now off bed happy mothers day all have great one,positive,2.0


In [4]:
train_df, test_df = train_test_split(df, test_size=0.2, random_state=32)
train_df.reset_index(inplace=True, drop=True)
test_df.reset_index(inplace=True, drop=True)

In [5]:
class Vocabulary: 
    def __init__(self, freq_threshold=10, max_size=10000):
        '''
        freq_threshold : the minimum times a word must occur in corpus to be included in vocabulary
        max_size : max vocab size
        '''
        self.freq_threshold = freq_threshold
        self.max_size = max_size

        self.itos = {0: '', 1:'', 2:'', 3: ''}
        self.stoi = {k:j for j, k in self.itos.items()} 
          
    
    def __len__(self):
        return len(self.itos)
    

    @staticmethod
    def tokenizer(text):
        return [tok.lower().strip() for tok in text.split(' ')]
    
    
    def build_vocabulary(self, sentence_list):
        '''
        build the vocabulary: create a dictionary mapping of index to string (itos) and string to index (stoi)
        (itos) -> {5:'the', 6:'a', 7:'an'} | (stoi) -> {'the':5, 'a':6, 'an':7}
        '''
        frequencies = {} 
        idx = 4  # because 4 tokens already added -> (itos) -> {0: '', 1:'', 2:'', 3: ''}
        
        # calculate the freq of words
        for sentence in sentence_list:
            for word in self.tokenizer(sentence):
                if word not in frequencies.keys():
                    frequencies[word] = 1
                else:
                    frequencies[word] += 1
                    
                    
        # limit vocab by removing low freq words
        frequencies = {k:v for k,v in frequencies.items() if v > self.freq_threshold} 
        
        # limit vocab to the max_size specified
        frequencies = dict(sorted(frequencies.items(), key = lambda x: -x[1])[:self.max_size-idx]) # idx = 4 for pad, start, end , unk
            
        # create vocab
        for word in frequencies.keys():
            self.stoi[word] = idx
            self.itos[idx] = word
            idx += 1

 
    def numericalize(self, text):
        '''
        convert the list of words to a list of corresponding indexes
        eg. cat and a dog -> [4, 5, 6, 3]
        '''   
        tokenized_text = self.tokenizer(text)  # tokenize text 
        numericalized_text = []

        for token in tokenized_text:
            if token in self.stoi.keys():
                numericalized_text.append(self.stoi[token])
            else: # out-of-vocab (OOV) words are represented by UNK token index
                numericalized_text.append(self.stoi[''])
                
        return numericalized_text

In [6]:
class TrainDataset(Dataset):

    def __init__(self, df, text_column, label_column, freq_threshold=5, vocab_size=10000):
        self.df = df
        
        # get texts and labels
        self.texts = self.df[text_column]
        self.labels = self.df[label_column]
        
        # build vocabulary
        self.vocab = Vocabulary(freq_threshold, vocab_size)
        self.vocab.build_vocabulary(self.texts.tolist())


    def __len__(self):
        return len(self.df)
    
    
    def __getitem__(self, index):
        text = self.texts[int(index)]
        label = self.labels[index]
            
        # numericalize texts ['','cat', 'in', 'a', 'bag',''] -> [1,12,2,9,24,2]
        numerialized_text = [self.vocab.stoi[""]]
        numerialized_text += self.vocab.numericalize(text)
        numerialized_text.append(self.vocab.stoi[""])
    
        label = [int(label)]
        
        return torch.tensor(numerialized_text), torch.tensor(label)

In [7]:
class TestDataset(Dataset):

    def __init__(self, train_dataset, df, text_column, label_column):
        self.train_dataset = train_dataset
        self.df = df
        
        # get texts and labels
        self.texts = self.df[text_column]
        self.labels = self.df[label_column]
        
        # utilizing vocabulary created using training set
        self.vocab = self.train_dataset.vocab


    def __len__(self):
        return len(self.df)
    
    
    def __getitem__(self, index):
        text = self.texts[int(index)]
        label = self.labels[index]
            
        # numericalize texts ['','cat', 'in', 'a', 'bag',''] -> [1,12,2,9,24,2]
        numerialized_text = [self.vocab.stoi[""]]
        numerialized_text += self.vocab.numericalize(text)
        numerialized_text.append(self.vocab.stoi[""])
    
        label = [int(label)]
        
        return torch.tensor(numerialized_text), torch.tensor(label)

In [8]:
train_dataset = TrainDataset(
    df = train_df, text_column = 'cleaned_text', label_column = 'label', 
    freq_threshold = 5, vocab_size = 10000
)

test_dataset = TestDataset(
    train_dataset = train_dataset, df = test_df, text_column = 'cleaned_text', label_column = 'label'
)

In [9]:
class MyCollate:
    def __init__(self, pad_idx):
        self.pad_idx = pad_idx
        
    def __call__(self, batch):
        source = [item[0] for item in batch] 
        source = pad_sequence(source, batch_first=False, padding_value = self.pad_idx) 
        
        target = torch.tensor([item[1].item() for item in batch])
        return source, target

In [10]:
train_loader = DataLoader(
    dataset = train_dataset, batch_size = 128, num_workers = 1, shuffle = True, pin_memory = True, drop_last = True,
    collate_fn = MyCollate(pad_idx = train_dataset.vocab.stoi[""])
)

test_loader = DataLoader(
    dataset = test_dataset, batch_size = 256, num_workers = 1, shuffle = True, pin_memory = True, 
    collate_fn = MyCollate(pad_idx = train_dataset.vocab.stoi[""])
)

## Build the Model

In [11]:
import torch.nn as nn
import torch.optim as optim
from tqdm import tqdm

In [12]:
class RNN(nn.Module):

    def __init__(self, input_dim, embedding_dim, hidden_dim, output_dim):
        super().__init__()
        self.embedding = nn.Embedding(num_embeddings = input_dim, embedding_dim = embedding_dim)
        self.rnn = nn.RNN(embedding_dim, hidden_dim)
        self.fc = nn.Linear(in_features = hidden_dim, out_features = output_dim)
    

    def forward(self, text):  # text → [sentence length x batch size]
        embedded_text = self.embedding(text)  
        # embedded_text → [sentence length x batch size x embedding dim]

        output, hidden_state = self.rnn(embedded_text)  
        # output → [sentence length x batch size x embedding dim]
        # hidden_state → [1 x batch size x embedding dim]
        
        # checking last hidden state
        assert torch.equal(output[-1, :, :], hidden_state.squeeze(0))
        
        prediction = self.fc(hidden_state.squeeze(0))  # hidden_state.squeeze(0) → [batch size x embedding dim]
        # prediction → [batch size x output dim]

        return prediction

In [13]:
# hyperparameters
INPUT_DIM = train_dataset.vocab.__len__()
EMBEDDING_DIM = 64
HIDDEN_DIM = 128
OUTPUT_DIM = len(set(df['label'].values))

In [14]:
model = RNN(
    INPUT_DIM, EMBEDDING_DIM, HIDDEN_DIM, OUTPUT_DIM
)

In [15]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f'The model has {count_parameters(model):,} trainable parameters')

The model has 238,915 trainable parameters


In [16]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

optimizer = optim.SGD(model.parameters(), lr=1e-3)
criterion = nn.BCEWithLogitsLoss()

model = model.to(device)
criterion = criterion.to(device)

In [17]:
def train(model, dataloader, optimizer, criterion, device):
    epoch_loss, epoch_acc = 0, 0
    model.train()

    for (texts, labels) in tqdm(dataloader):
        texts, labels = texts.to(device), labels.to(device) 

        predictions = model(texts)
        predictions = torch.argmax(predictions, 1)

        loss = criterion(predictions.to(float), labels.to(float))
        loss.requires_grad = True
        acc = (predictions == labels).float().sum() / len(labels)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        epoch_loss += loss.item()
        epoch_acc += acc.item()
    
    return epoch_loss / len(dataloader), epoch_acc / len(dataloader)

In [18]:
def evaluate(model, dataloader, criterion, device):  
    epoch_loss, epoch_acc = 0, 0
    model.eval()
    
    with torch.no_grad():
        for (texts, labels) in tqdm(dataloader):
            texts, labels = texts.to(device), labels.to(device) 

            predictions = model(texts)
            predictions = torch.argmax(predictions, 1)
            
            loss = criterion(predictions.to(float), labels.to(float))
            acc = (predictions == labels).float().sum() / len(labels)

            epoch_loss += loss.item()
            epoch_acc += acc.item()
        
    return epoch_loss / len(dataloader), epoch_acc / len(dataloader)

In [19]:
N_EPOCHS = 10

for epoch in range(N_EPOCHS):
    train_loss, train_acc = train(model, train_loader, optimizer, criterion, device)
    test_loss, test_acc = evaluate(model, test_loader, criterion, device)
    print(f"Train Loss: {train_loss:.4f}, Train Acc: {train_acc*100:.2f}% | Test Loss: {test_loss:.4f}, Test Acc: {test_acc*100:.2f}%\n")

100%|██████████| 188/188 [00:02<00:00, 70.52it/s]
100%|██████████| 24/24 [00:00<00:00, 39.41it/s]


Train Loss: 0.6906, Train Acc: 40.01% | Test Loss: 0.6910, Test Acc: 39.62%



100%|██████████| 188/188 [00:03<00:00, 51.35it/s]
100%|██████████| 24/24 [00:01<00:00, 17.18it/s]


Train Loss: 0.6909, Train Acc: 39.98% | Test Loss: 0.6916, Test Acc: 39.52%



100%|██████████| 188/188 [00:07<00:00, 26.03it/s]
100%|██████████| 24/24 [00:01<00:00, 14.02it/s]


Train Loss: 0.6906, Train Acc: 40.10% | Test Loss: 0.6908, Test Acc: 39.41%



100%|██████████| 188/188 [00:06<00:00, 28.26it/s]
100%|██████████| 24/24 [00:01<00:00, 20.72it/s]


Train Loss: 0.6903, Train Acc: 40.08% | Test Loss: 0.6929, Test Acc: 39.35%



100%|██████████| 188/188 [00:03<00:00, 61.28it/s]
100%|██████████| 24/24 [00:00<00:00, 43.23it/s]


Train Loss: 0.6903, Train Acc: 40.06% | Test Loss: 0.6918, Test Acc: 39.51%



100%|██████████| 188/188 [00:02<00:00, 78.57it/s]
100%|██████████| 24/24 [00:00<00:00, 42.35it/s]


Train Loss: 0.6906, Train Acc: 40.03% | Test Loss: 0.6917, Test Acc: 39.54%



100%|██████████| 188/188 [00:02<00:00, 79.05it/s]
100%|██████████| 24/24 [00:00<00:00, 43.02it/s]


Train Loss: 0.6900, Train Acc: 40.11% | Test Loss: 0.6921, Test Acc: 39.46%



100%|██████████| 188/188 [00:02<00:00, 78.72it/s]
100%|██████████| 24/24 [00:00<00:00, 42.94it/s]


Train Loss: 0.6894, Train Acc: 40.06% | Test Loss: 0.6904, Test Acc: 39.58%



100%|██████████| 188/188 [00:02<00:00, 77.22it/s]
100%|██████████| 24/24 [00:00<00:00, 42.72it/s]


Train Loss: 0.6905, Train Acc: 40.08% | Test Loss: 0.6924, Test Acc: 39.43%



100%|██████████| 188/188 [00:02<00:00, 78.09it/s]
100%|██████████| 24/24 [00:00<00:00, 42.04it/s]

Train Loss: 0.6910, Train Acc: 40.03% | Test Loss: 0.6914, Test Acc: 39.43%




