In [1]:
import torch
from torch import nn
import torchtext
from torch.utils.data import DataLoader
from torchtext.datasets import IMDB
from torchtext.data.utils import get_tokenizer
from torch.utils.data import DataLoader, BatchSampler, RandomSampler
import pandas as pd
import numpy as np
import re
from tqdm import tqdm


In [2]:
train_data = pd.read_csv("data/imdb/Train.csv")
test_data = pd.read_csv("data/imdb/Test.csv")

In [3]:
def yield_tokens(data_iter):
    for text in data_iter:
        yield tokenizer(text)
tokenizer = get_tokenizer('spacy')
embed = torchtext.vocab.GloVe("6B", 100)



In [4]:
vocab = torchtext.vocab.build_vocab_from_iterator(yield_tokens(train_data['text'].values), specials=["<unk>"])
vocab.set_default_index(vocab["<unk>"])

In [5]:
text_pipeline = lambda x: vocab(tokenizer(x))
label_pipeline = lambda x: int(x)

In [6]:
from torch.utils.data import DataLoader
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

def collate_batch(batch):
     label_list, text_list = [], []
     for (_label, _text) in batch:
          label_list.append(label_pipeline(_label))
          p_text = torch.tensor(text_pipeline(_text.lower()), dtype=torch.int64)
          text_list.append(p_text)
     # Pad text documents with lower than the max seq. length of the batch
     text_list = torch.nn.utils.rnn.pad_sequence(text_list, batch_first=True)
     label_list = torch.tensor(label_list, dtype=torch.float32).reshape(-1, 1)
     return label_list.to(device), text_list.to(device)


In [7]:
class SentTM(nn.Module):
    def __init__(self, embedding_vector) -> None:
        super(SentTM, self).__init__()

        self.embed = torch.nn.Embedding.from_pretrained(embedding_vector, freeze=True)

        self.rnn = nn.LSTM(input_size=100, hidden_size=100, num_layers=2, batch_first=True, bidirectional=True)
        self.relu = nn.ReLU()
        self.fc1 = nn.Linear(200, 1)
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        x = self.embed(x)
        
        _, (hidden,_) = self.rnn(x)

        # Picking only the values of the forward and backward layer(bidirectional) of hidden output
        hidden = torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim = -1)
        logits = self.fc1(hidden)

        probs = self.sigmoid(logits)

        return logits, probs

In [8]:
class Imdb(torch.utils.data.Dataset):
    def __init__(self, data):
        self._text = data['text'].values
        self._label = data['label'].values
    
    def __len__(self):
        return self._text.shape[0]
    
    def __getitem__(self, idx):

        text = self._text[idx]
        label = self._label[idx]

        return label, text

In [9]:
model = SentTM(embed.vectors).to(device)
loss_fn = nn.BCEWithLogitsLoss().to(device)
optim = torch.optim.Adam(model.parameters(), lr=1e-4)
batch_size = 64
dataset = Imdb(train_data)
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_batch)

In [10]:
model.train()
total_acc, total_count = 0, 0
log_interval = 10
epoch = 0
acc = []
l = []
for i in range(10):
    for (label, text) in (t := tqdm(dataloader)):
        
        optim.zero_grad()
        # forward pass
        logits, probs = model(text)

        # Compute loss
        loss = loss_fn(logits, label)
        loss.backward()
        optim.step()
        total_acc = (probs.round() == label).sum().item()
        acc.append(total_acc/64)
        l.append(loss.item())
        t.set_description(f"Loss: {loss.item():.2f}, Acc: {total_acc/64:.2f}")

    


Loss: 0.69, Acc: 0.53:   2%|▏         | 12/625 [00:18<15:59,  1.56s/it]


KeyboardInterrupt: 