In [1]:
import torch
from torch import nn
import torchtext
from torch.utils.data import DataLoader
from torchtext.datasets import IMDB
from torchtext.data.utils import get_tokenizer
from torch.utils.data import DataLoader, BatchSampler, RandomSampler
import pandas as pd
import numpy as np
import re
import tqdm

In [2]:
train_data = IMDB(root="data", split="train")
tokenizer = get_tokenizer('spacy')
embed = torchtext.vocab.GloVe("6B", 100)



In [3]:
def yield_tokens(data_iter):
    for _, text in data_iter:
        yield tokenizer(text)

vocab = torchtext.vocab.build_vocab_from_iterator(yield_tokens(train_data), specials=["<unk>"])
vocab.set_default_index(vocab["<unk>"])

In [4]:
text_pipeline = lambda x: vocab(tokenizer(x))

In [5]:
from torch.utils.data import DataLoader
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

def collate_batch(batch):
     label_list, text_list = [], []
     for (_label, _text) in batch:
          label_list.append(_label)
          p_text = torch.tensor(text_pipeline(_text.lower()), dtype=torch.int64)
          text_list.append(p_text)
     text_list = torch.nn.utils.rnn.pad_sequence(text_list, batch_first=True)
     label_list = torch.tensor(label_list, dtype=torch.float32).reshape(-1, 1)
     return label_list.to(device), text_list.to(device)


In [6]:
class SentTM(nn.Module):
    def __init__(self, embedding_vector) -> None:
        super(SentTM, self).__init__()

        self.embed = torch.nn.Embedding.from_pretrained(embedding_vector, freeze=True)

        self.rnn = nn.LSTM(input_size=100, hidden_size=100, num_layers=2, batch_first=True, bidirectional=True)
        self.relu = nn.ReLU()
        self.fc1 = nn.Linear(2*100, 1)
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        x = self.embed(x)
        
        _, (hidden,_) = self.rnn(x)

        # Picking only the values of the forward and backward layer(bidirectional) of hidden output
        hidden = torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim = -1)
        logits = self.fc1(hidden)

        probs = self.sigmoid(logits)

        return logits, probs

In [7]:
model = SentTM(embed.vectors).to(device)
loss_fn = nn.BCEWithLogitsLoss().to(device)
optim = torch.optim.Adam(model.parameters(), lr=1e-4)
batch_size = 64
train_iter = IMDB(split='train')
dataloader = DataLoader(train_iter, batch_size=batch_size, shuffle=True, collate_fn=collate_batch)

In [9]:
model.train()
total_acc, total_count = 0, 0
log_interval = 10
epoch = 0
acc = []
l = []
for idx, (label, text) in enumerate(tqdm.tqdm(dataloader)):
    
    optim.zero_grad()
    # forward pass
    logits, probs = model(text)

    # Compute loss
    loss = loss_fn(logits, label)
    loss.backward()
    optim.step()
    total_acc = (probs.round() == label).sum().item()
    acc.append(total_acc)
    l.append(loss.item())
    


47it [12:34, 17.89s/it]

In [None]:
import matplotlib.pyplot as plt
plt.plot(acc)
plt.plot(l)
plt.show()