In [1]:
#pip uninstall torchtext

In [2]:
#pip install torchtext==0.15.1

In [3]:
! pip install datasets



In [4]:
import collections
from torchtext.data import get_tokenizer
import datasets
import matplotlib.pyplot as plt
import numpy as np
import torch
import os
import torch.nn as nn
import torch.optim as optim
import torchtext
import tqdm

In [5]:
seed = 1234

np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
torch.backends.cudnn.deterministic = True

In [6]:
train_data, test_data = datasets.load_dataset("imdb", split=["train", "test"])

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [7]:
train_data

Dataset({
    features: ['text', 'label'],
    num_rows: 25000
})

In [8]:
tokenizer = get_tokenizer("basic_english")

In [9]:
def tokenize_example(example, tokenizer, max_length):
    tokens = ['<bos>']+tokenizer(example["text"])[:max_length-2]+['<eos>']
    length = len(tokens)
    return {"tokens": tokens, "length": length}

In [10]:
max_length = 256

train_data = train_data.map(
    tokenize_example, fn_kwargs={"tokenizer": tokenizer, "max_length": max_length}
)
test_data = test_data.map(
    tokenize_example, fn_kwargs={"tokenizer": tokenizer, "max_length": max_length}
)

In [11]:
test_size = 0.25

train_valid_data = train_data.train_test_split(test_size=test_size)
train_data = train_valid_data["train"]
valid_data = train_valid_data["test"]

In [12]:
min_freq = 5
special_tokens = ["<unk>", "<pad>"]

vocab =  torchtext.vocab.build_vocab_from_iterator(
    train_data["tokens"],
    min_freq=min_freq,
    specials=special_tokens,
)

In [13]:
unk_index = vocab["<unk>"]
pad_index = vocab["<pad>"]

In [14]:
vocab.set_default_index(unk_index)

In [15]:
def numericalize_example(example, vocab):
    ids = vocab.lookup_indices(example["tokens"])
    return {"ids": ids}

In [16]:
train_data = train_data.map(numericalize_example, fn_kwargs={"vocab": vocab})
valid_data = valid_data.map(numericalize_example, fn_kwargs={"vocab": vocab})
test_data = test_data.map(numericalize_example, fn_kwargs={"vocab": vocab})

In [17]:
train_data = train_data.with_format(type="torch", columns=["ids", "label", "length"])
valid_data = valid_data.with_format(type="torch", columns=["ids", "label", "length"])
test_data = test_data.with_format(type="torch", columns=["ids", "label", "length"])

In [18]:
train_data[0]

{'label': tensor(1),
 'length': tensor(137),
 'ids': tensor([   26,   182,     4,    14,    10,   193,   840,    36,     7,     2,
           123,   100,   818,    52,     8,  1528,     2,   382,     7,     2,
           187,     3,     8,   806,    14,    18,    10,     8,   806,  2257,
           431,     3,    12,   349,    15,     2,   390,  1022,     9,    16,
            78,     5,    66,     7, 11170,     3,     2,  9198,    17,   762,
             4,  7791,  5447,   200,    39,     4,     6,  3651,  3679,    17,
           229,     5, 11865,     3,    55,     7,    15,  3298,   356,     5,
           688,     4,     0,  5457,     3,  2726,  3815,     9,    16,   248,
            13,    14,    18,  1111,    35,    17,   129,   190,     8,    37,
           599,   183,    19,     5,    36,  3934,  1388,    13,     2,  5361,
             3,    34,  4214,    10,    43,  2815,     8,    39,     3,     2,
          4475,  1148,   215,  2646,     6, 16912,  3719,   591,  4827,     8,


In [19]:
def get_collate_fn(pad_index):
    def collate_fn(batch):
        batch_ids = [i["ids"] for i in batch]
        batch_ids = nn.utils.rnn.pad_sequence(
            batch_ids, padding_value=pad_index, batch_first=True
        )
        batch_length = [i["length"] for i in batch]
        batch_length = torch.stack(batch_length)
        batch_label = [i["label"] for i in batch]
        batch_label = torch.stack(batch_label)
        batch = {"ids": batch_ids, "length": batch_length, "label": batch_label}
        return batch

    return collate_fn

In [20]:
def get_data_loader(dataset, batch_size, pad_index, shuffle=False):
    collate_fn = get_collate_fn(pad_index)
    data_loader = torch.utils.data.DataLoader(
        dataset=dataset,
        batch_size=batch_size,
        collate_fn=collate_fn,
        shuffle=shuffle,
    )
    return data_loader

In [21]:
batch_size = 128

train_data_loader = get_data_loader(train_data, batch_size, pad_index, shuffle=True)
valid_data_loader = get_data_loader(valid_data, batch_size, pad_index)
test_data_loader = get_data_loader(test_data, batch_size, pad_index)

In [22]:
def fun2(data, input_lens):
    N, C, H =data.shape
    device=data.get_device()

    idx = torch.arange(C).unsqueeze(0).expand(N, -1)
    idx = idx < input_lens.unsqueeze(1)
    idx = idx.unsqueeze(2).expand(-1, -1, H).to(device)

    ret = (data * idx.float()).sum(1) / input_lens.unsqueeze(1).float().to(device)
    return ret

class LSTM(nn.Module):
    def __init__(
        self,
        vocab_size,
        embedding_dim,
        hidden_dim,
        n_layers,
        dropout_rate,
        pad_index,
    ):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=pad_index)
        self.lstm = nn.LSTM(
            embedding_dim,
            hidden_dim,
            n_layers,
            bidirectional=False,
            dropout=dropout_rate,
            batch_first=True,
        )
        self.fc = nn.Linear(hidden_dim,vocab_size,bias=True)  #nn.Linear(hidden_dim * 2 if bidirectional else hidden_dim, output_dim)
        self.dropout = nn.Dropout(dropout_rate)

    def forward(self, ids):
        # ids = [batch size, seq len]
        # length = [batch size]
        embedded = self.dropout(self.embedding(ids))

        output, _ = self.lstm(embedded)

        output = self.fc(output)
        return output
    def sentence_embedding(self,ids, lengths):

      embedded = self.dropout(self.embedding(ids))

      output, _ = self.lstm(embedded) #B, seq_len, hidden_dim
      ret = torch.stack([
        torch.sum(output[i, :l, :], dim=0) / l for i, l in enumerate(lengths)
       ])
      return ret

In [23]:
class CLASSIFIER(nn.Module):
    def __init__(
        self,
        vocab_size,
        embedding_dim,
        hidden_dim,
        output_dim,
        n_layers,
        dropout_rate,
        pad_index,
    ):
        super().__init__()
        self.text_encoder=LSTM(vocab_size,
                embedding_dim,
                hidden_dim,
                n_layers,
                dropout_rate,
                pad_index,)

        self.fc = nn.Linear(hidden_dim , output_dim)


    def forward(self, ids, lengths):
        with torch.no_grad():
          hidden=self.text_encoder.sentence_embedding(ids, lengths).detach()
        prediction = self.fc(hidden)

        return prediction

In [24]:
vocab_size = len(vocab)
embedding_dim = 256
hidden_dim = 256
output_dim = len(train_data.unique("label"))
n_layers = 1
dropout_rate = 0.25

model = CLASSIFIER(
    vocab_size,
    embedding_dim,
    hidden_dim,
    output_dim,
    n_layers,
    dropout_rate,
    pad_index,
)



In [25]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)


print(f"The model has {count_parameters(model):,} trainable parameters")

The model has 11,611,241 trainable parameters


In [26]:


optimizer = optim.Adam(model.parameters())
criterion = nn.CrossEntropyLoss()
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

device

device(type='cuda')

In [None]:
!gdown --id 1xrvosUfmglb01gnMWOSXYwvdecYwtxB0

Downloading...
From (original): https://drive.google.com/uc?id=1xrvosUfmglb01gnMWOSXYwvdecYwtxB0
From (redirected): https://drive.google.com/uc?id=1xrvosUfmglb01gnMWOSXYwvdecYwtxB0&confirm=t&uuid=59e3afac-4627-43ec-b9fc-ce8ea7c27d89
To: /content/pretrained_lstm.pth
  0% 0.00/46.4M [00:00<?, ?B/s]

In [None]:
if os.path.isfile("pretrained_lstm.pth"):
  print('Loading a Pretrained Language Model')
  model.text_encoder.load_state_dict(torch.load("pretrained_lstm.pth", weights_only=True))

In [None]:
model = model.to(device)
criterion = criterion.to(device)

In [None]:
def train(dataloader, model, criterion, optimizer, device):
    model.train()
    epoch_losses = []
    epoch_accs = []
    for batch in tqdm.tqdm(dataloader, desc="training..."):
        ids = batch["ids"].to(device)
        length = batch["length"]
        label = batch["label"].to(device)
        prediction = model(ids, length)
        loss = criterion(prediction, label)
        accuracy = get_accuracy(prediction, label)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        epoch_losses.append(loss.item())
        epoch_accs.append(accuracy.item())
    return np.mean(epoch_losses), np.mean(epoch_accs)

In [None]:
def evaluate(dataloader, model, criterion, device):
    model.eval()
    epoch_losses = []
    epoch_accs = []
    with torch.no_grad():
        for batch in tqdm.tqdm(dataloader, desc="evaluating..."):
            ids = batch["ids"].to(device)
            length = batch["length"]
            label = batch["label"].to(device)
            prediction = model(ids, length)
            loss = criterion(prediction, label)
            accuracy = get_accuracy(prediction, label)
            epoch_losses.append(loss.item())
            epoch_accs.append(accuracy.item())
    return np.mean(epoch_losses), np.mean(epoch_accs)

In [None]:
def get_accuracy(prediction, label):
    batch_size, _ = prediction.shape
    predicted_classes = prediction.argmax(dim=-1)
    correct_predictions = predicted_classes.eq(label).sum()
    accuracy = correct_predictions / batch_size
    return accuracy

In [None]:
n_epochs = 20
best_valid_loss = float("inf")

metrics = collections.defaultdict(list)

for epoch in range(n_epochs):
    train_loss, train_acc = train(
        train_data_loader, model, criterion, optimizer, device
    )
    valid_loss, valid_acc = evaluate(valid_data_loader, model, criterion, device)
    metrics["train_losses"].append(train_loss)
    metrics["train_accs"].append(train_acc)
    metrics["valid_losses"].append(valid_loss)
    metrics["valid_accs"].append(valid_acc)
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), "lstm.pt")
    print(f"epoch: {epoch}")
    print(f"train_loss: {train_loss:.3f}, train_acc: {train_acc:.3f}")
    print(f"valid_loss: {valid_loss:.3f}, valid_acc: {valid_acc:.3f}")

In [None]:
fig = plt.figure(figsize=(10, 6))
ax = fig.add_subplot(1, 1, 1)
ax.plot(metrics["train_losses"], label="train loss")
ax.plot(metrics["valid_losses"], label="valid loss")
ax.set_xlabel("epoch")
ax.set_ylabel("loss")
ax.set_xticks(range(n_epochs))
ax.legend()
ax.grid()

In [None]:
fig = plt.figure(figsize=(10, 6))
ax = fig.add_subplot(1, 1, 1)
ax.plot(metrics["train_accs"], label="train accuracy")
ax.plot(metrics["valid_accs"], label="valid accuracy")
ax.set_xlabel("epoch")
ax.set_ylabel("loss")
ax.set_xticks(range(n_epochs))
ax.legend()
ax.grid()

In [None]:
model.load_state_dict(torch.load("lstm.pt"))

test_loss, test_acc = evaluate(test_data_loader, model, criterion, device)

In [None]:
print(f"test_loss: {test_loss:.3f}, test_acc: {test_acc:.3f}")

In [None]:
def predict_sentiment(text, model, tokenizer, vocab, device):
    tokens = tokenizer(text)
    ids = vocab.lookup_indices(tokens)
    length = torch.LongTensor([len(ids)])
    tensor = torch.LongTensor(ids).unsqueeze(dim=0).to(device)
    prediction = model(tensor, length).squeeze(dim=0)
    probability = torch.softmax(prediction, dim=-1)
    predicted_class = prediction.argmax(dim=-1).item()
    predicted_probability = probability[predicted_class].item()
    return predicted_class, predicted_probability

In [None]:
text = "This film is terrible!"

predict_sentiment(text, model, tokenizer, vocab, device)

In [None]:
text = "This film is great!"

predict_sentiment(text, model, tokenizer, vocab, device)

In [None]:
text = "a bad movie"

predict_sentiment(text, model, tokenizer, vocab, device)

In [None]:
text = "just amazing"

predict_sentiment(text, model, tokenizer, vocab, device)