In [64]:
import collections

import datasets
import matplotlib.pyplot as plt
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import torchtext
import tqdm

In [65]:
seed = 1234

In [66]:
np.random.seed(seed=seed)

In [67]:
torch.manual_seed(seed)

<torch._C.Generator at 0x16a35fdf0>

In [68]:
torch.cuda.manual_seed(seed)

In [69]:
torch.backends.cudnn.deterministic = True

In [70]:
train_data, test_data = datasets.load_dataset("imdb", split=["train", "test"])

In [71]:
train_data, test_data

(Dataset({
     features: ['text', 'label'],
     num_rows: 25000
 }),
 Dataset({
     features: ['text', 'label'],
     num_rows: 25000
 }))

In [72]:
train_data[0]

{'text': 'I rented I AM CURIOUS-YELLOW from my video store because of all the controversy that surrounded it when it was first released in 1967. I also heard that at first it was seized by U.S. customs if it ever tried to enter this country, therefore being a fan of films considered "controversial" I really had to see this for myself.<br /><br />The plot is centered around a young Swedish drama student named Lena who wants to learn everything she can about life. In particular she wants to focus her attentions to making some sort of documentary on what the average Swede thought about certain political issues such as the Vietnam War and race issues in the United States. In between asking politicians and ordinary denizens of Stockholm about their opinions on politics, she has sex with her drama teacher, classmates, and married men.<br /><br />What kills me about I AM CURIOUS-YELLOW is that 40 years ago, this was considered pornographic. Really, the sex and nudity scenes are few and far be

In [73]:
test_data[0]

{'text': 'I love sci-fi and am willing to put up with a lot. Sci-fi movies/TV are usually underfunded, under-appreciated and misunderstood. I tried to like this, I really did, but it is to good TV sci-fi as Babylon 5 is to Star Trek (the original). Silly prosthetics, cheap cardboard sets, stilted dialogues, CG that doesn\'t match the background, and painfully one-dimensional characters cannot be overcome with a \'sci-fi\' setting. (I\'m sure there are those of you out there who think Babylon 5 is good sci-fi TV. It\'s not. It\'s clichéd and uninspiring.) While US viewers might like emotion and character development, sci-fi is a genre that does not take itself seriously (cf. Star Trek). It may treat important issues, yet not as a serious philosophy. It\'s really difficult to care about the characters here as they are not simply foolish, just missing a spark of life. Their actions and reactions are wooden and predictable, often painful to watch. The makers of Earth KNOW it\'s rubbish as 

In [74]:
tokenizer = torchtext.data.utils.get_tokenizer("basic_english")

In [75]:
tokenizer

<function torchtext.data.utils._basic_english_normalize(line)>

In [76]:
tokenizer("This is cool!")

['this', 'is', 'cool', '!']

In [77]:
def tokenize_example(example, tokenizer, max_length):
    tokens = tokenizer(example["text"][:max_length])
    return {"tokens": tokens}

In [78]:
max_length = 256

train_data = train_data.map(tokenize_example, fn_kwargs={"tokenizer": tokenizer, "max_length": max_length})
test_data = test_data.map(tokenize_example, fn_kwargs={"tokenizer": tokenizer, "max_length": max_length})

In [79]:
train_data

Dataset({
    features: ['text', 'label', 'tokens'],
    num_rows: 25000
})

In [80]:
test_data

Dataset({
    features: ['text', 'label', 'tokens'],
    num_rows: 25000
})

In [81]:
test_size = 0.25

In [82]:
train_valid_data = train_data.train_test_split(test_size=test_size)
train_data = train_valid_data["train"]
valid_data = train_valid_data["test"]

In [83]:
train_data

Dataset({
    features: ['text', 'label', 'tokens'],
    num_rows: 18750
})

In [84]:
test_data

Dataset({
    features: ['text', 'label', 'tokens'],
    num_rows: 25000
})

In [85]:
min_freq = 5
special_tokens = ["<unk>", "<pad>"]

In [86]:
vocab = torchtext.vocab.build_vocab_from_iterator(
    train_data["tokens"],
    min_freq=min_freq,
    specials=special_tokens
)

In [87]:
vocab

Vocab()

In [88]:
len(vocab)

9651

In [89]:
vocab.get_itos()

['<unk>',
 '<pad>',
 'the',
 '.',
 ',',
 'a',
 'i',
 'of',
 'and',
 "'",
 'to',
 'this',
 'is',
 'it',
 'in',
 'movie',
 'that',
 'was',
 's',
 'film',
 'as',
 'with',
 'for',
 'but',
 't',
 'on',
 '(',
 'have',
 'one',
 'not',
 ')',
 'you',
 'be',
 '!',
 'an',
 'are',
 'at',
 'about',
 'so',
 'all',
 'by',
 'his',
 'my',
 'from',
 'like',
 'who',
 'has',
 'he',
 'just',
 'good',
 'what',
 'very',
 'there',
 'when',
 'they',
 'first',
 'some',
 'or',
 'out',
 'if',
 'story',
 'can',
 'had',
 'time',
 'me',
 'really',
 '?',
 'movies',
 'bad',
 'seen',
 'see',
 'no',
 'great',
 'only',
 'would',
 'were',
 'more',
 'made',
 'even',
 'been',
 'up',
 'ever',
 'people',
 'most',
 'films',
 'because',
 'her',
 'which',
 'much',
 'how',
 'well',
 'after',
 'plot',
 'don',
 'we',
 'its',
 '-',
 'acting',
 'than',
 've',
 'best',
 'get',
 'saw',
 'think',
 'she',
 'm',
 'other',
 'love',
 'many',
 'into',
 'show',
 'could',
 'watch',
 'do',
 'their',
 'watching',
 'two',
 'say',
 'will',
 'years

In [90]:
vocab["if"]

59

In [91]:
"yes" in vocab

True

In [92]:
vocab.lookup_indices(["this", "good", "cool", "<pad>"])

[11, 49, 606, 1]

In [93]:
unk_index = vocab["<unk>"]
pad_index = vocab["<pad>"]

vocab.set_default_index(unk_index)

#numericalizing data


def numericalizing_data(example, vocab):
    ids = vocab.lookup_indices(example["tokens"])
    return {"ids": ids}


In [94]:
train_data = train_data.map(numericalizing_data, fn_kwargs={"vocab": vocab })

Map: 100%|██████████| 18750/18750 [00:00<00:00, 20816.54 examples/s]


In [95]:
test_data = test_data.map(numericalizing_data, fn_kwargs={"vocab": vocab})
valid_data = valid_data.map(numericalizing_data, fn_kwargs={"vocab": vocab})

Map: 100%|██████████| 25000/25000 [00:01<00:00, 23780.58 examples/s]
Map: 100%|██████████| 6250/6250 [00:00<00:00, 20252.23 examples/s]


In [96]:
train_data[0]

{'text': "Look, this is quite possibly one of the best movies America has to offer the rest of the world. To hate this movie is to hate freedom itself. I remember that the early 80's were a time of uncertainty. The economy was weak, communism threatened us all, and nuclear destruction was almost a certainty. Out of that confusion came a hero, Stroker Ace. Ned Beatty's performance in this movie showed he was never again to be type cast as a one dimensional victim in the wilderness. His triumph is an inspiration to all. The on-screen chemistry between Burt and Loni draws obvious comparisons to Brad and Jennifer. Jim Nabors is a poet. Go see this movie tonight!",
 'label': 1,
 'tokens': ['look',
  ',',
  'this',
  'is',
  'quite',
  'possibly',
  'one',
  'of',
  'the',
  'best',
  'movies',
  'america',
  'has',
  'to',
  'offer',
  'the',
  'rest',
  'of',
  'the',
  'world',
  '.',
  'to',
  'hate',
  'this',
  'movie',
  'is',
  'to',
  'hate',
  'freedom',
  'itself',
  '.',
  'i',
 

In [97]:
#transform numericalized data into tensors
train_data = train_data.with_format(type="torch", columns=["ids", "label"])
valid_data = valid_data.with_format(type="torch", columns=["ids", "label"])
test_data = test_data.with_format(type="torch", columns={"ids", "label"})

In [98]:
#creating data loaders
def get_collate_fn(pad_index):
    def collate_fn(batch):
        batch_ids = [i["ids"] for i in batch]
        batch_ids = nn.utils.rnn.pad_sequence(
            batch_ids, padding_value=pad_index, batch_first=True
        )
        batch_label = [i["label"] for i in batch]
        batch_label = torch.stack(batch_label)
        batch = {"ids": batch_ids, "label": batch_label}
        return batch
    
    return collate_fn

In [99]:
def get_data_loader(dataset, batch_size, pad_index, shuffle=False):
    collate_fn = get_collate_fn(pad_index=pad_index)
    data_loader = torch.utils.data.DataLoader(
        dataset=dataset,
        batch_size=batch_size,
        collate_fn=collate_fn,
        shuffle=shuffle,
    )
    return data_loader

In [100]:
batch_size = 512

train_data_loader = get_data_loader(train_data, batch_size, pad_index, shuffle=True)

In [101]:
valid_data_loader = get_data_loader(valid_data, batch_size, pad_index)
test_data_loader = get_data_loader(test_data, batch_size, pad_index)

In [102]:
#create model class
class NBoW(nn.Module):
    def __init__(self, vocab_size, embedding_dim, output_dim, pad_index):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=pad_index)
        self.fc = nn.Linear(embedding_dim, output_dim)
        
    def forward(self, ids):
        # ids = [batch size, seq len]
        embedded = self.embedding(ids)
        # embedded = [batch size, seq len, embedding dim]
        pooled = embedded.mean(dim=1)
        # pooled = [batch size, embedding dim]
        prediction = self.fc(pooled)
        return prediction
        
        

In [103]:
vocab_size = len(vocab)
embedding_dim = 300
output_dim = len(train_data.unique("label"))

model = NBoW(vocab_size, embedding_dim,output_dim, pad_index)

In [104]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f"The model has {count_parameters(model):,} trainable parameters")

The model has 2,895,902 trainable parameters


In [105]:
vectors = torchtext.vocab.GloVe(name='6B', dim=50)

.vector_cache/glove.6B.zip: 862MB [09:29, 1.51MB/s]                               
100%|█████████▉| 399999/400000 [00:03<00:00, 120131.47it/s]


In [106]:
hello_vector = vectors.get_vecs_by_tokens("hello")
pretrained_embedding = vectors.get_vecs_by_tokens(vocab.get_itos())

In [107]:
pretrained_embedding.shape

torch.Size([9651, 50])

In [108]:
model.embedding.weight.data = pretrained_embedding

In [109]:
optimizer = torch.optim.Adam(model.parameters())
criterion = nn.CrossEntropyLoss()

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model = model.to(device)

criterion = criterion.to(device)

In [110]:
def get_accuracy(prediction, label):
    batch_size, _ = prediction.shape
    predicted_classes = prediction.argmax(dim=-1)
    correct_predictions = predicted_classes.eq(label).sum()
    accuracy = correct_predictions / batch_size
    return accuracy

def train(data_loader, model, criterion, optimizer, device):
    model.train()
    epoch_losses = []
    epoch_accs = []
    for batch in tqdm.tqdm(data_loader, desc="training..."):
        ids = batch["ids"].to(device)
        label = batch["label"].to(device)
        prediction = model(ids)
        loss = criterion(prediction, label)
        accuracy = get_accuracy(prediction, label)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        epoch_losses.append(loss.item())
        epoch_accs.append(accuracy.item())
    return np.mean(epoch_losses), np.mean(epoch_accs)

In [111]:
def evaluate(data_loader, model, criterion, device):
    model.eval()
    epoch_losses = []
    epoch_accs = []
    with torch.no_grad():
        for batch in tqdm.tqdm(data_loader, desc="evaluating..."):
            ids = batch["ids"].to(device)
            label = batch["label"].to(device)
            prediction = model(ids)
            loss = criterion(prediction, label)
            accuracy = get_accuracy(prediction, label)
            epoch_losses.append(loss.item())
            epoch_accs.append(accuracy.item())
    return np.mean(epoch_losses), np.mean(epoch_accs)

In [112]:
n_epochs = 10
best_valid_loss = float("inf")

metrics = collections.defaultdict(list)

for epoch in range(n_epochs):
    train_loss, train_acc = train(
        train_data_loader, model, criterion, optimizer, device
    )
    valid_loss, valid_acc = evaluate(valid_data_loader, model, criterion, device)
    metrics["train_losses"].append(train_loss)
    metrics["train_accs"].append(train_acc)
    metrics["valid_losses"].append(valid_loss)
    metrics["valid_accs"].append(valid_acc)
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), "nbow.pt")
    print(f"epoch: {epoch}")
    print(f"train_loss: {train_loss:.3f}, train_acc: {train_acc:.3f}")
    print(f"valid_loss: {valid_loss:.3f}, valid_acc: {valid_acc:.3f}")

training...:   0%|          | 0/37 [00:00<?, ?it/s]


RuntimeError: mat1 and mat2 shapes cannot be multiplied (512x50 and 300x2)