In [358]:
%%capture
!pip install transformers datasets

11347.22s - pydevd: Sending message related to process being replaced timed-out after 5 seconds


In [359]:
import numpy as np
import torch
import torch.nn.functional as F
import torch.nn as nn
import math
from torch.utils.data import DataLoader
from tabulate import tabulate
from datasets import load_dataset

from tqdm.notebook import tqdm
from transformers import BertTokenizer

This is a template of the notebook that you should complete and enrich with your own code.

First cells will be the same than the ones of the lab on text convolution.

# Data loading


In [360]:
dataset = load_dataset("scikit-learn/imdb", split="train")
print(dataset)

Dataset({
    features: ['review', 'sentiment'],
    num_rows: 50000
})


# Pre-processing / Tokenization

This is a very important step. It maybe boring but very important. In this session we will be lazy, but in real life, the time spent on inspecting and cleaning data is never wasted. It is true for text, but also for everything.



In PyTorch, everything is tensor. Words are replaced by indices. A sentence, is therefore a sequence of indices (long integers). In the first HW, you constructed a `WhiteSpaceTokenizer`. Here we will use an already built tokenizer. It is more appropriate to transformers. It relies on sub-word units, and converts everything in lower case. This is not always the best choice, but here it will be sufficient. To quote the documentation, this tokenizer allows you to:
- Tokenize (splitting strings in sub-word token strings), converttokens strings to ids and back, and encoding/decoding (i.e., tokenizing and converting to integers).
- Add new tokens to the vocabulary in a way that is independent of the underlying structure (BPE, SentencePiece…).
- Manage special tokens (like mask, beginning-of-sentence, etc.): adding them, assigning them to attributes in the tokenizer for easy access and making sure they are not split during tokenization.

Here we are going to use the tokenizer from the well known Bert model, that we can directly download.

In [361]:
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased", do_lower_case=True)

In [362]:
def preprocessing_fn(x, tokenizer):
    x["review_ids"] = tokenizer(
        x["review"],
        add_special_tokens=False,
        truncation=True,
        max_length=256,
        padding=False,
        return_attention_mask=False,
    )["input_ids"]
    x["label"] = 0 if x["sentiment"] == "negative" else 1
    return x


print(preprocessing_fn(dataset[1], tokenizer).keys())
print(preprocessing_fn(dataset[1], tokenizer)["review_ids"])

dict_keys(['review', 'sentiment', 'review_ids', 'label'])
[1037, 6919, 2210, 2537, 1012, 1026, 7987, 1013, 1028, 1026, 7987, 1013, 1028, 1996, 7467, 6028, 2003, 2200, 14477, 4757, 24270, 1011, 2200, 2214, 1011, 2051, 1011, 4035, 4827, 1998, 3957, 1037, 16334, 1010, 1998, 2823, 17964, 2075, 1010, 3168, 1997, 15650, 2000, 1996, 2972, 3538, 1012, 1026, 7987, 1013, 1028, 1026, 7987, 1013, 1028, 1996, 5889, 2024, 5186, 2092, 4217, 1011, 2745, 20682, 2025, 2069, 1000, 2038, 2288, 2035, 1996, 11508, 2072, 1000, 2021, 2002, 2038, 2035, 1996, 5755, 2091, 6986, 2205, 999, 2017, 2064, 5621, 2156, 1996, 25180, 3238, 9260, 8546, 2011, 1996, 7604, 2000, 3766, 1005, 9708, 10445, 1010, 2025, 2069, 2003, 2009, 2092, 4276, 1996, 3666, 2021, 2009, 2003, 1037, 27547, 2135, 2517, 1998, 2864, 3538, 1012, 1037, 3040, 3993, 2537, 2055, 2028, 1997, 1996, 2307, 3040, 1005, 1055, 1997, 4038, 1998, 2010, 2166, 1012, 1026, 7987, 1013, 1028, 1026, 7987, 1013, 1028, 1996, 15650, 2428, 3310, 2188, 2007, 1996, 2210, 2

Same celel than in the lab session.

🚧 **TODO** 🚧

Read the documentation about HuggingFace dataset and complete the code below.
You should:
- Shuffle the dataset
- For computational reasons, use only a total of **5000 samples**.
- Tokenize the dataset with the `preprocessing_fn`. (*Hint: use the `Dataset.map` method from HuggingFace*).
- Keep only columns `review_ids` and `label`.
- Make a train/validation split, (**80% / 20%**). Call these dataset `train_set` and `valid_set`.


#### Question 1

In [363]:
n_samples = 5000  # the number of training example

# We first shuffle the data !
dataset = dataset.shuffle(seed=42)

# Select 5000 samples
dataset = dataset.select(range(n_samples))

# Tokenize the dataset
dataset = dataset.map(lambda x: preprocessing_fn(x, tokenizer))

# Remove useless columns
dataset = dataset.remove_columns(["review", "sentiment"])

# Split the train and validation
train_test_split = dataset.train_test_split(test_size=0.2)
document_train_set = train_test_split["train"]
document_test_set = train_test_split["test"]

#### Question 2

Our strategy to handle borders is to look further to the right when on the left border and look further to the left on the right border. For instance, when w is the first word of the review, we look at the 2R following words for positive context. 

In [364]:
from typing import List, Tuple


def extract_word_context(
    radius: int, tokens: List[int]
) -> Tuple[List[int], List[List[int]]]:
    assert radius > 0 and radius < len(
        tokens
    ), "The radius should be > 0 and < the number of tokens"
    words = []
    contexts = []
    for i in range(0, radius):
        words.append(tokens[i])
        context = tokens[:i] + tokens[i + 1 : radius * 2 + 1]
        contexts.append(context)
        assert len(context) == 2 * radius, f"Context length mismatch at index {i}"
    for i in range(radius, len(tokens) - radius):
        words.append(tokens[i])
        context = tokens[i - radius : i] + tokens[i + 1 : i + radius + 1]
        contexts.append(context)
        assert len(context) == 2 * radius, f"Context length mismatch at index {i}"
    for i in range(len(tokens) - radius, len(tokens)):
        words.append(tokens[i])
        context = tokens[len(tokens) - 2 * radius - 1 : i] + tokens[i + 1 :]
        contexts.append(context)
        assert len(context) == 2 * radius, f"Context length mismatch at index {i}"
    return words, contexts

In [365]:
# Sanity check that all the contexts are of the same length
print(len(dataset[3]["review_ids"]))
contexts = extract_word_context(6, dataset[3]["review_ids"])[1]
print(len(contexts[0]), len(contexts[10]), len(contexts[-1]))

253
12 12 12


#### Question 3

In [366]:
def flatten_dataset_to_list(dataset, radius: int) -> List:
    words, contexts = [], []
    for i in tqdm(range(len(dataset)), desc="Processing dataset"):
        words_, contexts_ = extract_word_context(radius, dataset["review_ids"][i])
        words.extend(words_)
        contexts.extend(contexts_)
    return words, contexts

#### Question 4

In [367]:
### REMOVE LATER. FOR FASTER PRCCOESSING
document_train_set = document_train_set.select(range(400))
document_test_set = document_test_set.select(range(100))

In [368]:
train_dataset_words, train_dataset_contexts = flatten_dataset_to_list(
    document_train_set, 6
)
test_dataset_words, test_dataset_contexts = flatten_dataset_to_list(
    document_test_set, 6
)

Processing dataset:   0%|          | 0/400 [00:00<?, ?it/s]

Processing dataset:   0%|          | 0/100 [00:00<?, ?it/s]

#### Question 5

In [369]:
class Word2VecDataset(torch.utils.data.Dataset):
    def __init__(self, words, contexts):
        assert len(words) == len(
            contexts
        ), "Words and contexts should have the same length"
        self.words = words
        self.contexts = contexts

    def __len__(self):
        return len(self.words)

    def __getitem__(self, idx):
        return self.words[idx], self.contexts[idx]


train_dataset = Word2VecDataset(train_dataset_words, train_dataset_contexts)
test_dataset = Word2VecDataset(test_dataset_words, test_dataset_contexts)

In [370]:
# Sanity check
print(train_dataset[3])

(2003, [2348, 2049, 5436, 2579, 2013, 1996, 2381, 1997, 3418, 4199, 1010, 1005])


#### Question 6

In [371]:
import random


# TODO: Values of the dict should be torch tensors
# TODO: Handle batch size > 1
def collate_fn(batch, scaling_factor: int, vocab_size: int):
    batch_size = len(batch)
    word_ids = torch.tensor([b[0] for b in batch])
    positive_context_ids = torch.tensor([b[1] for b in batch])
    positive_context_ids_set = set(positive_context_ids.flatten().tolist())
    negative_candidates = list(set(range(vocab_size)) - positive_context_ids_set)
    negative_context_ids = [
        random.sample(
            negative_candidates, scaling_factor * positive_context_ids.size(1)
        )
        for _ in range(batch_size)
    ]
    negative_context_ids = torch.tensor(negative_context_ids)
    result = {
        "word_ids": word_ids,
        "positive_context_ids": positive_context_ids,
        "negative_context_ids": negative_context_ids,
    }
    return result

In [372]:
print(tokenizer.vocab_size)

30522


#### Question 7

In [373]:
train_dataloader = DataLoader(
    dataset=train_dataset,
    batch_size=256,
    shuffle=True,
    collate_fn=lambda batch: collate_fn(
        batch, scaling_factor=4, vocab_size=tokenizer.vocab_size
    ),
)

#### Question 8

In [374]:
context_length = len(combined_dataset[0][1])
for i in range(1, len(combined_dataset)):
    if len(combined_dataset[i][1]) != context_length:
        print(i)
        print(combined_dataset[i])
        print(len(combined_dataset[i][1]))
        raise ValueError("All the context should have the same length")

In [375]:
batch = next(iter(dataloader))
print("batch ", batch)
print("batch word_id size", batch["word_ids"].size())
print("batch positive_context_ids size", batch["positive_context_ids"].size())
print("batch negative_context_ids size", batch["negative_context_ids"].size())

batch  {'word_ids': tensor([ 1012, 13558,  3367,  2138,  1028,  2733,  2147,  2031,  1012,  2801,
          999,  2009, 10151,  2439,  5691,  3772,  2011,  4895,  2079, 24840,
         2196,  2178,  1012,  3233,  2083,  2172,  3422,  3772,  2507, 14556,
         2318,  2529,  1996,  1037,  1007, 25815,  2041,  4138,  1999,  3600,
         1028,  1012,  4332,  3185,  2022,  1012,  1010,  1055,  1026,  2018,
         3733,  8087,  1012,  1060,  2208,  1996,  3087,  2070,  2027,  2003,
         2892,   999,  4050,  6298,  2023,  1000,  1996,  2070,  1026,  2007,
         2043,  2062,  1006,  3749,   999,  1000,  2572,  1012,  2022,  2173,
         2044, 14203, 11463, 19986,  1028,  4150,  2001,  2022,  2017, 13433,
         2043,  5751, 25377, 24897, 11321,  1012,  1012,  2051,  2769, 22732,
         1010,  1025,  1028,  1006,  2035,  2204,  2842,  4834,  1999,  9252,
         6501,  7987,  8790,  2081, 10885, 28493,  2293,  3138,  6359,  7987,
         5691,  2033,  5573,  2127,  1012,  

#### Question 9

In [376]:
class Word2Vec(nn.Module):
    def __init__(self, embedding_dim: int, vocab_size: int):
        super(Word2Vec, self).__init__()
        self.in_embedding = nn.Embedding(vocab_size, embedding_dim)
        self.out_embedding = nn.Embedding(vocab_size, embedding_dim)

#### Question 10

In [377]:
def ce_loss(
    word_embedding: torch.Tensor,
    positive_context_embeddings: torch.Tensor,
    negative_context_embeddings: torch.Tensor,
) -> torch.Tensor:
    """
    Args:
        word_embedding (torch.Tensor): Shape (batch_size, embedding_dim,)
        positive_context_embeddings (torch.Tensor): Shape (batch_size, 2R, embedding_dim)
        negative_context_embeddings (torch.Tensor): Shape (batch_size, 2KR, embedding_dim)
    Returns:
        torch.Tensor: The loss value
    """
    positive_similarity = torch.log(
        torch.sigmoid(
            torch.bmm(positive_context_embeddings, word_embedding.unsqueeze(-1) + 1e-10)
        )
    )
    negative_similarity = torch.log(
        torch.sigmoid(
            1
            - torch.bmm(negative_context_embeddings, word_embedding.unsqueeze(-1))
            + 1e-10
        )
    )
    return -torch.sum(positive_similarity) - torch.sum(negative_similarity)


model = Word2Vec(embedding_dim=128, vocab_size=tokenizer.vocab_size)
optimizer = torch.optim.SGD(model.parameters(), lr=0.01)

In [378]:
def train(n_epochs: int) -> None:
    for epoch in tqdm(range(n_epochs), desc="Epochs"):
        total_loss = 0
        for i, batch in enumerate(tqdm(train_dataloader, desc="Batches", leave=False)):
            word_embedding = model.in_embedding(batch["word_ids"])
            positive_context_embeddings = model.out_embedding(
                batch["positive_context_ids"]
            )
            negative_context_embeddings = model.out_embedding(
                batch["negative_context_ids"]
            )

            optimizer.zero_grad()
            loss = ce_loss(
                word_embedding, positive_context_embeddings, negative_context_embeddings
            )
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
        print(f"Epoch {epoch} - Loss: {total_loss/len(train_dataloader)}")

In [379]:
train(n_epochs=10)

Epochs:   0%|          | 0/10 [00:00<?, ?it/s]

Batches:   0%|          | 0/318 [00:00<?, ?it/s]

Epoch 0 - Loss: 26112.686830916493


Batches:   0%|          | 0/318 [00:00<?, ?it/s]

Epoch 1 - Loss: 16238.584628267108


Batches:   0%|          | 0/318 [00:00<?, ?it/s]

Epoch 2 - Loss: 11306.088389414661


Batches:   0%|          | 0/318 [00:00<?, ?it/s]

Epoch 3 - Loss: 8131.693448048718


Batches:   0%|          | 0/318 [00:00<?, ?it/s]

Epoch 4 - Loss: 6370.757311695027


Batches:   0%|          | 0/318 [00:00<?, ?it/s]

Epoch 5 - Loss: 5232.012038800702


Batches:   0%|          | 0/318 [00:00<?, ?it/s]

Epoch 6 - Loss: 4396.276004839244


Batches:   0%|          | 0/318 [00:00<?, ?it/s]

Epoch 7 - Loss: 3795.34054728574


Batches:   0%|          | 0/318 [00:00<?, ?it/s]

Epoch 8 - Loss: 3327.2300416958406


Batches:   0%|          | 0/318 [00:00<?, ?it/s]

Epoch 9 - Loss: 2969.847161634913


In [None]:
def test_collate_fn(batch):
    word_ids = torch.tensor([b[0] for b in batch])
    return {"word_ids": word_ids}

In [380]:
test_dataloader = DataLoader(
    test_dataset,
    batch_size=256,
    shuffle=False,
)

In [None]:
def evaluate() -> None:
    total_loss = 0
    for _, batch in enumerate(tqdm(test_dataloader, desc="Batches")):
        word_embedding = model.in_embedding(batch["word_ids"])
        positive_context_embeddings = model.out_embedding(batch["positive_context_ids"])
        negative_context_embeddings = model.out_embedding(batch["negative_context_ids"])
        loss = ce_loss(
            word_embedding, positive_context_embeddings, negative_context_embeddings
        )
        total_loss += loss.item()
    print(f"Loss {total_loss / len(test_dataloader)}")

In [381]:
evaluate()

Batches:   0%|          | 0/82 [00:00<?, ?it/s]

Loss 7472.958478229802


In [382]:
def test

SyntaxError: expected '(' (3323627844.py, line 1)