In [83]:
from torch import nn
import torch
import yaml
from torch.utils.data import DataLoader
from transformers import BertTokenizer
from datasets import load_dataset
import math
import torch.nn.functional as F
from tqdm.notebook import tqdm

In [84]:
with open("config.yaml", "r") as file:
    config = yaml.safe_load(file)

In [85]:
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased", do_lower_case=True)
dataset = load_dataset("scikit-learn/imdb", split="train")

In [86]:
def preprocessing_fn(x, tokenizer):
    x["review_ids"] = tokenizer(
        x["review"],
        add_special_tokens=False,
        truncation=True,
        max_length=256,
        padding=False,
        return_attention_mask=False,
    )["input_ids"]
    x["label"] = 0 if x["sentiment"] == "negative" else 1
    return x


print(preprocessing_fn(dataset[1], tokenizer).keys())
print(preprocessing_fn(dataset[1], tokenizer)["review_ids"])

dict_keys(['review', 'sentiment', 'review_ids', 'label'])
[1037, 6919, 2210, 2537, 1012, 1026, 7987, 1013, 1028, 1026, 7987, 1013, 1028, 1996, 7467, 6028, 2003, 2200, 14477, 4757, 24270, 1011, 2200, 2214, 1011, 2051, 1011, 4035, 4827, 1998, 3957, 1037, 16334, 1010, 1998, 2823, 17964, 2075, 1010, 3168, 1997, 15650, 2000, 1996, 2972, 3538, 1012, 1026, 7987, 1013, 1028, 1026, 7987, 1013, 1028, 1996, 5889, 2024, 5186, 2092, 4217, 1011, 2745, 20682, 2025, 2069, 1000, 2038, 2288, 2035, 1996, 11508, 2072, 1000, 2021, 2002, 2038, 2035, 1996, 5755, 2091, 6986, 2205, 999, 2017, 2064, 5621, 2156, 1996, 25180, 3238, 9260, 8546, 2011, 1996, 7604, 2000, 3766, 1005, 9708, 10445, 1010, 2025, 2069, 2003, 2009, 2092, 4276, 1996, 3666, 2021, 2009, 2003, 1037, 27547, 2135, 2517, 1998, 2864, 3538, 1012, 1037, 3040, 3993, 2537, 2055, 2028, 1997, 1996, 2307, 3040, 1005, 1055, 1997, 4038, 1998, 2010, 2166, 1012, 1026, 7987, 1013, 1028, 1026, 7987, 1013, 1028, 1996, 15650, 2428, 3310, 2188, 2007, 1996, 2210, 2

In [87]:
# If required, n_samples can be set to 2000 to speed up training
n_samples = 5000  # the number of training example

# We first shuffle the data !
dataset = dataset.shuffle(seed=42)

# Select 5000 samples
dataset = dataset.select(range(n_samples))

# Tokenize the dataset
dataset = dataset.map(lambda x: preprocessing_fn(x, tokenizer))

# Remove useless columns
dataset = dataset.remove_columns(["review", "sentiment"])

# Split the train and validation
train_test_split = dataset.train_test_split(test_size=0.2)
document_train_set = train_test_split["train"]
document_test_set = train_test_split["test"]

## Classification task

In [88]:
class Word2Vec(nn.Module):
    def __init__(self, embedding_dim: int, vocab_size: int):
        super(Word2Vec, self).__init__()
        self.in_embedding = nn.Embedding(vocab_size, embedding_dim)
        self.out_embedding = nn.Embedding(vocab_size, embedding_dim)

    def predict(self, context_indices):
        """
        Predicts the most likely word given the context words.

        Args:
            context_indices (List[int]): Indices of the context words.

        Returns:
            int: Index of the predicted word.
        """
        # Get embeddings for each word in the context
        context_embeddings = self.in_embedding(torch.tensor(context_indices))

        # Aggregate the context embeddings (e.g., by averaging)
        context_vector = context_embeddings.mean(dim=0)

        # Compute the similarity scores between the context vector and all word embeddings
        similarity_scores = torch.matmul(self.out_embedding.weight, context_vector)

        # Find the index of the word with the highest similarity score
        predicted_word_index = similarity_scores.argmax().item()
        return predicted_word_index

### Question 1

In [89]:
def load_model(model, path):
    model.load_state_dict(torch.load(path))
    return model

In [90]:
model = Word2Vec(embedding_dim=config["embedding_dim"], vocab_size=tokenizer.vocab_size)
load_model(model, "model_dim-128-radius-6-ratio-4-batch-256-epoch-15.ckpt")

  model.load_state_dict(torch.load(path))


Word2Vec(
  (in_embedding): Embedding(30522, 128)
  (out_embedding): Embedding(30522, 128)
)

In [91]:
class Conv1dClassifier(nn.Module):
    """A text classifier:
    - input = minibatch
    - output = probability associated to a binary classification task
    - vocab_size: the number of words in the vocabulary we want to embed
    - embedding_dim: size of the word vectors
    """

    def __init__(self, vocab_size, embedding_dim, feature_size=100, kernel_size=3):
        super().__init__()
        self.embedding_dim = embedding_dim
        self.feature_size = feature_size
        self.embeddings = nn.Embedding(vocab_size, embedding_dim)
        # The number of padding symbols depends on the kernel size.
        # It is important to ensure that we have always a sequence
        # as long as the kernel size.
        # ex: if ks=3, we add 1 padding before and one after.
        # The sentence "Great" becomes "<pad> Great <pad>"
        self.conv = nn.Conv1d(
            embedding_dim,
            feature_size,
            kernel_size,
            padding=math.floor(kernel_size / 2),
        )
        # The parameter for AdaptiveMaxPool1d is the "output size"
        # or the number of output values for a dimension.
        # Here it is one: we want to get the max for every components.
        self.pool = nn.AdaptiveMaxPool1d(1)
        self.dropout = nn.Dropout(0.3)
        self.linear = nn.Linear(feature_size, 1)
        self.out_activation = nn.Sigmoid()

    def forward(self, input_ids):
        # In pytorch, convolution expects (B,d,L)
        # B: the batch dimension
        # d: the embedding dimension
        # L: the length of the sequence
        hidden_states = self.embeddings(input_ids).permute(0, 2, 1)
        hidden_states = F.relu(self.conv(hidden_states))
        hidden_states = self.pool(hidden_states)  # --> (B,d,1)
        # Before the linear, do something with dims the dimension
        # Because Linear works on the final dim
        # (B,d,1) -> (B,d)
        hidden_states = hidden_states.squeeze(dim=2)
        hidden_states = self.dropout(hidden_states)
        logits = self.linear(hidden_states)
        return self.out_activation(logits)

    def train(self, n_epochs: int = 10):
        criterion = nn.BCELoss()
        optimizer = torch.optim.Adam(self.parameters(), lr=0.001)
        for epoch in tqdm(range(n_epochs), desc="Epochs"):
            total_loss = 0
            for i, batch in enumerate(
                tqdm(classification_train_dataloader, desc="Batches", leave=False)
            ):
                optimizer.zero_grad()
                output = self(batch["review_ids"])
                loss = criterion(output, batch["label"].float())
                loss.backward()
                optimizer.step()
                total_loss += loss.item()
            print(f"Epoch {epoch} - Loss: {total_loss/n_train}")

    def test(self):
        correct_predictions = 0
        with torch.no_grad():
            for i, batch in enumerate(
                tqdm(classification_test_dataloader, desc="Processing batches")
            ):
                output = self(batch["review_ids"])
                predicted_labels = (output > 0.5).int()
                correct_predictions += (predicted_labels == batch["label"]).sum().item()
        accuracy = correct_predictions / n_test * 100
        accuracy = round(accuracy, 4)
        print(f"Accuracy: {accuracy}%")
        return accuracy

In [92]:
class DataCollator:
    def __init__(self, tokenizer):
        self.tokenizer = tokenizer

    def __call__(self, batch):
        # `batch` is a list of dictionary with keys "review_ids" and "label".
        features = [{"input_ids": x["review_ids"]} for x in batch]
        features = self.tokenizer.pad(
            features, padding="max_length", max_length=256, return_tensors="pt"
        )
        label = torch.tensor([x["label"] for x in batch])[:, None]
        return {"review_ids": features["input_ids"], "label": label}

In [93]:
data_collator = DataCollator(tokenizer)

In [94]:
batch_size = 32

classification_train_dataloader = DataLoader(
    document_train_set, batch_size=batch_size, collate_fn=data_collator
)
classification_test_dataloader = DataLoader(
    document_test_set, batch_size=batch_size, collate_fn=data_collator
)

n_test = len(document_test_set)
n_train = len(document_train_set)

In [95]:
batch = next(iter(classification_train_dataloader))
print("batch is a dictionnary with keys:", batch.keys())
print("Size of different elements:", batch["review_ids"].shape, batch["label"].shape)

batch is a dictionnary with keys: dict_keys(['review_ids', 'label'])
Size of different elements: torch.Size([32, 256]) torch.Size([32, 1])


Test the classifier on a random sequence.

In [96]:
VOCSIZE = tokenizer.vocab_size
random_inputs = torch.randint(0, VOCSIZE, (4, 100))
# Test the class: is everything in place:
# A first classifier is built like :
model = Conv1dClassifier(
    vocab_size=VOCSIZE, embedding_dim=25
)  # The parameters of the classifier are randomly initialize, but we
# can use it on a sequence :
out = model(random_inputs)
print(out.shape)  # the output has 2 dimensions
print(out)

torch.Size([4, 1])
tensor([[0.4438],
        [0.5006],
        [0.4052],
        [0.6701]], grad_fn=<SigmoidBackward0>)


### Question 2

For the pretrained embeddings, we use the in_embeddings matrix, which makes more sense as it is the one used to capture a word when it is part of a context. This is better for semantic classification.

In [97]:
pretrained_model = Conv1dClassifier(tokenizer.vocab_size, config["embedding_dim"])
pretrained_word2vec = load_model(
    Word2Vec(embedding_dim=config["embedding_dim"], vocab_size=tokenizer.vocab_size),
    "model_dim-128-radius-6-ratio-4-batch-256-epoch-15.ckpt",
)
pretrained_model.embeddings = nn.Embedding.from_pretrained(
    pretrained_word2vec.in_embedding.weight
)
pretrained_model.train(n_epochs=15)

  model.load_state_dict(torch.load(path))


Epochs:   0%|          | 0/15 [00:00<?, ?it/s]

Batches:   0%|          | 0/125 [00:00<?, ?it/s]

NameError: name 'train_dataloader' is not defined

In [None]:
pretrained_model.test()

Processing batches:   0%|          | 0/32 [00:00<?, ?it/s]

NameError: name 'classification_test_dataset' is not defined

### Question 3

In [None]:
classifier = Conv1dClassifier(
    vocab_size=tokenizer.vocab_size, embedding_dim=config["embedding_dim"]
)
classifier.train(n_epochs=15)

Epochs:   0%|          | 0/10 [00:00<?, ?it/s]

Batches:   0%|          | 0/125 [00:00<?, ?it/s]

Epoch 0 - Loss: 0.026903515173314855


Batches:   0%|          | 0/125 [00:00<?, ?it/s]

Epoch 1 - Loss: 0.02231757625479463


Batches:   0%|          | 0/125 [00:00<?, ?it/s]

In [None]:
classifier.test()