In [1]:
!pip install transformers datasets



In [2]:
import numpy as np

import torch

import torch.nn.functional as F

import torch.nn as nn

import math

from torch.utils.data import DataLoader

from tabulate import tabulate

from datasets import load_dataset



from tqdm.notebook import tqdm

from transformers import BertTokenizer

This is a template of the notebook that you should complete and enrich with your own code.



First cells will be the same than the ones of the lab on text convolution.



# Data loading


In [3]:
dataset = load_dataset("scikit-learn/imdb", split="train")

print(dataset)

README.md:   0%|          | 0.00/1.22k [00:00<?, ?B/s]

IMDB Dataset.csv:   0%|          | 0.00/66.2M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/50000 [00:00<?, ? examples/s]

Dataset({
    features: ['review', 'sentiment'],
    num_rows: 50000
})


# Pre-processing / Tokenization



This is a very important step. It maybe boring but very important. In this session we will be lazy, but in real life, the time spent on inspecting and cleaning data is never wasted. It is true for text, but also for everything.







In PyTorch, everything is tensor. Words are replaced by indices. A sentence, is therefore a sequence of indices (long integers). In the first HW, you constructed a `WhiteSpaceTokenizer`. Here we will use an already built tokenizer. It is more appropriate to transformers. It relies on sub-word units, and converts everything in lower case. This is not always the best choice, but here it will be sufficient. To quote the documentation, this tokenizer allows you to:

- Tokenize (splitting strings in sub-word token strings), converttokens strings to ids and back, and encoding/decoding (i.e., tokenizing and converting to integers).

- Add new tokens to the vocabulary in a way that is independent of the underlying structure (BPE, SentencePiece…).

- Manage special tokens (like mask, beginning-of-sentence, etc.): adding them, assigning them to attributes in the tokenizer for easy access and making sure they are not split during tokenization.



Here we are going to use the tokenizer from the well known Bert model, that we can directly download.

In [4]:
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased", do_lower_case=True)

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [5]:
def preprocessing_fn(x, tokenizer):

    x["review_ids"] = tokenizer(

        x["review"],

        add_special_tokens=False,

        truncation=True,

        max_length=256,

        padding=False,

        return_attention_mask=False,

    )["input_ids"]

    x["label"] = 0 if x["sentiment"] == "negative" else 1

    return x


Same celel than in the lab session.



🚧 **TODO** 🚧



Read the documentation about HuggingFace dataset and complete the code below.

You should:

- Shuffle the dataset

- For computational reasons, use only a total of **5000 samples**.

- Tokenize the dataset with the `preprocessing_fn`. (*Hint: use the `Dataset.map` method from HuggingFace*).

- Keep only columns `review_ids` and `label`.

- Make a train/validation split, (**80% / 20%**). Call these dataset `train_set` and `valid_set`.


In [6]:
n_samples = 5000  # the number of training example

# We first shuffle the data !

dataset = dataset.shuffle()

# Select 5000 samples

small_dataset = dataset.select(range(n_samples))


# Tokenize the dataset

tokenized_dataset = small_dataset.map(lambda x: preprocessing_fn(x, tokenizer))


# Remove useless columns

tokenized_dataset = tokenized_dataset.remove_columns(['review', 'sentiment'])

# Split the train and validation

train_test_split = tokenized_dataset.train_test_split(test_size=0.2)

document_train_set = train_test_split['train']
document_valid_set = train_test_split['test']

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

In [7]:
document_train_set

Dataset({
    features: ['review_ids', 'label'],
    num_rows: 4000
})

In [8]:
def extract_words_contexts(ids, R):
    words = []
    contexts = []
    
    for i, w in enumerate(ids):
        context = ids[max(0, i - R): i] + ids[i + 1: i + R + 1]
        
        if len(context) < 2 * R:
            context += [0] * (2 * R - len(context))
        
        words.append(w)
        contexts.append(context)
    
    return words, contexts


In [9]:
def flatten_dataset_to_list(dataset, R):
    all_words = []
    all_contexts = []
    
    for doc in dataset:
        words, contexts = extract_words_contexts(doc['review_ids'], R)
        all_words.extend(words)
        all_contexts.extend(contexts)
    
    return all_words, all_contexts

In [10]:
R = 10
train_words, train_contexts = flatten_dataset_to_list(document_train_set, R)
valid_words, valid_contexts = flatten_dataset_to_list(document_valid_set, R)

print(f"Training set: {len(train_words)} words and {len(train_contexts)} contexts.")
print(f"Validation set: {len(valid_words)} words and {len(valid_contexts)} contexts.")

Training set: 821044 words and 821044 contexts.
Validation set: 207163 words and 207163 contexts.


In [11]:
from torch.utils.data import Dataset

class WordContextDataset(Dataset):
    def __init__(self, words, contexts):
        self.words = words
        self.contexts = contexts

    def __len__(self):
        return len(self.words)

    def __getitem__(self, idx):
        word = self.words[idx]
        context = self.contexts[idx]
        return torch.tensor(word), torch.tensor(context)


In [12]:
train_set = WordContextDataset(train_words, train_contexts)
valid_set = WordContextDataset(valid_words, valid_contexts)

print(f"Training dataset size: {len(train_set)}")
print(f"Validation dataset size: {len(valid_set)}")

Training dataset size: 821044
Validation dataset size: 207163


In [13]:
import torch
import random

def collate_fn(batch, vocab, K, R):
    word_ids_batch = []
    positive_context_ids_batch = []
    negative_context_ids_batch = []

    vocab_size = len(vocab)

    max_positive_context_len = max([len(positive_context_ids) for _, positive_context_ids in batch])
    
    for word_ids, positive_context_ids in batch:
        word_ids_batch.append(word_ids)

        positive_context_ids = positive_context_ids.tolist()
        padded_positive_context = positive_context_ids + [0] * (2 * R - len(positive_context_ids))
        positive_context_ids_batch.append(padded_positive_context[:2 * R])

        negative_context_ids = [random.randint(0, vocab_size - 1) for _ in range(2 * K)]
        negative_context_ids_batch.append(negative_context_ids)

    word_ids_tensor = torch.stack(word_ids_batch)
    positive_context_ids_tensor = torch.tensor(positive_context_ids_batch)
    negative_context_ids_tensor = torch.tensor(negative_context_ids_batch)

    return {
        "word_id": word_ids_tensor,
        "positive_context_ids": positive_context_ids_tensor,
        "negative_context_ids": negative_context_ids_tensor
    }

In [14]:
len(tokenizer.vocab)

30522

In [15]:
import random
from torch.utils.data import DataLoader

R = 2
K = 5

vocab = tokenizer.get_vocab()

train_loader = DataLoader(
    train_set,
    batch_size=256,
    shuffle=True,
    collate_fn=lambda x: collate_fn(x, vocab, K, R)
)

valid_loader = DataLoader(
    valid_set,
    batch_size=256,
    shuffle=False,
    collate_fn=lambda x: collate_fn(x, vocab, K, R)
)

for i, batch in enumerate(train_loader):
    print(f"Batch {i + 1} (R={R}, K={K}):")
    print(f"word_id tensor shape: {batch['word_id'].shape}")
    print(f"positive_context_ids tensor shape: {batch['positive_context_ids'].shape}")
    print(f"negative_context_ids tensor shape: {batch['negative_context_ids'].shape}")
    print("-" * 50)

    if i == 2:
        break


Batch 1 (R=2, K=5):
word_id tensor shape: torch.Size([256])
positive_context_ids tensor shape: torch.Size([256, 4])
negative_context_ids tensor shape: torch.Size([256, 10])
--------------------------------------------------
Batch 2 (R=2, K=5):
word_id tensor shape: torch.Size([256])
positive_context_ids tensor shape: torch.Size([256, 4])
negative_context_ids tensor shape: torch.Size([256, 10])
--------------------------------------------------
Batch 3 (R=2, K=5):
word_id tensor shape: torch.Size([256])
positive_context_ids tensor shape: torch.Size([256, 4])
negative_context_ids tensor shape: torch.Size([256, 10])
--------------------------------------------------


In [16]:
import torch
import torch.nn as nn

class Word2Vec(nn.Module):
    def __init__(self, vocab_size, embedding_dim):
        super(Word2Vec, self).__init__()
        self.word_embeddings = nn.Embedding(vocab_size, embedding_dim)
        self.context_embeddings = nn.Embedding(vocab_size, embedding_dim)
    
    def forward(self, word_ids, context_ids, negative_context_ids):
        word_embeds = self.word_embeddings(word_ids) 
        positive_context_embeds = self.context_embeddings(context_ids)  
        negative_context_embeds = self.context_embeddings(negative_context_ids)

        positive_scores = (word_embeds.unsqueeze(1) * positive_context_embeds).sum(dim=2) 
        
        negative_scores = (-word_embeds.unsqueeze(1) * negative_context_embeds).sum(dim=2)  

        positive_scores = positive_scores.mean(dim=1)  
        negative_scores = negative_scores.mean(dim=1)  
        return positive_scores, negative_scores

In [17]:
import torch

def evaluate_model(model, valid_loader, device):
    model.eval()
    
    correct_predictions = 0
    total_predictions = 0
    
    with torch.no_grad():
        for batch in valid_loader:
            word_ids = batch["word_id"].to(device)
            positive_context_ids = batch["positive_context_ids"].to(device)
            negative_context_ids = batch["negative_context_ids"].to(device)

            word_embeds = model.word_embeddings(word_ids)
            positive_context_embeds = model.context_embeddings(positive_context_ids)
            negative_context_embeds = model.context_embeddings(negative_context_ids)

            positive_scores = (word_embeds.unsqueeze(1) * positive_context_embeds).sum(dim=2)
            negative_scores = (-word_embeds.unsqueeze(1) * negative_context_embeds).sum(dim=2)

            positive_scores = positive_scores.mean(dim=1)
            negative_scores = negative_scores.mean(dim=1)

            positive_probs = torch.sigmoid(positive_scores)
            negative_probs = torch.sigmoid(negative_scores)

            predicted_positive = (positive_probs >= 0.5).sum()
            predicted_negative = (negative_probs < 0.5).sum()
            
            correct_predictions += predicted_positive.item() + predicted_negative.item()
            total_predictions += len(word_ids) * 2
        
    accuracy = correct_predictions / total_predictions
    return accuracy

In [18]:
from tqdm import tqdm 

def train_word2vec(model, train_loader, optimizer, criterion, E, device, valid_loader):
    model.train()
    for epoch in range(E):
        total_loss = 0
        
        progress_bar = tqdm(enumerate(train_loader), total=len(train_loader), desc=f'Epoch {epoch + 1}/{E}')
        
        for batch_idx, batch in progress_bar:
            word_ids = batch["word_id"].to(device)
            positive_context_ids = batch["positive_context_ids"].to(device)
            negative_context_ids = batch["negative_context_ids"].to(device)
            
            optimizer.zero_grad()
            
            positive_score, negative_score = model(word_ids, positive_context_ids, negative_context_ids)

            positive_loss = criterion(positive_score, torch.ones_like(positive_score, device=device))
            negative_loss = criterion(negative_score, torch.zeros_like(negative_score, device=device))
            loss = positive_loss + negative_loss
            
            loss.backward()
            optimizer.step()
            
            total_loss += loss.item()
            
            progress_bar.set_postfix(loss=total_loss / (batch_idx + 1))
        
        print(f"Epoch {epoch + 1}/{E}, Loss: {total_loss / len(train_loader):.4f}")
        
        val_accuracy = evaluate_model(model, valid_loader, device)
        print(f"Validation Accuracy after epoch {epoch + 1}: {val_accuracy:.4f}")

In [19]:
import torch.optim as optim  
model = None
optimizer = None
criterion = None

In [20]:
vocab_size = len(vocab)
embedding_dim = 256
B = 100           
E = 3
learning_rate = 0.001

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

model = Word2Vec(vocab_size, embedding_dim).to(device)
optimizer = optim.Adam(model.parameters(), lr=learning_rate)
criterion = nn.BCEWithLogitsLoss()

train_word2vec(model, train_loader, optimizer, criterion, E, device, valid_loader)

cuda


Epoch 1/3: 100%|██████████| 3208/3208 [01:02<00:00, 50.93it/s, loss=2.78]


Epoch 1/3, Loss: 2.7799
Validation Accuracy after epoch 1: 0.8416


Epoch 2/3: 100%|██████████| 3208/3208 [01:02<00:00, 51.40it/s, loss=0.73]


Epoch 2/3, Loss: 0.7299
Validation Accuracy after epoch 2: 0.9328


Epoch 3/3: 100%|██████████| 3208/3208 [01:02<00:00, 51.47it/s, loss=0.268]


Epoch 3/3, Loss: 0.2681
Validation Accuracy after epoch 3: 0.9631


In [21]:
import os

def save_model(model, save_dir="models"):
    if not os.path.exists(save_dir):
        os.makedirs(save_dir)

    file_name = f"model.ckpt"
    file_path = os.path.join(save_dir, file_name)

    torch.save(model.state_dict(), file_path)
    print(f"Model saved as {file_path}")

In [22]:
save_model(model)

Model saved as models/model.ckpt
