<a href="https://colab.research.google.com/github/mohamedshouaib/iti/blob/main/NLP/tasks/task5_NLP_W2V.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#This notebook introduce a complete implementation of a Word2Vec-like Skip-Gram model using PyTorch, the Hugging Face Datasets library, and a custom tokenizer. The used dataset is "yelp_review_full".

# Install and Import Dependencies

In [None]:
!pip install datasets transformers torch tqdm

Collecting datasets
  Downloading datasets-3.6.0-py3-none-any.whl.metadata (19 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2025.3.0,>=2023.1.0 (from fsspec[http]<=2025.3.0,>=2023.1.0->datasets)
  Downloading fsspec-2025.3.0-py3-none-any.whl.metadata (11 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupt

In [None]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from datasets import load_dataset
import random
from collections import Counter
import numpy as np
from tqdm import tqdm

# Load and Preprocess the Dataset

In [None]:
dataset = load_dataset("yelp_review_full", split="train[:10%]")
texts = [item['text'] for item in dataset]

tokenized_texts = [text.lower().split() for text in texts]
flat_tokens = [word for sentence in tokenized_texts for word in sentence]

Error while fetching `HF_TOKEN` secret value from your vault: 'Requesting secret HF_TOKEN timed out. Secrets can only be fetched when running from the Colab UI.'.
You are not authenticated with the Hugging Face Hub in this notebook.
If the error persists, please let us know by opening an issue on GitHub (https://github.com/huggingface/huggingface_hub/issues/new).


README.md:   0%|          | 0.00/6.72k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/299M [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/23.5M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/650000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/50000 [00:00<?, ? examples/s]

# Build Vocabulary

In [None]:
vocab_size = 10000
min_freq = 5
word_freq = Counter(flat_tokens)
most_common = word_freq.most_common(vocab_size - 2)

word2idx = {'<UNK>': 0, '<PAD>': 1}
for i, (word, _) in enumerate(most_common, start=2):
    word2idx[word] = i
idx2word = {idx: word for word, idx in word2idx.items()}


# Generate Skip-Gram Pairs

In [None]:
def generate_skipgram_pairs(tokenized_sentences, window_size=2):
    pairs = []
    for sentence in tokenized_sentences:
        indices = [word2idx.get(word, 0) for word in sentence]
        for center_pos in range(len(indices)):
            for w in range(-window_size, window_size + 1):
                context_pos = center_pos + w
                if w != 0 and 0 <= context_pos < len(indices):
                    pairs.append((indices[center_pos], indices[context_pos]))
    return pairs

pairs = generate_skipgram_pairs(tokenized_texts)

# Dataset and DataLoader

In [None]:
class SkipGramDataset(Dataset):
    def __init__(self, pairs):
        self.pairs = pairs

    def __len__(self):
        return len(self.pairs)

    def __getitem__(self, idx):
        center, context = self.pairs[idx]
        return torch.tensor(center), torch.tensor(context)

batch_size = 256
train_dataset = SkipGramDataset(pairs)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

# Word2Vec Skip-Gram Model

In [None]:
class Word2Vec(nn.Module):
    def __init__(self, vocab_size, embedding_dim):
        super(Word2Vec, self).__init__()
        self.center_embeddings = nn.Embedding(vocab_size, embedding_dim)
        self.context_embeddings = nn.Embedding(vocab_size, embedding_dim)

    def forward(self, center_words, context_words):
        center_embeds = self.center_embeddings(center_words)
        context_embeds = self.context_embeddings(context_words)
        scores = torch.sum(center_embeds * context_embeds, dim=1)
        return scores

embedding_dim = 200
model = Word2Vec(vocab_size=len(word2idx), embedding_dim=embedding_dim)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)

# Training the Model

In [None]:
optimizer = torch.optim.Adam(model.parameters(), lr=0.0005)
loss_fn = nn.BCEWithLogitsLoss()

epochs = 20
for epoch in range(epochs):
    model.train()
    total_loss = 0
    for center, context in tqdm(train_loader):
        center, context = center.to(device), context.to(device)

        # Create positive and negative samples
        positive_labels = torch.ones(center.size(0)).to(device)
        negative_context = torch.randint(0, len(word2idx), context.size()).to(device)
        negative_labels = torch.zeros(center.size(0)).to(device)

        # Forward pass
        pos_scores = model(center, context)
        neg_scores = model(center, negative_context)

        # Compute loss
        loss = loss_fn(pos_scores, positive_labels) + loss_fn(neg_scores, negative_labels)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    print(f"Epoch {epoch + 1}/{epochs}, Loss: {total_loss:.4f}")

100%|██████████| 13355/13355 [01:22<00:00, 161.95it/s]


Epoch 1/20, Loss: 90971.1613


100%|██████████| 13355/13355 [01:23<00:00, 160.64it/s]


Epoch 2/20, Loss: 46661.9170


100%|██████████| 13355/13355 [01:23<00:00, 160.65it/s]


Epoch 3/20, Loss: 30855.2078


100%|██████████| 13355/13355 [01:22<00:00, 161.11it/s]


Epoch 4/20, Loss: 23525.2784


100%|██████████| 13355/13355 [01:23<00:00, 160.81it/s]


Epoch 5/20, Loss: 19235.0791


100%|██████████| 13355/13355 [01:22<00:00, 161.17it/s]


Epoch 6/20, Loss: 16391.2795


100%|██████████| 13355/13355 [01:23<00:00, 160.57it/s]


Epoch 7/20, Loss: 14326.2613


100%|██████████| 13355/13355 [01:22<00:00, 161.00it/s]


Epoch 8/20, Loss: 12737.8142


100%|██████████| 13355/13355 [01:23<00:00, 160.30it/s]


Epoch 9/20, Loss: 11480.3478


100%|██████████| 13355/13355 [01:23<00:00, 160.43it/s]


Epoch 10/20, Loss: 10494.2649


100%|██████████| 13355/13355 [01:23<00:00, 160.66it/s]


Epoch 11/20, Loss: 9709.4226


100%|██████████| 13355/13355 [01:22<00:00, 161.29it/s]


Epoch 12/20, Loss: 9074.4453


100%|██████████| 13355/13355 [01:23<00:00, 160.64it/s]


Epoch 13/20, Loss: 8542.2664


100%|██████████| 13355/13355 [01:23<00:00, 160.20it/s]


Epoch 14/20, Loss: 8126.1015


100%|██████████| 13355/13355 [01:23<00:00, 159.15it/s]


Epoch 15/20, Loss: 7787.4119


100%|██████████| 13355/13355 [01:23<00:00, 160.48it/s]


Epoch 16/20, Loss: 7503.0659


100%|██████████| 13355/13355 [01:23<00:00, 160.00it/s]


Epoch 17/20, Loss: 7273.2559


100%|██████████| 13355/13355 [01:23<00:00, 159.99it/s]


Epoch 18/20, Loss: 7057.2664


100%|██████████| 13355/13355 [01:23<00:00, 160.12it/s]


Epoch 19/20, Loss: 6877.6071


100%|██████████| 13355/13355 [01:23<00:00, 160.46it/s]

Epoch 20/20, Loss: 6728.2118





# Save and Load the Model

In [None]:
# Save model and vocab
torch.save(model.state_dict(), "skipgram_model.pt")
torch.save(word2idx, "word2idx.pt")

# To load later:
# model.load_state_dict(torch.load("skipgram_model.pt"))
# model.eval()

# Inference – Get Similar Words

In [None]:
def get_similar_words(query_word, top_n=5):
    model.eval()
    if query_word not in word2idx:
        print(f"'{query_word}' not in vocabulary.")
        return

    with torch.no_grad():
        query_idx = word2idx[query_word]
        query_vec = model.center_embeddings(torch.tensor([query_idx]).to(device))

        all_embeddings = model.center_embeddings.weight.data
        similarities = torch.matmul(query_vec, all_embeddings.T).squeeze(0)
        similar_indices = similarities.topk(top_n + 1).indices.tolist()[1:]

        print(f"Words similar to '{query_word}':")
        for idx in similar_indices:
            print(f"- {idx2word[idx]}")

# Example
get_similar_words("good")

Words similar to 'good':
- machine
- clothing
- perfectly,
- classy
- quiet
