<a href="https://colab.research.google.com/github/mohamedshouaib/iti/blob/main/NLP/tasks/task5_NLP_W2V.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#This notebook introduce a complete implementation of a Word2Vec-like Skip-Gram model using PyTorch, the Hugging Face Datasets library, and a custom tokenizer. The used dataset is "yelp_review_full".

# Install and Import Dependencies

In [53]:
!pip install datasets transformers torch tqdm



In [54]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from datasets import load_dataset
import random
from collections import Counter
import numpy as np
from tqdm import tqdm

# Load and Preprocess the Dataset

In [55]:
dataset = load_dataset("yelp_review_full", split="train[:5%]")
texts = [item['text'] for item in dataset]

tokenized_texts = [text.lower().split() for text in texts]
flat_tokens = [word for sentence in tokenized_texts for word in sentence]

# Build Vocabulary

In [56]:
vocab_size = 10000
min_freq = 5
word_freq = Counter(flat_tokens)
most_common = word_freq.most_common(vocab_size - 2)

word2idx = {'<UNK>': 0, '<PAD>': 1}
for i, (word, _) in enumerate(most_common, start=2):
    word2idx[word] = i
idx2word = {idx: word for word, idx in word2idx.items()}


# Generate Skip-Gram Pairs

In [57]:
def generate_skipgram_pairs(tokenized_sentences, window_size=2):
    pairs = []
    for sentence in tokenized_sentences:
        indices = [word2idx.get(word, 0) for word in sentence]
        for center_pos in range(len(indices)):
            for w in range(-window_size, window_size + 1):
                context_pos = center_pos + w
                if w != 0 and 0 <= context_pos < len(indices):
                    pairs.append((indices[center_pos], indices[context_pos]))
    return pairs

pairs = generate_skipgram_pairs(tokenized_texts)

# Dataset and DataLoader

In [58]:
class SkipGramDataset(Dataset):
    def __init__(self, pairs):
        self.pairs = pairs

    def __len__(self):
        return len(self.pairs)

    def __getitem__(self, idx):
        center, context = self.pairs[idx]
        return torch.tensor(center), torch.tensor(context)

batch_size = 512
train_dataset = SkipGramDataset(pairs)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

# Word2Vec Skip-Gram Model

In [59]:
class Word2Vec(nn.Module):
    def __init__(self, vocab_size, embedding_dim):
        super(Word2Vec, self).__init__()
        self.center_embeddings = nn.Embedding(vocab_size, embedding_dim)
        self.context_embeddings = nn.Embedding(vocab_size, embedding_dim)

    def forward(self, center_words, context_words):
        center_embeds = self.center_embeddings(center_words)
        context_embeds = self.context_embeddings(context_words)
        scores = torch.sum(center_embeds * context_embeds, dim=1)
        return scores

embedding_dim = 200
model = Word2Vec(vocab_size=len(word2idx), embedding_dim=embedding_dim)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)

# Training the Model

In [60]:
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
loss_fn = nn.BCEWithLogitsLoss()

epochs = 10
for epoch in range(epochs):
    model.train()
    total_loss = 0
    for center, context in tqdm(train_loader):
        center, context = center.to(device), context.to(device)

        # Create positive and negative samples
        positive_labels = torch.ones(center.size(0)).to(device)
        negative_context = torch.randint(0, len(word2idx), context.size()).to(device)
        negative_labels = torch.zeros(center.size(0)).to(device)

        # Forward pass
        pos_scores = model(center, context)
        neg_scores = model(center, negative_context)

        # Compute loss
        loss = loss_fn(pos_scores, positive_labels) + loss_fn(neg_scores, negative_labels)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    print(f"Epoch {epoch + 1}/{epochs}, Loss: {total_loss:.4f}")

100%|██████████| 32413/32413 [07:07<00:00, 75.82it/s]


Epoch 1/10, Loss: 90320.9652


100%|██████████| 32413/32413 [07:04<00:00, 76.27it/s]


Epoch 2/10, Loss: 31988.3264


100%|██████████| 32413/32413 [07:05<00:00, 76.17it/s]


Epoch 3/10, Loss: 24043.9943


100%|██████████| 32413/32413 [07:02<00:00, 76.79it/s]


Epoch 4/10, Loss: 20856.5922


100%|██████████| 32413/32413 [07:04<00:00, 76.40it/s]


Epoch 5/10, Loss: 19274.4050


100%|██████████| 32413/32413 [07:01<00:00, 76.89it/s]


Epoch 6/10, Loss: 18368.6513


100%|██████████| 32413/32413 [07:02<00:00, 76.63it/s]


Epoch 7/10, Loss: 17822.0861


100%|██████████| 32413/32413 [07:02<00:00, 76.78it/s]


Epoch 8/10, Loss: 17448.2099


100%|██████████| 32413/32413 [06:58<00:00, 77.39it/s]


Epoch 9/10, Loss: 17183.3344


100%|██████████| 32413/32413 [06:59<00:00, 77.20it/s]

Epoch 10/10, Loss: 16982.3353





# Save and Load the Model

In [61]:
# Save model and vocab
torch.save(model.state_dict(), "skipgram_model.pt")
torch.save(word2idx, "word2idx.pt")

# To load later:
# model.load_state_dict(torch.load("skipgram_model.pt"))
# model.eval()

# Inference – Get Similar Words

In [62]:
def get_similar_words(query_word, top_n=5):
    model.eval()
    if query_word not in word2idx:
        print(f"'{query_word}' not in vocabulary.")
        return

    with torch.no_grad():
        query_idx = word2idx[query_word]
        query_vec = model.center_embeddings(torch.tensor([query_idx]).to(device))

        all_embeddings = model.center_embeddings.weight.data
        similarities = torch.matmul(query_vec, all_embeddings.T).squeeze(0)
        similar_indices = similarities.topk(top_n + 1).indices.tolist()[1:]

        print(f"Words similar to '{query_word}':")
        for idx in similar_indices:
            print(f"- {idx2word[idx]}")

# Example
get_similar_words("good")

Words similar to 'good':
- superior
- thai,
- entertaining
- alright
- divine.
