## Skip Gram Model

P(word|context) -> CBOW
skip gram -> P(Wcontext | Wword) it provides the word embeddings , given the words it predicts context for the word embeddings. Why do we require the embeddings? These embeddings captures the semantic relationships among the words because one hot encoding doesn't captures the semantic relationship 
    - because one-hot encoding is a sparse matrix
    - the euclidean distance of root(2) doesn't gurantees the far away 
Skip gram model thus captures the:
    - semantic relationship
    - syntactic relationship (explain it later)

## Question
1. Word Representation using Word2Vec: Implement the skip-gram model with negative
sampling loss function for word embedding generation. Your implementation should include: [20
 Marks]
 (b) Implement the skip-gram model from scratch with negative sampling loss. [4]
 (c) Derive and implement the gradients for backpropagation. [4]
 (d) Train your model on the text8 dataset with appropriate hyperparameters (specify your choices
 and justify them). [3]
 (e) Evaluate the quality of your embeddings through: [4]
 • Visualization using SVD to project the embeddings to 2D space.
 • Word similarity analysis for semantically related words (e.g., “king”- “man” + “woman”
 ≈ “queen”).
 (f) Discuss the impact of key hyperparameters (e.g., embedding dimension, context window size,
 number of negative samples) on the quality of the learned representations. [2]

In [1]:
import torch.nn.functional as F
import torch
import numpy as np
import torch.nn as nn

class SkipGram:
    def __init__(self,input,hidden,output=1):
        self.hidden=hidden
        self.output=output
        self.input=input
        self.W1 = torch.tensor([[np.random.normal() for _ in range(self.output)] for _ in range(self.input)]) #how do i initialize a tensor of the size input X hidden
        self.W2 = torch.tensor([[np.random.normal() for _ in range(self.output)] for _ in range(self.input)])
        self.b1 = torch.tensor([np.random.normal() for _ in range(self.hidden)])
        self.b2 = torch.tensor([np.random.normal() for _ in range(self.output)])
        
        self.eta =0.01
        
    def forward(self,input_tensor):
        f = torch.tanh(torch.add(torch.matmul(self.W1.T,input_tensor),self.b1))
        g = torch.add(torch.matmul(self.W2.T,f),self.b2)
        output= F.softmax(g)
        # how do we find the log of this 
        return output , f
    
    def cross_entropy(self,X,y):
        m = y.shape[0]
        p = F.log_softmax(X,dim=1)
        loss = -p[torch.arange(m), y].mean()
        return loss
    
    def delta_cross_entropy(self,X,y):
        m = y.shape[0]
        grad = F.softmax(X).clone()
        grad[torch.arange(m), y] -= 1
        grad = grad/m
        return grad
    
    def gradient_descent(self,X,y):
        n_epoch = 10
        
        for _ in range(n_epoch):
            prediction,f = self.forward(X)
            loss = self.cross_entropy(prediction,y)
            
            # calculate the gradients of loss wrt crossentropy function
            d_g = prediction.clone()
            d_g[torch.arange(y.shape[0]), y] -= 1
            d_g /= y.shape[0]

            # calculate the deltaW1, deltaW2 and deltab1, deltab2 
            deltaW2 = torch.matmul(f.T,d_g)
            db2 = torch.sum(d_g,dim=0)
            
            d_f = torch.matmul(d_g, self.W2.T) * (1 - f**2) 
            deltaW1 = torch.matmul(X.T,d_f)
            db1 = torch.sum(d_f,dim=0)
            self.W1 -= self.eta*deltaW1
            self.W2 -= self.eta*deltaW2
            self.b1 -= self.eta*db1
            self.b2 -= self.eta*db2
            print(f'Epoch {_+1}, Loss: {loss.item():.4f}')


## Negative Sampling in the code

In [2]:
import torch
import numpy as np

class SkipGram:
    def __init__(self, input, hidden, output=1, neg_sample_size=5):
        self.input = input
        self.hidden = hidden
        self.output = output
        self.neg_sample_size = neg_sample_size  # Number of negative samples
        
        # Initialize weights and biases
        self.W1 = torch.tensor(np.random.randn(input, hidden), dtype=torch.float32)  # Input × Hidden
        self.W2 = torch.tensor(np.random.randn(hidden, output), dtype=torch.float32)  # Hidden × Output
        self.b1 = torch.tensor(np.random.randn(hidden), dtype=torch.float32)  # Hidden bias
        self.b2 = torch.tensor(np.random.randn(output), dtype=torch.float32)  # Output bias
        
        self.eta = 0.01  # Learning rate

    def forward(self, input_tensor):
        # Forward pass (same as before)
        f = torch.tanh(torch.matmul(input_tensor, self.W1) + self.b1)  # Hidden layer activation
        return f  # Return only hidden activations
    
    def sample_negatives(self, vocab_size, exclude_indices, n_samples):
        """
        Negative sampling: Draw 'n_samples' random indices, excluding 'exclude_indices'.
        """
        neg_indices = []
        while len(neg_indices) < n_samples:
            sample = np.random.randint(0, vocab_size)
            if sample not in exclude_indices:  # Avoid sampling positive targets
                neg_indices.append(sample)
        return torch.tensor(neg_indices, dtype=torch.long)

    def calculate_loss(self, f, target_word_idx, neg_sample_indices, W2, b2):
        """
        Compute the binary cross-entropy loss for positive and negative samples.
        """
        # Positive sample score
        pos_score = torch.sigmoid(torch.matmul(f, W2[:, target_word_idx]) + b2[target_word_idx])
        pos_loss = -torch.log(pos_score + 1e-9)  # Add small epsilon for numerical stability

        # Negative samples score
        neg_scores = torch.sigmoid(torch.matmul(f, W2[:, neg_sample_indices]) + b2[neg_sample_indices])
        neg_loss = -torch.sum(torch.log(1 - neg_scores + 1e-9))  # Sum over negative samples

        return pos_loss + neg_loss

    def gradient_descent(self, input_tensor, target_indices, vocab_size):
        """
        Perform gradient descent using negative sampling.
        """
        n_epoch = 10
        for epoch in range(n_epoch):
            total_loss = 0.0
            for i, input_vec in enumerate(input_tensor):
                # Forward pass
                f = self.forward(input_vec.unsqueeze(0))

                # Positive target index
                target_idx = target_indices[i]

                # Negative sampling
                neg_sample_indices = self.sample_negatives(vocab_size, [target_idx], self.neg_sample_size)

                # Loss computation
                loss = self.calculate_loss(f, target_idx, neg_sample_indices, self.W2, self.b2)
                total_loss += loss.item()

                # Gradients for W2 and b2
                pos_score = torch.sigmoid(torch.matmul(f, self.W2[:, target_idx]) + self.b2[target_idx])
                self.W2[:, target_idx] -= self.eta * (pos_score - 1) * f.squeeze()
                self.b2[target_idx] -= self.eta * (pos_score - 1)

                neg_scores = torch.sigmoid(torch.matmul(f, self.W2[:, neg_sample_indices]) + self.b2[neg_sample_indices])
                for j, neg_idx in enumerate(neg_sample_indices):
                    self.W2[:, neg_idx] -= self.eta * neg_scores[j] * f.squeeze()
                    self.b2[neg_idx] -= self.eta * neg_scores[j]

                # Gradients for W1 and b1
                grad_f = (pos_score - 1) * self.W2[:, target_idx] + torch.sum(neg_scores.unsqueeze(1) * self.W2[:, neg_sample_indices], dim=1)
                self.W1 -= self.eta * torch.matmul(input_vec.unsqueeze(1), grad_f.unsqueeze(0))
                self.b1 -= self.eta * grad_f.squeeze()

            print(f'Epoch {epoch+1}, Loss: {total_loss:.4f}')

## Data preprocessing

In [1]:
def data_preprocessing(file_path):
    with open(file_path,'r') as file:
        data=file.read()
        print("read data")
    return data

file_path = "C:/users/pantm/Downloads/text8/text8.txt"
data = data_preprocessing(file_path)
#print(data)

read data


In [4]:
import random
import torch
from collections import Counter
def preprocess_text(corpus, min_count=5):
    """Preprocess text corpus into vocabulary and word indices"""
    words = corpus.lower().split()
    word_counts = Counter(words)
    # Filter words that appear less than min_count times
    vocab = {word: i for i, (word, count) in enumerate(word_counts.items()) if count >= min_count}
    # Add UNK token for unknown words
    vocab['UNK'] = len(vocab)
    # Create reverse mapping
    idx_to_word = {i: word for word, i in vocab.items()}
    # Convert corpus to word indices
    word_indices = [vocab.get(word, vocab['UNK']) for word in words]
    return vocab, idx_to_word, word_indices

def generate_skip_grams(word_indices, window_size=5):
    """Generate skip-gram pairs with context window"""
    skip_grams = []
    for i in range(len(word_indices)):
        target_word = word_indices[i]
        # Get context words within window
        context_range = range(max(0, i - window_size), min(len(word_indices), i + window_size + 1))
        for j in context_range:
            if i != j:  # Skip the target word itself
                skip_grams.append((target_word, word_indices[j]))
    return skip_grams

# Example usage with a small corpus
vocab, idx_to_word, word_indices = preprocess_text(data, min_count=1)
skip_grams = generate_skip_grams(word_indices, window_size=2)

# Convert to tensors for model input
input_tensor = torch.tensor([pair[0] for pair in skip_grams], dtype=torch.long)
target_indices = torch.tensor([pair[1] for pair in skip_grams], dtype=torch.long)

In [13]:
vocab_size = len(vocab)
print(f"Vocabulary size: {vocab_size}")
#skip_grams = generate_skip_grams(word_indices, window_size=5)
model = SkipGram(input=vocab_size, hidden=300, neg_sample_size=10)
model.gradient_descent(input_tensor, target_indices,vocab_size)

Vocabulary size: 253855


RuntimeError: bad allocation

In [3]:

import torch
import numpy as np
from collections import Counter

class SkipGram:
    def __init__(self, input_dim, hidden_dim, vocab_size, neg_sample_size=5):
        self.input_dim = input_dim
        self.hidden_dim = hidden_dim
        self.vocab_size = vocab_size
        self.neg_sample_size = neg_sample_size  # Number of negative samples
        
        # Initialize weights and biases
        self.W1 = torch.tensor(np.random.randn(input_dim, hidden_dim), dtype=torch.float32)  # Input × Hidden
        self.W2 = torch.tensor(np.random.randn(hidden_dim, vocab_size), dtype=torch.float32)  # Hidden × Output
        self.b1 = torch.tensor(np.zeros(hidden_dim), dtype=torch.float32)  # Hidden bias
        self.b2 = torch.tensor(np.zeros(vocab_size), dtype=torch.float32)  # Output bias
        
        self.eta = 0.01  # Learning rate

    def forward(self, input_vec):
        """Forward pass: Compute hidden layer activations and output scores."""
        f = torch.tanh(torch.matmul(input_vec, self.W1) + self.b1)  # Hidden layer activation
        output = torch.matmul(f, self.W2) + self.b2  # Output scores
        return f, output

    def calculate_loss(self, output, target_idx, neg_sample_indices):
        """Binary cross-entropy loss for positive and negative samples."""
        # Positive sample score
        pos_score = torch.sigmoid(output[0, target_idx])
        pos_loss = -torch.log(pos_score + 1e-9)  # Positive sample loss

        # Negative samples score
        neg_scores = torch.sigmoid(output[0, neg_sample_indices])
        neg_loss = -torch.sum(torch.log(1 - neg_scores + 1e-9))  # Negative sample loss

        return pos_loss + neg_loss

    def gradient_descent(self, input_tensor, target_indices, vocab_size):
        """Perform training with manual gradient calculation."""
        n_epoch = 25
        for epoch in range(n_epoch):
            total_loss = 0.0
            for i, input_idx in enumerate(input_tensor):
                input_vec = torch.zeros(1, self.input_dim)  # One-hot encoding for input word
                input_vec[0, input_idx] = 1.0

                # Forward pass
                f, output = self.forward(input_vec)

                # Negative sampling
                neg_sample_indices = self.sample_negatives(vocab_size, [target_indices[i]], self.neg_sample_size)

                # Loss computation
                loss = self.calculate_loss(output, target_indices[i], neg_sample_indices)
                total_loss += loss.item()

                # Gradients for W2 and b2
                pos_score = torch.sigmoid(output[0, target_indices[i]])
                grad_W2_pos = (pos_score - 1) * f.squeeze()  # Gradient for positive sample
                grad_b2_pos = (pos_score - 1)

                neg_scores = torch.sigmoid(output[0, neg_sample_indices])
                grad_W2_neg = torch.sum(neg_scores.unsqueeze(1) * f, dim=0)  # Gradient for negative samples
                grad_b2_neg = neg_scores

                grad_W2 = torch.zeros_like(self.W2)
                grad_W2[:, target_indices[i]] = grad_W2_pos
                for j, neg_idx in enumerate(neg_sample_indices):
                    grad_W2[:, neg_idx] += grad_W2_neg[j]

                grad_b2 = torch.zeros_like(self.b2)
                grad_b2[target_indices[i]] = grad_b2_pos
                for j, neg_idx in enumerate(neg_sample_indices):
                    grad_b2[neg_idx] += grad_b2_neg[j]

                # Gradients for W1 and b1
                grad_f = (pos_score - 1) * self.W2[:, target_indices[i]] + torch.sum(neg_scores.unsqueeze(0) * self.W2[:, neg_sample_indices], dim=1)
                grad_W1 = torch.matmul(input_vec.T, (1 - f**2) * grad_f.unsqueeze(0))  # Gradient for W1
                grad_b1 = (1 - f**2).squeeze() * grad_f  # Gradient for b1

                # Update parameters
                self.W1 -= self.eta * grad_W1
                self.W2 -= self.eta * grad_W2
                self.b1 -= self.eta * grad_b1
                self.b2 -= self.eta * grad_b2

            print(f'Epoch {epoch+1}, Loss: {total_loss:.4f}')

    def sample_negatives(self, vocab_size, exclude_indices, n_samples):
        """Negative sampling: Draw 'n_samples' random indices, excluding 'exclude_indices'."""
        neg_indices = []
        while len(neg_indices) < n_samples:
            sample = np.random.randint(0, vocab_size)
            if sample not in exclude_indices:  # Avoid sampling positive targets
                neg_indices.append(sample)
        return torch.tensor(neg_indices, dtype=torch.long)


# Additional utility functions for text processing remain the same
def preprocess_text(corpus, min_count=5):
    """Preprocess text corpus into vocabulary and word indices."""
    words = corpus.lower().split()
    word_counts = Counter(words)
    vocab = {word: i for i, (word, count) in enumerate(word_counts.items()) if count >= min_count}
    vocab['UNK'] = len(vocab)
    idx_to_word = {i: word for word, i in vocab.items()}
    word_indices = [vocab.get(word, vocab['UNK']) for word in words]
    return vocab, idx_to_word, word_indices

def generate_skip_grams(word_indices, window_size=5):
    """Generate skip-gram pairs with context window."""
    skip_grams = []
    for i in range(len(word_indices)):
        target_word = word_indices[i]
        context_range = range(max(0, i - window_size), min(len(word_indices), i + window_size + 1))
        for j in context_range:
            if i != j:
                skip_grams.append((target_word, word_indices[j]))
    return skip_grams

# Example usage with dummy data
vocab, idx_to_word, word_indices = preprocess_text(data, min_count=5)
skip_grams = generate_skip_grams(word_indices, window_size=2)

# Convert to tensors for model input
input_tensor = torch.tensor([pair[0] for pair in skip_grams], dtype=torch.long)
target_indices = torch.tensor([pair[1] for pair in skip_grams], dtype=torch.long)

# Initialize and train the SkipGram model
# model = SkipGram(input_dim=len(vocab), hidden_dim=50, vocab_size=len(vocab))
# model.gradient_descent(input_tensor, target_indices, vocab_size=len(vocab))

- debugging of the code let us try to do this code with 
- how skip gram uses the 
  - target, context and the negative samples

In [7]:
print(len(vocab),len(skip_grams)) 

71291 68020822


In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
from torch.utils.data import Dataset, DataLoader

# SkipGramDataset class
class SkipGramDataset(Dataset):
    def __init__(self, context_pairs, vocab_size, num_negative_samples=5):
        self.context_pairs = context_pairs
        self.vocab_size = vocab_size
        self.num_negative_samples = num_negative_samples

    def __len__(self):
        return len(self.context_pairs)

    def __getitem__(self, idx):
        target, context = self.context_pairs[idx]
        negative_samples = np.random.choice(
            self.vocab_size, self.num_negative_samples, replace=False
        )
        return target, context, torch.LongTensor(negative_samples)

# Define the Skip-Gram Model
class SkipGram(nn.Module):
    def __init__(self, vocab_size, embedding_dim):
        super(SkipGram, self).__init__()
        self.target_embeddings = nn.Embedding(vocab_size, embedding_dim)
        self.context_embeddings = nn.Embedding(vocab_size, embedding_dim)

    def forward(self, target, context, negative_samples):
        # Positive pair score
        target_embedding = self.target_embeddings(target)  # [batch_size, embedding_dim]
        context_embedding = self.context_embeddings(context)  # [batch_size, embedding_dim]
        positive_score = torch.mul(target_embedding, context_embedding).sum(dim=1)  # Dot product
        positive_score = torch.sigmoid(positive_score)

        # Negative pair score
        negative_context_embeddings = self.context_embeddings(negative_samples)  # [batch_size, num_neg_samples, embedding_dim]
        negative_score = torch.bmm(negative_context_embeddings, target_embedding.unsqueeze(2)).squeeze()  # Dot product
        negative_score = torch.sigmoid(-negative_score).sum(dim=1)  # Sum of negative scores

        return positive_score, negative_score

# Loss Function
def negative_sampling_loss(positive_score, negative_score):
    loss = -torch.log(positive_score + 1e-8) - torch.log(negative_score + 1e-8)
    return loss.mean()



#vocab, idx_to_word, word_indices = preprocess_text(data, min_count=5)
#skip_grams = generate_skip_grams(word_indices, window_size=2)

# Generate Toy Data
vocab_size = len(vocab)
embedding_dim = 64
#context_pairs = [(np.random.randint(vocab_size), np.random.randint(vocab_size)) for _ in range(1000)]
dataset = SkipGramDataset(skip_grams, vocab_size)
dataloader = DataLoader(dataset, batch_size=32, shuffle=True)

# Model, Optimizer
model = SkipGram(vocab_size, embedding_dim)
optimizer = optim.Adam(model.parameters(), lr=0.01)

#Training Loop
for epoch in range(10):
    total_loss = 0
    for target, context, negative_samples in dataloader:
        target, context, negative_samples = target.long(), context.long(), negative_samples.long()

        positive_score, negative_score = model(target, context, negative_samples)
        loss = negative_sampling_loss(positive_score, negative_score)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    print(f"Epoch {epoch + 1}, Loss: {total_loss:.4f}")

print("Training Complete!")

IndexError: index out of range in self

: 

In [2]:
def data_preprocessing(file_path):
    with open(file_path,'r') as file:
        data=file.read()
        print("read data")
    return data

file_path = "C:/users/pantm/Downloads/text8/text8.txt"
data = data_preprocessing(file_path)



read data


## New code with updated 

In [None]:
import torch
import torch.nn.functional as F
import numpy as np
from collections import Counter

class SkipGram:
    def __init__(self, vocab_size, embedding_dim, num_negative_samples=5):
        self.vocab_size = vocab_size
        self.embedding_dim = embedding_dim
        self.num_negative_samples = num_negative_samples
        
        # Initialize embeddings
        self.W1 = torch.randn(vocab_size, embedding_dim) * 0.1
        self.W2 = torch.randn(vocab_size, embedding_dim) * 0.1
        
        self.eta = 0.01
        
    def forward(self, target_word, context_word, negative_samples):
        # Get embeddings
        target_emb = self.W1[target_word]  # [batch_size, embedding_dim]
        context_emb = self.W2[context_word]  # [batch_size, embedding_dim]
        neg_emb = self.W2[negative_samples]  # [batch_size, num_neg_samples, embedding_dim]
        
        # Positive score
        pos_score = torch.sum(target_emb * context_emb, dim=1)
        pos_loss = -F.logsigmoid(pos_score)
        
        # Negative score
        neg_score = torch.bmm(neg_emb, target_emb.unsqueeze(2)).squeeze()
        neg_loss = -torch.sum(F.logsigmoid(-neg_score), dim=1)
        
        return pos_loss + neg_loss, target_emb, context_emb, neg_emb

    def gradient_descent(self, dataloader, n_epoch=10):
        for epoch in range(n_epoch):
            total_loss = 0
            for target_word, context_word, negative_samples in dataloader:
                loss, target_emb, context_emb, neg_emb = self.forward(target_word, context_word, negative_samples)
                total_loss += loss.mean().item()

                # Gradients for positive samples
                d_pos = -torch.sigmoid(-torch.sum(target_emb * context_emb, dim=1)).unsqueeze(1) * context_emb
                d_context = -torch.sigmoid(-torch.sum(target_emb * context_emb, dim=1)).unsqueeze(1) * target_emb

                # Gradients for negative samples
                d_neg = torch.sigmoid(torch.bmm(neg_emb, target_emb.unsqueeze(2)).squeeze()).unsqueeze(2) * neg_emb
                d_target_neg = torch.sum(torch.sigmoid(torch.bmm(neg_emb, target_emb.unsqueeze(2)).squeeze()).unsqueeze(2) * neg_emb, dim=1)

                # Update embeddings
                self.W1[target_word] -= self.eta * (d_pos + d_target_neg)
                self.W2[context_word] -= self.eta * d_context
                self.W2[negative_samples] -= self.eta * d_neg

            print(f'Epoch {epoch+1}, Loss: {total_loss/len(dataloader):.4f}')

    def get_word_embedding(self, word_idx):
        return self.W1[word_idx].detach().numpy()

class SkipGramDataset(torch.utils.data.Dataset):
    def __init__(self, text, window_size=2, num_negative_samples=5):
        words = text.lower().split()
        word_counts = Counter(words)
        self.vocab = {word: idx for idx, (word, _) in enumerate(word_counts.most_common())}
        self.vocab_size = len(self.vocab)
        self.idx_to_word = {idx: word for word, idx in self.vocab.items()}
        
        word_indices = [self.vocab[word] for word in words if word in self.vocab]
        
        self.skip_grams = []
        for i in range(len(word_indices)):
            for j in range(max(0, i - window_size), min(len(word_indices), i + window_size + 1)):
                if i != j:
                    self.skip_grams.append((word_indices[i], word_indices[j]))
        
        self.num_negative_samples = num_negative_samples
        
        word_freqs = np.array([count for _, count in word_counts.most_common()])
        word_freqs = word_freqs ** 0.75
        self.sampling_weights = word_freqs / np.sum(word_freqs)

    def __len__(self):
        return len(self.skip_grams)

    def __getitem__(self, idx):
        target_word, context_word = self.skip_grams[idx]
        negative_samples = np.random.choice(
            self.vocab_size, 
            size=self.num_negative_samples, 
            p=self.sampling_weights, 
            replace=False
        )
        return torch.tensor(target_word), torch.tensor(context_word), torch.tensor(negative_samples)

def train_skip_gram_with_negative_sampling(text, embedding_dim=100, window_size=2, 
                                          num_negative_samples=5, batch_size=32, epochs=5):
    dataset = SkipGramDataset(text, window_size, num_negative_samples)
    dataloader = torch.utils.data.DataLoader(dataset, batch_size=batch_size, shuffle=True)
    
    model = SkipGram(dataset.vocab_size, embedding_dim, num_negative_samples)
    model.gradient_descent(dataloader, n_epoch=epochs)
    
    return model, dataset.vocab, dataset.idx_to_word

def find_similar_words(word, model, vocab, idx_to_word, top_k=5):
    if word not in vocab:
        return []
    
    word_idx = vocab[word]
    word_vector = model.get_word_embedding(word_idx)
    
    similarities = []
    for idx in range(len(vocab)):
        if idx != word_idx:
            vector = model.get_word_embedding(idx)
            similarity = np.dot(word_vector, vector) / (np.linalg.norm(word_vector) * np.linalg.norm(vector))
            similarities.append((idx_to_word[idx], similarity))
    
    similarities.sort(key=lambda x: x[1], reverse=True)
    return similarities[:top_k]

# Example usage
if __name__ == "__main__":
    text = "The wide road shimmered in the hot sun. The algorithm was developed to produce software."
    model, vocab, idx_to_word = train_skip_gram_with_negative_sampling(
        text, embedding_dim=50, epochs=10
    )
    
    similar_words = find_similar_words("the", model, vocab, idx_to_word)
    print(f"Words similar to 'the': {similar_words}")


In [None]:


def generate_skip_grams(word_indices, window_size=5):
    """Generate skip-gram pairs with context window"""
    skip_grams = []
    for i in range(len(word_indices)):
        target_word = word_indices[i]
        # Get context words within window
        context_range = range(max(0, i - window_size), min(len(word_indices), i + window_size + 1))
        for j in context_range:
            if i != j:  # Skip the target word itself
                skip_grams.append((target_word, word_indices[j]))
    return skip_grams

# Example usage with a small corpus
#data = "This is a sample text for skip-gram model with negative sampling. The model learns word embeddings by predicting context words."
 # Using min_count=1 for small example
skip_grams = generate_skip_grams(word_indices, window_size=2)

def generate_negative_samples(target_word, context_word, vocab_size, num_samples):
    """Generate negative samples that are different from the target-context pair"""
    negative_samples = []
    while len(negative_samples) < num_samples:
        # Generate a random context word
        neg_context = np.random.randint(0, vocab_size)
        # Make sure it's not the same as the positive context
        if neg_context != context_word and (target_word, neg_context) not in negative_samples:
            negative_samples.append((target_word, neg_context))
    return negative_samples

# Generate negative samples
num_negative_samples = 5
negTrainSet = []



totalWords = sum([freq**(3/4) for freq in vocab.values()])
wordProb = {word:(freq/totalWords)**(3/4) for word, freq in vocab.items()}

# Example Usage
probabilities = list(wordProb.values())#[0.1, 0.2, 0.4, 0.3]  # Example probability distribution
prob_table, alias_table = create_alias_table(probabilities)


# Sample from the alias table
samples = [alias_sample(prob_table, alias_table) for _ in range(1000)]
print("Generated samples:", samples)




for target_word, context_word in skip_grams:
    # Generate negative samples for this target word
    neg_samples = generate_negative_samples(target_word, context_word, len(vocab), num_negative_samples)
    negTrainSet.extend(neg_samples)

print(f"Number of positive examples: {len(skip_grams)}")
print(f"Number of negative examples: {len(negTrainSet)}")
# # Convert to tensors for model input
# input_tensor = torch.tensor([pair[0] for pair in skip_grams], dtype=torch.long)
# target_indices = torch.tensor([pair[1] for pair in skip_grams], dtype=torch.long)
#print(data)

(0, 1, tensor([68, 43,  9, 86, 56]))

## Implementing the Vision Transformer
- first understand the concept and then try to write the code.