Word2Vec Skip-gram Implementation



In [97]:
import pandas as pd
import numpy as np
import torch
from torch import nn
import torch.nn.functional as F
import torch.optim as optim
import random

In [46]:
class Word2Vec(nn.Module):
    def __init__(self, vocab_size, embedding_dim):
        # Initialize the Word2Vec model
        super(Word2Vec, self).__init__()

        # Store the vocabulary size and embedding dimension
        self.vocab_size = vocab_size
        self.embedding_dim = embedding_dim

        # Define the embedding matrix (W). This is a learnable parameter.
        # It maps each word to an embedding of size 'embedding_dim'
        # Shape: (embedding_dim, vocab_size)
        self.W = nn.Parameter(torch.rand(self.embedding_dim, self.vocab_size), requires_grad=True)

        # Define the output layer which transforms the embedding vector back to a vector of size vocab_size
        # This will output the probabilities for each word in the vocabulary
        self.output_layer = nn.Linear(self.embedding_dim, self.vocab_size)

        # Softmax function to convert logits into probabilities
        self.softmax = F.softmax

    def forward(self, x, device):
        # x is a one-hot vector of the context word, passed as input

        # Move input tensor to the specified device (GPU or CPU)
        x = torch.tensor(x, dtype=torch.float32).to(device)

        # Perform a matrix multiplication between the input vector and the transpose of the embedding matrix (W)
        # This gives us the embedding for the context word of size (1, embedding_dim)
        x = torch.matmul(x, self.W.t())

        # Now, x is the embedding vector for the context word of size (1, embedding_dim)

        # Pass the embedding through a linear layer to get a vector of size (1, vocab_size)
        x = self.output_layer(x)

        # Apply softmax to get probabilities for each word in the vocabulary
        x = self.softmax(x, dim=-1)

        # Return the final output (probabilities for each word in the vocabulary)
        return x

Importing the text corpus ( Text8 )

In [2]:
import zipfile
import urllib.request

# Download and extract Text8
url = "http://mattmahoney.net/dc/text8.zip"
file_name = "text8.zip"
urllib.request.urlretrieve(url, file_name)

with zipfile.ZipFile(file_name, 'r') as zip_ref:
    zip_ref.extractall()  # Extracts 'text8' to the current directory

# Read the content
with open("text8", 'r') as file:
    corpus = file.read()

print("First 500 characters of the corpus:")
print(corpus[:500])

First 500 characters of the corpus:
 anarchism originated as a term of abuse first used against early working class radicals including the diggers of the english revolution and the sans culottes of the french revolution whilst the term is still used in a pejorative way to describe any act that used violent means to destroy the organization of society it has also been taken up as a positive label by self defined anarchists the word anarchism is derived from the greek without archons ruler chief king anarchism as a political philoso


In [3]:
# Tokenize the text into words
words = corpus.split()

print(f"Total words: {len(words)}")
print(f"First 10 words: {words[:10]}")

Total words: 17005207
First 10 words: ['anarchism', 'originated', 'as', 'a', 'term', 'of', 'abuse', 'first', 'used', 'against']


In [4]:
from collections import Counter

# Define the maximum vocabulary size (top 10,000 most frequent words)
vocab_size = 10000

# Count the frequency of each word in the corpus (list 'words')
word_counts = Counter(words)

# Get the most common words in the corpus, limited to 'vocab_size' (10000 in this case)
most_common_words = word_counts.most_common(vocab_size)

# Create the vocabulary dictionary, where each word is assigned a unique index (starting from 0)
# The most common words are assigned indices first
vocab = {word: idx for idx, (word, _) in enumerate(most_common_words)}

# Replace any word not in the vocabulary with the special token <UNK> (for "unknown" words)
processed_corpus = [word if word in vocab else "<UNK>" for word in words]


In [5]:
vocab["<UNK>"] = 10000

In [6]:
print(list(vocab.keys())[list(vocab.values()).index(10000)])  # Prints <UNK>

<UNK>


Generating training pairs

In [50]:
def generate_skipgram_pairs(words, window_size):
    # Initialize an empty list to store the generated word pairs
    pairs = []

    # Iterate through each word in the list 'words'
    for i, target_word in enumerate(words):
        # Calculate the start and end indices for the context window, ensuring they are within bounds
        start = max(i - window_size, 0)
        end = min(i + window_size + 1, len(words))

        # Iterate through the context window, avoiding the target word itself (i != j)
        for j in range(start, end):
            if i != j:  # Skip the target word itself
                # Add the (target_word, context_word) pair to the pairs list
                pairs.append((target_word, words[j]))

    # Return the generated list of (target_word, context_word) pairs
    return pairs

# Define the window size for context words (1 means the context window is one word on each side)
window_size = 1

# Generate the training pairs using the processed corpus and the defined window size
training_pairs = generate_skipgram_pairs(processed_corpus, window_size)

# Print the first 5 training pairs for inspection
print(f"First 5 training pairs: {training_pairs[:5]}")

First 5 training pairs: [('anarchism', 'originated'), ('originated', 'anarchism'), ('originated', 'as'), ('as', 'originated'), ('as', 'a')]


Defining a loss function

In [36]:
class LogLoss(nn.Module):
    def __init__(self):
        super(LogLoss, self).__init__()

    def forward(self, inputs, targets, device):
        # Ensure inputs and targets are on the same device
        inputs = inputs.to(device)
        targets = torch.tensor(targets, dtype=torch.float32).to(device)  # Make sure targets are float32

        # Safe logarithm computation with a small epsilon to avoid log(0)
        epsilon = 1e-10  # Small value to prevent log(0)
        log_vector = torch.log(inputs + epsilon)  # Apply log with epsilon to avoid log(0)

        # Compute the final log loss (dot product of one-hot targets and log vector)
        final_vector = torch.sum(targets * log_vector, dim=-1)  # Sum over vocab_size dimension

        # Return the log loss value
        return -torch.mean(final_vector)  # Negative because we want to minimize loss

In [87]:
def train_embedding_model(model, training_pairs,loss_fn, opt, epochs):
    # Iterate over the number of epochs for training
    for epoch in range(epochs):
        # Iterate over each (context, target) pair in the training dataset
        for training_pair in training_pairs:
            # Unpack the context (input) and target (output) words from the pair
            context, target = training_pair

            # Create a one-hot vector for the context word
            X = np.zeros(shape=(len(vocab),))  # Initialize a vector of size vocab_size filled with zeros
            X[vocab[context]] = 1  # Set the position corresponding to the context word to 1

            # Create a one-hot vector for the target word
            y = np.zeros(shape=(len(vocab),))  # Initialize a vector of size vocab_size filled with zeros
            y[vocab[target]] = 1  # Set the position corresponding to the target word to 1

            # Convert the input and target one-hot vectors to PyTorch tensors
            X = torch.tensor(X, dtype=torch.float32).to(device)
            y = torch.tensor(y, dtype=torch.float32).to(device)

            # Get the model's prediction by passing the one-hot context vector (X) to the model
            output = model(X, device)

            # Calculate the loss using the model's output and the target vector (y)
            loss = loss_fn(output, y, device)

            # Zero the gradients of the optimizer (clear previous gradients)
            opt.zero_grad()

            # Perform backpropagation to compute gradients
            loss.backward()

            # Update model parameters using the optimizer
            opt.step()

        # Print the progress after each epoch
        print(f"Epoch {epoch + 1} training done \n")

Setting up training with GPU

In [98]:
# Set the learning rate for optimization
learning_rate = 0.01

# Define the size of the vocabulary and the dimension of the word embeddings
vocab_size = 10001  # Vocabulary size (including the <UNK> token)
embedding_dim = 300  # Size of each word embedding vector

# Set the device to CUDA if available, otherwise fallback to CPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Initialize the Word2Vec model with vocab size and embedding dimension
embedding_model = Word2Vec(vocab_size=vocab_size, embedding_dim=embedding_dim)

# Move the model to the appropriate device (GPU or CPU)
embedding_model.to(device)

# Use Adam optimizer for updating model parameters
optimizer = torch.optim.Adam(embedding_model.parameters(), lr=learning_rate)

# Define the loss function (Log Loss, typically used in classification tasks)
criterion = LogLoss()

# Select a subset of training pairs (size may vary) from the whole training set
training_size = 1000
train_pairs = random.sample(training_pairs, training_size)

# Train the embedding model for 5 epochs
train_embedding_model(embedding_model, train_pairs, criterion, optimizer, 5)

  x = torch.tensor(x,dtype=torch.float32).to(device)
  targets = torch.tensor(targets, dtype=torch.float32).to(device)  # Make sure targets are float32


Epoch 1 training done 

Epoch 2 training done 

Epoch 3 training done 

Epoch 4 training done 

Epoch 5 training done 



In [99]:
print(embedding_model.W)

Parameter containing:
tensor([[-0.0146,  0.0907, -0.2002,  ...,  0.5800,  0.6810, -0.1679],
        [ 0.0408, -0.1323,  0.2040,  ...,  0.1933,  0.5413, -0.1152],
        [-0.0913,  0.0943,  0.0371,  ...,  0.4337,  0.6969,  0.0116],
        ...,
        [-0.0133,  0.4184, -0.0737,  ...,  0.3026,  0.6211,  0.2891],
        [-0.1414, -0.0880,  0.1212,  ...,  0.9408,  0.8237,  0.1081],
        [ 0.0022, -0.2043, -0.1563,  ...,  0.7861,  0.6502, -0.1336]],
       device='cuda:0', requires_grad=True)


In [109]:
def get_embedding(word, model, vocab):
  # Check if the word is in the vocabulary
  if word in vocab:
    index = vocab[word]  # Retrieve the index of the word from the vocab
  else:
    # If the word is not found, assign the index for the <UNK> token (typically a predefined index like 10000)
    index = 10000

  # Set the index value to 1 (this seems to override the index logic, which might be unintended)
  index = 1  # (Note: this line overwrites the previous logic, and you might want to remove or adjust this)

  # Create a one-hot vector of length equal to the size of the vocabulary
  X = np.zeros(shape=(len(vocab),))  # Initialize a vector of zeros of length vocab_size
  X[index] = 1  # Set the position corresponding to the word's index to 1 (one-hot encoding)

  # Convert the one-hot vector to a PyTorch tensor and move it to the appropriate device (GPU or CPU)
  X = torch.tensor(X, dtype=torch.float32).to(device)

  # Multiply the one-hot vector with the model's weight matrix (W) to get the word embedding
  embedding = torch.matmul(X, model.W.t())  # Compute the embedding by matrix multiplication

  # Print the shape of the resulting embedding (for debugging purposes)
  print(f"dimension of embedding is : {embedding.shape}")

  # Print the computed embedding for the given word (for debugging purposes)
  print(f"embedding for word '{word}' is : {embedding}")

In [111]:
get_embedding("wawa",embedding_model,vocab)

dimension of embedding is : torch.Size([300])
embedding for word 'wawa' is : tensor([ 9.0698e-02, -1.3226e-01,  9.4261e-02,  1.1339e-01,  1.1698e-01,
         8.1681e-02,  1.1637e-01, -1.0738e-01, -1.2629e-01,  7.6360e-02,
         9.2827e-02,  2.3592e-02, -5.0850e-02, -4.1952e-02, -4.4760e-02,
         1.1629e-01,  5.4650e-02,  1.2056e-01, -4.5607e-02,  1.0455e-01,
         2.3274e-02,  1.1124e-01,  3.6648e-02, -9.7998e-02,  6.4069e-02,
         7.9149e-02,  1.5498e-01, -1.7485e-01, -1.3088e-02, -1.2657e-01,
        -4.7069e-03, -1.6915e-01, -4.4700e-02,  1.5338e-01,  7.9100e-02,
         3.2009e-02, -6.2296e-02,  3.6395e-02,  9.3597e-02,  3.9470e-02,
        -1.1973e-01,  6.0814e-02, -6.7003e-02, -1.4318e-02, -7.3098e-02,
         7.0064e-02, -7.7794e-02,  5.6612e-02,  2.9793e-02,  1.4543e-01,
        -2.2176e-02,  2.0141e-02, -4.4536e-02,  1.1531e-01,  7.7149e-02,
         1.4333e-01, -5.6273e-02, -2.9748e-02, -3.0252e-02,  1.1383e-01,
        -5.7922e-02, -1.1934e-01,  2.1869e-02, 