In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
from collections import Counter
import numpy as np
import re

In [2]:
# Check if a GPU is available and set device accordingly
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cuda


In [3]:
# Step 1: Load and Read the Text Data
def load_text(file_path):
    with open(file_path, 'r', encoding='utf-8') as f:
        text = f.read().lower()
    return text

In [4]:
# Step 2: Tokenize the Text
def tokenize(text):
    text = re.sub(r"[^a-zA-Z0-9\s]", '', text)  # Remove punctuation
    words = text.split()  # Split by whitespace
    return words

In [5]:
# Step 3: Create Vocabulary
def build_vocab(words):
    word_counts = Counter(words)
    vocabulary = {word: idx for idx, (word, _) in enumerate(word_counts.items())}
    idx_to_word = {idx: word for word, idx in vocabulary.items()}
    return vocabulary, idx_to_word


In [6]:
# Step 4: Prepare Input Data for Skip-Gram Model
def prepare_data(words, vocabulary, window_size=2):
    data = []
    for idx, word in enumerate(words):
        for neighbor in range(-window_size, window_size + 1):
            if neighbor == 0:
                continue
            neighbor_idx = idx + neighbor
            if 0 <= neighbor_idx < len(words):
                data.append((vocabulary[word], vocabulary[words[neighbor_idx]]))
    return data

In [7]:
# Step 5: Define the Neural Network Model
class SkipGramModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim):
        super(SkipGramModel, self).__init__()
        self.embeddings = nn.Embedding(vocab_size, embedding_dim)
        self.output_layer = nn.Linear(embedding_dim, vocab_size)

    def forward(self, center_word):
        embed = self.embeddings(center_word)
        out = self.output_layer(embed)
        return out


In [8]:
# Function to create batches
def create_batches(data, batch_size):
    # Shuffle the data
    np.random.shuffle(data)
    # Split the data into batches
    batches = [data[i:i + batch_size] for i in range(0, len(data), batch_size)]
    return batches


In [9]:
# Training the Model with Batching
def train(skip_gram_model, data, vocab_size, embedding_dim, batch_size=64, epochs=10, learning_rate=0.01):
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(skip_gram_model.parameters(), lr=learning_rate)

    # Move the model to the device (GPU or CPU)
    skip_gram_model.to(device)

    # Create batches
    batches = create_batches(data, batch_size)

    for epoch in range(epochs):
        total_loss = 0

        for batch in batches:
            # Separate center words and context words in the batch
            center_batch = [pair[0] for pair in batch]
            context_batch = [pair[1] for pair in batch]

            # Convert to tensors and move to device
            center_batch = torch.LongTensor(center_batch).to(device)
            context_batch = torch.LongTensor(context_batch).to(device)

            # Zero the gradients
            optimizer.zero_grad()

            # Forward pass
            output = skip_gram_model(center_batch)

            # Compute loss
            loss = criterion(output, context_batch)

            # Backward pass and optimize
            loss.backward()
            optimizer.step()

            total_loss += loss.item()

        print(f"Epoch {epoch+1}, Loss: {total_loss}")

In [10]:

file_path = 'sample_text.txt'  # Replace with your .txt file path
text = load_text(file_path)

In [11]:
# Tokenize the text
words = tokenize(text)

In [12]:
# Create vocabulary
vocabulary, idx_to_word = build_vocab(words)
vocab_size = len(vocabulary)

In [13]:
# Prepare training data
data = prepare_data(words, vocabulary, window_size=2)

In [14]:
# Define the Skip-Gram model
embedding_dim = 50  # You can adjust the embedding dimension
skip_gram_model = SkipGramModel(vocab_size, embedding_dim)

In [15]:
# Train the model with batching
batch_size = 64  # Set the batch size
train(skip_gram_model, data, vocab_size, embedding_dim, batch_size=batch_size, epochs=50, learning_rate=0.01)

Epoch 1, Loss: 17.392043113708496
Epoch 2, Loss: 15.375497341156006
Epoch 3, Loss: 14.320851802825928
Epoch 4, Loss: 13.572887420654297
Epoch 5, Loss: 12.954366445541382
Epoch 6, Loss: 12.481980562210083
Epoch 7, Loss: 12.137553453445435
Epoch 8, Loss: 11.87052297592163
Epoch 9, Loss: 11.650390386581421
Epoch 10, Loss: 11.470478057861328
Epoch 11, Loss: 11.329429864883423
Epoch 12, Loss: 11.216997146606445
Epoch 13, Loss: 11.124237298965454
Epoch 14, Loss: 11.046266794204712
Epoch 15, Loss: 10.980922937393188
Epoch 16, Loss: 10.9269540309906
Epoch 17, Loss: 10.882646560668945
Epoch 18, Loss: 10.846586227416992
Epoch 19, Loss: 10.817251443862915
Epoch 20, Loss: 10.792980432510376
Epoch 21, Loss: 10.772388935089111
Epoch 22, Loss: 10.754582166671753
Epoch 23, Loss: 10.7390878200531
Epoch 24, Loss: 10.725602626800537
Epoch 25, Loss: 10.713829517364502
Epoch 26, Loss: 10.703450202941895
Epoch 27, Loss: 10.694203853607178
Epoch 28, Loss: 10.685911655426025
Epoch 29, Loss: 10.678441286087036

In [16]:
# After training, move embeddings back to CPU for further processing if needed
skip_gram_model.cpu()

SkipGramModel(
  (embeddings): Embedding(32, 50)
  (output_layer): Linear(in_features=50, out_features=32, bias=True)
)

In [17]:
# Get Embeddings
word_embeddings = skip_gram_model.embeddings.weight.data.numpy()
for word, idx in vocabulary.items():
    print(f"Word: {word}, Embedding: {word_embeddings[idx]}")

Word: the, Embedding: [-0.14251852 -0.94881225 -0.5413972  -1.2912344  -0.13615991 -0.97734463
  0.22281218 -1.1646893   0.41605696  0.5624783  -1.215951    0.5887805
  0.43272135 -0.46438175 -1.5950586   0.42580098 -0.26277116 -0.9383325
 -0.37033722  0.41089806  0.35435766  0.87726593 -0.00665058 -0.65032375
 -0.1435545   1.3600202  -0.5079986  -1.5001851   0.48574454 -0.08523705
 -0.01757985  0.7027134   0.687775    1.7696964  -0.3522051   0.9554862
 -0.24144818  0.12313724 -0.24975005  0.24052599 -1.318385   -1.3535063
 -0.26460585  1.0759817  -0.6620209  -0.4765183   1.3182598  -0.8869704
  1.6224129  -1.7740642 ]
Word: cat, Embedding: [ 2.1221316  -0.32849133  1.2511393  -0.38994083  0.96975464 -0.8020615
  0.8452099   1.6009694   0.1035975   0.98592174 -1.2350359   0.07670219
  0.628      -0.16801563  0.98632765 -2.680964   -0.12100766 -1.3112888
 -2.2745821   0.427327   -0.14534713  0.19691992  0.25935045 -0.8978313
  0.6531155  -0.13397418 -0.5384055   1.2730356  -0.68806404 -

In [18]:
word_embeddings.shape

(32, 50)

In [22]:
# Compute the cosine similarity between two words
# Extract the indices for the words "once" and "upon"
word1 = 'cat'
word2 = 'dog'
idx_once = vocabulary.get(word1)
idx_upon = vocabulary.get(word2)

# Check if both words are in the vocabulary
if idx_once is None or idx_upon is None:
    print("One or both words are not in the vocabulary.")
else:
    # Get the embeddings for "once" and "upon"
    embedding_once = word_embeddings[idx_once]
    embedding_upon = word_embeddings[idx_upon]

    # Compute the cosine similarity
    cosine_similarity = np.dot(embedding_once, embedding_upon) / (np.linalg.norm(embedding_once) * np.linalg.norm(embedding_upon))

    # Print the cosine similarity
    print("Cosine Similarity between",word1,"and",word2,":", cosine_similarity)

Cosine Similarity between cat and dog : 0.41268414


In [26]:
# Function to calculate cosine similarity
def cosine_similarity(vec1, vec2):
    return np.dot(vec1, vec2) / (np.linalg.norm(vec1) * np.linalg.norm(vec2))

# Function to get the nearest k words in the embedding space
def get_nearest_k_words(word, word_embeddings, vocabulary, idx_to_word, k=5):
    # Check if the word exists in the vocabulary
    if word not in vocabulary:
        print(f"'{word}' is not in the vocabulary.")
        return
    
    # Get the embedding of the given word
    word_idx = vocabulary[word]
    word_embedding = word_embeddings[word_idx]
    
    # Compute the cosine similarity between the given word and all other words
    similarities = []
    for other_word, other_idx in vocabulary.items():
        if other_word != word:  # Skip comparing the word to itself
            other_embedding = word_embeddings[other_idx]
            #print(word_embedding)
            #print(other_embedding)
            similarity = cosine_similarity(word_embedding, other_embedding)
            similarities.append((other_word, similarity))
    
    # Sort by similarity (highest first) and get the top k nearest words
    nearest_words = sorted(similarities, key=lambda x: x[1], reverse=True)[:k]
    
    # Print the nearest words and their cosine similarity scores
    print(f"Nearest {k} words to '{word}':")
    for other_word, similarity in nearest_words:
        print(f"{other_word}: {similarity:.4f}")

In [31]:
word = 'tree'  # Example word to find nearest neighbors
k = 5  # Number of nearest neighbors to find
get_nearest_k_words(word, word_embeddings, vocabulary, idx_to_word, k)

Nearest 5 words to 'tree':
tall: 0.4421
trees: 0.2252
sang: 0.2089
flew: 0.1869
together: 0.1708
