In [54]:
import torch
import torch.nn as nn
import torch.optim as optim
from collections import defaultdict
import random

class Word2Vec(nn.Module):
    def __init__(self, vocab_size, embed_dim, num_negative_samples):
        super(Word2Vec, self).__init__()
        self.embed_dim = embed_dim
        self.num_negative_samples = num_negative_samples
        
        # Embedding matrices: target (M) and context (C)
        self.target_embeddings = nn.Embedding(vocab_size, embed_dim)
        self.context_embeddings = nn.Embedding(vocab_size, embed_dim)
        
        # Initialize embeddings uniformly
        self.init_emb()

    def init_emb(self):
        # Initialize embeddings using uniform distribution
        initrange = 0.5 / self.embed_dim
        self.target_embeddings.weight.data.uniform_(-initrange, initrange)
        self.context_embeddings.weight.data.uniform_(-initrange, initrange)

    def forward(self, target_word, context_word, negative_samples):
        target_embedding = self.target_embeddings(target_word)  # (batch_size, embed_dim)
        context_embedding = self.context_embeddings(context_word)  # (batch_size, embed_dim)
        neg_embeddings = self.context_embeddings(negative_samples).view(negative_samples.size(0), -1, self.embed_dim)  # (batch_size, neg_samples, embed_dim)

        pos_score = torch.mul(target_embedding, context_embedding)  # Element-wise multiplication
        pos_score = torch.sum(pos_score, dim=1)  # Sum across the embedding dimensions (batch_size)

        target_embedding = target_embedding.unsqueeze(2)  # Add an extra dimension
        neg_score = torch.bmm(neg_embeddings, target_embedding).squeeze(2)  # (batch_size, neg_samples)

        return pos_score, neg_score




In [55]:
def read_corpus(file_path):
    corpus = []
    with open(file_path, 'r', encoding='utf-8') as file:
        for line in file:
            words = line.strip().replace('<s>', '').replace('</s>', '').split()
            if words:  # Only add non-empty sentences
                corpus.append(words)
    return corpus

# Read the corpus
file_path = 'data/La_Reine_Margot.tok'
corpus = read_corpus(file_path)
print("Sample Corpus: ", corpus[:5])


Sample Corpus:  [['le', 'lundi', ',', 'dix', '-', 'huitième', 'jour', 'du', 'mois', 'd’', 'août', '1572', ',', 'il', 'y', 'avait', 'grande', 'fête', 'au', 'louvre', '.'], ['les', 'fenêtres', 'de', 'la', 'vieille', 'demeure', 'royale', ',', 'ordinairement', 'si', 'sombres', ',', 'étaient', 'ardemment', 'éclairées', ';', 'les', 'places', 'et', 'les', 'rues', 'attenantes', ',', 'habituellement', 'si', 'solitaires', 'dès', 'que', 'neuf', 'heures', 'sonnaient', 'à', 'saint', '-', 'germain', '-', 'l’', 'auxerrois', ',', 'étaient', ',', 'quoiqu’', 'il', 'fût', 'minuit', ',', 'encombrées', 'de', 'populaire', '.'], ['tout', 'ce', 'concours', 'menaçant', ',', 'pressé', ',', 'bruyant', ',', 'ressemblait', ',', 'dans', 'l’', 'obscurité', ',', 'à', 'une', 'mer', 'sombre', 'et', 'houleuse', 'dont', 'chaque', 'flot', 'faisait', 'une', 'vague', 'grondante', ';', 'cette', 'mer', ',', 'épandue', 'sur', 'le', 'quai', ',', 'où', 'elle', 'se', 'dégorgeait', 'par', 'la', 'rue', 'des', 'fossés', '-', 'saint'

In [61]:
def build_vocab(corpus, min_count):
    word_counts = defaultdict(int)
    for sentence in corpus:
        for word in sentence:
            word_counts[word] += 1
            
    vocab = [word for word, count in word_counts.items() if count >= min_count]
    word_to_index = {word: i for i, word in enumerate(vocab)}
    index_to_word = {i: word for i, word in enumerate(vocab)}
    
    return vocab, word_to_index, index_to_word

# Build vocabulary
vocab, word_to_index, index_to_word = build_vocab(corpus, min_count=5)

print("Vocabulary Size:", len(vocab))


['le', ',', 'dix', '-', 'jour', 'du', 'mois', 'd’', 'août', '1572', 'il', 'y', 'avait', 'grande', 'fête', 'au', 'louvre', '.', 'les', 'fenêtres', 'de', 'la', 'vieille', 'demeure', 'royale', 'ordinairement', 'si', 'sombres', 'étaient', ';', 'et', 'rues', 'dès', 'que', 'neuf', 'heures', 'sonnaient', 'à', 'saint', 'germain', 'l’', 'auxerrois', 'quoiqu’', 'fût', 'minuit', 'tout', 'ce', 'menaçant', 'pressé', 'ressemblait', 'dans', 'obscurité', 'une', 'sombre', 'dont', 'chaque', 'flot', 'faisait', 'vague', 'cette', 'sur', 'quai', 'où', 'elle', 'se', 'par', 'rue', 'des', 'fossés', 'venait', 'battre', 'son', 'pied', 'murs', 'hôtel', 'bourbon', 'qui', 's’', 'élevait', 'en', 'face', 'malgré', 'même', 'peut', 'être', 'cause', 'quelque', 'chose', 'peuple', 'car', 'ne', 'doutait', 'pas', 'laquelle', 'comme', 'n’', 'était', 'autre', 'remise', 'serait', 'cœur', 'cour', 'noces', 'madame', 'marguerite', 'valois', 'fille', 'roi', 'henri', 'ii', 'sœur', 'charles', 'ix', 'avec', 'navarre', 'effet', 'matin

In [57]:
def train_model(model, corpus, word_to_index, num_epochs, learning_rate, context_size, num_negative_samples, batch_size):
    optimizer = optim.SGD(model.parameters(), lr=learning_rate)
    loss_fn = nn.BCEWithLogitsLoss()  # For binary classification (positive vs. negative samples)

    for epoch in range(num_epochs):
        total_loss = 0
        for sentence in corpus:
            # Generate training examples for each target word in the sentence
            for i, target_word in enumerate(sentence):
                if target_word not in word_to_index:
                    continue

                target_idx = torch.tensor([word_to_index[target_word]], dtype=torch.long)

                # Context words
                context_range = list(range(max(0, i - context_size), i)) + list(range(i + 1, min(len(sentence), i + context_size + 1)))
                context_words = [sentence[j] for j in context_range if sentence[j] in word_to_index]

                if not context_words:
                    continue

                # For each context word, perform forward and backward pass
                for context_word in context_words:
                    context_idx = torch.tensor([word_to_index[context_word]], dtype=torch.long)

                    # Negative sampling
                    neg_samples = torch.tensor([random.choice(list(word_to_index.values())) for _ in range(num_negative_samples)], dtype=torch.long)

                    # Forward pass
                    pos_score, neg_score = model(target_idx, context_idx, neg_samples)

                    # Labels: 1 for positive, 0 for negative
                    pos_labels = torch.ones(pos_score.size(), dtype=torch.float32)
                    neg_labels = torch.zeros(neg_score.size(), dtype=torch.float32)

                    # Compute loss
                    pos_loss = loss_fn(pos_score, pos_labels)
                    neg_loss = loss_fn(neg_score, neg_labels)
                    loss = pos_loss + neg_loss

                    # Backward pass
                    optimizer.zero_grad()
                    loss.backward()
                    optimizer.step()

                    total_loss += loss.item()

        print(f'Epoch {epoch + 1}/{num_epochs}, Loss: {total_loss}')


In [58]:
# Hyperparameters
embedding_dim = 100
context_size = 2
num_negative_samples = 5
learning_rate = 0.01
num_epochs = 5

# Initialize model
vocab_size = len(vocab)
model = Word2Vec(vocab_size, embedding_dim, num_negative_samples)



# Train the model
train_model(model, corpus, word_to_index, num_epochs, learning_rate, context_size, num_negative_samples, batch_size=64)


RuntimeError: Expected size for first two dimensions of batch2 tensor to be: [5, 100] but got: [1, 100].

In [42]:
def save_embeddings(model, index_to_word, file_name):
    with open(file_name, 'w', encoding='utf-8') as f:
        for idx, word in index_to_word.items():
            embedding = model.target_embeddings.weight.data[idx].numpy()
            embedding_str = ' '.join(map(str, embedding))
            f.write(f"{word} {embedding_str}\n")

# Save embeddings to a file
save_embeddings(model, index_to_word, 'embeddings.txt')


In [72]:
import torch
import torch.nn.functional as F
from collections import defaultdict
import random

class Word2Vec(torch.nn.Module):
    def __init__(self, vocab_size, emb_dim, context_size, num_neg_samples, lr, min_count, num_epochs):
        super(Word2Vec, self).__init__()
        self.vocab_size = vocab_size
        self.emb_dim = emb_dim
        self.context_size = context_size
        self.num_neg_samples = num_neg_samples
        self.lr = lr
        self.min_count = min_count
        self.num_epochs = num_epochs

        self.word_counts = defaultdict(int)
        self.vocab = []
        self.word_to_index = {}

        self.embeddings = torch.nn.Embedding(vocab_size, emb_dim)
        self.context_embeddings = torch.nn.Embedding(vocab_size, emb_dim)

    def read_corpus(self, file_path):
        corpus = []
        with open(file_path, 'r', encoding='utf-8') as file:
            for line in file:
                # Remove <s> tags and split into words
                words = line.strip().replace('<s>', '').replace('</s>', '').split()
                if words:  # Only add non-empty sentences
                    corpus.append(words)
        return corpus

    def build_vocab(self, corpus):
        for sentence in corpus:
            for word in sentence:
                self.word_counts[word] += 1
        self.vocab = [word for word, count in self.word_counts.items() if count >= self.min_count]
        self.word_to_index = {word: i for i, word in enumerate(self.vocab)}

        print(f"Vocabulary size: {len(self.vocab)}")
        return self.vocab

    def forward(self, target_word, context_word, neg_samples):
    # Compute positive context
        target_emb = self.embeddings(target_word)
        context_emb = self.context_embeddings(context_word)
        pos_score = torch.sum(target_emb * context_emb, dim=1)

        # Compute negative samples
        neg_emb = self.context_embeddings(neg_samples)
        neg_emb = neg_emb.unsqueeze(1)  # Add a dimension to make it 3D
        target_emb = target_emb.unsqueeze(1)  # Add a dimension to make it 3D
        neg_score = torch.bmm(neg_emb, target_emb.transpose(1, 2)).squeeze(2)

        # Compute the loss
        pos_loss = -torch.log(torch.sigmoid(pos_score)).mean()
        neg_loss = -torch.log(1 - torch.sigmoid(neg_score)).mean()
        loss = pos_loss + neg_loss
        return loss

    def train(self, corpus):
        self.build_vocab(corpus)
        optimizer = torch.optim.Adam(self.parameters(), lr=self.lr)

        for _ in range(self.num_epochs):
            for sentence in corpus:
                for i, target_word in enumerate(sentence):
                    if target_word not in self.word_to_index:
                        continue
                    
                    context_words = sentence[max(0, i-self.context_size):i] + sentence[i+1:min(len(sentence), i+self.context_size+1)]
                    context_words = [w for w in context_words if w in self.word_to_index]
                    
                    for context_word in context_words:
                        neg_samples = torch.tensor(self.get_negative_samples(len(context_words)))
                        loss = self.forward(
                            torch.tensor([self.word_to_index[target_word]]),
                            torch.tensor([self.word_to_index[context_word]]),
                            neg_samples
                        )
                        optimizer.zero_grad()
                        loss.backward()
                        optimizer.step()

        print("Training completed.")

    def get_negative_samples(self, num_samples):
        word_freq = torch.tensor([self.word_counts[word] for word in self.vocab])
        word_freq = word_freq ** 0.75
        word_prob = word_freq / word_freq.sum()
        
        negative_samples = []
        while len(negative_samples) < num_samples:
            sample = torch.multinomial(word_prob, 1, replacement=True).item()
            if sample < self.vocab_size and self.vocab[sample] in self.word_to_index:
                negative_samples.append(self.word_to_index[self.vocab[sample]])
                if len(negative_samples) == num_samples:
                    break
        
        return negative_samples

    def save_embeddings(self, filename):
        with open(filename, 'w', encoding='utf-8') as f:
            f.write(f"{len(self.vocab)} {self.emb_dim}\n")
            for word, idx in self.word_to_index.items():
                embedding = ' '.join(map(str, self.embeddings.weight[idx].tolist()))
                f.write(f"{word} {embedding}\n")

        print(f"Embeddings saved to {filename}")

# Example usage
file_path = 'data/La_Reine_Margot.tok'
w2v = Word2Vec(vocab_size=1000, emb_dim=100, context_size=2, num_neg_samples=10, lr=0.001, min_count=5, num_epochs=5)
corpus = w2v.read_corpus(file_path)
w2v.build_vocab(corpus)
w2v.train(corpus)
w2v.save_embeddings("embeddings.txt")

Vocabulary size: 3288
Vocabulary size: 5021


RuntimeError: Expected size for first two dimensions of batch2 tensor to be: [3, 100] but got: [1, 100].

: 

data/La_Reine_Margot.tok