In [6]:
import pandas as pd
 
def get_wordlist(lang, column=None):
    if column is None:  # set default value for column
        column = lang
    
    dataset_path = f'./datasets/CL_{lang}-en.parquet'
    df = pd.read_parquet(dataset_path)
    
    # Preprocess the text
    wordlist = df[column].str.lower().str.split().explode().tolist()

    return wordlist


words_fr = get_wordlist('fr')
words_en = get_wordlist('fr','en')
words_es = get_wordlist('es')
words_de = get_wordlist('de')
words_it = get_wordlist('it')

import itertools

# words_all = list(itertools.chain(words_fr, words_en))
words_all = list(itertools.chain(words_fr,words_it,words_de,words_es,words_en))

In [13]:
print(words_all[9000010:9000020])

['empresas', 'e', 'individuos', 'se', 'comportan', 'y', 'actúan', 'entre', 'sí.', 'giulio']


In [15]:
import torch

# Word2Vec Dataset Class
class W2VData(torch.utils.data.Dataset):
    def __init__(self, wordlist, window_size=2):
        # Initialize a tokenizer with the provided corpus
        self.wordlist = wordlist
        self.data = []
        # Create training samples using the window size provided
        self.create_tuples(window_size)

    def create_tuples(self, window_size):
        # Create context and target pairs for training word2vec
        for i, target in enumerate(tokens):
            context = tokens[max(0, i - window_size):i] + tokens[i + 1:i + window_size + 1]
            # Only consider cases where context is of the defined window size
            if len(context) != 2 * window_size: continue
            self.data.append((context, target))

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        # Fetch context and target pairs by index
        context, target = self.data[idx]
        return torch.tensor(context), torch.tensor(target)

In [17]:
# SkipGram Model
class SkipGram(torch.nn.Module):
    def __init__(self, vocab_size, embedding_dim):
        super(SkipGram, self).__init__()
        # Define embeddings layer: maps word indices to embedding vectors
        self.embeddings = torch.nn.Embedding(vocab_size, embedding_dim)
        # Linear layer: maps a single embedding to vocabulary size (used for predicting the context words)
        self.linear = torch.nn.Linear(embedding_dim, vocab_size)

    def forward(self, target_word):
        # Get the embedding of the target word
        embeds = self.embeddings(target_word)
        # Pass the embedding through the linear layer
        out = self.linear(embeds)
        # Apply log softmax to get log probabilities for predicting the context words
        log_probs = torch.nn.functional.log_softmax(out, dim=1)
        return log_probs
