In [28]:
import pandas as pd

 
def read_and_preprocess(lang, sample_fraction=0.5):
    dataset_path = f'./datasets/CL_{lang}-en.parquet'
    df_with_en = pd.read_parquet(dataset_path)
    df_lang = df_with_en[[lang]].rename(columns={lang: 'text'})
    df_lang['language'] = lang
    # Randomly sample a fraction of the data
    df_lang = df_lang.sample(frac=sample_fraction, random_state=42)  # You can set a random seed for reproducibility

    df_lang = df_lang.dropna(subset=['text'])
    
#     df_lang['text'] = df_lang['text'].apply(lambda x: ' '.join(x.lower().split()))
        # Explode paragraphs into separate words using whitespace-based tokenization
    df_lang['text'] = df_lang['text'].apply(lambda x: x.lower().split())

    return df_lang 


In [29]:
def create_context_target_pairs(words, context_window_size=2):
    pairs = []
    
    for i, target_word in enumerate(words):
        # Define a context window around the target word
        start = max(0, i - context_window_size)
        end = min(len(words), i + context_window_size + 1)
        
        # Extract context words within the window
        context_words = [words[j] for j in range(start, end) if j != i]
        
        # Create context-target pairs
        for context_word in context_words:
            pairs.append((target_word, context_word))
    

    return pairs

In [30]:
print("processing frech")
df_fr = read_and_preprocess('fr')
print("processed french")

import numpy as np

print("getting al words")

all_words_fr = df_fr['text'].sum()  # Combine all words into a single list

print("got all words")

# Split the list of words into smaller chunks
chunk_size = 1000
chunks = [all_words_fr[i:i+chunk_size] for i in range(0, 10000, chunk_size)]

print("chunks ready")
context_target_pairs_fr = []

# Process each chunk and create pairs
for chunk in chunks:
    pairs = create_context_target_pairs(chunk, context_window_size=2)
    context_target_pairs_fr.extend(pairs)
    print("chunk done")

# Display a few pairs as an example
print(context_target_pairs_fr[:10])



processing frech
processed french
getting al words
got all words
chunks ready
chunk done
chunk done
chunk done
chunk done
chunk done
chunk done
chunk done
chunk done
chunk done
chunk done
[('très', 'peu'), ('très', 'de'), ('peu', 'très'), ('peu', 'de'), ('peu', 'facteurs'), ('de', 'très'), ('de', 'peu'), ('de', 'facteurs'), ('de', 'ont'), ('facteurs', 'peu')]


In [31]:
import torch
import torch.nn as nn

class Word2Vec(nn.Module):
    def __init__(self, vocab_size, embedding_dim):
        super(Word2Vec, self).__init__()
        self.embeddings = nn.Embedding(vocab_size, embedding_dim)
        self.linear = nn.Linear(embedding_dim, vocab_size)

    def forward(self, target):
        embedded = self.embeddings(target)
        output = self.linear(embedded)
        return output

In [32]:
import torch
import torch.optim as optim
import torch.nn as nn


# Build a vocabulary and word-to-index mapping
vocab = set(all_words_fr)  # Assuming 'all_words_fr' contains all unique words
word_to_index = {word: i for i, word in enumerate(vocab)}

# Define hyperparameters
embedding_dim = 100
vocab_size = len(vocab)  # Replace with the actual vocabulary size
learning_rate = 0.001
num_epochs = 10

# Create the Word2Vec model
model = Word2Vec(vocab_size, embedding_dim)

# Define loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

# Training loop
# for epoch in range(num_epochs):
#     total_loss = 0.0
    
#     for target, context in context_target_pairs_fr:
#         # Convert target and context words to indices
#         target_index = word_to_index[target]
#         context_index = word_to_index[context]
        
#         # Zero the gradients
#         optimizer.zero_grad()
        
#         # Forward pass
#         output = model(torch.tensor([target_index], dtype=torch.long))
        
#         # Calculate loss
#         loss = criterion(output, torch.tensor([context_index], dtype=torch.long))
        
#         # Backpropagation
#         loss.backward()
#         optimizer.step()
        
#         total_loss += loss.item()
    
#     print(f"Epoch {epoch+1}/{num_epochs}, Loss: {total_loss}")


In [33]:
# Define batch size
batch_size = 64

# Training loop
for epoch in range(num_epochs):
    total_loss = 0.0
    batch_losses = []

    for i in range(0, len(context_target_pairs_fr), batch_size):
        batch = context_target_pairs_fr[i:i+batch_size]
        
        # Prepare inputs and targets for the batch
        targets, contexts = zip(*batch)
        target_indices = torch.tensor([word_to_index[target] for target in targets], dtype=torch.long)
        context_indices = torch.tensor([word_to_index[context] for context in contexts], dtype=torch.long)
        
        # Zero the gradients
        optimizer.zero_grad()
        
        # Forward pass
        outputs = model(target_indices)
        
        # Calculate loss
        loss = criterion(outputs, context_indices)
        
        # Backpropagation
        loss.backward()
        optimizer.step()
        
        total_loss += loss.item()
        batch_losses.append(loss.item())
    
    # Print average loss for the epoch
    print(f"Epoch {epoch+1}/{num_epochs}, Avg. Loss: {total_loss / len(batch_losses)}")


Epoch 1/10, Avg. Loss: 9.906805409240723
Epoch 2/10, Avg. Loss: 8.172392933654786
Epoch 3/10, Avg. Loss: 7.212105236816406
Epoch 4/10, Avg. Loss: 6.5407790786743165
Epoch 5/10, Avg. Loss: 6.056113903808594
Epoch 6/10, Avg. Loss: 5.6935494606018064
Epoch 7/10, Avg. Loss: 5.4135796133041385
Epoch 8/10, Avg. Loss: 5.191718505096436
Epoch 9/10, Avg. Loss: 5.012246129989624
Epoch 10/10, Avg. Loss: 4.864613343048096


In [48]:
import torch
import torch.nn.functional as F


# Assuming you have a list of words in your vocabulary
your_vocab = all_words_fr

# Retrieve word vectors for specific words
word_vectors_dict = {}
for word in your_vocab:
    # Convert the word to its corresponding index in the vocabulary
    word_index = torch.tensor([word_to_index[word]], dtype=torch.long)
    
    # Pass the word index through your Word2Vec model to get the word vector
    word_vector = model.embeddings(word_index)
    
    # Convert the word vector to a numpy array
    word_vector = word_vector.squeeze().detach().numpy()
    
    # Store the word vector in a dictionary
    word_vectors_dict[word] = word_vector

# # Perform vector arithmetic
# roi_vector = word_vectors_dict['roi']
# homme_vector = word_vectors_dict['homme']
# femme_vector = word_vectors_dict['femme']
# result_vector = roi_vector - homme_vector + femme_vector


# Calculate cosine similarity between a vector and a list of vectors while excluding input words
def find_closest_words_excluding_input(vector, vectors_dict, input_words, num_results=5):
    # Convert the input vector to a PyTorch tensor (if it's not already)
    vector = torch.tensor(vector, dtype=torch.float32)

    # Calculate cosine similarities between the input vector and all vectors in the dictionary
    similarities = {word: F.cosine_similarity(vector, torch.tensor(vec), dim=0).item()
                    for word, vec in vectors_dict.items()}

    # Exclude the input words from the closest words list
    closest_words = [word for word in sorted(similarities, key=similarities.get, reverse=True)
                     if word not in input_words]

    # Take the top 'num_results' closest words (excluding input words)
    closest_words = closest_words[:num_results]

    return closest_words

# ...



True


In [53]:

def find_analogy(word1, word2, word3, word_vectors_dict, input_words, num_results=5):
    # Get the word vectors for the input words
    word_vector1 = word_vectors_dict.get(word1)
    word_vector2 = word_vectors_dict.get(word2)
    word_vector3 = word_vectors_dict.get(word3)

    # Check if any of the input words are not in the vocabulary
    if any(vector is None for vector in [word_vector1, word_vector2, word_vector3]):
        return None

    # Calculate the result vector using vector arithmetic
    result_vector = word_vector2 - word_vector1 + word_vector3

    # Find closest words to the result vector (excluding input words)
    closest_words = find_closest_words_excluding_input(result_vector, word_vectors_dict, input_words, num_results)

    return closest_words

# Example usage:
input_words = ['paris', 'france', 'rome']
result = find_analogy('paris', 'france', 'rome', word_vectors_dict, input_words, num_results=5)

if result is not None:
    print(f"Words: {result}")
else:
    print("One or more input words not found in the vocabulary.")

Words: ["d'altitude", 'dire.', 'insuline', 'fonds', "d'imiter"]


In [50]:
# Print the closest words (excluding input words)
print("vieux" in all_words_fr)
print("jeune" in all_words_fr)

True
True


In [56]:
petit_vector = word_vectors_dict['france']
grand_vector = word_vectors_dict['paris']
vieux_vector = word_vectors_dict['italie']

# Calculate the word vector for 'jeune' (young) as 'grand - petit + vieux'
jeune_vector = grand_vector - petit_vector + vieux_vector

# Find closest words to the 'jeune' vector
closest_words_jeune = find_closest_words_excluding_input(jeune_vector, word_vectors_dict, ['petit', 'grand', 'vieux'], num_results=5)

# Print the closest words to 'jeune'
print(f"Words similar to 'jeune': {closest_words_jeune}")

Words similar to 'jeune': ['italie', 'paris', 'awareness', "d'information.", 'japon;']


In [57]:
import torch
import torch.nn as nn
import torch.optim as optim

# Define your language detection model
class LanguageDetectionModel(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super(LanguageDetectionModel, self).__init__()
        self.embedding = nn.Embedding(input_dim, hidden_dim)
        self.fc = nn.Linear(hidden_dim, output_dim)
        self.softmax = nn.Softmax(dim=1)

    def forward(self, input):
        embedded = self.embedding(input)
        output = self.fc(embedded.mean(dim=1))  # Example: Average word embeddings
        return self.softmax(output)

# Define your training loop
def train_language_detection_model(model, train_loader, criterion, optimizer, num_epochs):
    model.train()
    for epoch in range(num_epochs):
        total_loss = 0
        for inputs, labels in train_loader:
            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
        print(f"Epoch {epoch + 1}/{num_epochs}, Loss: {total_loss / len(train_loader)}")

# Load your word embeddings and prepare data
# (Assuming you have word embeddings in 'word_vectors_dict', train_loader, criterion, optimizer, and labels)

# Initialize and train the language detection model
input_dim = embedding_dim  # Adjust based on your word embedding dimension
hidden_dim = 256  # Adjust based on your architecture
output_dim = num_languages  # Adjust based on the number of languages
model = LanguageDetectionModel(input_dim, hidden_dim, output_dim)
train_language_detection_model(model, train_loader, criterion, optimizer, num_epochs)


NameError: name 'num_languages' is not defined