In [1]:
import random
import torch
import numpy as np

def set_seed(seed=42):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed(seed)
set_seed()


In [2]:
context_size = 5
embedding_dim = 100
hidden_dim = 128
num_epochs = 15
learning_rate = 0.001
batch_size = 100

vocab_size = 2908




In [3]:
from collections import Counter

def replace_rare_words(text_file, output_file, threshold=5):
    # Read the text file
    with open(text_file, 'r') as f:
        text = f.read().split()
    
    # Count the frequency of each word
    word_counts = Counter(text)
    
    # Replace words that appear less than the threshold with <unk>
    modified_text = [word if word_counts[word] >= threshold else '<unk>' for word in text]
    
    # Write the modified text to the output file
    with open(output_file, 'w') as f:
        f.write(' '.join(modified_text))

# Example usage
replace_rare_words('Le_comte_de_Monte_Cristo.train.tok', 'Le_comte_de_Monte_Cristo.train.100.unk5.tok')

In [4]:
from Vocab import Vocab

def get_word_and_next_k_indexes(text, vocab_dict, k=3):
    with open(text, 'r') as f:
        words = f.read().split()
    
    indexed_data = []
    
    for i in range(len(words) - k):
        temp = []
        current_word_index = vocab_dict.get(words[i], vocab_dict['<unk>'])
        
        next_k_indexes = [vocab_dict.get(words[i + j + 1], vocab_dict['<unk>']) for j in range(k)]
        temp = next_k_indexes
        temp.insert(0, current_word_index)
        indexed_data.append(temp)
    
    return indexed_data

def transform_to_embeddings(data, embeddings_file):
    vocab = Vocab(emb_filename=embeddings_file)
    
    res = []
    for i in range(len(data)):
        res.append([vocab.get_emb_torch(word) for word in data[i]])
    return res




In [5]:
from Vocab import Vocab
with open("embeddings-word2vecofficial.train.unk5.txt", 'r') as f:
    text = f.read().split()

vocab = Vocab(emb_filename="embeddings-word2vecofficial.train.unk5.txt")
print(vocab.get_emb('the'))
vocab_dict = vocab.get_vocab_dict()
print(vocab_dict)
result = get_word_and_next_k_indexes("Le_comte_de_Monte_Cristo.train.100.unk5.tok", vocab_dict, k=context_size)
print(result[:100])
print(len(result))
result_tensor = torch.tensor(result, dtype=torch.long)
print(result_tensor.shape)

######""
embeddings = transform_to_embeddings(result, "embeddings-word2vecofficial.train.unk5.txt")
print(embeddings[:2]) #OKKKKKKKKKKK on l'a!!!!

None
{'</s>': 0, '<s>': 1, '<unk>': 2, ',': 3, 'de': 4, '.': 5, 'le': 6, 'et': 7, 'la': 8, 'à': 9, 'il': 10, '—': 11, 'l’': 12, '-': 13, 'que': 14, 'un': 15, 'vous': 16, ';': 17, 'en': 18, 'd’': 19, 'les': 20, 'qui': 21, 'une': 22, 'je': 23, 'qu’': 24, 'est': 25, 'ce': 26, '?': 27, 'pas': 28, 'son': 29, 'dit': 30, '!': 31, 'du': 32, 'était': 33, 'dans': 34, 'ne': 35, 's’': 36, 'lui': 37, 'se': 38, 'au': 39, 'avait': 40, 'pour': 41, 'des': 42, 'n’': 43, 'mais': 44, 'sur': 45, 'on': 46, 'c’': 47, ':': 48, 'dantès': 49, 'comme': 50, 'plus': 51, 'sa': 52, 'cette': 53, 'avec': 54, 'a': 55, 'tout': 56, 'bien': 57, 'par': 58, 'si': 59, 'nous': 60, 'homme': 61, 'ses': 62, 'me': 63, 'j’': 64, 'mon': 65, 'deux': 66, 'y': 67, 'elle': 68, 'même': 69, 'franz': 70, 'moi': 71, 'ces': 72, 'monsieur': 73, 'ai': 74, 'ou': 75, 'être': 76, 'sans': 77, 'fait': 78, 'donc': 79, 'm’': 80, 'où': 81, 'votre': 82, 'oui': 83, 'cela': 84, 'faire': 85, 'jeune': 86, 'ils': 87, 'encore': 88, 'morrel': 89, 'villefort'

In [6]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class MultilayerPerceptron(nn.Module):
    def __init__(self, vocab_size, embedding_dim, context_size=3, hidden_dim=128):
        """
        Args:
        - vocab_size (int): Taille du vocabulaire (|V|).
        - embedding_dim (int): Dimension des plongements de mots (d).
        - context_size (int): Nombre de mots de contexte (k).
        - hidden_dim (int): Dimension de la couche cachée (dh).
        """
        super(MultilayerPerceptron, self).__init__()
        
        # Taille de la couche d'entrée (dx = k * d)
        self.input_dim = context_size * embedding_dim
        
        # Embedding layer pour transformer les indices de mots en vecteurs
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        
        # Matrice W et vecteur b1 pour la transformation de la couche cachée
        self.fc1 = nn.Linear(self.input_dim, hidden_dim)  # h' = xW + b1
        
        # Matrice U et vecteur b2 pour la transformation de la couche de sortie
        self.fc2 = nn.Linear(hidden_dim, vocab_size)  # y' = hU + b2

    def forward(self, x):
        # x est une séquence d'indices de mots (batch_size, context_size)
        
        # Transformer les indices en embeddings
        x = self.embedding(x)  # (batch_size, context_size, embedding_dim)
        
        # Aplatir la dimension de contexte pour former la couche d'entrée
        x = x.view(x.size(0), -1)  # (batch_size, context_size * embedding_dim)
        
        # Calcul de la couche cachée avec ReLU
        h = F.relu(self.fc1(x))  # (batch_size, hidden_dim)
        
        # Calcul de la couche de sortie avec Softmax pour obtenir une distribution de probabilité
        y = F.softmax(self.fc2(h), dim=1)  # (batch_size, vocab_size)
        
        return y

# Exemple d'instanciation du modèle


# Exemple d'usage
model = MultilayerPerceptron(vocab_size, embedding_dim, context_size, hidden_dim)
#output = model(torch.randint(0, 2908, (32, 4)))  # 32 est le batch_size et 3 est le nombre de mots en contexte


In [7]:
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset

def train_model(model, data, vocab, epochs, batch_size, learning_rate, context_size):
    """
    Entraîne le modèle en utilisant l'entropie croisée comme fonction de perte.

    Args:
    - model (nn.Module): Le modèle neuronal à entraîner.
    - data (List[List[int]]): Données d'entraînement, liste de séquences d'indices de mots.
    - vocab (Vocab): Instance de la classe Vocab pour le vocabulaire.
    - epochs (int): Nombre d'époques pour l'entraînement.
    - batch_size (int): Taille du lot.
    - learning_rate (float): Taux d'apprentissage.
    - context_size (int): Nombre de mots de contexte.

    Returns:
    - nn.Module: Le modèle entraîné.
    """
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print(f"Training on {device}")

    model.to(device)

    # Préparation des données
    data = torch.tensor(data, dtype=torch.long)
    inputs, targets = data[:, :-1], data[:, -1]  # Dernier mot de chaque séquence est la cible
    dataset = TensorDataset(inputs, targets)
    train_loader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

    # Définition de la fonction de perte (entropie croisée) et de l'optimiseur
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=learning_rate)
    
    # Boucle d'entraînement
    for epoch in range(epochs):
        model.train()  # Mise en mode entraînement
        total_loss = 0
        
        for batch_inputs, batch_targets in train_loader:
            batch_inputs, batch_targets = batch_inputs.to(device), batch_targets.to(device)

            optimizer.zero_grad()  # Remise à zéro des gradients
            
            # Calcul de la sortie du modèle
            outputs = model(batch_inputs)  # Prédictions du modèle
            
            # Calcul de la perte (entropie croisée)
            loss = criterion(outputs, batch_targets)
            total_loss += loss.item()
            
            # Rétropropagation et optimisation
            loss.backward()
            optimizer.step()
        
        # Affichage de la perte moyenne par époque
        average_loss = total_loss / len(train_loader)
        print(f"Epoch {epoch + 1}/{epochs}, Loss: {average_loss:.4f}")

    return model
#stacked_embeddings = [torch.stack(embedding) for embedding in embeddings]

# Convert the list of stacked tensors to a single tensor
#embeddings_tensor = torch.stack(stacked_embeddings)

#print(embeddings_tensor.shape)






# Create random indices for training data
randomized_embeddings = torch.randint(0, len(embeddings), (1000, 4)).tolist()

# Train the model with randomized embeddings
trained_model = train_model(model, result, vocab, epochs=5, batch_size=32, learning_rate=0.001, context_size=5)

Training on cuda
Epoch 1/5, Loss: 7.6296
Epoch 2/5, Loss: 7.6266
Epoch 3/5, Loss: 7.6264
Epoch 4/5, Loss: 7.6261
Epoch 5/5, Loss: 7.6238


In [8]:
def save_model(model, path):
    torch.save(model.state_dict(), path)
    
def load_model(path, vocab_size, embedding_dim, hidden_dim):
    model = MultilayerPerceptron(vocab_size, embedding_dim, hidden_dim)
    model.load_state_dict(torch.load(path))
    return model




In [9]:
from torch.utils.data import DataLoader, TensorDataset
import torch.nn.functional as F

def calculate_perplexity(model, data_loader):
    model.eval()
    total_log_prob = 0
    total_words = 0
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)
    with torch.no_grad():
        for inputs, targets in data_loader:
            inputs, targets = inputs.to(device), targets.to(device)
            outputs = model(inputs)
            outputs = outputs.view(-1, outputs.size(-1))  # Reshape outputs to (batch_size * sequence_length, vocab_size)
            targets = targets.view(-1)  # Reshape targets to (batch_size * sequence_length)
            log_prob = torch.log(F.softmax(outputs, dim=-1))
            log_prob = log_prob[range(len(targets)), targets]
            total_log_prob += log_prob.sum().item()
            total_words += len(targets)
    perplexity = torch.exp(torch.tensor(-total_log_prob / total_words))
    print("Perplexity:", perplexity.item())
    return perplexity.item()

vocab_size = 2908
embedding_dim = 100
hidden_dim = 128
batch_size= 32
data = torch.tensor(result, dtype=torch.long)
inputs, targets = data[:, :-1], data[:, -1]  # Dernier mot de chaque séquence est la cible
dataset = TensorDataset(inputs, targets)
data_loader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

print(calculate_perplexity(model, data_loader))




Perplexity: 2058.09033203125
2058.09033203125


In [None]:
import torch
import torch.nn.functional as F


def generate_text(model, seed_text, vocab_dict, idx_to_word, max_len=50, context_size=3):
    model.eval()
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)
    generated_text = seed_text.copy()
    input_seq = torch.tensor([vocab_dict.get(word, vocab_dict['<unk>']) for word in seed_text], dtype=torch.long).unsqueeze(0).to(device)
    
    # Ensure input_seq has the correct context_size
    if input_seq.size(1) < context_size:
        padding = torch.zeros((1, context_size - input_seq.size(1)), dtype=torch.long).to(device)
        input_seq = torch.cat([padding, input_seq], dim=1)
    else:
        input_seq = input_seq[:, -context_size:]
    
    taille = 0
    for _ in range(max_len):
        with torch.no_grad():
            output = model(input_seq)
            output = F.softmax(output[0, :], dim=-1)  # Apply softmax to get probabilities
            next_word_idx = torch.multinomial(output, 1).item()
            next_word = idx_to_word.get(next_word_idx, '<unk>')
            generated_text.append(next_word)
            input_seq = torch.cat([input_seq, torch.tensor([[next_word_idx]], dtype=torch.long).to(device)], dim=1)
            input_seq = input_seq[:, -context_size:]  # Keep only the last context_size elements
            taille+=1
            if next_word == '<end>':
                break
    
    return (' '.join(generated_text),taille)

# Example usage (assuming model, vocab_dict, and idx_to_word are defined)

idx_to_word = {idx: word for word, idx in vocab_dict.items()}


# Ensure the seed text is correctly encoded
seed_text = ["<start>"]
print(generate_text(model, seed_text, vocab_dict, idx_to_word, max_len=50, context_size=3))
print(generate_text(model, seed_text, vocab_dict, idx_to_word, max_len=50, context_size=3))

print(generate_text(model, seed_text, vocab_dict, idx_to_word, max_len=50, context_size=3))

print(generate_text(model, seed_text, vocab_dict, idx_to_word, max_len=50, context_size=3))

# Generate multiple sentences
for _ in range(5):
    generated_sentence, length = generate_text(model, seed_text, vocab_dict, idx_to_word, max_len=50, context_size=context_size)
    print(f"Generated Sentence: {generated_sentence}")
    print(f"Length: {length}")

RuntimeError: mat1 and mat2 shapes cannot be multiplied (1x300 and 500x128)

: 

In [None]:
from torch.utils.data import DataLoader, TensorDataset

import torch.optim as optim

def train_model(model, data, vocab, epochs, batch_size, learning_rate, context_size):
    """
    Entraîne le modèle en utilisant l'entropie croisée comme fonction de perte.

    Args:
    - model (nn.Module): Le modèle neuronal à entraîner.
    - data (List[List[int]]): Données d'entraînement, liste de séquences d'indices de mots.
    - vocab (Vocab): Instance de la classe Vocab pour le vocabulaire.
    - epochs (int): Nombre d'époques pour l'entraînement.
    - batch_size (int): Taille du lot.
    - learning_rate (float): Taux d'apprentissage.
    - context_size (int): Nombre de mots de contexte.

    Returns:
    - nn.Module: Le modèle entraîné.
    """
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print(f"Training on {device}")

    model.to(device)

    # Préparation des données
    data = torch.tensor(data, dtype=torch.long)
    inputs, targets = data[:, :-1], data[:, -1]  # Dernier mot de chaque séquence est la cible
    dataset = TensorDataset(inputs, targets)
    train_loader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

    # Définition de la fonction de perte (entropie croisée) et de l'optimiseur
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=learning_rate)
    
    # Boucle d'entraînement
    for epoch in range(epochs):
        model.train()  # Mise en mode entraînement
        total_loss = 0
        
        for batch_inputs, batch_targets in train_loader:
            batch_inputs, batch_targets = batch_inputs.to(device), batch_targets.to(device)

            optimizer.zero_grad()  # Remise à zéro des gradients
            
            # Calcul de la sortie du modèle
            outputs = model(batch_inputs)  # Prédictions du modèle
            
            # Calcul de la perte (entropie croisée)
            loss = criterion(outputs, batch_targets)
            total_loss += loss.item()
            
            # Rétropropagation et optimisation
            loss.backward()
            optimizer.step()
        
        # Affichage de la perte moyenne par époque
        average_loss = total_loss / len(train_loader)
        print(f"Epoch {epoch + 1}/{epochs}, Loss: {average_loss:.4f}")

    return model

# Exemple d'utilisation
batch_size = 32 # Vous pouvez ajuster cette valeur pour trouver la valeur optimale
model = MultilayerPerceptron(vocab_size, embedding_dim, context_size, hidden_dim)
trained_model = train_model(model, result, vocab, epochs=15, batch_size=batch_size, learning_rate=0.001, context_size=context_size)

Training on cuda
Epoch 1/15, Loss: 7.6291
Epoch 2/15, Loss: 7.6209
Epoch 3/15, Loss: 7.6172
Epoch 4/15, Loss: 7.6156
Epoch 5/15, Loss: 7.6147
Epoch 6/15, Loss: 7.6137
Epoch 7/15, Loss: 7.6130
Epoch 8/15, Loss: 7.6124
Epoch 9/15, Loss: 7.6119
Epoch 10/15, Loss: 7.6118
Epoch 11/15, Loss: 7.6110
Epoch 12/15, Loss: 7.6111
Epoch 13/15, Loss: 7.6103
Epoch 14/15, Loss: 7.6100
Epoch 15/15, Loss: 7.6099


In [None]:
for _ in range(5):
    generated_sentence, length = generate_text(trained_model, seed_text, vocab_dict, idx_to_word, max_len=10, context_size=context_size)
    print(f"Generated Sentence: {generated_sentence}")
    print(f"Length: {length}")

Generated Sentence: <start> ici noirtier laissait étais absence fusil chut prix assiette îles
Length: 10
Generated Sentence: <start> veulent parti prière élégant meurs teresa crosse encore étendant calèche
Length: 10
Generated Sentence: <start> tracés balle confetti base coup que garçon fâché ordre sortant
Length: 10
Generated Sentence: <start> mouvements ferme voici individu rocher langues bureau amer cet nu
Length: 10
Generated Sentence: <start> préoccupation prenait dirigea village vaut passée hier désirait balbutia selon
Length: 10
