<a href="https://colab.research.google.com/github/olfabre/amsProjetMaster1/blob/olivier/Generation_prenoms_V1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
try:
    import unidecode
except ModuleNotFoundError:
    !pip install unidecode
    import unidecode

import requests
import torch
import torch.nn as nn
from torch.autograd import Variable
import time
import math
import string
import random
import os

# Vérification GPU
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(f"Device utilisé: {device}")

# Téléchargement des données
url = "https://olivier-fabre.com/passwordgenius/russian.txt"
data_dir = "data"
os.makedirs(data_dir, exist_ok=True)
data_path = os.path.join(data_dir, "russian.txt")

if not os.path.exists(data_path):
    print("Chargement des données encours...")
    response = requests.get(url)
    with open(data_path, 'w', encoding='utf-8') as f:
        f.write(response.text)

# Chargement des données
def unicode_to_ascii(s):
    return ''.join(
        c for c in unidecode.unidecode(s)
        if c in (string.ascii_letters + " .,;'-")
    )

def read_lines(filename):
    with open(filename, encoding='utf-8') as f:
        return [unicode_to_ascii(line.strip().lower()) for line in f]

lines = read_lines(data_path)
print(f"Nombre de noms: {len(lines)}")

# Paramètres globaux
all_letters = string.ascii_letters + " .,;'-"
n_letters = len(all_letters) + 1  # EOS marker
hidden_size = 128
n_layers = 2
lr = 0.005
bidirectional = True
max_length = 20

# Fonctions utilitaires
def char_tensor(string):
    tensor = torch.zeros(len(string)).long()
    for c in range(len(string)):
        tensor[c] = all_letters.index(string[c])
    return tensor

def input_tensor(line):
    tensor = torch.zeros(len(line), 1, n_letters)
    for li in range(len(line)):
        letter = line[li]
        tensor[li][0][all_letters.find(letter)] = 1
    return tensor

def target_tensor(line):
    letter_indexes = [all_letters.find(line[li]) for li in range(1, len(line))]
    letter_indexes.append(n_letters - 1)  # EOS
    return torch.LongTensor(letter_indexes)

def random_training_example(lines):
    line = random.choice(lines)
    input_line_tensor = input_tensor(line)
    target_line_tensor = target_tensor(line)
    return input_line_tensor, target_line_tensor

# Définition du modèle
class RNNLight(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(RNNLight, self).__init__()
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.bidirectional = bidirectional
        self.num_directions = 2 if self.bidirectional else 1
        self.rnn = nn.RNN(
            input_size=input_size, hidden_size=hidden_size,
            num_layers=1, bidirectional=self.bidirectional, batch_first=True
        )
        self.out = nn.Linear(self.num_directions * hidden_size, output_size)
        self.dropout = nn.Dropout(0.1)
        self.softmax = nn.LogSoftmax(dim=1)

    def forward(self, input, hidden):
        _, hidden = self.rnn(input.unsqueeze(0), hidden)
        hidden_concat = hidden if not self.bidirectional else torch.cat((hidden[0], hidden[1]), 1)
        output = self.out(hidden_concat)
        output = self.dropout(output)
        return self.softmax(output), hidden

    def init_hidden(self):
        return torch.zeros(self.num_directions, 1, self.hidden_size, device=device)

# Entraînement
def train(input_line_tensor, target_line_tensor, decoder, decoder_optimizer, criterion):
    target_line_tensor.unsqueeze_(-1)
    hidden = decoder.init_hidden()
    decoder.zero_grad()
    loss = 0
    for i in range(input_line_tensor.size(0)):
        output, hidden = decoder(input_line_tensor[i].to(device), hidden.to(device))
        l = criterion(output.to(device), target_line_tensor[i].to(device))
        loss += l
    loss.backward()
    decoder_optimizer.step()
    return loss.item() / input_line_tensor.size(0)

def training(n_epochs, lines, decoder, decoder_optimizer, criterion):
    print("\n-----------\n|  Entrainement  |\n-----------\n")
    start = time.time()
    total_loss = 0
    for epoch in range(1, n_epochs + 1):
        input_line_tensor, target_line_tensor = random_training_example(lines)
        loss = train(input_line_tensor, target_line_tensor, decoder, decoder_optimizer, criterion)
        total_loss += loss
        if epoch % 500 == 0:
            print(f"{time_since(start)} ({epoch}/{n_epochs}) Perte: {total_loss / epoch:.4f}")

# Génération de noms
def sample(decoder, start_letter='A'):
    with torch.no_grad():
        hidden = decoder.init_hidden()
        input = input_tensor(start_letter)
        output_name = start_letter
        for _ in range(max_length):
            output, hidden = decoder(input[0].to(device), hidden.to(device))
            topi = output.topk(1)[1][0][0]
            if topi == n_letters - 1:
                break
            else:
                letter = all_letters[topi]
                output_name += letter
            input = input_tensor(letter)
        return output_name

def time_since(since):
    """Retourne le temps écoulé au format mm:ss"""
    now = time.time()
    s = now - since
    m = math.floor(s / 60)
    s -= m * 60
    return f"{m}m {s:.2f}s"

# Exécution principale
if __name__ == "__main__":
    decoder = RNNLight(n_letters, hidden_size, n_letters).to(device)
    decoder_optimizer = torch.optim.Adam(decoder.parameters(), lr=lr)
    criterion = nn.CrossEntropyLoss()
    n_epochs = 5000

    print("Demarrage entrainement...")
    training(n_epochs, lines, decoder, decoder_optimizer, criterion)

    print("\nGénération de noms:")
    for letter in "ABC":
        print(sample(decoder, letter))


Device utilisé: cpu
Nombre de noms: 9408
Demarrage entrainement...

-----------
|  Entrainement  |
-----------

0m 6.73s (500/5000) Perte: 2.7826
0m 11.77s (1000/5000) Perte: 2.7370
0m 17.63s (1500/5000) Perte: 2.7514
0m 22.59s (2000/5000) Perte: 2.7429
0m 27.80s (2500/5000) Perte: 2.7622
0m 33.45s (3000/5000) Perte: 2.7708
0m 39.10s (3500/5000) Perte: 2.7737
0m 45.77s (4000/5000) Perte: 2.7749
0m 50.77s (4500/5000) Perte: 2.7745
0m 56.96s (5000/5000) Perte: 2.7726

Génération de noms:
Aelen
Belon
Cilenkov
