<a href="https://colab.research.google.com/github/olfabre/amsProjetMaster1/blob/olivier/ShakeSpeare_v4.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
try:
    import unidecode
except ModuleNotFoundError:
    !pip install unidecode
    import unidecode
import string
import random
import re
import os
import requests

import torch
import torch.nn as nn
from torch.autograd import Variable

import time
import math
import matplotlib.pyplot as plt

# Vérification du GPU
if torch.cuda.is_available():
    device = torch.device("cuda:0")
    print("CUDA AVAILABLE")
else:
    device = torch.device("cpu")
    print("ONLY CPU AVAILABLE")

# Paramètres globaux
all_characters = string.printable
n_characters = len(all_characters)
chunk_len = 13

# Paramètres modifiables
n_epochs = 5000
hidden_size = 768
n_layers = 4
lr = 0.002
dropout_rate = 0.2  # Ajout du dropout

# Téléchargement des données depuis une URL
def download_data(url, filename):
    response = requests.get(url)
    with open(filename, 'w', encoding='utf-8') as f:
        f.write(response.text)

# Chargement des données
url = "https://olivier-fabre.com/passwordgenius/shakespeare2.txt"
data_dir = "data"
os.makedirs(data_dir, exist_ok=True)
data_path = os.path.join(data_dir, "shakespeare2.txt")

if not os.path.exists(data_path):
    print("Téléchargement des données...")
    download_data(url, data_path)

# Lecture et traitement du fichier
file = unidecode.unidecode(open(data_path, "r", encoding="utf-8").read())
file_len = len(file)
print(f"Longueur du corpus : {file_len}")

# Fonctions de préparation des données
def random_chunk(file):
    start_index = random.randint(0, file_len - chunk_len)
    end_index = start_index + chunk_len + 1
    return file[start_index:end_index]

def char_tensor(string):
    tensor = torch.zeros(len(string)).long()
    for c in range(len(string)):
        tensor[c] = all_characters.index(string[c])
    return Variable(tensor)

def random_training_set(file):
    chunk = random_chunk(file)
    inp = char_tensor(chunk[:-1]).to(device)
    target = char_tensor(chunk[1:]).to(device)
    return inp, target

# Définition du modèle
class RNN(nn.Module):
    def __init__(self, input_size, hidden_size, output_size, n_layers=1, dropout=0.0):
        super(RNN, self).__init__()
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.n_layers = n_layers

        self.encoder = nn.Embedding(input_size, hidden_size)
        self.gru = nn.GRU(hidden_size, hidden_size, n_layers, dropout=dropout)
        self.decoder = nn.Linear(hidden_size, output_size)

    def forward(self, input, hidden):
        input = self.encoder(input.view(1, -1))
        output, hidden = self.gru(input.view(1, 1, -1), hidden)
        output = self.decoder(output.view(1, -1))
        return output, hidden

    def init_hidden(self):
        return Variable(torch.zeros(self.n_layers, 1, self.hidden_size, device=device))

# Fonctions d'entraînement et d'évaluation
def train(inp, target):
    hidden = decoder.init_hidden()
    decoder.zero_grad()
    loss = 0
    for c in range(inp.size(0)):
        output, hidden = decoder(inp[c], hidden)
        loss += criterion(output, target[c].unsqueeze(0))
    loss.backward()
    torch.nn.utils.clip_grad_norm_(decoder.parameters(), max_norm=5)  # Gradient Clipping
    decoder_optimizer.step()
    return loss.item() / chunk_len

def training(n_epochs, file, chunk_count=10, print_every=10, eval_every=50):
    print("\n-----------")
    print("|  TRAIN  |")
    print("-----------\n")

    start = time.time()
    all_losses = []
    loss_avg = 0    # Moyenne des pertes sur tout l'entraînement
    best_loss = float("inf")
    model_save_path = os.path.join(data_dir, "best_model.pth")

    for epoch in range(1, n_epochs + 1):
        losses = []
        for _ in range(chunk_count):
            loss = train(*random_training_set(file))
            losses.append(loss)

        # Moyenne sur les chunks
        loss_avg += sum(losses) / chunk_count

        if epoch % print_every == 0:
            print('[%s (%d/%d) Perte moyenne: %.4f Dernière perte: %.4f]' % (
                time_since(start), epoch, n_epochs, loss_avg / epoch, losses[-1]))

        if epoch % eval_every == 0:
            print(f"\nÉvaluation à l'epoch {epoch}:")
            print(evaluate(decoder, prime_str='To be', predict_len=200, temperature=0.8))
            print()

        # Sauvegarde du meilleur modèle
        if best_loss > (loss_avg / epoch):
            best_loss = loss_avg / epoch
            torch.save(decoder.state_dict(), model_save_path)
            print('[%s (%d/%d) Nouvelle meilleure perte moyenne: %.4f Sauvegarde du modèle.]' % (
                time_since(start), epoch, n_epochs, best_loss))

def evaluate(decoder, prime_str="To be", predict_len=200, temperature=0.8):
    hidden = decoder.init_hidden()
    prime_input = char_tensor(prime_str).to(device)
    predicted = prime_str
    for p in range(len(prime_str) - 1):
        _, hidden = decoder(prime_input[p], hidden)
    inp = prime_input[-1]
    for p in range(predict_len):
        output, hidden = decoder(inp, hidden)
        output_dist = output.data.view(-1).div(temperature).exp()
        top_i = torch.multinomial(output_dist, 1)[0]
        predicted_char = all_characters[top_i]
        predicted += predicted_char
        inp = char_tensor(predicted_char).to(device)
    return predicted

def time_since(since):
    now = time.time()
    s = now - since
    m = math.floor(s / 60)
    s -= m * 60
    return f'{m}m {s:.2f}s'

# Lancement principal
if __name__ == "__main__":
    decoder = RNN(n_characters, hidden_size, n_characters, n_layers, dropout=dropout_rate).to(device)
    decoder_optimizer = torch.optim.AdamW(decoder.parameters(), lr=lr)
    criterion = nn.CrossEntropyLoss()

    print("Début de l'entraînement...")
    training(n_epochs, file, chunk_count=15, print_every=50, eval_every=100)

    print("\nÉvaluation finale...")
    print(evaluate(decoder, prime_str="To be or not to be", predict_len=200, temperature=0.8))


Collecting unidecode
  Downloading Unidecode-1.3.8-py3-none-any.whl.metadata (13 kB)
Downloading Unidecode-1.3.8-py3-none-any.whl (235 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/235.5 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m235.5/235.5 kB[0m [31m7.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: unidecode
Successfully installed unidecode-1.3.8
CUDA AVAILABLE
Téléchargement des données...
Longueur du corpus : 314122
Début de l'entraînement...

-----------
|  TRAIN  |
-----------

[0m 2.04s (1/5000) Nouvelle meilleure perte moyenne: 4.1431 Sauvegarde du modèle.]
[0m 2.75s (2/5000) Nouvelle meilleure perte moyenne: 3.9035 Sauvegarde du modèle.]
[0m 3.48s (3/5000) Nouvelle meilleure perte moyenne: 3.7407 Sauvegarde du modèle.]
[0m 4.91s (5/5000) Nouvelle meilleure perte moyenne: 3.7240 Sauvegarde du modèle.]
[0m 5.82s (6/5000) Nouvelle meilleure perte moyenne: 3.6882 Sauvegarde d

KeyboardInterrupt: 

In [4]:
from google.colab import files
files.download("data/best_model.pth")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>