<a href="https://colab.research.google.com/github/olfabre/amsProjetMaster1/blob/olivier/ShakeSpeare_v2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>



# Version améliorée 1

In [6]:
try:
    import unidecode
except ModuleNotFoundError:
    !pip install unidecode
    import unidecode
import string
import random
import re
import os
import requests

import torch
import torch.nn as nn
from torch.autograd import Variable

import time
import math
import matplotlib.pyplot as plt

# Vérification du GPU
if torch.cuda.is_available():
    device = torch.device("cuda:0")
    print("CUDA AVAILABLE")
else:
    device = torch.device("cpu")
    print("ONLY CPU AVAILABLE")

# Paramètres globaux
all_characters = string.printable
n_characters = len(all_characters)
chunk_len = 13

n_epochs = 200000
print_every = 10
plot_every = 10
hidden_size = 512
n_layers = 3
lr = 0.005

# Téléchargement des données depuis une URL
def download_data(url, filename):
    response = requests.get(url)
    with open(filename, 'w', encoding='utf-8') as f:
        f.write(response.text)

# Chargement des données
url = "https://olivier-fabre.com/passwordgenius/shakespeare2.txt"
data_dir = "data"
os.makedirs(data_dir, exist_ok=True)
data_path = os.path.join(data_dir, "shakespeare2.txt")

if not os.path.exists(data_path):
    print("Téléchargement des données...")
    download_data(url, data_path)

# Lecture et traitement du fichier
file = unidecode.unidecode(open(data_path, "r", encoding="utf-8").read())
file_len = len(file)
print(f"Longueur du corpus : {file_len}")

# Fonctions de préparation des données
def random_chunk(file):
    start_index = random.randint(0, file_len - chunk_len)
    end_index = start_index + chunk_len + 1
    return file[start_index:end_index]

def char_tensor(string):
    tensor = torch.zeros(len(string)).long()
    for c in range(len(string)):
        tensor[c] = all_characters.index(string[c])
    return Variable(tensor)

def random_training_set(file):
    chunk = random_chunk(file)
    inp = char_tensor(chunk[:-1]).to(device)
    target = char_tensor(chunk[1:]).to(device)
    return inp, target

# Définition du modèle
class RNN(nn.Module):
    def __init__(self, input_size, hidden_size, output_size, n_layers=1):
        super(RNN, self).__init__()
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.n_layers = n_layers

        self.encoder = nn.Embedding(input_size, hidden_size)
        self.gru = nn.GRU(hidden_size, hidden_size, n_layers)
        self.decoder = nn.Linear(hidden_size, output_size)

    def forward(self, input, hidden):
        input = self.encoder(input.view(1, -1))
        output, hidden = self.gru(input.view(1, 1, -1), hidden)
        output = self.decoder(output.view(1, -1))
        return output, hidden

    def init_hidden(self):
        return Variable(torch.zeros(self.n_layers, 1, self.hidden_size, device=device))

# Fonctions d'entraînement et d'évaluation
def train(inp, target):
    hidden = decoder.init_hidden()
    decoder.zero_grad()
    loss = 0
    for c in range(inp.size(0)):
        output, hidden = decoder(inp[c], hidden)
        loss += criterion(output, target[c].unsqueeze(0))
    loss.backward()
    decoder_optimizer.step()
    return loss.item() / chunk_len

def training(n_epochs, file, chunk_count=10):
    print()
    print('-----------')
    print('|  TRAIN  |')
    print('-----------')
    print()

    start = time.time()
    all_losses = []
    loss_avg = 0    # Moyenne des pertes sur tout l'entraînement
    best_loss = float("inf")
    print_every = n_epochs // 100
    eval_every = n_epochs // 100

    for epoch in range(1, n_epochs + 1):
        losses = []
        for _ in range(chunk_count):
            loss = train(*random_training_set(file))
            losses.append(loss)

        # Moyenne sur les chunks
        loss_avg += sum(losses) / chunk_count

        if epoch % print_every == 0:
            print('[%s (%d %d%%) Perte moyenne: %.4f Dernière perte: %.4f]' % (
                time_since(start), epoch, epoch / n_epochs * 100, loss_avg / epoch, losses[-1]))

        if epoch % eval_every == 0:
            print()
            print(f"Évaluation à l'epoch {epoch}:")
            print(evaluate(decoder, prime_str='Wh', predict_len=100, temperature=0.8))
            print()

        if best_loss > (loss_avg / epoch):
            best_loss = loss_avg / epoch
            print('[%s (%d %d%%) Nouvelle meilleure perte moyenne: %.4f]' % (
                time_since(start), epoch, epoch / n_epochs * 100, best_loss))

def evaluate(decoder, prime_str="A", predict_len=100, temperature=0.8):
    hidden = decoder.init_hidden()
    prime_input = char_tensor(prime_str).to(device)
    predicted = prime_str
    for p in range(len(prime_str) - 1):
        _, hidden = decoder(prime_input[p], hidden)
    inp = prime_input[-1]
    for p in range(predict_len):
        output, hidden = decoder(inp, hidden)
        output_dist = output.data.view(-1).div(temperature).exp()
        top_i = torch.multinomial(output_dist, 1)[0]
        predicted_char = all_characters[top_i]
        predicted += predicted_char
        inp = char_tensor(predicted_char).to(device)
    return predicted

def time_since(since):
    now = time.time()
    s = now - since
    m = math.floor(s / 60)
    s -= m * 60
    return f'{m}m {s:.2f}s'

# Lancement principal
if __name__ == "__main__":
    decoder = RNN(n_characters, hidden_size, n_characters, n_layers).to(device)
    decoder_optimizer = torch.optim.Adam(decoder.parameters(), lr=lr)
    criterion = nn.CrossEntropyLoss()

    print("Début de l'entraînement...")
    training(1000, file)  # Ajustez n_epochs pour vos besoins

    print("\nÉvaluation...")
    print(evaluate(decoder, prime_str="To be or not to be", predict_len=200, temperature=0.8))


CUDA AVAILABLE
Longueur du corpus : 314122
Début de l'entraînement...

-----------
|  TRAIN  |
-----------

[0m 0.24s (1 0%) Nouvelle meilleure perte moyenne: 4.2476]
[0m 0.72s (3 0%) Nouvelle meilleure perte moyenne: 4.0521]
[0m 1.23s (5 0%) Nouvelle meilleure perte moyenne: 3.9434]
[0m 1.49s (6 0%) Nouvelle meilleure perte moyenne: 3.8626]
[0m 1.74s (7 0%) Nouvelle meilleure perte moyenne: 3.8150]
[0m 1.98s (8 0%) Nouvelle meilleure perte moyenne: 3.7555]
[0m 2.46s (10 1%) Perte moyenne: 3.8066 Dernière perte: 4.7488]

Évaluation à l'epoch 10:
WhEIls
Iln I
N
 bdh

ahbm
sy
n 'iinI
a I lItIIgm
dia I
  s Ea:rMM a 
EEnl
iImpinIEE Eesa iMEtlIr EI   

[0m 5.71s (20 2%) Perte moyenne: 3.7528 Dernière perte: 3.3598]

Évaluation à l'epoch 20:
What  tohhn b ph ihn uo en yyiyylhOtAyb at 
ty  mueoN aihte sh hsh  t yy huyte xe thh b  ysout
 t   ty

[0m 5.82s (20 2%) Nouvelle meilleure perte moyenne: 3.7528]
[0m 6.36s (22 2%) Nouvelle meilleure perte moyenne: 3.7508]
[0m 6.84s (24 2%) Nouvelle mei