<a href="https://colab.research.google.com/github/olfabre/amsProjetSemestre2/blob/main/05_05_mod%C3%A8le_avec_evaluation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install unidecode

import unidecode
import string
import random
import torch
import torch.nn as nn
from torch.autograd import Variable
import time, math
import matplotlib.pyplot as plt
from os import path, makedirs

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

all_characters = string.ascii_letters + string.digits + string.punctuation
end_char = '&'
selected = string.ascii_letters + string.digits + end_char
n_characters = len(all_characters)
n_characters_selected = len(selected)

chunk_len = 12

hidden_size = 256
n_layers = 2
lr = 0.005
n_epochs = 3000


train_file = unidecode.unidecode(open("sample_data/train2_with_&.txt").read())
train_file_len = len(train_file)
val_file = unidecode.unidecode(open("sample_data/validation2.txt").read())
val_file_len = len(val_file)

def random_chunk(file, file_len):
  start_index = random.randint(0, file_len - chunk_len - 1)
  end_index = start_index + chunk_len + 1
  return file[start_index:end_index]

def char_tensor(string):
  tensor = torch.zeros(len(string)).long()
  for c in range(len(string)):
    if string[c] not in selected:
      continue
    tensor[c] = selected.index(string[c])
  return tensor.to(device)

def random_training_set(file, file_len):
  chunk = random_chunk(file, file_len)
  inp = char_tensor(chunk[:-1])
  target = char_tensor(chunk[1:])
  return inp, target

class RNN(nn.Module):
    def __init__(self, input_size, hidden_size, output_size, n_layers=1):
        super(RNN, self).__init__()
        self.encoder = nn.Embedding(input_size, hidden_size)
        self.gru = nn.GRU(hidden_size, hidden_size, n_layers)
        self.decoder = nn.Linear(hidden_size, output_size)
        self.hidden_size = hidden_size
        self.n_layers = n_layers

    def forward(self, input, hidden):
        input = self.encoder(input.view(1, -1))
        output, hidden = self.gru(input.view(1, 1, -1), hidden)
        output = self.decoder(output.view(1, -1))
        return output, hidden

    def init_hidden(self):
        return torch.zeros(self.n_layers, 1, self.hidden_size, device=device)


def train(inp, target):
  hidden = decoder.init_hidden()
  decoder.zero_grad()
  loss = 0
  seq_len = inp.size(0) # R√©cup√©rer la longueur du batch actuel
  for c in range(seq_len):
    output, hidden = decoder(inp[c], hidden)
    loss += criterion(output, target[c].unsqueeze(0))
  loss.backward()
  optimizer.step()
  train_losses.append(loss.item() / seq_len)
  return loss.item() / seq_len


def evaluate(prime_str='A', predict_len=25, temperature=0.8):
  if predict_len is None:
    predict_len = random.randint(5, 12)

  hidden = decoder.init_hidden()
  prime_input = char_tensor(prime_str)
  predicted = prime_str

  for p in range(len(prime_str) - 1):
    _, hidden = decoder(prime_input[p], hidden)
  inp = prime_input[-1]

  for _ in range(predict_len):
    output, hidden = decoder(inp, hidden)
    output_dist = torch.softmax(output.data.view(-1) / temperature, dim=0)
    top_i = torch.multinomial(output_dist[:len(selected)], 1)[0]
    predicted_char = selected[top_i]

    if predicted_char == '&':
      break

    predicted += predicted_char
    inp = torch.tensor([selected.index(predicted_char)]).to(device)

  return predicted


def time_since(since):
  s = time.time() - since
  m = math.floor(s / 60)
  s -= m * 60
  return '%dm %ds' % (m, s)

def exponential_moving_average(values, alpha=0.01):
  ema = []
  avg = values[0] # Initialisation
  for value in values:
    avg = alpha * value + (1 - alpha) * avg
    ema.append(avg)
  return ema

def evaluate_loss(n_samples=1000):
  total_loss = 0
  for _ in range(n_samples):
    inp, target = random_training_set(val_file, val_file_len)
    hidden = decoder.init_hidden()
    loss = 0
  for c in range(inp.size(0)):
    output, hidden = decoder(inp[c], hidden)
  loss += criterion(output, target[c].unsqueeze(0))
  total_loss += loss.item() / inp.size(0)
  return total_loss / n_samples

decoder = RNN(n_characters, hidden_size, n_characters, n_layers).to(device)
optimizer = torch.optim.Adam(decoder.parameters(), lr=lr)
criterion = nn.CrossEntropyLoss()

train_losses = []
eval_losses = []
print("Entra√Ænement sur le dataset de mots de passe...")

start = time.time()
for epoch in range(1, n_epochs + 1):
  loss = train(*random_training_set(train_file, train_file_len))
  with torch.no_grad():
    eval_loss = evaluate_loss(n_samples=50)
    eval_losses.append(eval_loss)
  if epoch % 1000 == 0:
    print('[%s (%d %d%%) %.4f]' % (time_since(start), epoch, epoch / n_epochs * 100, loss))

smoothed_train_losses = exponential_moving_average(train_losses, alpha=0.01)
smoothed_eval_losses = exponential_moving_average(eval_losses, alpha=0.01)

plt.plot(smoothed_train_losses, label='Loss liss√©e (EMA)')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.title('Courbe de la perte pendant l\'entra√Ænement (liss√©e)')
plt.legend()
plt.show()

plt.plot(smoothed_eval_losses, label='Loss de validation')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.title('Courbe de la perte de validation')
plt.legend()
plt.show()

# save

if not path.exists("models"):
  makedirs("models")
torch.save(decoder, f"models/password_rnn_{int(time.time())}.pt")

print("nG√©n√©ration de mots de passe apr√®s entra√Ænement :n")
for _ in range(10):
  print(evaluate(prime_str=random.choice(selected), temperature=0.7))



Entra√Ænement sur le dataset de mots de passe...
[0m 25s (1000 33%) 3.3694]
[0m 52s (2000 66%) 2.8599]


In [None]:
# Charger le mod√®le entra√Æn√©
if path.exists("models/password_lstm_attention.pt"):
    decoder = torch.load("models/password_lstm_attention.pt").to(device)
    decoder.eval()  # Mettre le mod√®le en mode √©valuation
    print("‚úÖ Mod√®le charg√© avec succ√®s !")
else:
    print("‚ùå Erreur : mod√®le introuvable.")
    exit()

# Charger les donn√©es de validation
try:
    val_file = unidecode(open("sample_data/validation2.txt").read())
    val_file_len = len(val_file)
except FileNotFoundError:
    print("‚ùå Erreur : Le fichier validation2.txt est introuvable.")
    exit()

# Fonction pour obtenir un batch de validation
def validation_set():
    start_index = random.randint(0, val_file_len - chunk_len - 1)
    end_index = start_index + chunk_len + 1
    chunk = val_file[start_index:end_index]
    inp = char_tensor(chunk[:-1])
    target = char_tensor(chunk[1:])
    return inp, target

# Fonction d'√©valuation
def evaluate_model(num_samples=500):
    total_loss = 0
    correct = 0
    total = 0
    all_losses = []

    with torch.no_grad():  # D√©sactiver le calcul des gradients
        for _ in range(num_samples):  # Tester sur plusieurs √©chantillons
            inp, target = validation_set()
            hidden = decoder.init_hidden()
            loss = 0
            seq_len = inp.size(0)

            for c in range(seq_len):
                output, hidden = decoder(inp[c].unsqueeze(0), hidden)
                loss += criterion(output, target[c].unsqueeze(0))

                # Calcul de l'accuracy
                pred = output.argmax(dim=1)  # Trouver l'indice de la meilleure pr√©diction
                correct += (pred == target[c]).sum().item()
                total += 1

            loss = loss.item() / seq_len
            total_loss += loss
            all_losses.append(loss)

    avg_loss = total_loss / num_samples  # Moyenne des pertes
    accuracy = correct / total * 100  # Pr√©cision en %

    return avg_loss, accuracy, all_losses

# Ex√©cuter l'√©valuation
val_loss, val_accuracy, all_losses = evaluate_model(num_samples=500)

print(f"\nüìä R√©sultats de la validation sur 500 √©chantillons :")
print(f"   - Validation Loss : {val_loss:.4f}")
print(f"   - Validation Accuracy : {val_accuracy:.2f}%")

# Affichage des r√©sultats sous forme graphique
plt.figure(figsize=(12, 5))

# Graphique de la distribution des pertes
plt.hist(all_losses, bins=30, color="blue", alpha=0.7)
plt.xlabel("Loss par s√©quence")
plt.ylabel("Nombre d'occurrences")
plt.title("Distribution des pertes sur l'ensemble de validation")
plt.show()
