In [None]:
from tensorflow import keras
from keras import layers
import pandas as pd
import tensorflow as tf
import numpy as np
import matplotlib.pyplot as plt
from scipy.interpolate import make_interp_spline, BSpline
import os

os.environ['KMP_DUPLICATE_LIB_OK']='True'
from io import open

train_data = pd.read_csv("data/Ashley-Madison.txt", header=None, names=['input'])
train_data['input'] = train_data['input'].astype(str)

print("La taille du dataframe est : ", len(train_data))

def count_words_by_length(filename):
    word_length_counts = {}

    with open(filename, 'r') as file:
        words = file.read().split()

    for word in words:
        length = len(word)
        if length in word_length_counts:
            word_length_counts[length] += 1
        else:
            word_length_counts[length] = 1

    return word_length_counts

resultat = count_words_by_length("data/Ashley-Madison.txt")


sorted_counts = sorted(resultat.items())


print("Longueur  | Nombre de mots")
print("-------------------------")
for length, count in sorted_counts:
    print(f"{length:8} | {count}")

train_data['length'] = train_data['input'].str.len()


mean_df = train_data["length"].mean()
print("La moyenne des MDP est : ", mean_df)


index_max_length = train_data['length'].idxmax()


input_max_length = train_data.loc[index_max_length, 'input']

print("L'input le plus long est:", input_max_length, "et contient : ", train_data.loc[index_max_length, 'length'], "caractere")


word_count_by_length = train_data['length'].value_counts().sort_index()

plt.xlim(0, 110)
plt.ylim(0, 100000)


plt.bar(word_count_by_length.index, word_count_by_length.values)


plt.xlabel('Taille du mot')
plt.ylabel('Nombre de mots')
plt.title('Nombre de mots par taille')


plt.show()


train_data['digit_only'] = train_data['input'].str.isdigit()


train_data['letter_only'] = train_data['input'].str.isalpha()

train_data['special'] = ~train_data['input'].str.isalnum()


digit_only_counts = train_data['digit_only'].value_counts()
letter_only_counts = train_data['letter_only'].value_counts()
special_counts = train_data['special'].value_counts()


print("Nombre de mots contenant uniquement des chiffres :", digit_only_counts[True] if True in digit_only_counts else 0)
print("Nombre de mots contenant uniquement des lettres :", letter_only_counts[True] if True in letter_only_counts else 0)
print("Nombre de mots contenant des caractères spéciaux  :", special_counts[True] if True in special_counts else 0)

duplicated_count = train_data['input'].duplicated().sum()
print("Nombre de mots redondants :", duplicated_count)


train_data = train_data[train_data["input"].str.len() >= 4]

train_data = train_data[train_data["input"].str.len() <= 20]
print("La taille du dataframe apres modification est : ", len(train_data))


train_data["target"] = train_data["input"] + "\n"


train_data["input"] = "\t" + train_data["input"]


contains_space = train_data['input'].str.contains(' ')


rows_with_space = train_data[contains_space]
print(rows_with_space)

chars = sorted(list(set("\n".join(train_data["input"]))))
vocab_size = len(chars)
print("Total chars:", vocab_size)

inputs = []
outputs = []

for line in train_data["input"]:
    tmp = [char for char in line]
    inputs.append(tmp)

for line in train_data["target"]:
    tmp = [char for char in line]
    outputs.append(tmp)

X_train = tf.ragged.constant(inputs)
X_train = tf.one_hot(X_train, depth=vocab_size)

y_train = tf.ragged.constant(outputs)
y_train = tf.one_hot(y_train, depth=vocab_size)

def onehot2indices(hotmax):
    return tf.math.argmax(hotmax, axis=-1).numpy().flatten()

def onehot2chars(hotmax):
    return [indices_char[i] for i in onehot2indices(hotmax)]

def word2indices(word):
    return [char_indices[c] for c in word]

def indices2word(indices):
    return ''.join([indices_char[i] for i in indices])

# Fonction pour créer la matrice one-hot d'un mot
def mot_to_onehot(mot, char_indices):
    vocab_size = len(char_indices)
    onehot_matrix = [keras.utils.to_categorical(char_indices[char], num_classes=vocab_size) for char in mot]
    return onehot_matrix

mot = "salut"
matrice_onehot = mot_to_onehot(mot, char_indices)
print(matrice_onehot)

onehot2chars(matrice_onehot)

def get_rnn():
    model = keras.Sequential()
    model.add(keras.Input(shape=(None, vocab_size)))
    model.add(layers.SimpleRNN(128, return_sequences=True))  
    model.add(layers.Dense(vocab_size, activation="softmax"))

    model.compile(loss="categorical_crossentropy", optimizer="adam")

    model.summary()
    return model
rnn_model = get_lstm()

def get_lstm():
    model = keras.Sequential()
    model.add(keras.Input(shape=(None, vocab_size)))
    model.add(layers.LSTM(128, return_sequences=True))
    model.add(layers.Dense(vocab_size, activation="softmax"))
    model.compile(loss="categorical_crossentropy", optimizer="adam")
    model.summary()
    return model

lstm_model = get_lstm()

def get_bidirectional_lstm():
    model = keras.Sequential()
    model.add(keras.Input(shape=(None, vocab_size)))
    model.add(layers.Bidirectional(layers.LSTM(128, return_sequences=True)))
    model.add(layers.Dense(vocab_size, activation="softmax"))
    model.compile(loss="categorical_crossentropy", optimizer="adam")
    model.summary()
    return model

bidirectional_lstm = get_bidirectional_lstm()

def get_stacked_lstm():
    model = keras.Sequential()
    model.add(keras.Input(shape=(None, vocab_size)))
    model.add(layers.LSTM(128, return_sequences=True))
    model.add(layers.LSTM(128, return_sequences=True))
    model.add(layers.Dense(vocab_size, activation="softmax"))

    model.compile(loss="categorical_crossentropy", optimizer="adam")

    model.summary()
    return model
stacked_lstm = get_stacked_lstm()

epochs = 30  # A modifier
lstm_h128_b32_model = lstm_model.fit(X_train, y_train, batch_size=32, epochs=epochs)

model_name = "lstm_h128_b32"

def save_model(model, file_path):
    model.save('models/lstm_h128_b32/lstm_h128_b32_e15.h5')
    print(f"Model saved to {file_path}")

file_path = 'models/lstm_h128_b32'
save_model(lstm_model, file_path)

def get_model(file=None, epoch=None):
    if file is not None:
        return keras.models.load_model("models/{}/{}_e{}.h5".format(file, file, epoch))
    return get_RNN()
    
   #return bidirectional_lstm
   #return lstm_model

model.save("models/{}/{}_e{}.h5".format(model_name, model_name, epoch))

def generate_word(model, max_length=20):
    out = [0]  # Start with \t token
    input = tf.one_hot(out, depth=vocab_size)  # One hot encode
    input = tf.reshape(input, (1, input.shape[0], input.shape[1]))  # Add batch dimension
    for i in range(max_length):
        prediction = model.predict(input, verbose=0)  # Predict next letter
        prediction = prediction[:, -1:, :][0]  # Last letter softmax probabilities
        id = tf.squeeze(tf.random.categorical(tf.math.log(prediction), 1)).numpy()  # Sample from softmax
        if indices_char[id] == "\n":  # Stop if \n token is predicted
            break
        out.append(id)  # Append predicted letter
        input = tf.one_hot(out, depth=vocab_size)  # Update input
        input = tf.reshape(input, (1, input.shape[0], input.shape[1]))
    out = out[1:]  # Remove \t token
    del input, prediction, id
    return indices2word(out)

generate_word(stacked_lstm, 12)

def generate_n_words(model, n, max_length=20):
    words = []
    for i in range(n):
        print("Generating word {}/{}".format(i, n), end="\r")
        words.append(generate_word(model, max_length))
    return words

def generate_and_save_words(model, num_words, max_length=20, output_file="generated_words.txt"):
    generated_words = generate_n_words(model, num_words, max_length)
    with open(output_file, "w") as file:
        for word in generated_words:
            file.write(word + "\n")
    print(f"{num_words} ont été générés et sauvegardés dans {output_file}")

generate_and_save_words(lstm_model, num_words=10000, max_length=20, output_file="rendu/10K.txt")

generate_and_save_words(lstm_model, num_words=100000, max_length=20, output_file="rendu/100K.txt")

generate_and_save_words(lstm_model, num_words=1000000, max_length=20, output_file="rendu/1M.txt")

def compare_passwords(file1_path, file2_path):
    with open(file1_path, "r", encoding="latin-1") as file1:
        passwords1 = set(file1.read().splitlines())

    with open(file2_path, "r", encoding="latin-1") as file2:
        passwords2 = set(file2.read().splitlines())

    common_passwords = passwords1.intersection(passwords2)
    return len(common_passwords)

file1_path = "rendu/100K.txt"
file2_path = "data/Ashley-Madison.txt"
result = compare_passwords(file1_path, file2_path)

print("Nombre de mots de passe identiques :", result)

def calculate_accuracy_and_matches(original_dataset_path, generated_file_path):
    # Load the original dataset
    with open(original_dataset_path, 'r', encoding='latin-1') as file:
        original_passwords = set(file.read().splitlines())

    # Load generated passwords
    with open(generated_file_path, 'r', encoding='latin-1') as file:
        generated_passwords = set(file.read().splitlines())

    # Find matches using set intersection
    matching_passwords = original_passwords.intersection(generated_passwords)

    # Calculate accuracy based on full line matches
    accuracy = (len(matching_passwords) / len(generated_passwords)) * 100 if generated_passwords else 0
    return accuracy, matching_passwords

# Paths to the files
original_dataset_path = "data/rockyou.txt"     #"data/Ashley-Madison.txt"
generated_file_path = "rendu/e30/100K.txt"

# Calculate accuracy and get matches
accuracy, matching_passwords = calculate_accuracy_and_matches(original_dataset_path, generated_file_path)
print(f"Accuracy: {accuracy:.2f}%")
print("Matching Passwords Number:", len(matching_passwords))
print("Matching Passwords:", matching_passwords)
