In [2]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

df_very_weak = pd.read_csv("D:/Etudes/2CS/Stage/PWLDS-main/pwlds_very_weak.csv")
df_weak = pd.read_csv("D:/Etudes/2CS/Stage/PWLDS-main/pwlds_weak.csv")
df_average = pd.read_csv("D:/Etudes/2CS/Stage/PWLDS-main/pwlds_average.csv")
df_strong = pd.read_csv("D:/Etudes/2CS/Stage/PWLDS-main/pwlds_strong.csv")

dataB = pd.concat([
    df_very_weak,
    df_weak,
    df_average,
    df_strong,
], ignore_index=True)
dataB = dataB.rename(columns={"Strength_Level": "strength"})

print("----------Appercu-------")
print(dataB.head())



----------Appercu-------
  Password  strength
0    7hqwv         0
1     cjml         0
2     asuy         0
3    kcyth         0
4     whcq         0


In [4]:
print("---------Nombre de donnees--------")
print(dataB["strength"].value_counts())

---------Nombre de donnees--------
strength
3    2000382
0    2000043
2    2000024
1    2000021
Name: count, dtype: int64


In [6]:
import numpy as np

min_class_size = 50000

# Indices équilibrés
balanced_indices = []

for cls in dataB["strength"].unique():
    cls_indices = dataB[dataB["strength"] == cls].index
    sampled = np.random.choice(cls_indices, size=min_class_size, replace=False)
    balanced_indices.extend(sampled)

# Créer dataset équilibré
data = dataB.loc[balanced_indices].reset_index(drop=True)

print("Taille après équilibrage :", data.shape)
print("----------Informations-------")
print(data.info())

Taille après équilibrage : (200000, 2)
----------Informations-------
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200000 entries, 0 to 199999
Data columns (total 2 columns):
 #   Column    Non-Null Count   Dtype 
---  ------    --------------   ----- 
 0   Password  200000 non-null  object
 1   strength  200000 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 3.1+ MB
None


In [8]:
# Supprimer les nuls s il y en a
data = data.dropna()

In [12]:
# Choix de l algorithme
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, accuracy_score, f1_score
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression  
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.tree import DecisionTreeClassifier
from scipy.sparse import hstack, csr_matrix
import numpy as np

# ============================
# Préparation des données
# ============================
X = data["Password"]
y = data["strength"]

# Fonction lenTransform : calcule la longueur
def lenTransform(passwords):
    return np.array([len(pw) for pw in passwords]).reshape(-1, 1)

# Split train/test
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# TF-IDF (n-grams caractères)
vectorizer = TfidfVectorizer(
    analyzer="char",
    ngram_range=(3, 5),
    max_features=2000
)

X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

# Longueur des mots de passe
X_train_len = lenTransform(X_train)
X_test_len = lenTransform(X_test)

# Normalisation de la longueur
scaler = StandardScaler()
X_train_len_scaled = scaler.fit_transform(X_train_len)
X_test_len_scaled = scaler.transform(X_test_len)

# Combinaison TF-IDF + feature "len"
X_train_combined = hstack([X_train_tfidf, csr_matrix(X_train_len_scaled)])
X_test_combined = hstack([X_test_tfidf, csr_matrix(X_test_len_scaled)])

# ============================
# Tests des modèles
# ============================

# ===== Logistic Regression =====
print("=== LogisticRegression ===")
clf_lr = LogisticRegression(max_iter=1000, n_jobs=-1, class_weight="balanced")
clf_lr.fit(X_train_combined, y_train)
y_pred_lr = clf_lr.predict(X_test_combined)
print("Accuracy:", accuracy_score(y_test, y_pred_lr))
print("F1-macro:", f1_score(y_test, y_pred_lr, average="macro"))
print(classification_report(y_test, y_pred_lr))

# ===== Linear SVM =====
print("\n=== Linear SVM ===")
svm = LinearSVC()
svm.fit(X_train_combined, y_train)
y_pred_svm = svm.predict(X_test_combined)
print("Accuracy:", accuracy_score(y_test, y_pred_svm))
print("F1-macro:", f1_score(y_test, y_pred_svm, average="macro"))
print(classification_report(y_test, y_pred_svm))



# ===== Random Forest =====
print("\n=== RandomForest ===")
rf = RandomForestClassifier(n_estimators=100, n_jobs=-1, class_weight="balanced",max_depth=10, random_state=42)
rf.fit(X_train_combined, y_train)
y_pred_rf = rf.predict(X_test_combined)
print("Accuracy:", accuracy_score(y_test, y_pred_rf))
print("F1-macro:", f1_score(y_test, y_pred_rf, average="macro"))
print(classification_report(y_test, y_pred_rf))

# ===== XGBoost =====
print("\n=== XGBoost ===")
xgb = XGBClassifier(
    n_estimators=300,
    learning_rate=0.1,
    max_depth=6,
    subsample=0.8,
    colsample_bytree=0.8,
    n_jobs=-1,
    random_state=42
)
xgb.fit(X_train_combined, y_train)
y_pred_xgb = xgb.predict(X_test_combined)
print("Accuracy:", accuracy_score(y_test, y_pred_xgb))
print("F1-macro:", f1_score(y_test, y_pred_xgb, average="macro"))
print(classification_report(y_test, y_pred_xgb))

# ===== Decision Tree =====
print("\n=== Decision Tree ===")
dt = DecisionTreeClassifier(
    max_depth=20,      
    class_weight="balanced",
    random_state=42
)
dt.fit(X_train_combined, y_train)
y_pred_dt = dt.predict(X_test_combined)

print("Accuracy:", accuracy_score(y_test, y_pred_dt))
print("F1-macro:", f1_score(y_test, y_pred_dt, average="macro"))
print(classification_report(y_test, y_pred_dt))


=== LogisticRegression ===
Accuracy: 0.92705
F1-macro: 0.9274101714452265
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     10001
           1       0.99      0.97      0.98      9992
           2       0.83      0.90      0.87      9916
           3       0.89      0.84      0.86     10091

    accuracy                           0.93     40000
   macro avg       0.93      0.93      0.93     40000
weighted avg       0.93      0.93      0.93     40000


=== Linear SVM ===
Accuracy: 0.928375
F1-macro: 0.928881640464907
              precision    recall  f1-score   support

           0       1.00      0.99      1.00     10001
           1       0.99      0.96      0.97      9992
           2       0.84      0.90      0.87      9916
           3       0.89      0.86      0.87     10091

    accuracy                           0.93     40000
   macro avg       0.93      0.93      0.93     40000
weighted avg       0.93      0.93      0.93 

In [14]:
import joblib

# Sauvegarder Logistic Regression
joblib.dump(clf_lr, "logistic_password_model.pkl")

# Sauvegarder TF-IDF vectorizer
joblib.dump(vectorizer, "tfidf_vectorizer.pkl")

# Sauvegarder scaler
joblib.dump(scaler, "scaler_len.pkl")

['scaler_len.pkl']

In [16]:
# Test
import joblib
from scipy.sparse import hstack, csr_matrix
import numpy as np

# Charger le modèle, vectorizer et scaler
clf_lr = joblib.load("logistic_password_model.pkl")
vectorizer = joblib.load("tfidf_vectorizer.pkl")
scaler = joblib.load("scaler_len.pkl")

# Exemple de nouveaux mots de passe
new_passwords = ["12345", "AcinetaeU", "ChevalTrottinetteAbricotMaison", "!L5uraA1meLesAffr3uxBurgers","abstemiousy&"]

# TF-IDF
X_new_tfidf = vectorizer.transform(new_passwords)

# Longueur des mots de passe
X_new_len = np.array([len(pw) for pw in new_passwords]).reshape(-1, 1)

# Standardisation
X_new_len_scaled = scaler.transform(X_new_len)

# Combinaison
X_new_combined = hstack([X_new_tfidf, csr_matrix(X_new_len_scaled)])

# Prédiction
y_pred_new = clf_lr.predict(X_new_combined)
print("Résultat :", y_pred_new)


Résultat : [0 2 3 3 2]
