In [1]:
import pandas as pd
import string
import random
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

In [2]:
# load datasets

print("Reading datasets...")

# dataset 1
df1 = pd.read_csv("datasets/passwords_dataset.csv")[
    ["Password", "Strength"]
].rename(columns={
    "Password": "password",
    "Strength": "strength"
})

strength_map = {"Weak": 0, "Medium": 1, "Strong": 2}
df1["strength"] = df1["strength"].map(strength_map)

# dataset 2
df2 = pd.read_csv("datasets/data.csv", on_bad_lines="skip")  
df2 = df2.iloc[:, [0, -1]]  
df2.columns = ["password", "strength"]

# combine datasets
df = pd.concat([df1, df2], ignore_index=True)
df = df.dropna()
df = df[df["password"].str.len() > 0]

X = df["password"].tolist()
y = df["strength"].astype(int).tolist()

print("Loaded entries:", len(df))


Reading datasets...
Loaded entries: 679639


In [3]:
# generate strong artificial passwords

def generateStrong(n=5000, min_len=10, max_len=16):
    strong_passwords = []
    chars = string.ascii_letters + string.digits + "!@#$%^&*()-_=+"

    while len(strong_passwords) < n:
        length = random.randint(min_len, max_len)
        pw = ''.join(random.choice(chars) for _ in range(length))

        if (any(c.islower() for c in pw) and
            any(c.isupper() for c in pw) and
            any(c.isdigit() for c in pw) and
            any(c in string.punctuation for c in pw)):
            strong_passwords.append(pw)

    return strong_passwords

strongPwds = generateStrong(n=5000)

X.extend(strongPwds)
y.extend([2]*len(strongPwds))


In [4]:
# train/test splits
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=2
)

# vectorize data
print("Vectorizing data...")
vectorizer = CountVectorizer(analyzer="char", ngram_range=(1, 4))
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

# train model
print("Training model...")
model = LogisticRegression(max_iter=1000)
model.fit(X_train_vec, y_train)

# model eval report
print("\nMODEL EVALUATION:\n")
print(classification_report(y_test, model.predict(X_test_vec)))

Vectorizing data...
Training model...

MODEL EVALUATION:

              precision    recall  f1-score   support

           0       0.99      1.00      0.99     27552
           1       0.99      1.00      0.99    149583
           2       0.99      0.96      0.97     28257

    accuracy                           0.99    205392
   macro avg       0.99      0.98      0.99    205392
weighted avg       0.99      0.99      0.99    205392



In [5]:
# prediction

labels = ["Weak", "Medium", "Strong"]

def predict_strength(password):
    vec = vectorizer.transform([password])
    pred = model.predict(vec)[0]
    probs = model.predict_proba(vec)[0]
    conf = probs[pred] * 100
    breakdown = {labels[i]: float(p*100) for i, p in enumerate(probs)}

    return labels[pred], round(conf, 2), breakdown

In [11]:
# interactive password strength prediction

print("-"*60)
print(" "*18 + "WEAK PASSWORD PREDICTOR")
print("-"*60)
print()

while True:
    pwd = input("Enter a password (ENTER to exit): ")
    if pwd == "":
        break
    else:
        label, conf, all_conf = predict_strength(pwd)
        print()
        print(f"Password: \t{pwd}")
        print(f"Prediction: \t{label}")
        print(f"Confidence: \t{conf}%")
        print("Breakdown:")
        for k, v in all_conf.items():
            print(f"\t\t{k}: {v:.2f}%")

------------------------------------------------------------
                  WEAK PASSWORD PREDICTOR
------------------------------------------------------------

