In [2]:
import pandas as pd
import numpy as np
import re
import hashlib
import math
import joblib
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score
from xgboost import XGBClassifier
from scipy.sparse import csr_matrix  # Importing the necessary library

In [3]:
# Function to calculate TF-IDF features
def tfidf_features(passwords):
    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform(passwords)
    return tfidf_matrix

In [4]:
def calculate_entropy(password):
    charset = 0
    charset += 26 if any(char.islower() for char in password) else 0
    charset += 26 if any(char.isupper() for char in password) else 0
    charset += 10 if any(char.isdigit() for char in password) else 0
    charset += 32 if bool(re.search("[!@#$%^&*()_+\-=\[\]{};:'|,.<>\/?]", password)) else 0
    
    if charset > 0:
        entropy = math.log2(charset) * len(password)
        return entropy
    else:
        return 0

In [5]:
def is_complex(password):
    complexity_threshold = 3
    complexity = 0
    complexity += 1 if any(char.islower() for char in password) else 0
    complexity += 1 if any(char.isupper() for char in password) else 0
    complexity += 1 if any(char.isdigit() for char in password) else 0
    complexity += 1 if bool(re.search("[!@#$%^&*()_+\-=\[\]{};:'|,.<>\/?]", password)) else 0

    return complexity >= complexity_threshold

In [6]:
# Function to load common passwords
def load_common_passwords(filename):
    with open(filename, 'r') as file:
        common_passwords = [line.strip() for line in file]
    return common_passwords

# Function to check if password is common
def is_common_password(password, common_passwords):
    hashed_password = hashlib.sha256(password.encode()).hexdigest()
    for common_pwd in common_passwords:
        if hashed_password == common_pwd:
            return True
    return False

In [7]:
# Function to check for repeating patterns
def has_repeating_patterns(password, threshold=3):
    if threshold <= 1:
        return False
    
    # Construct the regular expression pattern with the given threshold
    pattern = re.compile(r"(.+?)\1{" + str(threshold - 1) + r",}")
    
    # Search for repeating patterns in the password
    return bool(pattern.search(password))

In [8]:
# Function to check for Leet Speak substitutions
def has_leet_speak_substitutions(password):
    leet_substitutions = {
        'a': ['4', '@'],
        'b': ['8'],
        'e': ['3'],
        'g': ['9', '6'],
        'i': ['1', '!', '|'],
        'l': ['1', '|', '!'],
        'o': ['0'],
        's': ['$', '5'],
        't': ['7', '+']
        # Add more substitutions as needed
    }

    for char, substitutions in leet_substitutions.items():
        if any(sub in password.lower() for sub in substitutions):
            return 1  # 1 indicates the presence of Leet Speak substitutions
    
    return 0  # 0 indicates no Leet Speak substitutions

In [9]:
def load_leaked_credentials(filename):
    with open(filename, 'r') as file:
        leaked_credentials = set(file.read().splitlines())
    return leaked_credentials

# Function to check if password is in leaked credentials
def check_leaked_password(password, leaked_credentials):
    if password in leaked_credentials:
        return True
    return False

leaked_credentials = load_leaked_credentials("most common2023.txt")

# Function to check for repeated patterns
def check_repeated(password):
    threshold = 3  # Minimum length of repeated pattern
    pattern = password[:threshold]  # Initial pattern to compare

    for i in range(1, len(password) - threshold + 1):
        # Check if the next substring matches the initial pattern
        if password[i:i+threshold] == pattern:
            return True

    return False

In [12]:
# Load data
df = pd.read_csv("data_700 .csv", usecols=['password', 'strength'], low_memory=False)
df.dropna(inplace=True)
df.drop_duplicates(inplace=True)

# Extract features
X = df['password'].apply(extract_features).apply(pd.Series)
y = df['strength']

# Split data into training and testing sets

# Initialize TF-IDF vectorizer
vectorizer = TfidfVectorizer()

# Fit and transform on training data
X_train_tfidf = vectorizer.fit_transform()

# Transform testing data


# Train XGBoost classifier
model = XGBClassifier()
model.fit(X_train_tfidf, y_train)

# Save vectorizer
joblib.dump(vectorizer, 'vectorizer.pkl')

# Save model
joblib.dump(model, 'xgboost_model.pkl')


['xgboost_model.pkl']

In [10]:
from scipy.sparse import hstack

# Input password from user
test_password = input("Enter the password to test: ")

# Check if password is in leaked credentials
if check_leaked_password(test_password, load_leaked_credentials("most common2023.txt")):
    print("The password is found in leaked credentials. It is a very weak password.")
elif check_repeated(test_password):
    print("The password contains a clear repeated pattern. It is weak.")
else:
    # Extract TF-IDF features for the input password
    password_tfidf = tfidf_features([test_password])
    
    # Ensure that the input data has the correct shape
    if password_tfidf.shape[1] != X_train_tfidf.shape[1]:
        # If the number of features doesn't match, create a sparse matrix of zeros to match the shape
        padding = csr_matrix((password_tfidf.shape[0], X_train_tfidf.shape[1] - password_tfidf.shape[1]))
        password_tfidf = hstack([password_tfidf, padding])
    
    # Predict strength for the input password
    predicted_strength = model.predict(password_tfidf)
    
    # Map predicted class labels to password complexity categories
    complexity_map = {0: "Weak", 1: "Medium", 2: "Strong"}
    predicted_complexity = complexity_map[predicted_strength[0]]  # Assuming predicted_strength is a list with one element
    
    print(f"Predicted complexity for password '{test_password}': {predicted_complexity}")



Predicted complexity for password 'clknqh[n3ml1': Strong
