In [1]:
import pandas as pd
import numpy as np
import re
import hashlib
import math
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from xgboost import XGBClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.sparse import hstack
import joblib

In [2]:
def calculate_entropy(password):
    charset = 0
    charset += 26 if any(char.islower() for char in password) else 0
    charset += 26 if any(char.isupper() for char in password) else 0
    charset += 10 if any(char.isdigit() for char in password) else 0
    charset += 32 if bool(re.search("[!@#$%^&*()_+\-=\[\]{};:'|,.<>\/?]", password)) else 0
    
    if charset > 0:
        entropy = math.log2(charset) * len(password)
        return entropy
    else:
        return 0

In [3]:
def is_complex(password):
    complexity_threshold = 3
    complexity = 0
    complexity += 1 if any(char.islower() for char in password) else 0
    complexity += 1 if any(char.isupper() for char in password) else 0
    complexity += 1 if any(char.isdigit() for char in password) else 0
    complexity += 1 if bool(re.search("[!@#$%^&*()_+\-=\[\]{};:'|,.<>\/?]", password)) else 0

    return complexity >= complexity_threshold

In [4]:
def load_common_passwords(filename):
    with open(filename, 'r') as file:
        common_passwords = [line.strip() for line in file]
    return common_passwords

def is_common_password(password, common_passwords):
    hashed_password = hashlib.sha256(password.encode()).hexdigest()
    for common_pwd in common_passwords:
        if hashed_password == common_pwd:
            return True
    return False


In [5]:
def has_repeating_patterns(password, threshold=3):
    if threshold <= 1:
        return False
    
    # Construct the regular expression pattern with the given threshold
    pattern = re.compile(r"(.+?)\1{" + str(threshold - 1) + r",}")
    
    # Search for repeating patterns in the password
    return bool(pattern.search(password))

In [6]:
def has_leet_speak_substitutions(password):
    leet_substitutions = {
        'a': ['4', '@'],
        'b': ['8'],
        'e': ['3'],
        'g': ['9', '6'],
        'i': ['1', '!', '|'],
        'l': ['1', '|', '!'],
        'o': ['0'],
        's': ['$', '5'],
        't': ['7', '+']
        # Add more substitutions as needed
    }

    for char, substitutions in leet_substitutions.items():
        if any(sub in password.lower() for sub in substitutions):
            return 1  # 1 indicates the presence of Leet Speak substitutions
    
    return 0  # 0 indicates no Leet Speak substitutions

In [7]:
def extract_features(password):
    features = {}
    common_passwords = load_common_passwords('most common2023.txt')
    
    if not isinstance(password, str):
        password = str(password)  # Convert non-string values to string
    
    #features['length'] = len(password)
    features['length']=len(password)
    #features['entropy'] = calculate_entropy(password)
    features['uppercase_ratio'] = sum(c.isupper() for c in password) / len(password)
    features['lowercase_ratio'] = sum(c.islower() for c in password) / len(password)
    features['digit_ratio'] = sum(c.isdigit() for c in password) / len(password)
    features['special_ratio'] = len(re.findall(r'[^a-zA-Z0-9]', password)) / len(password)
    #features['common_password'] = 1 if is_common_password(password, common_passwords) else 0
    #features['repeating_patterns'] = 1 if has_repeating_patterns(password) else 0
    #features['leet_speak_substitutions'] = 1 if has_leet_speak_substitutions(password) else 0
    #features['is_complex'] = 1 if is_complex(password) else 0
    
    # Normalize features
    max_length = 50  # Maximum observed password length for normalization
    #features['length'] /= max_length
    max_entropy = 8  # Maximum observed entropy for normalization
    #features['entropy'] /= max_entropy
    
    return features

In [8]:
df=pd.read_csv("data_700 .csv",usecols=['password', 'strength'],low_memory=False)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 679769 entries, 0 to 679768
Data columns (total 2 columns):
 #   Column    Non-Null Count   Dtype 
---  ------    --------------   ----- 
 0   password  679769 non-null  object
 1   strength  679769 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 10.4+ MB


In [9]:
df.isnull().sum()

password    0
strength    0
dtype: int64

In [10]:
df.dropna(inplace=True)
df.drop_duplicates(inplace=True)



In [11]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 679140 entries, 0 to 679768
Data columns (total 2 columns):
 #   Column    Non-Null Count   Dtype 
---  ------    --------------   ----- 
 0   password  679140 non-null  object
 1   strength  679140 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 15.5+ MB


In [12]:
df['strength'].value_counts()

strength
1    496738
0     99265
2     83137
Name: count, dtype: int64

In [13]:
vectorizer = TfidfVectorizer()

In [14]:
t = df['password'].apply(extract_features).apply(pd.Series)
tf=vectorizer.fit_transform(df['password'])
X=hstack([tf,t])
y = df['strength']


In [15]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=45)


In [16]:
model = XGBClassifier()
model.fit(X_train, y_train)

In [17]:
joblib.dump(model,'new_train.pkl')
joblib.dump(vectorizer,'new_tfidf.pkl')

['new_tfidf.pkl']

In [18]:
y_pred = model.predict(X_test)

In [None]:
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

Accuracy: 0.9989251111700092


In [19]:
def load_leaked_credentials(filename):
    with open(filename, 'r') as file:
        leaked_credentials = set(file.read().splitlines())
    return leaked_credentials

# Function to check if password is in leaked credentials
def check_leaked_password(password, leaked_credentials):
    if password in leaked_credentials:
        return True
    return False
leaked_credentials = load_leaked_credentials("most common2023.txt")

In [20]:
def check_repeated(password):
    threshold = 3  # Minimum length of repeated pattern
    pattern = password[:threshold]  # Initial pattern to compare

    for i in range(1, len(password) - threshold + 1):
        # Check if the next substring matches the initial pattern
        if password[i:i+threshold] == pattern:
            return True

    return False

In [None]:
test_password = input("Enter the password to test: ")

if check_leaked_password(test_password, leaked_credentials):
    print("The password is found in leaked credentials. It is a very weak password.")
elif check_repeated(test_password):
    print("The password contains a clear repeated pattern. It is weak.")
else:
    password_features = pd.DataFrame({"password": [test_password]})
    tfidf=vectorizer.transform(password_features['password'])
    password_features = password_features['password'].apply(extract_features).apply(pd.Series)
    pas=hstack([tfidf, password_features])
    predicted_strength = model.predict(pas)
    print(f"Predicted strength for password '{test_password}': {predicted_strength}")


Predicted strength for password 'P$ut#;;;03': [1]
