In [2]:
import pandas as pd
import math
from itertools import tee

Import txt dataset

In [4]:
#used only one time at the start
# df = pd.read_fwf('E:/Rockyou/rockyou.txt', header=None, names=['password'], encoding='latin-1')


convert to csv

In [None]:
#used only one time at the start
# df.to_csv('my_dataframe.csv', index=False)

In [None]:
# print(df.head())

    password
0     123456
1      12345
2  123456789
3   password
4   iloveyou


Import the new CSV file

In [5]:
df = pd.read_csv("rockyou.csv", encoding="latin-1")

In [6]:
print(df.head())

    password
0     123456
1      12345
2  123456789
3   password
4   iloveyou


Extract charactaristic features

num_of_features

In [7]:
#num of letters
df["num_letters"] = df["password"].str.count(r"[A-Za-z]")
df["num_upper"] = df["password"].str.count(r"[A-Z]")
df["num_lower"] = df["password"].str.count(r"[a-z]")
#num of digits 
df["num_digits"] = df["password"].str.count(r"[0-9]")  
#num of special characters
df["num_special_char"] = df["password"].str.count(r"[^A-Za-z0-9]")

In [8]:
print(df.head())

    password  num_letters  num_upper  num_lower  num_digits  num_special_char
0     123456          0.0        0.0        0.0         6.0               0.0
1      12345          0.0        0.0        0.0         5.0               0.0
2  123456789          0.0        0.0        0.0         9.0               0.0
3   password          8.0        0.0        8.0         0.0               0.0
4   iloveyou          8.0        0.0        8.0         0.0               0.0


Boolean_features

In [10]:
#general
df["has_upper"] = df["num_upper"] > 0
df["has_num"] = df["num_digits"] > 0
df["has_special"] = df["num_special_char"] > 0
#first letter
df["first_is_upper"] = df["password"].str.match(r"^[A-Z]")
df["first_is_digit"] = df["password"].str.match(r"^[0-9]")
df["first_is_special"] = df["password"].str.match(r"^[^A-Za-z0-9]")
#last letter
df["last_is_upper"] = df["password"].str[-1].str.match(r"[A-Z]")
df["last_is_digit"] = df["password"].str[-1].str.match(r"[0-9]")
df["last_is_special"] = df["password"].str[-1].str.match(r"[^A-Za-z0-9]")

In [13]:
print(df.head())

    password  num_letters  num_upper  num_lower  num_digits  num_special_char  \
0     123456          0.0        0.0        0.0         6.0               0.0   
1      12345          0.0        0.0        0.0         5.0               0.0   
2  123456789          0.0        0.0        0.0         9.0               0.0   
3   password          8.0        0.0        8.0         0.0               0.0   
4   iloveyou          8.0        0.0        8.0         0.0               0.0   

   has_upper  has_num  has_special first_is_upper first_is_digit  \
0      False     True        False          False           True   
1      False     True        False          False           True   
2      False     True        False          False           True   
3      False    False        False          False          False   
4      False    False        False          False          False   

  first_is_special last_is_upper last_is_digit last_is_special  
0            False         False       

ratio_features

In [None]:
df["ratio_letters"] = df["num_letters"] / df["length"]
df["ratio_uppercase"] = df["num_upper"] / df["length"]
df["ratio_lowercase"] = df["num_lower"] / df["length"]

df["ratio_digits"] = df["num_didgits"] / df["length"]

df["ratio_symbols"] = df["num_special_char"] / df["length"]

In [None]:
# print(df.head())

    password
0     123456
1      12345
2  123456789
3   password
4   iloveyou


entropy_features

functions

In [None]:
def shannon_entropy(pwd):
    if not pwd:
        return 0
    freq = {c: pwd.count(c)/len(pwd) for c in set(pwd)}
    return -sum(p * math.log2(p) for p in freq.values())

def ngrams(seq, n=2):
    a, b = tee(seq)
    for _ in range(1, n):
        b = tee(b)[1]
        next(b, None)
    return zip(*a, *b)

def bigram_entropy(pwd):
    pwd = str(pwd)
    if len(pwd) < 2:
        return 0
    bigrams = list(zip(pwd, pwd[1:]))
    freq = {bg: bigrams.count(bg)/len(bigrams) for bg in set(bigrams)}
    return -sum(p * math.log2(p) for p in freq.values())

sequences = ["abcdefghijklmnopqrstuvwxyz", "0123456789", "qwertyuiop", "asdfghjkl", "zxcvbnm"]
def pattern_entropy(pwd):
    pwd = pwd.lower()
    penalty = 0
    for seq in sequences:
        for i in range(len(seq)-2):
            pattern = seq[i:i+3]
            if pattern in pwd:
                penalty += 1
    return max(0, shannon_entropy(pwd) - penalty*0.2)

keyboard_sequences = ["qwertyuiop", "asdfghjkl", "zxcvbnm"]
def keyboard_entropy(pwd):
    pwd = pwd.lower()
    penalty = 0
    for seq in keyboard_sequences:
        for i in range(len(seq)-2):
            if seq[i:i+3] in pwd:
                penalty += 1
    return max(0, shannon_entropy(pwd) - penalty*0.2)

features

In [None]:
df["shannon_entropy"] = df["password"].apply(shannon_entropy)
df["length_adjusted_entropy"] = df["shannon_entropy"] * df["password"].str.len()
df["bigram_entropy"] = df["password"].apply(bigram_entropy)
df["pattern_entropy"] = df["password"].apply(pattern_entropy)
df["keyboard_entropy"] = df["password"].apply(keyboard_entropy)


In [None]:
# print(df.head())

    password
0     123456
1      12345
2  123456789
3   password
4   iloveyou


PCA combination

In [None]:
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
import numpy as np

ent_cols = [
    "shannon_entropy",
    "length_adjusted_entropy",
    "bigram_entropy",
    "pattern_entropy",
    "keyboard_entropy",
]

#check all num
mask = df[ent_cols].notna().all(axis=1)
X = df.loc[mask, ent_cols].astype(float)

#standardize
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

#PCA
pca = PCA(n_components=1)
pc1 = pca.fit_transform(X_scaled)  # shape (n_rows, 1)

#add new column

df.loc[mask, "combined_entropy_pca"] = pc1.ravel()
df.loc[~mask, "combined_entropy_pca"] = np.nan

#normaliz
df["combined_entropy_pca_norm"] = (df["combined_entropy_pca"] - df["combined_entropy_pca"].min()) / (
    df["combined_entropy_pca"].max() - df["combined_entropy_pca"].min()
)

In [None]:
# print(df.head())

    password
0     123456
1      12345
2  123456789
3   password
4   iloveyou
