In [63]:
import pandas as pd
import math
from itertools import tee

Import the new CSV file

In [None]:
df = pd.read_csv("rockyou_100k_with_zxcvbn.csv", encoding="utf-8",keep_default_na=False)

In [65]:
df.head()

Unnamed: 0,password
0,ever973crow222
1,WARD23
2,allison012
3,fy-fy92
4,cabot04


In [66]:
df["password"] = df["password"].fillna("").astype(str)
df = df[df["password"].str.len() > 0].reset_index(drop=True)

In [67]:
df.head()

Unnamed: 0,password
0,ever973crow222
1,WARD23
2,allison012
3,fy-fy92
4,cabot04


Extract charactaristic features

num_of_features

In [68]:
# total length
df["length"] = df["password"].astype(str).str.len()
#num of letters
df["num_letters"] = df["password"].str.count(r"[A-Za-z]")
df["num_upper"] = df["password"].str.count(r"[A-Z]")
df["num_lower"] = df["password"].str.count(r"[a-z]")
#num of digits 
df["num_digits"] = df["password"].str.count(r"[0-9]")
#num of special characters
df["num_special_char"] = df["password"].str.count(r"[^A-Za-z0-9]")

In [69]:
df.head()

Unnamed: 0,password,length,num_letters,num_upper,num_lower,num_digits,num_special_char
0,ever973crow222,14,8,0,8,6,0
1,WARD23,6,4,4,0,2,0
2,allison012,10,7,0,7,3,0
3,fy-fy92,7,4,0,4,2,1
4,cabot04,7,5,0,5,2,0


Boolean_features

In [70]:
#general
df["has_upper"] = df["num_upper"] > 0
df["has_num"] = df["num_digits"] > 0
df["has_special"] = df["num_special_char"] > 0
# First character
df["first_is_upper"] = df["password"].str.match(r"^[A-Z]")
df["first_is_digit"] = df["password"].str.match(r"^[0-9]")
df["first_is_special"] = df["password"].str.match(r"^[^A-Za-z0-9]")
# Last character
df["last_is_upper"] = df["password"].str[-1].str.match(r"[A-Z]")
df["last_is_digit"] = df["password"].str[-1].str.match(r"[0-9]")
df["last_is_special"] = df["password"].str[-1].str.match(r"[^A-Za-z0-9]")

In [71]:
# Convert all boolean columns to 0/1
bool_cols = ['has_upper', 'has_num', 'has_special', 
             'first_is_upper', 'first_is_digit',
             'first_is_special', 'last_is_upper', 'last_is_digit', 'last_is_special']

df[bool_cols] = df[bool_cols].astype(int)

In [72]:
df.head()

Unnamed: 0,password,length,num_letters,num_upper,num_lower,num_digits,num_special_char,has_upper,has_num,has_special,first_is_upper,first_is_digit,first_is_special,last_is_upper,last_is_digit,last_is_special
0,ever973crow222,14,8,0,8,6,0,0,1,0,0,0,0,0,1,0
1,WARD23,6,4,4,0,2,0,1,1,0,1,0,0,0,1,0
2,allison012,10,7,0,7,3,0,0,1,0,0,0,0,0,1,0
3,fy-fy92,7,4,0,4,2,1,0,1,1,0,0,0,0,1,0
4,cabot04,7,5,0,5,2,0,0,1,0,0,0,0,0,1,0


ratio_features

In [73]:
#letters
df["ratio_letters"] = df["num_letters"] / df["length"]
df["ratio_uppercase"] = df["num_upper"] / df["length"]
df["ratio_lowercase"] = df["num_lower"] / df["length"]
#digits
df["ratio_digits"] = df["num_digits"] / df["length"]
#special
df["ratio_symbols"] = df["num_special_char"] / df["length"]

In [74]:
df.head()

Unnamed: 0,password,length,num_letters,num_upper,num_lower,num_digits,num_special_char,has_upper,has_num,has_special,...,first_is_digit,first_is_special,last_is_upper,last_is_digit,last_is_special,ratio_letters,ratio_uppercase,ratio_lowercase,ratio_digits,ratio_symbols
0,ever973crow222,14,8,0,8,6,0,0,1,0,...,0,0,0,1,0,0.571429,0.0,0.571429,0.428571,0.0
1,WARD23,6,4,4,0,2,0,1,1,0,...,0,0,0,1,0,0.666667,0.666667,0.0,0.333333,0.0
2,allison012,10,7,0,7,3,0,0,1,0,...,0,0,0,1,0,0.7,0.0,0.7,0.3,0.0
3,fy-fy92,7,4,0,4,2,1,0,1,1,...,0,0,0,1,0,0.571429,0.0,0.571429,0.285714,0.142857
4,cabot04,7,5,0,5,2,0,0,1,0,...,0,0,0,1,0,0.714286,0.0,0.714286,0.285714,0.0


entropy_features

functions

In [75]:
def shannon_entropy(pwd):
    if not pwd:
        return 0
    freq = {c: pwd.count(c)/len(pwd) for c in set(pwd)}
    return -sum(p * math.log2(p) for p in freq.values())

def ngrams(seq, n=2):
    a, b = tee(seq)
    for _ in range(1, n):
        b = tee(b)[1]
        next(b, None)
    return zip(*a, *b)

def bigram_entropy(pwd):
    pwd = str(pwd)
    if len(pwd) < 2:
        return 0
    bigrams = list(zip(pwd, pwd[1:]))
    freq = {bg: bigrams.count(bg)/len(bigrams) for bg in set(bigrams)}
    return -sum(p * math.log2(p) for p in freq.values())

sequences = ["abcdefghijklmnopqrstuvwxyz", "0123456789", "qwertyuiop", "asdfghjkl", "zxcvbnm"]
def pattern_entropy(pwd):
    pwd = pwd.lower()
    penalty = 0
    for seq in sequences:
        for i in range(len(seq)-2):
            pattern = seq[i:i+3]
            if pattern in pwd:
                penalty += 1
    return max(0, shannon_entropy(pwd) - penalty*0.2)

keyboard_sequences = ["qwertyuiop", "asdfghjkl", "zxcvbnm"]
def keyboard_entropy(pwd):
    pwd = pwd.lower()
    penalty = 0
    for seq in keyboard_sequences:
        for i in range(len(seq)-2):
            if seq[i:i+3] in pwd:
                penalty += 1
    return max(0, shannon_entropy(pwd) - penalty*0.2)

features

In [76]:
df["shannon_entropy"] = df["password"].apply(shannon_entropy)
df["length_adjusted_entropy"] = df["shannon_entropy"] * df["password"].str.len()
df["bigram_entropy"] = df["password"].apply(bigram_entropy)
df["pattern_entropy"] = df["password"].apply(pattern_entropy)
df["keyboard_entropy"] = df["password"].apply(keyboard_entropy)

In [77]:
df.head()

Unnamed: 0,password,length,num_letters,num_upper,num_lower,num_digits,num_special_char,has_upper,has_num,has_special,...,ratio_letters,ratio_uppercase,ratio_lowercase,ratio_digits,ratio_symbols,shannon_entropy,length_adjusted_entropy,bigram_entropy,pattern_entropy,keyboard_entropy
0,ever973crow222,14,8,0,8,6,0,0,1,0,...,0.571429,0.0,0.571429,0.428571,0.0,3.182006,44.548081,3.546594,3.182006,3.182006
1,WARD23,6,4,4,0,2,0,1,1,0,...,0.666667,0.666667,0.0,0.333333,0.0,2.584963,15.509775,2.321928,2.584963,2.584963
2,allison012,10,7,0,7,3,0,0,1,0,...,0.7,0.0,0.7,0.3,0.0,3.121928,31.219281,3.169925,2.921928,3.121928
3,fy-fy92,7,4,0,4,2,1,0,1,1,...,0.571429,0.0,0.571429,0.285714,0.142857,2.235926,15.651484,2.251629,2.235926,2.235926
4,cabot04,7,5,0,5,2,0,0,1,0,...,0.714286,0.0,0.714286,0.285714,0.0,2.807355,19.651484,2.584963,2.807355,2.807355


PCA combination

In [78]:
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
import numpy as np

ent_cols = [
    "shannon_entropy",
    "length_adjusted_entropy",
    "bigram_entropy",
    "pattern_entropy",
    "keyboard_entropy",
]

#check all num
mask = df[ent_cols].notna().all(axis=1)
X = df.loc[mask, ent_cols].astype(float)

#standardize
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

#PCA
pca = PCA(n_components=1)
pc1 = pca.fit_transform(X_scaled)  # shape (n_rows, 1)

#add new column

df.loc[mask, "combined_entropy_pca"] = pc1.ravel()
df.loc[~mask, "combined_entropy_pca"] = np.nan

#normaliz
df["combined_entropy_pca_norm"] = (df["combined_entropy_pca"] - df["combined_entropy_pca"].min()) / (
    df["combined_entropy_pca"].max() - df["combined_entropy_pca"].min()
)

In [79]:
df.head()

Unnamed: 0,password,length,num_letters,num_upper,num_lower,num_digits,num_special_char,has_upper,has_num,has_special,...,ratio_lowercase,ratio_digits,ratio_symbols,shannon_entropy,length_adjusted_entropy,bigram_entropy,pattern_entropy,keyboard_entropy,combined_entropy_pca,combined_entropy_pca_norm
0,ever973crow222,14,8,0,8,6,0,0,1,0,...,0.571429,0.428571,0.0,3.182006,44.548081,3.546594,3.182006,3.182006,3.04277,0.251627
1,WARD23,6,4,4,0,2,0,1,1,0,...,0.0,0.333333,0.0,2.584963,15.509775,2.321928,2.584963,2.584963,-1.077407,0.185687
2,allison012,10,7,0,7,3,0,0,1,0,...,0.7,0.3,0.0,3.121928,31.219281,3.169925,2.921928,3.121928,1.840019,0.232378
3,fy-fy92,7,4,0,4,2,1,0,1,1,...,0.571429,0.285714,0.142857,2.235926,15.651484,2.251629,2.235926,2.235926,-2.33521,0.165557
4,cabot04,7,5,0,5,2,0,0,1,0,...,0.714286,0.285714,0.0,2.807355,19.651484,2.584963,2.807355,2.807355,0.067028,0.204003


Structural Features

Position Features

In [80]:
def calculate_position_spread(pwd):
    """Calculate how spread out each character type is (0-1 scale)."""
    pwd = str(pwd)
    length = len(pwd)
    
    if length == 0:
        return {'digit_spread': 0.0, 'letter_spread': 0.0, 'special_spread': 0.0}
    
    digit_positions = [i for i, c in enumerate(pwd) if c.isdigit()]
    letter_positions = [i for i, c in enumerate(pwd) if c.isalpha()]
    special_positions = [i for i, c in enumerate(pwd) if not c.isalnum()]
    
    def spread(positions):
        n = len(positions)
        
        # Edge cases
        if n <= 1:
            return 0.0  # Single or no characters can't be "spread"
        
        # If all characters are of this type, they're clustered
        if n == length:
            return 0.0
        
        # Calculate actual spread
        actual_spread = (max(positions) - min(positions)) / (n - 1)
        
        # Calculate maximum possible spread for n items in length L
        # Maximum spread happens when items are evenly spaced
        max_possible_spread = (length - 1) / (n - 1)
        
        # Normalize to 0-1
        return min(actual_spread / max_possible_spread, 1.0)
    
    return {
        'digit_spread': spread(digit_positions),
        'letter_spread': spread(letter_positions),
        'special_spread': spread(special_positions)
    }

def calculate_position_centered(pwd):
    """Calculate how centered each character type is (0-1 scale)."""
    pwd = str(pwd)
    length = len(pwd)
    
    if length == 0:
        return {'digit_centered': 0.0, 'letter_centered': 0.0}
    
    digit_positions = [i for i, c in enumerate(pwd) if c.isdigit()]
    letter_positions = [i for i, c in enumerate(pwd) if c.isalpha()]
    
    def centered(positions, length):
        if not positions:
            return 0.0
        avg_pos = sum(positions) / len(positions)
        centered_score = 1 - abs(avg_pos/length - 0.5) * 2
        return max(0.0, centered_score)
    
    return {
        'digit_centered': centered(digit_positions, length),
        'letter_centered': centered(letter_positions, length)
    }

In [81]:
position_spread_df = pd.DataFrame(df['password'].apply(calculate_position_spread).tolist())
df = pd.concat([df, position_spread_df], axis=1)

position_centered_df = pd.DataFrame(df['password'].apply(calculate_position_centered).tolist())
df = pd.concat([df, position_centered_df], axis=1)

In [82]:
df.head()

Unnamed: 0,password,length,num_letters,num_upper,num_lower,num_digits,num_special_char,has_upper,has_num,has_special,...,bigram_entropy,pattern_entropy,keyboard_entropy,combined_entropy_pca,combined_entropy_pca_norm,digit_spread,letter_spread,special_spread,digit_centered,letter_centered
0,ever973crow222,14,8,0,8,6,0,0,1,0,...,3.546594,3.182006,3.182006,3.04277,0.251627,0.692308,0.769231,0.0,0.785714,0.714286
1,WARD23,6,4,4,0,2,0,1,1,0,...,2.321928,2.584963,2.584963,-1.077407,0.185687,0.2,0.6,0.0,0.5,0.5
2,allison012,10,7,0,7,3,0,0,1,0,...,3.169925,2.921928,3.121928,1.840019,0.232378,0.222222,0.666667,0.0,0.4,0.6
3,fy-fy92,7,4,0,4,2,1,0,1,1,...,2.251629,2.235926,2.235926,-2.33521,0.165557,0.166667,0.666667,0.0,0.428571,0.571429
4,cabot04,7,5,0,5,2,0,0,1,0,...,2.584963,2.807355,2.807355,0.067028,0.204003,0.166667,0.666667,0.0,0.428571,0.571429


Consecutive Character Features

In [83]:
def max_consecutive_counts(pwd):
    """Calculate maximum consecutive runs for all character types."""
    import re
    
    pwd = str(pwd)
    
    patterns = {
        'digit': r'\d+',
        'letter': r'[A-Za-z]+',
        'upper': r'[A-Z]+',
        'lower': r'[a-z]+',
        'special': r'[^A-Za-z0-9]+'
    }
    
    result = {}
    for name, pattern in patterns.items():
        matches = re.findall(pattern, pwd)
        result[f'max_consecutive_{name}'] = max([len(m) for m in matches], default=0)
        result[f'num_consecutive_{name}_runs'] = len(matches)
        if matches:
            result[f'avg_consecutive_{name}_run'] = sum(len(m) for m in matches) / len(matches)
        else:
            result[f'avg_consecutive_{name}_run'] = 0.0
    
    return result


In [84]:
consecutive_df = pd.DataFrame(df['password'].apply(max_consecutive_counts).tolist())
df = pd.concat([df, consecutive_df], axis=1)

In [85]:
df.head()

Unnamed: 0,password,length,num_letters,num_upper,num_lower,num_digits,num_special_char,has_upper,has_num,has_special,...,avg_consecutive_letter_run,max_consecutive_upper,num_consecutive_upper_runs,avg_consecutive_upper_run,max_consecutive_lower,num_consecutive_lower_runs,avg_consecutive_lower_run,max_consecutive_special,num_consecutive_special_runs,avg_consecutive_special_run
0,ever973crow222,14,8,0,8,6,0,0,1,0,...,4.0,0,0,0.0,4,2,4.0,0,0,0.0
1,WARD23,6,4,4,0,2,0,1,1,0,...,4.0,4,1,4.0,0,0,0.0,0,0,0.0
2,allison012,10,7,0,7,3,0,0,1,0,...,7.0,0,0,0.0,7,1,7.0,0,0,0.0
3,fy-fy92,7,4,0,4,2,1,0,1,1,...,2.0,0,0,0.0,2,2,2.0,1,1,1.0
4,cabot04,7,5,0,5,2,0,0,1,0,...,5.0,0,0,0.0,5,1,5.0,0,0,0.0


Transition Features 

In [86]:
def get_character_type(char):
    """Helper: classify character type."""
    if char.isdigit():
        return 'digit'
    elif char.isalpha():
        return 'upper' if char.isupper() else 'lower'
    else:
        return 'special'

def calculate_transitions(pwd):
    """Calculate all transition-related features."""
    pwd = str(pwd)
    length = len(pwd)
    
    if length < 2:
        return {
            'num_transitions': 0,
            'transition_frequency': 0.0,
            'letter_to_digit': 0,
            'digit_to_letter': 0,
            'to_special': 0,
            'from_special': 0,
            'alternating_pattern_score': 0.0
        }
    
    # Basic transitions
    transitions = 0
    prev_type = get_character_type(pwd[0])
    
    # Specific transition counters
    letter_to_digit = 0
    digit_to_letter = 0
    to_special = 0
    from_special = 0
    
    for char in pwd[1:]:
        current_type = get_character_type(char)
        
        if current_type != prev_type:
            transitions += 1
            
            # Count specific transitions
            if prev_type in ['upper', 'lower'] and current_type == 'digit':
                letter_to_digit += 1
            elif prev_type == 'digit' and current_type in ['upper', 'lower']:
                digit_to_letter += 1
            if current_type == 'special':
                to_special += 1
            if prev_type == 'special':
                from_special += 1
        
        prev_type = current_type
    
    # Alternating pattern score (like a1b2c3)
    alternating_patterns = 0
    for i in range(length - 2):
        type1 = get_character_type(pwd[i])
        type2 = get_character_type(pwd[i+1])
        type3 = get_character_type(pwd[i+2])
        
        if type1 == type3 and type1 != type2:
            alternating_patterns += 1
    
    alternating_score = alternating_patterns / (length - 2) if length > 2 else 0.0
    
    return {
        'num_transitions': transitions,
        'transition_frequency': transitions / (length - 1),
        'letter_to_digit': letter_to_digit,
        'digit_to_letter': digit_to_letter,
        'to_special': to_special,
        'from_special': from_special,
        'alternating_pattern_score': alternating_score
    }

In [87]:
transitions_df = pd.DataFrame(df['password'].apply(calculate_transitions).tolist())
df = pd.concat([df, transitions_df], axis=1)

df['transitions_to_length_ratio'] = df['num_transitions'] / df['length']
df['transitions_to_length_ratio'] = df['transitions_to_length_ratio'].fillna(0)

In [88]:
df.head()

Unnamed: 0,password,length,num_letters,num_upper,num_lower,num_digits,num_special_char,has_upper,has_num,has_special,...,num_consecutive_special_runs,avg_consecutive_special_run,num_transitions,transition_frequency,letter_to_digit,digit_to_letter,to_special,from_special,alternating_pattern_score,transitions_to_length_ratio
0,ever973crow222,14,8,0,8,6,0,0,1,0,...,0,0.0,3,0.230769,2,1,0,0,0.0,0.214286
1,WARD23,6,4,4,0,2,0,1,1,0,...,0,0.0,1,0.2,1,0,0,0,0.0,0.166667
2,allison012,10,7,0,7,3,0,0,1,0,...,0,0.0,1,0.111111,1,0,0,0,0.0,0.1
3,fy-fy92,7,4,0,4,2,1,0,1,1,...,1,1.0,3,0.5,1,0,1,1,0.2,0.428571
4,cabot04,7,5,0,5,2,0,0,1,0,...,0,0.0,1,0.166667,1,0,0,0,0.0,0.142857


longest_char streak and didgit letter mixing Features

In [89]:
def longest_same_char_streak(pwd):
    """Find longest streak of the same character."""
    pwd = str(pwd)
    if not pwd:
        return 0
    
    max_streak = 1
    current_streak = 1
    
    for i in range(1, len(pwd)):
        if pwd[i] == pwd[i-1]:
            current_streak += 1
            max_streak = max(max_streak, current_streak)
        else:
            current_streak = 1
    
    return max_streak

def digit_letter_mixing_score(pwd):
    """Calculate how mixed digits and letters are (0-1)."""
    pwd = str(pwd)
    
    digit_positions = [i for i, c in enumerate(pwd) if c.isdigit()]
    letter_positions = [i for i, c in enumerate(pwd) if c.isalpha()]
    
    if not digit_positions or not letter_positions:
        return 0.0
    
    # Check if digits and letters are interspersed
    min_digit = min(digit_positions)
    max_digit = max(digit_positions)
    min_letter = min(letter_positions)
    max_letter = max(letter_positions)
    
    # Calculate overlap
    overlap_start = max(min_digit, min_letter)
    overlap_end = min(max_digit, max_letter)
    overlap = max(0, overlap_end - overlap_start)
    
    if overlap <= 0:  # No overlap - completely separated
        return 0.0
    else:  # Some overlap - mixed
        total_span = max(max_digit, max_letter) - min(min_digit, min_letter)
        return overlap / total_span if total_span > 0 else 0.0

In [90]:
df['longest_same_char_streak'] = df['password'].apply(longest_same_char_streak)
df['digit_letter_mixing_score'] = df['password'].apply(digit_letter_mixing_score)

In [91]:
df.head()

Unnamed: 0,password,length,num_letters,num_upper,num_lower,num_digits,num_special_char,has_upper,has_num,has_special,...,num_transitions,transition_frequency,letter_to_digit,digit_to_letter,to_special,from_special,alternating_pattern_score,transitions_to_length_ratio,longest_same_char_streak,digit_letter_mixing_score
0,ever973crow222,14,8,0,8,6,0,0,1,0,...,3,0.230769,2,1,0,0,0.0,0.214286,3,0.461538
1,WARD23,6,4,4,0,2,0,1,1,0,...,1,0.2,1,0,0,0,0.0,0.166667,1,0.0
2,allison012,10,7,0,7,3,0,0,1,0,...,1,0.111111,1,0,0,0,0.0,0.1,2,0.0
3,fy-fy92,7,4,0,4,2,1,0,1,1,...,3,0.5,1,0,1,1,0.2,0.428571,1,0.0
4,cabot04,7,5,0,5,2,0,0,1,0,...,1,0.166667,1,0,0,0,0.0,0.142857,1,0.0


Check all features until now

In [92]:
for col in df.columns: print(col)

password
length
num_letters
num_upper
num_lower
num_digits
num_special_char
has_upper
has_num
has_special
first_is_upper
first_is_digit
first_is_special
last_is_upper
last_is_digit
last_is_special
ratio_letters
ratio_uppercase
ratio_lowercase
ratio_digits
ratio_symbols
shannon_entropy
length_adjusted_entropy
bigram_entropy
pattern_entropy
keyboard_entropy
combined_entropy_pca
combined_entropy_pca_norm
digit_spread
letter_spread
special_spread
digit_centered
letter_centered
max_consecutive_digit
num_consecutive_digit_runs
avg_consecutive_digit_run
max_consecutive_letter
num_consecutive_letter_runs
avg_consecutive_letter_run
max_consecutive_upper
num_consecutive_upper_runs
avg_consecutive_upper_run
max_consecutive_lower
num_consecutive_lower_runs
avg_consecutive_lower_run
max_consecutive_special
num_consecutive_special_runs
avg_consecutive_special_run
num_transitions
transition_frequency
letter_to_digit
digit_to_letter
to_special
from_special
alternating_pattern_score
transitions_to_leng

Save dataset

In [None]:
# df.to_csv("sample_rockyou_before_zxcvbn.csv", index=False)

zxcvbn

In [94]:
from zxcvbn import zxcvbn
#test
print(zxcvbn("P@ssw0rd123")) 



In [None]:
from zxcvbn import zxcvbn
import numpy as np
import pandas as pd

# make sure get_zxcvbn_features(self) returns a 3-tuple (score, guesses, crack_time)
def get_zxcvbn_features(password):
    try:
        res = zxcvbn(str(password))
        return (
            res.get('score', np.nan),
            res.get('guesses', np.nan),
            res.get('crack_times_seconds', {}).get('offline_slow_hashing_1e4_per_second', np.nan)
        )
    except Exception:
        return (np.nan, np.nan, np.nan)


In [None]:
# 1) produce a clean list of 3-tuples (protect against broken returns)
features = []
for p in df['password'].astype(str):
    t = get_zxcvbn_features(p)
    if isinstance(t, (list, tuple)) and len(t) == 3:
        features.append(tuple(t))
    else:
        # fallback if something unexpected was returned
        features.append((np.nan, np.nan, np.nan))

# 2) make a DataFrame from the list and give it explicit column names
features_df = pd.DataFrame(features, index=df.index,
                           columns=['zxcvbn_score', 'zxcvbn_guesses', 'zxcvbn_crack_time_seconds'])

# 3) safely drop any pre-existing zxcvbn columns to avoid duplication problems
drop_cols = [c for c in ['zxcvbn_score','zxcvbn_guesses','zxcvbn_crack_time_seconds','zxcvbn_log10_guesses'] if c in df.columns]
if drop_cols:
    df = df.drop(columns=drop_cols)

# 4) concat the new features
df = pd.concat([df, features_df], axis=1)

# 5) coerce guesses -> numeric and compute log10 safely
df['zxcvbn_guesses'] = pd.to_numeric(df['zxcvbn_guesses'], errors='coerce')  # NaN if non-numeric
df['zxcvbn_log10_guesses'] = np.log10(df['zxcvbn_guesses'].clip(lower=1))   # clip prevents -inf


In [105]:
print(df[['password','zxcvbn_score','zxcvbn_guesses','zxcvbn_crack_time_seconds','zxcvbn_log10_guesses']].head(10))

         password  zxcvbn_score  zxcvbn_guesses zxcvbn_crack_time_seconds  \
0  ever973crow222           4.0    3.541000e+11                  35410000   
1          WARD23           1.0    2.280000e+04                      2.28   
2      allison012           1.0    3.250000e+04                      3.25   
3         fy-fy92           2.0    1.000000e+07                 1000.0001   
4         cabot04           1.0    5.718000e+05                     57.18   
5     metalgear!2           2.0    1.878200e+06                    187.82   
6      pirouline!           3.0    1.076800e+09                    107680   
7        mach5114           2.0    7.885000e+07                      7885   
8      abner20390           2.0    7.569640e+07                   7569.64   
9       chabakaew           3.0    4.956100e+08                     49561   

   zxcvbn_log10_guesses  
0             11.549126  
1              4.357935  
2              4.511883  
3              7.000000  
4              5.75724

In [None]:
# # Save to new CSV
# df.to_csv("rockyou_100k_with_zxcvbn.csv", index=False)

Extract sampled passwords for PCFG and OMEN

In [107]:
df['password'].astype(str).to_csv(
    "sampled_passwords.txt",
    index=False,
    header=False
)

Merge the new PCFG OMEN csv

In [108]:
pcfg_omen_df = pd.read_csv("pcfg_omen_results.csv")

merged_df = df.merge(
    pcfg_omen_df[['password', 'PCFG_probability', 'OMEN_level']], 
    on='password', 
    how='left'  
)

merged_df.to_csv("rockyou_100k_merged_only.csv", index=False)

In [110]:
merged_df.head(10)

Unnamed: 0,password,length,num_letters,num_upper,num_lower,num_digits,num_special_char,has_upper,has_num,has_special,...,alternating_pattern_score,transitions_to_length_ratio,longest_same_char_streak,digit_letter_mixing_score,zxcvbn_score,zxcvbn_guesses,zxcvbn_crack_time_seconds,zxcvbn_log10_guesses,PCFG_probability,OMEN_level
0,ever973crow222,14,8,0,8,6,0,0,1,0,...,0.0,0.214286,3,0.461538,4.0,354100000000.0,35410000.0,11.549126,5.74224e-15,24.0
1,WARD23,6,4,4,0,2,0,1,1,0,...,0.0,0.166667,1,0.0,1.0,22800.0,2.28,4.357935,3.390362e-09,6.0
2,allison012,10,7,0,7,3,0,0,1,0,...,0.0,0.1,2,0.0,1.0,32500.0,3.25,4.511883,2.776556e-09,12.0
3,fy-fy92,7,4,0,4,2,1,0,1,1,...,0.2,0.428571,1,0.0,2.0,10000000.0,1000.0001,7.0,9.319015e-15,6.0
4,cabot04,7,5,0,5,2,0,0,1,0,...,0.0,0.142857,1,0.0,1.0,571800.0,57.18,5.757244,2.807601e-08,9.0
5,metalgear!2,11,9,0,9,1,1,0,1,1,...,0.0,0.181818,1,0.0,2.0,1878200.0,187.82,6.273742,1.814969e-10,17.0
6,pirouline!,10,9,0,9,0,1,0,0,1,...,0.0,0.1,1,0.0,3.0,1076800000.0,107680.0,9.032135,9.075658e-09,16.0
7,mach5114,8,4,0,4,4,0,0,1,0,...,0.0,0.125,2,0.0,2.0,78850000.0,7885.0,7.896802,5.505244e-10,9.0
8,abner20390,10,5,0,5,5,0,0,1,0,...,0.0,0.1,1,0.0,2.0,75696400.0,7569.64,7.879075,3.49549e-11,13.0
9,chabakaew,9,9,0,9,0,0,0,0,0,...,0.0,0.0,1,0.0,3.0,495610000.0,49561.0,8.69514,3.241715e-06,14.0


In [112]:
for col in merged_df.columns: print(col)

password
length
num_letters
num_upper
num_lower
num_digits
num_special_char
has_upper
has_num
has_special
first_is_upper
first_is_digit
first_is_special
last_is_upper
last_is_digit
last_is_special
ratio_letters
ratio_uppercase
ratio_lowercase
ratio_digits
ratio_symbols
shannon_entropy
length_adjusted_entropy
bigram_entropy
pattern_entropy
keyboard_entropy
combined_entropy_pca
combined_entropy_pca_norm
digit_spread
letter_spread
special_spread
digit_centered
letter_centered
max_consecutive_digit
num_consecutive_digit_runs
avg_consecutive_digit_run
max_consecutive_letter
num_consecutive_letter_runs
avg_consecutive_letter_run
max_consecutive_upper
num_consecutive_upper_runs
avg_consecutive_upper_run
max_consecutive_lower
num_consecutive_lower_runs
avg_consecutive_lower_run
max_consecutive_special
num_consecutive_special_runs
avg_consecutive_special_run
num_transitions
transition_frequency
letter_to_digit
digit_to_letter
to_special
from_special
alternating_pattern_score
transitions_to_leng

Omen based features

In [114]:
# Handle missing values
merged_df['OMEN_level'] = merged_df['OMEN_level'].fillna(0).astype(int)

# OMEN numeric features
max_omen = merged_df['OMEN_level'].max() if 'OMEN_level' in merged_df else 25
merged_df['omen_level_norm'] = merged_df['OMEN_level'] / float(max_omen)
merged_df['omen_log10'] = np.log10(merged_df['OMEN_level'] + 1)  # add 1 to avoid log(0)

  result = getattr(ufunc, method)(*inputs, **kwargs)


In [115]:
print(merged_df[['password','OMEN_level', 'omen_level_norm', 'omen_log10']].head())

         password  OMEN_level  omen_level_norm  omen_log10
0  ever973crow222          24         0.558140    1.397940
1          WARD23           6         0.139535    0.845098
2      allison012          12         0.279070    1.113943
3         fy-fy92           6         0.139535    0.845098
4         cabot04           9         0.209302    1.000000


PCFG based features

In [116]:
# Handle missing values
merged_df['PCFG_probability'] = merged_df['PCFG_probability'].fillna(1e-50)  # avoid log(0)

# PCFG log features
merged_df['pcfg_log10_prob'] = np.log10(merged_df['PCFG_probability'])
merged_df['pcfg_neglog10_prob'] = -merged_df['pcfg_log10_prob']  # “surprisal” measure

  result = getattr(ufunc, method)(*inputs, **kwargs)


In [117]:
print(merged_df[['password', 'PCFG_probability', 'pcfg_log10_prob', 'pcfg_neglog10_prob',]].head())

         password  PCFG_probability  pcfg_log10_prob  pcfg_neglog10_prob
0  ever973crow222      5.742240e-15       -14.240919           14.240919
1          WARD23      3.390362e-09        -8.469754            8.469754
2      allison012      2.776556e-09        -8.556494            8.556494
3         fy-fy92      9.319015e-15       -14.030630           14.030630
4         cabot04      2.807601e-08        -7.551665            7.551665


In [118]:
for col in merged_df.columns: print(col)

password
length
num_letters
num_upper
num_lower
num_digits
num_special_char
has_upper
has_num
has_special
first_is_upper
first_is_digit
first_is_special
last_is_upper
last_is_digit
last_is_special
ratio_letters
ratio_uppercase
ratio_lowercase
ratio_digits
ratio_symbols
shannon_entropy
length_adjusted_entropy
bigram_entropy
pattern_entropy
keyboard_entropy
combined_entropy_pca
combined_entropy_pca_norm
digit_spread
letter_spread
special_spread
digit_centered
letter_centered
max_consecutive_digit
num_consecutive_digit_runs
avg_consecutive_digit_run
max_consecutive_letter
num_consecutive_letter_runs
avg_consecutive_letter_run
max_consecutive_upper
num_consecutive_upper_runs
avg_consecutive_upper_run
max_consecutive_lower
num_consecutive_lower_runs
avg_consecutive_lower_run
max_consecutive_special
num_consecutive_special_runs
avg_consecutive_special_run
num_transitions
transition_frequency
letter_to_digit
digit_to_letter
to_special
from_special
alternating_pattern_score
transitions_to_leng

Save dataset before adding target column

In [120]:
# Save updated dataset
merged_df.to_csv("rockyou_100k_PCFG_OMEN_no_target.csv", index=False)

Target label

In [8]:
import pandas as pd
import numpy as np

# Load
df = pd.read_csv("rockyou_100k_PCFG_OMEN_no_target.csv", encoding="utf-8", keep_default_na=False)

cols = ['zxcvbn_log10_guesses', 'omen_log10', 'pcfg_neglog10_prob']

# 1) Convert to numeric (coerce bad strings -> NaN)
for c in cols:
    df[c] = pd.to_numeric(df[c], errors='coerce')

# 2) Replace -inf / +inf with NaN, then decide on fill/cap strategy
#    (np.isfinite is True for finite numbers, False for NaN/inf)
finite_mask = df[cols].applymap(np.isfinite)
# Optional: inspect how many infs/NaNs
print("Per-column finite counts:\n", finite_mask.sum())

# 3) Replace NaN with a conservative low value (e.g., 0 = very weak)
#    but first handle +inf specially: cap them to CAP_LOG10
CAP_LOG10 = 16.0   # cap infinite or extremely large log10(guesses) to 1e16 guesses (tune as needed)

# Replace +inf with CAP_LOG10, -inf with 0
for c in cols:
    # set posinf -> CAP_LOG10, neginf -> 0
    posinf_mask = np.isposinf(df[c])
    neginf_mask = np.isneginf(df[c])
    if posinf_mask.any():
        df.loc[posinf_mask, c] = CAP_LOG10
    if neginf_mask.any():
        df.loc[neginf_mask, c] = 0.0

# Now coerce any remaining NaN -> 0 (treat missing estimator as "no strength estimate")
df[cols] = df[cols].fillna(0.0)

# 4) Build the conservative combined log10 guesses target:
#    use the MAX across estimators (defensive / conservative choice discussed earlier).
df['target_log10_guesses'] = df[cols].max(axis=1)

# 5) Create a continuous 1-10 strength score via linear scaling:
#    choose min and max bounds for mapping. Using e.g. MIN_LOG10=2, MAX_LOG10=CAP_LOG10.
MIN_LOG10 = 2.0   # 10^2 guesses -> very weak
MAX_LOG10 = CAP_LOG10  # matches cap above

def log10_to_score_continuous(log10v, min_log=MIN_LOG10, max_log=MAX_LOG10):
    # clamp
    v = float(log10v)
    if not np.isfinite(v):
        v = min_log
    v = max(min_log, min(max_log, v))
    # linear scale into 1..10
    score = 1.0 + 9.0 * (v - min_log) / (max_log - min_log)
    return score

df['target_score_continuous'] = df['target_log10_guesses'].apply(log10_to_score_continuous)


  df = pd.read_csv("rockyou_100k_PCFG_OMEN_no_target.csv", encoding="utf-8", keep_default_na=False)
  finite_mask = df[cols].applymap(np.isfinite)


Per-column finite counts:
 zxcvbn_log10_guesses    99994
omen_log10              99793
pcfg_neglog10_prob      99737
dtype: int64


In [9]:
df.head()

Unnamed: 0,password,length,num_letters,num_upper,num_lower,num_digits,num_special_char,has_upper,has_num,has_special,...,zxcvbn_crack_time_seconds,zxcvbn_log10_guesses,PCFG_probability,OMEN_level,omen_level_norm,omen_log10,pcfg_log10_prob,pcfg_neglog10_prob,target_log10_guesses,target_score_continuous
0,ever973crow222,14,8,0,8,6,0,0,1,0,...,35410000.0,11.549126,5.74224e-15,24,0.55814,1.39794,-14.240919,14.240919,14.240919,8.869162
1,WARD23,6,4,4,0,2,0,1,1,0,...,2.28,4.357935,3.390362e-09,6,0.139535,0.845098,-8.469754,8.469754,8.469754,5.159128
2,allison012,10,7,0,7,3,0,0,1,0,...,3.25,4.511883,2.776556e-09,12,0.27907,1.113943,-8.556494,8.556494,8.556494,5.214889
3,fy-fy92,7,4,0,4,2,1,0,1,1,...,1000.0001,7.0,9.319015e-15,6,0.139535,0.845098,-14.03063,14.03063,14.03063,8.733976
4,cabot04,7,5,0,5,2,0,0,1,0,...,57.18,5.757244,2.807601e-08,9,0.209302,1.0,-7.551665,7.551665,7.551665,4.568927


In [10]:
for col in df.columns: print(col)

password
length
num_letters
num_upper
num_lower
num_digits
num_special_char
has_upper
has_num
has_special
first_is_upper
first_is_digit
first_is_special
last_is_upper
last_is_digit
last_is_special
ratio_letters
ratio_uppercase
ratio_lowercase
ratio_digits
ratio_symbols
shannon_entropy
length_adjusted_entropy
bigram_entropy
pattern_entropy
keyboard_entropy
combined_entropy_pca
combined_entropy_pca_norm
digit_spread
letter_spread
special_spread
digit_centered
letter_centered
max_consecutive_digit
num_consecutive_digit_runs
avg_consecutive_digit_run
max_consecutive_letter
num_consecutive_letter_runs
avg_consecutive_letter_run
max_consecutive_upper
num_consecutive_upper_runs
avg_consecutive_upper_run
max_consecutive_lower
num_consecutive_lower_runs
avg_consecutive_lower_run
max_consecutive_special
num_consecutive_special_runs
avg_consecutive_special_run
num_transitions
transition_frequency
letter_to_digit
digit_to_letter
to_special
from_special
alternating_pattern_score
transitions_to_leng

In [12]:
# Save updated dataset
df.to_csv("rockyou_100k_PCFG_OMEN_target.csv", index=False)