<a href="https://colab.research.google.com/github/maryamelnahas/CSEN711-Project/blob/main/BINF711_MS1_Password_Strength.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [9]:
import pandas as pd
import numpy as np
import math
import os
from google.colab import files


MAX_DICT_SIZE = 50000

# Map for reversing common l33t-speak substitutions
L33T_MAP = {
    '@': 'a', '4': 'a',
    '8': 'b',
    '(': 'c',
    '3': 'e',
    '9': 'g', '6': 'g',
    '1': 'i', '!': 'i', '|': 'i',
    '0': 'o',
    '5': 's', '$': 's',
    '7': 't', '+': 't',
    'z': 's'
}

# Thresholds are based on log2(Guess Count) for mapping.
STRENGTH_THRESHOLDS = {
    'very weak': 15,     # < ~32,000 guesses
    'weak': 25,          # < ~33 million guesses
    'medium': 35,        # < ~34 billion guesses
    'strong': 50,        # < ~1 quadrillion guesses
    'very strong': 50    # > ~1 quadrillion guesses
}


def load_ranked_dictionary(file_name, max_size):

    print(f"Loading ranked dictionary from {file_name}...")
    passwords = {}
    rank = 1
    try:
        with open(file_name, 'r', encoding='utf-8', errors='ignore') as f:
            for line in f:
                if rank > max_size:
                    break

                # Split by tab and take the first item as the password
                password = line.strip().split('\t')[0].lower()

                # Add to dictionary if it's not already there
                if password and password not in passwords:
                    passwords[password] = rank
                    rank += 1

        print(f"Loaded {len(passwords)} unique passwords into ranked dictionary.")
        return passwords
    except FileNotFoundError:
        print(f"Error: File '{file_name}' not found. Using a small placeholder dictionary.")
        return {"password": 1, "123456": 2, "qwerty": 3, "secret": 4, "iloveyou": 5}


def demangle_l33t(password):
    """
    Reverses common l33t-speak substitutions to find a base word.
    """
    password_lower = password.lower()
    demangled = "".join(L33T_MAP.get(char, char) for char in password_lower)
    return demangled

def get_brute_force_entropy(password):
    """
    Calculates the brute-force guess count based on character sets and length.
    """
    charset_size = 0
    if any(c.islower() for c in password): charset_size += 26
    if any(c.isupper() for c in password): charset_size += 26
    if any(c.isdigit() for c in password): charset_size += 10
    # A common symbol set
    if any(c in "!@#$%^&*()_+-=[]{}|;:,.<>?~" for c in password): charset_size += 32

    if charset_size == 0:
        return 1.0

    # Guess Count = (Charset Size) ^ (Password Length)
    return float(charset_size ** len(password))


def calculate_password_complexity(password, dictionary):
    """
    Estimates password complexity (guess count).
    Simulates an attacker choosing the most efficient attack.
    """

    if not password:
        return 1.0

    password_lower = password.lower()

    # --- Initialize a list of possible guess counts ---
    # We will take the MINIMUM of this list as the final complexity.
    # Start with the fallback: a brute-force attack.
    guess_counts = [get_brute_force_entropy(password)]

    # --- Attack 1: Direct Dictionary Attack ---
    if password_lower in dictionary:
        guess_counts.append(dictionary[password_lower])

    # --- Attack 2: Rule-Based Attack (Capitalization) ---
    if password_lower in dictionary and password != password_lower:
        # Cost = dictionary rank * estimated guesses for this rule
        capitalization_guesses = dictionary[password_lower] * 10
        guess_counts.append(capitalization_guesses)

    # --- Attack 3: Rule-Based Attack (L33t-speak) ---
    demangled_password = demangle_l33t(password)
    if demangled_password in dictionary:
        # Cost = dictionary rank * estimated guesses for this rule
        l33t_guesses = dictionary[demangled_password] * 100
        guess_counts.append(l33t_guesses)

    # --- Attack 4: Rule-Based Attack (Reversed) ---
    reversed_password = password_lower[::-1]
    if reversed_password in dictionary:
        reversed_guesses = dictionary[reversed_password] * 50
        guess_counts.append(reversed_guesses)

    # The final complexity is the *lowest* guess count,
    return max(1.0, min(guess_counts)) # Ensure at least 1 guess


def get_strength_rating(guess_count):
    """Maps the final guess count to a human-readable strength rating."""

    try:
        complexity_bits = math.log2(guess_count)
    except ValueError:
        complexity_bits = 0

    if complexity_bits < STRENGTH_THRESHOLDS['very weak']:
        rating = "Very Weak"
    elif complexity_bits < STRENGTH_THRESHOLDS['weak']:
        rating = "Weak"
    elif complexity_bits < STRENGTH_THRESHOLDS['medium']:
        rating = "Medium"
    elif complexity_bits < STRENGTH_THRESHOLDS['strong']:
        rating = "Strong"
    else:
        rating = "Very Strong"

    return f"{rating} (Complexity: {complexity_bits:.2f} bits)"


if __name__ == '__main__':

    uploaded = files.upload()

    if not uploaded:
        print("\n--- ERROR ---")
        print("No file was uploaded. Please re-run the cell and select the file.")
    else:
        actual_filename = list(uploaded.keys())[0]

        print(f"\nUser uploaded file '{actual_filename}' with length {len(uploaded[actual_filename])} bytes")

        dictionary = load_ranked_dictionary(actual_filename, MAX_DICT_SIZE)

        test_passwords = {
            "password": "Very Weak - Direct Dictionary Hit",
            "123456": "Very Weak - Direct Dictionary Hit",
            "Password": "Weak - Capitalization Rule Hit",
            "P@$$w0rd": "Weak/Medium - L33t Rule Hit",
            "drowssap": "Weak/Medium - Reverse Rule Hit",
            "CorrectHorseBatteryStaple": "Strong - Long, unique (Brute-Force Fallback)",
            "MyP@ssw0rd!sUn1qu3": "Very Strong - Long, complex (Brute-Force Fallback)"
        }

        for pwd, description in test_passwords.items():
            guess_count = calculate_password_complexity(pwd, dictionary)

            strength_rating = get_strength_rating(guess_count)

            print(f"\nPassword: '{pwd}'")
            print(f"  Description: {description}")
            print(f"  Est. Guess Count: {guess_count:,.0f}")
            print(f"  Strength: {strength_rating}")

            if 'Very Weak' in strength_rating or 'Weak' in strength_rating:
                print("  This password is too weak and would be rejected. "
                      "It was found using a common dictionary or rule-based attack.")
            elif 'Medium' in strength_rating:
                 print(" This password is acceptable but could be stronger. "
                       "Add more length or unique symbols.")

Saving production.tsv to production (4).tsv

User uploaded file 'production (4).tsv' with length 11818456 bytes
Loading ranked dictionary from production (4).tsv...
Loaded 50000 unique passwords into ranked dictionary.

Password: 'password'
  Description: Very Weak - Direct Dictionary Hit
  Est. Guess Count: 1
  Strength: Very Weak (Complexity: 0.00 bits)
  This password is too weak and would be rejected. It was found using a common dictionary or rule-based attack.

Password: '123456'
  Description: Very Weak - Direct Dictionary Hit
  Est. Guess Count: 1,000,000
  Strength: Weak (Complexity: 19.93 bits)
  This password is too weak and would be rejected. It was found using a common dictionary or rule-based attack.

Password: 'Password'
  Description: Weak - Capitalization Rule Hit
  Est. Guess Count: 1
  Strength: Very Weak (Complexity: 0.00 bits)
  This password is too weak and would be rejected. It was found using a common dictionary or rule-based attack.

Password: 'P@$$w0rd'
  Descr