In [1]:
import hashlib
import random
import string
import pandas as pd

# Define the hashing algorithms to test
hashing_algorithms = {
    "MD5": hashlib.md5,
    "SHA-1": hashlib.sha1,
    "SHA-256": hashlib.sha256,
    "SHA3-256": hashlib.sha3_256,
    "Blake2b": hashlib.blake2b
}

# Function to generate a random string
def generate_random_string(length=64):
    """
    Generates a random string of the specified length.
    """
    return ''.join(random.choices(string.ascii_letters + string.digits, k=length))

# Function to hash a string
def hash_string(data, algorithm):
    """
    Hashes the input data using the specified algorithm.
    """
    hasher = algorithm()
    hasher.update(data.encode('utf-8'))
    return hasher.digest()  # Full binary hash

# Function to truncate the hash
def truncate_hash(full_hash, bits):
    """
    Truncates the full hash to a specified number of bits.
    """
    truncated_bytes = bits // 8  # Convert bits to bytes
    return full_hash[:truncated_bytes].hex()

# Function to calculate the Hamming Distance
def calculate_hamming_distance(hash1, hash2):
    """
    Calculates the Hamming Distance between two binary hash outputs.
    """
    if len(hash1) != len(hash2):
        raise ValueError("Hashes must be the same length for Hamming Distance calculation.")
    bit_diff = sum(bin(byte1 ^ byte2).count('1') for byte1, byte2 in zip(hash1, hash2))
    return bit_diff

# Parameters for the test
num_samples = 100000  # Number of random strings to generate
truncated_bits = 64   # Number of bits for truncated hash space

# Initialize results
collision_results = []

# Perform the collision test and calculate Hamming Distances
for algo_name, algo_func in hashing_algorithms.items():
    hash_map = {}
    for _ in range(num_samples):
        random_string = generate_random_string()
        full_hash = hash_string(random_string, algo_func)
        truncated_h = truncate_hash(full_hash, truncated_bits)
        
        if truncated_h in hash_map:
            # Collision found
            original_hash = hash_map[truncated_h]
            hamming_distance = calculate_hamming_distance(full_hash, original_hash)
            collision_results.append({
                "Algorithm": algo_name,
                "Truncated Bits": truncated_bits,
                "Collision Found": True,
                "Hamming Distance": hamming_distance,
                "Colliding Input": random_string
            })
            break
        else:
            hash_map[truncated_h] = full_hash

# Store results in a DataFrame
collision_df = pd.DataFrame(collision_results)

# Calculate summary statistics for Hamming Distances
hamming_summary = collision_df.groupby("Algorithm").agg({
    "Hamming Distance": ["mean", "std", "min", "max"]
}).reset_index()
hamming_summary.columns = ["Algorithm", "Mean Hamming Distance", "Std Dev", "Min Hamming Distance", "Max Hamming Distance"]

# Save results to CSV for later use (optional)
collision_df.to_csv("hamming_distance_collisions.csv", index=False)
hamming_summary.to_csv("hamming_distance_summary.csv", index=False)

collision_df.head(15)


KeyError: 'Algorithm'