In [107]:
# Automating the creation of test data paths
def generate_file_paths(base_path, file_name_pattern, start, end, extension):
    """
    Generates a list of file paths with a specified pattern.
    
    Args:
    - base_path (str): The base directory path.
    - file_name_pattern (str): The pattern of the file name with placeholders for numbering.
    - start (int): The starting number for file names.
    - end (int): The ending number for file names.
    - extension (str): The file extension.
    
    Returns:
    - List of file paths.
    """
    return [f"{base_path}/{file_name_pattern.format(str(i).zfill(4))}.{extension}" for i in range(start, end + 1)]

# Automating GIF files
sample_gif_files = [
    f"input_data/sample_gif/SampleGIFImage_{size}kbmb.gif"
    for size in [40, 135, 350]
]

# Automating JPG files
sample_jpg_files = [
    f"input_data/sample_jpg/SampleJPGImage_{size}mbmb.jpg"
    for size in [1, 2, 5, 10, 15, 20, 30]
] + [
    f"input_data/sample_jpg/SampleJPGImage_{size}kbmb.jpg"
    for size in [50, 100, 200, 500]
]

# Automating PDF files
sample_pdf_files = [f"input_data/sample_pdf/SamplePDFFile_{size}mb.pdf" for size in [5]]

# Automating PNG files
sample_png_files = [
    f"input_data/sample_png/SamplePNGImage_{size}mbmb.png"
    for size in [1, 3, 5, 10, 20, 30]
] + [
    f"input_data/sample_png/SamplePNGImage_{size}kbmb.png"
    for size in [100, 200, 500]
]

# Automating PPT files
sample_ppt_files = [
    f"input_data/sample_ppt/SamplePPTFile_{size}kb.ppt"
    for size in [500, 1000]
]

# Automating Text files
sample_text_files = [
    f"input_data/sample_text/text_{size}.txt"
    for size in [10, 100, 1000, 10000, 100000, 1000000, 10000000]
]

# Automating Video files
video_resolutions = ['360x240', '640x360', '720x480', '1280x720']
video_sizes = [1, 2, 5, 10, 20, 30]
sample_video_files = [
    f"input_data/sample_video/SampleVideo_{res}_{size}mb.mp4"
    for res in video_resolutions
    for size in video_sizes
]

# Automating ZIP files
sample_zip_files = [
    f"input_data/sample_zip/SampleZIPFile_{size}mbmb.zip"
    for size in [10, 20, 30, 50, 100]
]

# Automating Vectors
vectors = generate_file_paths(
    base_path="input_data/vectors",
    file_name_pattern="byte{}",
    start=0,
    end=195,
    extension="dat"
)

In [108]:
import hashlib
import os
import math
import pandas as pd

# Define the hashing algorithms to test
hashing_algorithms = {
    "MD5": hashlib.md5,
    "SHA-1": hashlib.sha1,
    "SHA-256": hashlib.sha256,
    "SHA3-256": hashlib.sha3_256,
    "Blake2b": hashlib.blake2b
}

# Function to calculate entropy of a hash output
def calculate_entropy(data):
    """
    Calculates the Shannon entropy of the given data.
    """
    if not data:
        return 0
    frequency = {}
    for byte in data:
        frequency[byte] = frequency.get(byte, 0) + 1
    total_bytes = len(data)
    entropy = -sum((freq / total_bytes) * math.log2(freq / total_bytes) for freq in frequency.values())
    return entropy

# Function to hash a file with the given algorithm
def hash_file(file_path, algorithm):
    """
    Hashes the contents of a file using the specified algorithm.
    """
    hasher = algorithm()
    with open(file_path, 'rb') as file:
        while chunk := file.read(8192):  # Read file in chunks
            hasher.update(chunk)
    return hasher.digest()

# Automate vector file generation
def generate_file_paths(base_path, file_name_pattern, start, end, extension):
    """
    Generates a list of file paths with a specified pattern.
    """
    return [f"{base_path}/{file_name_pattern.format(str(i).zfill(4))}.{extension}" for i in range(start, end + 1)]

# Generate file paths for the vectors
vectors = generate_file_paths(
    base_path="input_data/vectors",
    file_name_pattern="byte{}",
    start=0,
    end=195,
    extension="dat"
)

# Initialize a list to store the results
results = []

# Perform hashing and entropy tests
for file_path in vectors:
    if not os.path.exists(file_path):
        print(f"File not found: {file_path}")
        continue
    for algo_name, algo_func in hashing_algorithms.items():
        hash_output = hash_file(file_path, algo_func)
        entropy = calculate_entropy(hash_output)
        results.append({
            "File": os.path.basename(file_path),
            "Algorithm": algo_name,
            "Hash": hash_output.hex(),
            "Entropy": entropy
        })

# Store results in a DataFrame for analysis
df = pd.DataFrame(results)

# Group by algorithm and calculate the average entropy for each algorithm
grouped_df = df.groupby("Algorithm").mean(numeric_only=True)["Entropy"].reset_index()
grouped_df.rename(columns={"Entropy": "Average Entropy"}, inplace=True)

# Save to CSV for later use (optional)
output_csv = "hash_entropy_results.csv"
df.to_csv(output_csv, index=False)

# Save the average entropy DataFrame to another CSV (optional)
average_entropy_csv = "average_entropy_results.csv"
grouped_df.to_csv(average_entropy_csv, index=False)


In [109]:
df.head(10)

Unnamed: 0,File,Algorithm,Hash,Entropy
0,byte0000.dat,MD5,d41d8cd98f00b204e9800998ecf8427e,4.0
1,byte0000.dat,SHA-1,da39a3ee5e6b4b0d3255bfef95601890afd80709,4.321928
2,byte0000.dat,SHA-256,e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b93...,4.9375
3,byte0000.dat,SHA3-256,a7ffc6f8bf1ed76651c14756a061d662f580ff4de43b49...,4.8125
4,byte0000.dat,Blake2b,786a02f742015903c6c6fd852552d272912f4740e15847...,5.8125
5,byte0001.dat,MD5,c3e97dd6e97fb5125688c97f36720cbe,3.75
6,byte0001.dat,SHA-1,3cdf2936da2fc556bfa533ab1eb59ce710ac80e5,4.321928
7,byte0001.dat,SHA-256,09fc96082d34c2dfc1295d92073b5ea1dc8ef8da95f14d...,4.9375
8,byte0001.dat,SHA3-256,5ecdbae446010644dd235353f132c03fa21a1e6020a86e...,4.875
9,byte0001.dat,Blake2b,388a507aa909e01f549b7fd8e6094b0438e8a1ecc4db0d...,5.738205


In [110]:
grouped_df.head(10)

Unnamed: 0,Algorithm,Average Entropy
0,Blake2b,5.76381
1,MD5,3.941964
2,SHA-1,4.238822
3,SHA-256,4.888266
4,SHA3-256,4.866534


In [111]:
import hashlib
import os
import pandas as pd
import numpy as np

# Define the hashing algorithms to test
hashing_algorithms = {
    "MD5": hashlib.md5,
    "SHA-1": hashlib.sha1,
    "SHA-256": hashlib.sha256,
    "SHA3-256": hashlib.sha3_256,
    "Blake2b": hashlib.blake2b
}

# Function to hash a file with the given algorithm
def hash_file(file_path, algorithm):
    """
    Hashes the contents of a file using the specified algorithm.
    """
    hasher = algorithm()
    with open(file_path, 'rb') as file:
        while chunk := file.read(8192):  # Read file in chunks
            hasher.update(chunk)
    return hasher.digest()

# Function to calculate byte frequencies
def calculate_byte_frequencies(hash_output):
    """
    Calculates the frequency of each byte (0-255) in the hash output.
    """
    frequencies = np.zeros(256, dtype=int)  # Initialize an array for byte frequencies
    for byte in hash_output:
        frequencies[byte] += 1
    return frequencies

# Combine all file paths (replace with your full file list if needed)
all_files = vectors  # Use the vector paths or other file groups

# Initialize a list to store results
uniform_results = []

# Perform uniform distribution tests
for file_path in all_files:
    if not os.path.exists(file_path):
        print(f"File not found: {file_path}")
        continue
    for algo_name, algo_func in hashing_algorithms.items():
        hash_output = hash_file(file_path, algo_func)
        byte_frequencies = calculate_byte_frequencies(hash_output)
        # Measure uniformity: standard deviation of frequencies (lower is better)
        std_dev = np.std(byte_frequencies)
        uniform_results.append({
            "File": os.path.basename(file_path),
            "Algorithm": algo_name,
            "Standard Deviation": std_dev
        })

# Store results in a DataFrame for analysis
uniform_df = pd.DataFrame(uniform_results)

# Group by algorithm to calculate average standard deviation for each algorithm
uniform_summary = uniform_df.groupby("Algorithm").mean(numeric_only=True)["Standard Deviation"].reset_index()
uniform_summary.rename(columns={"Standard Deviation": "Average Standard Deviation"}, inplace=True)

# Save results to CSV for later analysis (optional)
uniform_df.to_csv("hash_uniformity_results.csv", index=False)
uniform_summary.to_csv("average_uniformity_results.csv", index=False)


In [112]:
uniform_df.head(10)

Unnamed: 0,File,Algorithm,Standard Deviation
0,byte0000.dat,MD5,0.242061
1,byte0000.dat,SHA-1,0.268368
2,byte0000.dat,SHA-256,0.342327
3,byte0000.dat,SHA3-256,0.364434
4,byte0000.dat,Blake2b,0.484123
5,byte0001.dat,MD5,0.272431
6,byte0001.dat,SHA-1,0.268368
7,byte0001.dat,SHA-256,0.342327
8,byte0001.dat,SHA3-256,0.353553
9,byte0001.dat,Blake2b,0.507752


In [113]:
uniform_summary.head(10)

Unnamed: 0,Algorithm,Average Standard Deviation
0,Blake2b,0.49934
1,MD5,0.249218
2,SHA-1,0.280106
3,SHA-256,0.351211
4,SHA3-256,0.355463


In [114]:
import hashlib
import os
import pandas as pd

# Define the hashing algorithms to test
hashing_algorithms = {
    "MD5": hashlib.md5,
    "SHA-1": hashlib.sha1,
    "SHA-256": hashlib.sha256,
    "SHA3-256": hashlib.sha3_256,
    "Blake2b": hashlib.blake2b
}

# Function to hash data with the given algorithm
def hash_data(data, algorithm):
    """
    Hashes the given data using the specified algorithm.
    """
    hasher = algorithm()
    hasher.update(data)
    return hasher.digest()  # Return as raw bytes for bit comparison

# Function to flip a specific bit in a byte sequence
def flip_bit(data, bit_index):
    """
    Flips a single bit in a byte sequence at the specified bit index.
    """
    if len(data) == 0:
        raise ValueError("Cannot flip a bit in an empty input.")
    
    byte_index = bit_index // 8
    if byte_index >= len(data):
        raise ValueError(f"bit_index {bit_index} is out of range for the input data of length {len(data) * 8} bits.")
    
    bit_in_byte = bit_index % 8
    modified_data = bytearray(data)
    modified_data[byte_index] ^= (1 << bit_in_byte)
    return bytes(modified_data)

# Function to calculate the percentage of bits that changed
def calculate_bit_difference(hash1, hash2):
    """
    Calculates the percentage of bits that differ between two hash outputs.
    """
    if len(hash1) != len(hash2):
        raise ValueError("Hashes must be the same length for comparison.")
    bit_diff = sum(bin(byte1 ^ byte2).count('1') for byte1, byte2 in zip(hash1, hash2))
    total_bits = len(hash1) * 8
    return (bit_diff / total_bits) * 100

# Combine all file paths (use vectors or other file groups as needed)
all_files = vectors  # Example: using the previously defined vector file paths

# Initialize a list to store results
avalanche_results = []

# Perform avalanche effect test
for file_path in all_files:
    if not os.path.exists(file_path):
        print(f"File not found: {file_path}")
        continue
    with open(file_path, 'rb') as file:
        original_data = file.read()
    
    # Skip empty files
    if len(original_data) == 0:
        print(f"Skipping empty file: {file_path}")
        continue
    
    # Test each algorithm
    for algo_name, algo_func in hashing_algorithms.items():
        # Hash the original data
        original_hash = hash_data(original_data, algo_func)
        
        # Flip a single bit in the input data
        try:
            flipped_data = flip_bit(original_data, 0)  # Flip the first bit (index 0)
            flipped_hash = hash_data(flipped_data, algo_func)
            
            # Calculate bit difference
            bit_diff_percentage = calculate_bit_difference(original_hash, flipped_hash)
            
            # Store results
            avalanche_results.append({
                "File": os.path.basename(file_path),
                "Algorithm": algo_name,
                "Bit Difference (%)": bit_diff_percentage
            })
        except ValueError as e:
            print(f"Error processing file {file_path} with algorithm {algo_name}: {e}")

# Store results in a DataFrame
avalanche_df = pd.DataFrame(avalanche_results)

# Calculate average bit difference for each algorithm
avalanche_summary = avalanche_df.groupby("Algorithm").mean(numeric_only=True).reset_index()
avalanche_summary.rename(columns={"Bit Difference (%)": "Average Bit Difference (%)"}, inplace=True)

# Save results to CSV for later use (optional)
avalanche_df.to_csv("avalanche_effect_results.csv", index=False)
avalanche_summary.to_csv("avalanche_effect_summary.csv", index=False)

Skipping empty file: input_data/vectors/byte0000.dat


In [115]:
avalanche_df.head(10)

Unnamed: 0,File,Algorithm,Bit Difference (%)
0,byte0001.dat,MD5,46.09375
1,byte0001.dat,SHA-1,58.125
2,byte0001.dat,SHA-256,55.078125
3,byte0001.dat,SHA3-256,46.875
4,byte0001.dat,Blake2b,50.0
5,byte0002.dat,MD5,48.4375
6,byte0002.dat,SHA-1,51.25
7,byte0002.dat,SHA-256,48.828125
8,byte0002.dat,SHA3-256,46.09375
9,byte0002.dat,Blake2b,49.804688


In [116]:
avalanche_summary.head(10)

Unnamed: 0,Algorithm,Average Bit Difference (%)
0,Blake2b,49.942909
1,MD5,49.627404
2,SHA-1,50.016026
3,SHA-256,50.036058
4,SHA3-256,50.05008


In [117]:
import hashlib
import random
import string
import pandas as pd

# Define the hashing algorithms to test
hashing_algorithms = {
    "MD5": hashlib.md5,
    "SHA-1": hashlib.sha1,
    "SHA-256": hashlib.sha256,
    "SHA3-256": hashlib.sha3_256,
    "Blake2b": hashlib.blake2b
}

# Function to generate a random string
def generate_random_string(length=64):
    """
    Generates a random string of the specified length.
    """
    return ''.join(random.choices(string.ascii_letters + string.digits, k=length))

# Function to flip a specific bit in a string
def flip_bit_in_string(data, bit_index):
    """
    Flips a single bit in a string's binary representation.
    """
    byte_index = bit_index // 8
    bit_in_byte = bit_index % 8
    modified_data = bytearray(data.encode('utf-8'))
    if byte_index < len(modified_data):
        modified_data[byte_index] ^= (1 << bit_in_byte)
    return modified_data.decode('utf-8', errors='ignore')

# Function to calculate the percentage of bits that changed
def calculate_bit_difference(hash1, hash2):
    """
    Calculates the percentage of bits that differ between two hash outputs.
    """
    if len(hash1) != len(hash2):
        raise ValueError("Hashes must be the same length for comparison.")
    bit_diff = sum(bin(byte1 ^ byte2).count('1') for byte1, byte2 in zip(hash1, hash2))
    total_bits = len(hash1) * 8
    return (bit_diff / total_bits) * 100

# Function to hash data using a specified algorithm
def hash_data(data, algorithm):
    """
    Hashes the given data using the specified algorithm.
    """
    hasher = algorithm()
    hasher.update(data.encode('utf-8'))
    return hasher.digest()

# Parameters for the test
num_samples = 10000  # Number of random strings to generate
string_length = 128  # Length of each random string

# Initialize a list to store results
avalanche_results = []

# Perform the avalanche effect test
for _ in range(num_samples):
    original_string = generate_random_string(string_length)
    
    # Test each algorithm
    for algo_name, algo_func in hashing_algorithms.items():
        # Hash the original string
        original_hash = hash_data(original_string, algo_func)
        
        # Flip a single bit in the string
        flipped_string = flip_bit_in_string(original_string, 0)  # Flip the first bit
        flipped_hash = hash_data(flipped_string, algo_func)
        
        # Calculate bit difference
        bit_diff_percentage = calculate_bit_difference(original_hash, flipped_hash)
        
        # Store results
        avalanche_results.append({
            "Original String": original_string,
            "Algorithm": algo_name,
            "Bit Difference (%)": bit_diff_percentage
        })

# Store results in a DataFrame
avalanche_df = pd.DataFrame(avalanche_results)

# Calculate average bit difference for each algorithm
avalanche_summary = avalanche_df.groupby("Algorithm").mean(numeric_only=True).reset_index()
avalanche_summary.rename(columns={"Bit Difference (%)": "Average Bit Difference (%)"}, inplace=True)

# Save results to CSV for later use (optional)
avalanche_df.to_csv("random_strings_avalanche_effect_results.csv", index=False)
avalanche_summary.to_csv("random_strings_avalanche_effect_summary.csv", index=False)

avalanche_df.head(10)
avalanche_summary.head(10)

Unnamed: 0,Algorithm,Average Bit Difference (%)
0,Blake2b,49.991621
1,MD5,49.958594
2,SHA-1,49.999187
3,SHA-256,50.021563
4,SHA3-256,49.994531


In [118]:
import hashlib
import random
import string
import pandas as pd

# Define the hashing algorithms to test
hashing_algorithms = {
    "MD5": hashlib.md5,
    "SHA-1": hashlib.sha1,
    "SHA-256": hashlib.sha256,
    "SHA3-256": hashlib.sha3_256,
    "Blake2b": hashlib.blake2b
}

# Function to generate a random string
def generate_random_string(length=64):
    """
    Generates a random string of the specified length.
    """
    return ''.join(random.choices(string.ascii_letters + string.digits, k=length))

# Function to hash a string and truncate the hash
def truncated_hash(data, algorithm, bits=16):
    """
    Hashes the input data using the specified algorithm and truncates the output.
    Args:
        data (str): The input data to hash.
        algorithm (function): The hash function from hashlib.
        bits (int): The number of bits to keep from the hash output.
    Returns:
        str: The truncated hash as a hexadecimal string.
    """
    hasher = algorithm()
    hasher.update(data.encode('utf-8'))
    full_hash = hasher.digest()
    truncated_bytes = bits // 8  # Convert bits to bytes
    return full_hash[:truncated_bytes].hex()

# Parameters for the test
num_samples = 100000  # Number of strings to generate
truncated_bits = 32   # Number of bits to keep in the truncated hash space

# Initialize results dictionary for collisions
collision_results = []

# Perform the collision detection test
for algo_name, algo_func in hashing_algorithms.items():
    hash_set = set()
    collisions = 0
    
    for _ in range(num_samples):
        random_string = generate_random_string()
        truncated_h = truncated_hash(random_string, algo_func, truncated_bits)
        
        if truncated_h in hash_set:
            collisions += 1
        else:
            hash_set.add(truncated_h)
    
    collision_results.append({
        "Algorithm": algo_name,
        "Truncated Bits": truncated_bits,
        "Number of Samples": num_samples,
        "Collisions": collisions,
        "Collision Probability (%)": (collisions / num_samples) * 100
    })

# Store results in a DataFrame
collision_df = pd.DataFrame(collision_results)

# Save results to CSV for later use (optional)
collision_df.to_csv("collision_detection_results.csv", index=False)

# Display the results
collision_df.head(15)

Unnamed: 0,Algorithm,Truncated Bits,Number of Samples,Collisions,Collision Probability (%)
0,MD5,32,100000,3,0.003
1,SHA-1,32,100000,0,0.0
2,SHA-256,32,100000,1,0.001
3,SHA3-256,32,100000,0,0.0
4,Blake2b,32,100000,3,0.003


In [119]:
import hashlib
import random
import string
import pandas as pd

# Define the hashing algorithms to test
hashing_algorithms = {
    "MD5": hashlib.md5,
    "SHA-1": hashlib.sha1,
    "SHA-256": hashlib.sha256,
    "SHA3-256": hashlib.sha3_256,
    "Blake2b": hashlib.blake2b
}

# Function to generate a random string
def generate_random_string(length=64):
    """
    Generates a random string of the specified length.
    """
    return ''.join(random.choices(string.ascii_letters + string.digits, k=length))

# Function to hash a string and truncate the hash
def truncated_hash(data, algorithm, bits=16):
    """
    Hashes the input data using the specified algorithm and truncates the output.
    Args:
        data (str): The input data to hash.
        algorithm (function): The hash function from hashlib.
        bits (int): The number of bits to keep from the hash output.
    Returns:
        str: The truncated hash as a hexadecimal string.
    """
    hasher = algorithm()
    hasher.update(data.encode('utf-8'))
    full_hash = hasher.digest()
    truncated_bytes = bits // 8  # Convert bits to bytes
    return full_hash[:truncated_bytes].hex()

# Parameters for the test
num_samples = 100000  # Number of attempts to find the preimage
truncated_bits = 16   # Number of bits to keep in the truncated hash space
target_hash = "abcd"  # Predefined target hash to match (in truncated space)

# Initialize results
preimage_results = []

# Perform the preimage test
for algo_name, algo_func in hashing_algorithms.items():
    found_preimage = None
    for _ in range(num_samples):
        random_string = generate_random_string()
        truncated_h = truncated_hash(random_string, algo_func, truncated_bits)
        
        if truncated_h == target_hash:
            found_preimage = random_string
            break
    
    preimage_results.append({
        "Algorithm": algo_name,
        "Truncated Bits": truncated_bits,
        "Target Hash": target_hash,
        "Preimage Found": found_preimage is not None,
        "Found Preimage": found_preimage if found_preimage else "None",
        "Attempts": _ + 1
    })

# Store results in a DataFrame
preimage_df = pd.DataFrame(preimage_results)

# Save results to CSV for later use (optional)
preimage_df.to_csv("preimage_test_results.csv", index=False)

preimage_df.head(15)


Unnamed: 0,Algorithm,Truncated Bits,Target Hash,Preimage Found,Found Preimage,Attempts
0,MD5,16,abcd,False,,100000
1,SHA-1,16,abcd,True,JlQikxZjloOmurJfYVjRLikDbeAwWwwe0pCNKqYzJMOGuB...,26265
2,SHA-256,16,abcd,False,,100000
3,SHA3-256,16,abcd,False,,100000
4,Blake2b,16,abcd,True,jyp0vjfinXZUkMp98nTBBHTR2uYCdO4qNcdp1YEc7dE3FX...,27460


In [125]:
import hashlib
import random
import string
import pandas as pd

# Define the hashing algorithms to test
hashing_algorithms = {
    "MD5": hashlib.md5,
    "SHA-1": hashlib.sha1,
    "SHA-256": hashlib.sha256,
    "SHA3-256": hashlib.sha3_256,
    "Blake2b": hashlib.blake2b
}

# Function to generate a random string
def generate_random_string(length=64):
    """
    Generates a random string of the specified length.
    """
    return ''.join(random.choices(string.ascii_letters + string.digits, k=length))

# Function to hash a string
def hash_string(data, algorithm):
    """
    Hashes the input data using the specified algorithm.
    """
    hasher = algorithm()
    hasher.update(data.encode('utf-8'))
    return hasher.digest()  # Full binary hash

# Function to truncate the hash
def truncate_hash(full_hash, bits):
    """
    Truncates the full hash to a specified number of bits.
    """
    truncated_bytes = bits // 8  # Convert bits to bytes
    return full_hash[:truncated_bytes].hex()

# Function to calculate the Hamming Distance
def calculate_hamming_distance(hash1, hash2):
    """
    Calculates the Hamming Distance between two binary hash outputs.
    """
    if len(hash1) != len(hash2):
        raise ValueError("Hashes must be the same length for Hamming Distance calculation.")
    bit_diff = sum(bin(byte1 ^ byte2).count('1') for byte1, byte2 in zip(hash1, hash2))
    return bit_diff

# Parameters for the test
num_samples = 100000  # Number of random strings to generate
truncated_bits = 16   # Number of bits for truncated hash space

# Initialize results
collision_results = []

# Perform the collision test and calculate Hamming Distances
for algo_name, algo_func in hashing_algorithms.items():
    hash_map = {}
    for _ in range(num_samples):
        random_string = generate_random_string()
        full_hash = hash_string(random_string, algo_func)
        truncated_h = truncate_hash(full_hash, truncated_bits)
        
        if truncated_h in hash_map:
            # Collision found
            original_hash = hash_map[truncated_h]
            hamming_distance = calculate_hamming_distance(full_hash, original_hash)
            collision_results.append({
                "Algorithm": algo_name,
                "Truncated Bits": truncated_bits,
                "Collision Found": True,
                "Hamming Distance": hamming_distance,
                "Colliding Input": random_string
            })
            break
        else:
            hash_map[truncated_h] = full_hash

# Store results in a DataFrame
collision_df = pd.DataFrame(collision_results)

# Calculate summary statistics for Hamming Distances
hamming_summary = collision_df.groupby("Algorithm").agg({
    "Hamming Distance": ["mean", "std", "min", "max"]
}).reset_index()
hamming_summary.columns = ["Algorithm", "Mean Hamming Distance", "Std Dev", "Min Hamming Distance", "Max Hamming Distance"]

# Save results to CSV for later use (optional)
collision_df.to_csv("hamming_distance_collisions.csv", index=False)
hamming_summary.to_csv("hamming_distance_summary.csv", index=False)

collision_df.head(15)


Unnamed: 0,Algorithm,Truncated Bits,Collision Found,Hamming Distance,Colliding Input
0,MD5,16,True,56,6QTDNONH0SalL54KqcVWTLMABd4OopPlY0CjQQ0k0gxLUe...
1,SHA-1,16,True,79,o56BIm9MS0Hpn8Zn1VHatf4mph2iPOg8ru7A1zzDJ6iOMm...
2,SHA-256,16,True,126,DY82xKpGWR8m061LVUmakPsPEgwHjPM2njIzCieGB3J5pB...
3,SHA3-256,16,True,121,yOKDDc9vsFk6OS4zG22h61z01ekEmPZoCHIdOxWv3SZH6z...
4,Blake2b,16,True,246,Y9fNcvM5LFfpImhx2rirHhGL9ggfZVOX6LeBFhm7rWt4Jj...


In [126]:
hamming_summary.head(15)

Unnamed: 0,Algorithm,Mean Hamming Distance,Std Dev,Min Hamming Distance,Max Hamming Distance
0,Blake2b,246.0,,246,246
1,MD5,56.0,,56,56
2,SHA-1,79.0,,79,79
3,SHA-256,126.0,,126,126
4,SHA3-256,121.0,,121,121
