In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy.stats import norm
from scipy.stats import gaussian_kde
from pprint import pprint
from scipy.integrate import simpson
from collections import Counter
import itertools
import seaborn as sns
from tqdm.notebook import tqdm
from random import sample, seed
import random
import logging
logging.basicConfig(level=logging.DEBUG)


In [2]:

from get_magic_numbers import get_magic_numbers_main

from string_magic_numbers import magic_strings_detection as string_values_process
from sign_violation_magic_numbers import sign_violation_magic_numbers as opposite_sign_process
from distance_based_magic_numbers import delta_distributed_magic_numbers 
from identical_magic_numbers import identical_column_magic_numbers as all_values_are_same
from magic_dictionaries import magic_dictionary, add_to_master_dict, safe_concatenate
from magic_dictionaries import clean_magic_results
from density_plot import plot_data_density


In [3]:
def synthetic_data(
        mean, 
        sigma, 
        num_samples=1000, 
        random_seed=None, 
        magic_values=[-999, 999],
        quantities=[100, 50], 
        col_names="synthetic_col"):

    if random_seed is not None:
        np.random.seed(random_seed)

    # --- 1. Calculate Clipping Bounds (HARDCODED 3-SIGMA) ---
    # We hardcode the sigma_limit to 3.0 to maintain the 99.7% geometry.
    SIGMA_LIMIT = 3.0  
    lower_bound = mean - SIGMA_LIMIT * sigma
    upper_bound = mean + SIGMA_LIMIT * sigma
    
    # --- 2. Generate and Clip the Main Normal Distribution Samples ---
    
    # Generate the initial samples
    synthetic_col = np.random.normal(mean, sigma, num_samples)
    
    # Apply 3-sigma clipping to ensure the normal data is within the 99.7% range
    synthetic_col = np.clip(synthetic_col, a_min=lower_bound, a_max=upper_bound)

    # --- 3. Add Magic Values (Original behavior: total size increases) ---

    if len(magic_values) != len(quantities):
        raise ValueError("Length of magic_values must match length of quantities")
    
    # If no magic values, return just the normal distribution
    if len(magic_values) == 0:
        df = pd.DataFrame({col_names: synthetic_col})
        return df
    
    # Add magic values to the dataset
    for magic_value, quantity in zip(magic_values, quantities):
        magic_samples = np.full(quantity, magic_value)
        # This is where the array size increases:
        synthetic_col = np.concatenate((synthetic_col, magic_samples))
    
    # Shuffle the final array
    np.random.shuffle(synthetic_col)
    
    # Create DataFrame AFTER the loop
    df = pd.DataFrame({col_names: synthetic_col})
    return df

# Note: If you use the parameters quantities=[100, 50] and num_samples=1000,
# the final dataset will have 1000 + 100 + 50 = 1150 samples.

In [4]:
def enhanced_validation(master_dict, expected_magic_numbers, tol=1e-6):
    dist = master_dict.get('synthetic_col', {}).get('magic_distanced_numbers', [])
    opp = master_dict.get('synthetic_col', {}).get('magic_opp_sign_numbers', [])
    
    detected = np.unique(np.concatenate([np.array(dist), np.array(opp)]))
    expected = np.array(expected_magic_numbers)
    
    # Classification metrics
    true_positives = 0
    false_positives = 0
    false_negatives = 0
    
    # Count matches
    for exp_num in expected:
        if any(abs(exp_num - det) < tol for det in detected):
            true_positives += 1
        else:
            false_negatives += 1
    
    # Count false positives
    for det_num in detected:
        if not any(abs(det_num - exp) < tol for exp in expected):
            false_positives += 1
    
    # Calculate metrics
    precision = true_positives / (true_positives + false_positives) if (true_positives + false_positives) > 0 else 0
    recall = true_positives / (true_positives + false_negatives) if (true_positives + false_negatives) > 0 else 0
    f1_score = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0
    
    return {
        'true_positives': true_positives,
        'false_positives': false_positives, 
        'false_negatives': false_negatives,
        'precision': precision,
        'recall': recall,
        'f1_score': f1_score,
        'detected_count': len(detected),
        'expected_count': len(expected)
    }

In [None]:
# ---------------- PARAMETER GRIDS ---------------- #
sigmas = np.linspace(0.0, 50.0, 1000)
gauss_thresholds = np.linspace(0.0, 1.0, 1000)
overlap_thresholds = np.linspace(0.0, 100.0, 1000)
center_distances = np.linspace(0.0, 100.0, 1000)
pair_gaps = np.linspace(0.0, 50.0, 1000)
num_samples = np.linspace(1, 1000, 1000)
num_samples = num_samples.astype(int)  

quantities = [50, 25]

rd_samples = 10000

seed(42)


# ---------------- PARAMETER SAMPLING ---------------- #
param_combos = [
    (
        random.choice(sigmas), 
        random.choice(gauss_thresholds), 
        random.choice(overlap_thresholds), 
        random.choice(center_distances), 
        random.choice(pair_gaps),
        random.choice(num_samples)
    )
    for _ in range(rd_samples)
]

def generate_filename(sigmas, gauss_thresholds, overlap_thresholds, center_distances, num_samples, pair_gaps, rd_samples):
    sigma_range = f"{sigmas.min():.0f}-{sigmas.max():.0f}"
    gth_range = f"{gauss_thresholds.min():.0f}-{gauss_thresholds.max():.0f}"
    oth_range = f"{overlap_thresholds.min():.0f}-{overlap_thresholds.max():.0f}"
    D_range = f"{center_distances.min():.0f}-{center_distances.max():.0f}"
    gap_range = f"{pair_gaps.min()}-{pair_gaps.max()}"
    num_range = f"{num_samples.max()}"
    
    return f"2m_s{sigma_range}_G{gth_range}_o{oth_range}_d{D_range}_g{gap_range}_n{num_range}_rd{rd_samples}"


# ---------------- MAIN LOOP ---------------- #
results = []
pbar = tqdm(param_combos, desc="Running parameter sweep", total=len(param_combos))

for sigma, gth, oth, D, g, num in pbar:
    magic_values = [D, D + g]

    magic_master_dict, magic_cleaned_dict = get_magic_numbers_main(
        df=synthetic_data(mean=0, sigma=sigma, num_samples=num,
                          magic_values=magic_values, quantities=quantities, col_names="synthetic_col"),
        extended_col_info=[('synthetic_col', 'N')],
        sign_violation_theshold=3,
        gauss_threshold=gth,
        overlap_threshold=oth,
        plot_graphs=False
    )

    metrics = enhanced_validation(magic_master_dict, magic_values)
    f1 = metrics['f1_score'] 

    pbar.set_postfix({
        "Sigma": f"{sigma:.2f}",
        "G_thres": f"{gth:.2f}",
        "O_thres": f"{oth:.2f}",
        "D": f"{D:.2f}",
        "gap": f"{g:.2f}",
        "Number_samples":f"{num:.2f}"
    })

    results.append({
        'sigma': sigma,
        'gauss_threshold': gth,
        'overlap_threshold': oth,
        'center_distance': D,
        'pair_gap': g,
        'number_samples': num,
        'recall': metrics['recall'],
        'precision': metrics['precision'],
        'f1_score': metrics['f1_score']
    })

    if len(results) % 100 == 0:
        filename = generate_filename(sigmas, gauss_thresholds, overlap_thresholds, center_distances, num_samples, pair_gaps, rd_samples)
        full_filename = f"{filename}.csv"
        pd.DataFrame(results).to_csv(full_filename, index=False)
        
print(full_filename)


Running parameter sweep:   0%|          | 0/10000 [00:00<?, ?it/s]

2m_s0-50_G0-1_o0-100_d0-100_g0.0-50.0_n1000_rd10000.csv


In [6]:
# ---------------- PARAMETER GRIDS ---------------- #
sigmas = np.linspace(0.0, 10.0, 1000)
gauss_thresholds = np.linspace(0.0, 1.0, 1000)
overlap_thresholds = np.linspace(0.0, 100.0, 1000)
center_distances = np.linspace(0.0, 30.0, 1000)
num_samples = np.linspace(1, 1000, 1000)
num_samples = num_samples.astype(int)
quantities = [50]

rd_samples = 10000

seed(42)

# ---------------- PARAMETER SAMPLING ---------------- #
param_combos = [
    (
        random.choice(sigmas), 
        random.choice(gauss_thresholds), 
        random.choice(overlap_thresholds), 
        random.choice(center_distances),
        random.choice(num_samples)
 
    )
    for _ in range(rd_samples)
]

def generate_filename(sigmas, gauss_thresholds, overlap_thresholds, center_distances, num_samples, rd_samples):
    sigma_range = f"{sigmas.min():.0f}-{sigmas.max():.0f}"
    gth_range = f"{gauss_thresholds.min():.0f}-{gauss_thresholds.max():.0f}"
    oth_range = f"{overlap_thresholds.min():.0f}-{overlap_thresholds.max():.0f}"
    D_range = f"{center_distances.min():.0f}-{center_distances.max():.0f}"
    num_range = f"{num_samples.max()}"
    
    return f"1m_s{sigma_range}_g{gth_range}_o{oth_range}_D{D_range}_n{num_range}_rd{rd_samples}"


# ---------------- MAIN LOOP ---------------- #
results = []
pbar = tqdm(param_combos, desc="Running parameter sweep", total=len(param_combos))

for sigma, gth, oth, D, num in pbar:
    magic_values = [D]

    magic_master_dict, magic_cleaned_dict = get_magic_numbers_main(
        df=synthetic_data(mean=0, sigma=sigma, num_samples=num,
                          magic_values=magic_values, quantities=quantities, col_names="synthetic_col"),
        extended_col_info=[(0, 0, 'synthetic_col', 'F18')],
        sign_violation_theshold=3,
        gauss_threshold=gth,
        overlap_threshold=oth,
        plot_graphs=False
    )

    metrics = enhanced_validation(magic_master_dict, magic_values)

    pbar.set_postfix({
        "Sigma": f"{sigma:.2f}",
        "G_thres": f"{gth:.2f}",
        "O_thres": f"{oth:.2f}",
        "D": f"{D:.2f}",
        "Number_samples":f"{num:.2f}"
    })

    results.append({
        'sigma': sigma,
        'gauss_threshold': gth,
        'overlap_threshold': oth,
        'center_distance': D,
        'number_samples': num,
        'recall': metrics['recall'],
        'precision': metrics['precision'],
        'f1_score': metrics['f1_score']
    })

    if len(results) % 100 == 0:
        filename = generate_filename(sigmas, gauss_thresholds, overlap_thresholds, center_distances, num_samples, rd_samples)
        full_filename = f"{filename}.csv"
        pd.DataFrame(results).to_csv(full_filename, index=False)
        
print(full_filename)




Running parameter sweep:   0%|          | 0/10000 [00:00<?, ?it/s]

1m_s0-10_g0-1_o0-100_D0-30_n1000_rd10000.csv


In [7]:
import winsound
import time

def mario_one_up():
    notes = [1319, 1568, 2637, 2093, 2349, 3136, 2637]
    durations = [100, 100, 100, 100, 100, 100, 300]
    for n, d in zip(notes, durations):
        winsound.Beep(n, d)
        time.sleep(0.02)

mario_one_up()
