In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy.stats import norm
from scipy.stats import gaussian_kde
from pprint import pprint
from scipy.integrate import simpson
from collections import Counter
import logging
logging.basicConfig(level=logging.DEBUG)

from get_magic_numbers import get_magic_numbers_main

from string_magic_numbers import magic_strings_detection as string_values_process
from sign_violation_magic_numbers import sign_violation_magic_numbers as opposite_sign_process
from distance_based_magic_numbers import delta_distributed_magic_numbers 
from identical_magic_numbers import identical_column_magic_numbers as all_values_are_same
from magic_dictionaries import magic_dictionary, add_to_master_dict, safe_concatenate
from magic_dictionaries import clean_magic_results
from density_plot import plot_data_density


In [2]:
def synthetic_data(
        mean, 
        sigma, 
        num_samples=1000, 
        random_seed=None, 
        magic_values=[-999, 999],
        quantities=[100, 50], 
        col_names="synthetic_col"):

    if random_seed is not None:
        np.random.seed(random_seed)

    # Calculate Clipping Bounds (HARDCODED 3-SIGMA) ---
    # Hardcode sigma_limit = 3.0 to maintain the 99.7% for testing consistency
    SIGMA_LIMIT = 3.0  
    lower_bound = mean - SIGMA_LIMIT * sigma
    upper_bound = mean + SIGMA_LIMIT * sigma
    
    # Generate and Clip the Main Normal Distribution Samples 
    
    # Generate the initial samples
    synthetic_col = np.random.normal(mean, sigma, num_samples)
    
    # Apply 3-sigma clipping to ensure the normal data is within the 99.7% range
    synthetic_col = np.clip(synthetic_col, a_min=lower_bound, a_max=upper_bound)

    # --- 3. Add Magic Values (Original behavior: total size increases) ---

    if len(magic_values) != len(quantities):
        raise ValueError("Length of magic_values must match length of quantities")
    
    # If no magic values returns the normal distribution
    if len(magic_values) == 0:
        df = pd.DataFrame({col_names: synthetic_col})
        return df
    
    # Add magic values to the dataset
    for magic_value, quantity in zip(magic_values, quantities):
        magic_samples = np.full(quantity, magic_value)
        synthetic_col = np.concatenate((synthetic_col, magic_samples))
    
    # Shuffle the final array
    np.random.shuffle(synthetic_col)
    
    df = pd.DataFrame({col_names: synthetic_col})
    return df

# Note: If you use the parameters quantities=[100, 50] and num_samples=1000,
# the final dataset will have 1000 + 100 + 50 = 1150 samples.

In [3]:
def generate_randomized_df(n_columns, num_samples=1000):

    column_list = []
    
    for i in range(n_columns):
        random_mean = np.random.uniform(10, 500)    # Mean between 10 and 500
        random_sigma = np.random.uniform(1, 50)     # Sigma between 1 and 50
        
        df_col = synthetic_data(
            mean=random_mean, 
            sigma=random_sigma, 
            num_samples=num_samples, 
            col_names=i, # Sets the header to 0, 1,... n-1
            random_seed=np.random.randint(0, 100000) # Different seed for each column
        )
        column_list.append(df_col)
    
    # 3. Combine all columns into one DataFrame
    final_df = pd.concat(column_list, axis=1)
    
    return final_df

df_final = generate_randomized_df(n_columns=5, num_samples=1000)

# p is the value you want, k is naturally the number of rows in the df
p = 42 
df_final[len(df_final.columns)] = p
print(df_final.describe()) 

                 0            1            2            3            4       5
count  1150.000000  1150.000000  1150.000000  1150.000000  1150.000000  1150.0
mean    212.712348   107.213005   -17.007430    89.099661    66.167788    42.0
std     400.650874   381.402858   364.920606   378.162291   376.560621     0.0
min    -999.000000  -999.000000  -999.000000  -999.000000  -999.000000    42.0
25%     293.662082   151.000163   -15.455019   132.361674    83.294341    42.0
50%     294.547536   172.033902    25.198971   151.205220   122.195495    42.0
75%     295.321254   191.985363    69.925897   168.838007   159.575899    42.0
max     999.000000   999.000000   999.000000   999.000000   999.000000    42.0


In [4]:
rows, columns = df_final.shape

extended_col_info =[]
for i in range(columns):
    column_info = []
    column_info.append("F") 
    column_info.append(i)  # Column name/index
    column_info.append(rows)  # Number of rows
    extended_col_info.append(column_info)

In [5]:
magic_master_dict, magic_cleaned_dict = get_magic_numbers_main(df_final, extended_col_info, sign_violation_theshold = 3,gauss_threshold = 0.01, overlap_threshold=5.0, plot_graphs = False)

In [6]:
print ("Detected Magic Numbers:")
pprint(magic_master_dict)

Detected Magic Numbers:
{0: {'all_magic_numbers': False,
     'magic_distanced_numbers': array([ 999., -999.]),
     'magic_sign_violation': [-999.0],
     'magic_strings': []},
 1: {'all_magic_numbers': False,
     'magic_distanced_numbers': array([ 999., -999.]),
     'magic_sign_violation': [-999.0],
     'magic_strings': []},
 2: {'all_magic_numbers': False,
     'magic_distanced_numbers': array([ 999., -999.]),
     'magic_sign_violation': [],
     'magic_strings': []},
 3: {'all_magic_numbers': False,
     'magic_distanced_numbers': array([ 999., -999.]),
     'magic_sign_violation': [-999.0],
     'magic_strings': []},
 4: {'all_magic_numbers': False,
     'magic_distanced_numbers': array([ 999., -999.]),
     'magic_sign_violation': [],
     'magic_strings': []},
 5: {'all_magic_numbers': 42,
     'magic_distanced_numbers': [],
     'magic_sign_violation': [],
     'magic_strings': []}}


In [7]:
pprint ("Cleaned Magic Numbers:")
pprint(magic_cleaned_dict)

'Cleaned Magic Numbers:'
{0: [-999.0, 999.0],
 1: [-999.0, 999.0],
 2: [-999.0, 999.0],
 3: [-999.0, 999.0],
 4: [-999.0, 999.0],
 5: 42}
