In [27]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy.stats import norm
from scipy.stats import gaussian_kde
from pprint import pprint
from scipy.integrate import simpson
from collections import Counter
import logging
logging.basicConfig(level=logging.DEBUG)

from get_magic_numbers import get_magic_numbers_main

from string_magic_numbers import magic_strings_detection as string_values_process
from sign_violation_magic_numbers import sign_violation_magic_numbers as opposite_sign_process
from distance_based_magic_numbers import delta_distributed_magic_numbers 
from identical_magic_numbers import identical_column_magic_numbers as all_values_are_same
from magic_dictionaries import magic_dictionary, add_to_master_dict, safe_concatenate
from magic_dictionaries import clean_magic_results
from density_plot import plot_data_density


In [28]:
def synthetic_data(
        mean, 
        sigma, 
        num_samples=1000, 
        random_seed=None, 
        magic_values=[-999, 999],
        quantities=[100, 50], 
        col_names="synthetic_col"):

    if random_seed is not None:
        np.random.seed(random_seed)

    # Calculate Clipping Bounds (HARDCODED 3-SIGMA) ---
    # Hardcode sigma_limit = 3.0 to maintain the 99.7% for testing consistency
    SIGMA_LIMIT = 3.0  
    lower_bound = mean - SIGMA_LIMIT * sigma
    upper_bound = mean + SIGMA_LIMIT * sigma
    
    # Generate and Clip the Main Normal Distribution Samples 
    
    # Generate the initial samples
    synthetic_col = np.random.normal(mean, sigma, num_samples)
    
    # Apply 3-sigma clipping to ensure the normal data is within the 99.7% range
    synthetic_col = np.clip(synthetic_col, a_min=lower_bound, a_max=upper_bound)

    # --- 3. Add Magic Values (Original behavior: total size increases) ---

    if len(magic_values) != len(quantities):
        raise ValueError("Length of magic_values must match length of quantities")
    
    # If no magic values returns the normal distribution
    if len(magic_values) == 0:
        df = pd.DataFrame({col_names: synthetic_col})
        return df
    
    # Add magic values to the dataset
    for magic_value, quantity in zip(magic_values, quantities):
        magic_samples = np.full(quantity, magic_value)
        synthetic_col = np.concatenate((synthetic_col, magic_samples))
    
    # Shuffle the final array
    np.random.shuffle(synthetic_col)
    
    df = pd.DataFrame({col_names: synthetic_col})
    return df

# Note: If you use the parameters quantities=[100, 50] and num_samples=1000,
# the final dataset will have 1000 + 100 + 50 = 1150 samples.

In [29]:
def generate_randomized_df(n_columns, num_samples=1000):

    column_list = []
    
    for i in range(n_columns):
        random_mean = np.random.uniform(10, 500)    # Mean between 10 and 500
        random_sigma = np.random.uniform(1, 50)     # Sigma between 1 and 50
        
        df_col = synthetic_data(
            mean=random_mean, 
            sigma=random_sigma, 
            num_samples=num_samples, 
            col_names=i, # Sets the header to 0, 1,... n-1
            random_seed=np.random.randint(0, 100000) # Different seed for each column
        )
        column_list.append(df_col)
    
    # 3. Combine all columns into one DataFrame
    final_df = pd.concat(column_list, axis=1)
    
    return final_df

df_final = generate_randomized_df(n_columns=5, num_samples=1000)

# p is the value you want, k is naturally the number of rows in the df
p = 42 
df_final[len(df_final.columns)] = p

for times in range(10):
    random_row_index = np.random.randint(0, len(df_final))
    df_final.loc[random_row_index, 4] = "Flag"
print(df_final.describe()) 

                 0            1            2            3       5
count  1150.000000  1150.000000  1150.000000  1150.000000  1150.0
mean     67.119461   166.132761     7.950269   110.578537    42.0
std     374.550272   392.797655   365.063525   381.276495     0.0
min    -999.000000  -999.000000  -999.000000  -999.000000    42.0
25%     105.873815   210.319373    56.416462   173.035912    42.0
50%     125.034432   239.361442    58.796786   176.951804    42.0
75%     144.065812   266.554845    61.367808   180.105466    42.0
max     999.000000   999.000000   999.000000   999.000000    42.0


  df_final.loc[random_row_index, 4] = "Flag"


In [30]:
rows, columns = df_final.shape

extended_col_info =[]
for i in range(columns):
    column_info = []

    column_info.append(i)  # Column name/index
    column_info.append("N") # Indicate Numerical column
    column_info.append(rows)  # Number of rows
    extended_col_info.append(column_info)

In [31]:
magic_master_dict, magic_cleaned_dict = get_magic_numbers_main(df_final, extended_col_info, sign_violation_theshold = 3,gauss_threshold = 0.01, overlap_threshold=5.0, plot_graphs = False)

  Skipping Processes 2 and 3 due to identical numeric values.


In [32]:
print ("Detected Magic Numbers:")
pprint(magic_master_dict)

Detected Magic Numbers:
{0: {'all_magic_numbers': None,
     'magic_distanced_numbers': array([ 999., -999.]),
     'magic_sign_violation': [-999.0],
     'magic_strings': []},
 1: {'all_magic_numbers': None,
     'magic_distanced_numbers': array([ 999., -999.]),
     'magic_sign_violation': [-999.0],
     'magic_strings': []},
 2: {'all_magic_numbers': None,
     'magic_distanced_numbers': array([ 999., -999.]),
     'magic_sign_violation': [-999.0],
     'magic_strings': []},
 3: {'all_magic_numbers': None,
     'magic_distanced_numbers': array([ 999., -999.]),
     'magic_sign_violation': [-999.0],
     'magic_strings': []},
 4: {'all_magic_numbers': None,
     'magic_distanced_numbers': array([ 999., -999.]),
     'magic_sign_violation': [-999.0],
     'magic_strings': ['Flag']},
 5: {'all_magic_numbers': 42,
     'magic_distanced_numbers': [],
     'magic_sign_violation': [],
     'magic_strings': []}}


In [33]:
pprint ("Cleaned Magic Numbers:")
pprint(magic_cleaned_dict)

'Cleaned Magic Numbers:'
{0: [-999.0, 999.0],
 1: [-999.0, 999.0],
 2: [-999.0, 999.0],
 3: [-999.0, 999.0],
 4: ['-999.0', '999.0', 'Flag']}


In [34]:
from astropy.io import ascii

readme_url = "https://cdsarc.cds.unistra.fr/ftp/J/A+A/469/861/ReadMe"
data_url = "https://cdsarc.cds.unistra.fr/ftp/J/A+A/469/861/table1.dat"

table = ascii.read(data_url, readme=readme_url, format="cds")
df = table.to_pandas()
print(df[:9])


   [BBG2007b] n_[BBG2007b]              Name  RAh  RAm    RAs DE-  DEd  DEm   DEs   Bmag   Rmag     RV  e_RV SpClass
0           1          NaN  J005535.4+261752    0   55  35.44   +   26   17  52.8  21.38  19.15  58643   135     NaN
1           2          NaN  J005536.0+262711    0   55  36.06   +   26   27  11.7  20.39  18.62  57857   157     NaN
2           3          NaN  J005539.1+261457    0   55  39.16   +   26   14  57.2  20.67  18.75  59498    90       k
3           4          NaN  J005539.4+261531    0   55  39.45   +   26   15  31.8  21.99  19.91  58948   150     NaN
4           5          NaN  J005540.1+262056    0   55  40.13   +   26   20  56.0  19.54  17.58  59605    64     k+a
5           6          NaN  J005544.5+261318    0   55  44.52   +   26   13  18.2  20.46  18.35  55782    74       k
6           7          NaN  J005544.5+262755    0   55  44.54   +   26   27  55.2  21.37  19.00  57965    96     NaN
7           8            F  J005544.5+262819    0   55  44.56   

In [35]:
df = table.to_pandas()

rows, columns = df.shape

extended_col_info =[]
for i in range(columns):
    column_info = []

    column_header = df.columns[i] # Get the header of each column
    column_info.append(column_header) 

    column_type = df[column_header].dtype
    if pd.api.types.is_float_dtype(column_type) == True or pd.api.types.is_integer_dtype(column_type) == True:
        column_info.append("N")  # Numeric column
    elif pd.api.types.is_string_dtype(column_type) == True:
        column_info.append("S")  # String column
        
    column_info.append(i)  # Column name/index
    column_info.append(rows)  # Number of rows
    extended_col_info.append(column_info)

pprint(extended_col_info)

[['[BBG2007b]', 'N', 0, 115],
 ['n_[BBG2007b]', 'S', 1, 115],
 ['Name', 'S', 2, 115],
 ['RAh', 'N', 3, 115],
 ['RAm', 'N', 4, 115],
 ['RAs', 'N', 5, 115],
 ['DE-', 'S', 6, 115],
 ['DEd', 'N', 7, 115],
 ['DEm', 'N', 8, 115],
 ['DEs', 'N', 9, 115],
 ['Bmag', 'N', 10, 115],
 ['Rmag', 'N', 11, 115],
 ['RV', 'N', 12, 115],
 ['e_RV', 'N', 13, 115],
 ['SpClass', 'S', 14, 115]]


In [36]:
magic_master_dict, magic_cleaned_dict = get_magic_numbers_main(df, extended_col_info, sign_violation_theshold = 3,gauss_threshold = 0.01, overlap_threshold=5.0, plot_graphs = False)

  Skipping Processes 2 and 3 due to identical numeric values.
  Skipping Processes 2 and 3 due to identical numeric values.


In [37]:
pprint ("Cleaned Magic Numbers:")
pprint(magic_cleaned_dict)
# Note: that the non numeric columns are ignored in the magic number detection 
# and in the cleaned results are last with None indicating it.


'Cleaned Magic Numbers:'
{'Bmag': [],
 'DE-': [],
 'DEm': [],
 'DEs': [0.2],
 'Name': [],
 'RAm': [],
 'RAs': [],
 'RV': [],
 'Rmag': [],
 'SpClass': [],
 '[BBG2007b]': [],
 'e_RV': [],
 'n_[BBG2007b]': []}


In [38]:
from astropy.io import ascii

# Local data file
data_file = "gxfluxes.dat"

# ReadMe URL for the data
readme_url = "ReadMe_531.txt"

# Read the table using CDS format
table = ascii.read(data_file, readme=readme_url, format="cds")

# Convert to pandas DataFrame
df = table.to_pandas()

# 1. Force all rows to show
pd.set_option('display.max_rows', None)

# 2. Force all columns to show (since you have 31 columns)
pd.set_option('display.max_columns', None)

# 3. Prevent the text from "wrapping" to a new line
pd.set_option('display.width', None)

print(df)



       Name note  RAh  RAm  RAs DE-  DEd  DEm  DEs instr     AA   Bmag  type  RA_Xh  RA_Xm  RA_Xs DE_X-  DE_Xd  DE_Xm  DE_Xs  radius l_counts    counts  e_counts l_crate    crate l_FluxX    FluxX     dist l_logLx  logLx
0     I0342  NaN    3   41   57   +   67   56   24     I   7045   7.86     6      3     41     35     +     67     56     19     510      NaN    165.75      20.6     NaN    80.49     NaN    49.27    6.300     NaN  40.37
1     I0749  NaN   11   56    0   +   43    0   48     I   7760  12.44     5   <NA>   <NA>   <NA>   NaN   <NA>   <NA>   <NA>    <NA>        <     24.37       NaN       <     5.55       <     1.82   27.300       <  40.21
2     I0750  NaN   11   56   17   +   43    0    6     I   7760  12.09     3   <NA>   <NA>   <NA>   NaN   <NA>   <NA>   <NA>    <NA>        <     24.92       NaN       <     5.68       <     1.86   27.300       <  40.22
3     I0989  NaN   14   12   18   +    3   21    0     I   4550  14.00    -5   <NA>   <NA>   <NA>   NaN   <NA>   <NA>   

In [39]:
df = table.to_pandas()

rows, columns = df.shape

extended_col_info =[]
for i in range(columns):
    column_info = []

    column_header = df.columns[i] # Get the header of each column
    column_info.append(column_header) 

    column_type = df[column_header].dtype
    if pd.api.types.is_float_dtype(column_type) == True or pd.api.types.is_integer_dtype(column_type) == True:
        column_info.append("N")  # Numeric column
    elif pd.api.types.is_string_dtype(column_type) == True:
        column_info.append("S")  # String column
        
    column_info.append(i)  # Column name/index
    column_info.append(rows)  # Number of rows
    extended_col_info.append(column_info)

pprint(extended_col_info)

[['Name', 'S', 0, 448],
 ['note', 'S', 1, 448],
 ['RAh', 'N', 2, 448],
 ['RAm', 'N', 3, 448],
 ['RAs', 'N', 4, 448],
 ['DE-', 'S', 5, 448],
 ['DEd', 'N', 6, 448],
 ['DEm', 'N', 7, 448],
 ['DEs', 'N', 8, 448],
 ['instr', 'S', 9, 448],
 ['AA', 'N', 10, 448],
 ['Bmag', 'N', 11, 448],
 ['type', 'N', 12, 448],
 ['RA_Xh', 'N', 13, 448],
 ['RA_Xm', 'N', 14, 448],
 ['RA_Xs', 'N', 15, 448],
 ['DE_X-', 'S', 16, 448],
 ['DE_Xd', 'N', 17, 448],
 ['DE_Xm', 'N', 18, 448],
 ['DE_Xs', 'N', 19, 448],
 ['radius', 'N', 20, 448],
 ['l_counts', 'S', 21, 448],
 ['counts', 'N', 22, 448],
 ['e_counts', 'N', 23, 448],
 ['l_crate', 'S', 24, 448],
 ['crate', 'N', 25, 448],
 ['l_FluxX', 'S', 26, 448],
 ['FluxX', 'N', 27, 448],
 ['dist', 'N', 28, 448],
 ['l_logLx', 'S', 29, 448],
 ['logLx', 'N', 30, 448]]


In [40]:
magic_master_dict, magic_cleaned_dict = get_magic_numbers_main(df, extended_col_info, sign_violation_theshold = 3,gauss_threshold = 0.01, overlap_threshold=10.0, plot_graphs = False)

In [41]:
pprint ("Cleaned Magic Numbers:")
pprint(magic_cleaned_dict)
# Note: that the non numeric columns are ignored in the magic number detection 
# and in the cleaned results are last with None indicating it.
print ("Detected Magic Numbers:")
pprint(magic_master_dict)

'Cleaned Magic Numbers:'
{'AA': [99999.0],
 'Bmag': [],
 'DE-': [],
 'DE_X-': [],
 'DE_Xd': [],
 'DE_Xm': [],
 'DE_Xs': [],
 'DEd': [85.0],
 'DEm': [],
 'DEs': [],
 'FluxX': [],
 'Name': [],
 'RA_Xh': [],
 'RA_Xm': [],
 'RA_Xs': [],
 'RAh': [],
 'RAm': [],
 'RAs': [],
 'counts': [],
 'crate': [],
 'dist': [],
 'e_counts': [],
 'instr': [],
 'l_FluxX': [],
 'l_counts': [],
 'l_crate': [],
 'l_logLx': [],
 'logLx': [],
 'note': [],
 'radius': [],
 'type': []}
Detected Magic Numbers:
{'AA': {'all_magic_numbers': None,
        'magic_distanced_numbers': array([99999.]),
        'magic_sign_violation': [],
        'magic_strings': []},
 'Bmag': {'all_magic_numbers': None,
          'magic_distanced_numbers': array([], dtype=float64),
          'magic_sign_violation': [],
          'magic_strings': []},
 'DE-': {'all_magic_numbers': None,
         'magic_distanced_numbers': [],
         'magic_sign_violation': [],
         'magic_strings': []},
 'DE_X-': {'all_magic_numbers': None,
         