In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from src.data.generate import generate_population_data
from src.discretization.quantile import quantile_discretize
from src.identifiability.evaluate import identify_accuracy

# Set reproducibility
np.random.seed(42)

# Simulation parameters
N_total = 1000000       # total population
N_train = 800000       # training cohort
N_test = N_total - N_train
D = 6                # number of variables
bins_list = [4, 8, 16, 36, 64, 128]  # discretization levels

# Generate population data with a specified covariance structure
population_data = generate_population_data(N_total, D)

# Split into training and testing sets
train_data = population_data[:N_train]
test_data = population_data[N_train:]

# Explore the relationship between discretization and identifiability
results = []
for bins in bins_list:
    print(f"\n=== Evaluating discretization with {bins} bins ===")
    discretized_data = quantile_discretize(population_data, bins)
    train_disc = discretized_data[:N_train]
    test_disc = discretized_data[N_train:]
    accuracy = identify_accuracy(train_disc, test_disc, bins)
    print(f"Identifiability (Top-1 Accuracy): {accuracy * 100:.2f}%")
    results.append((bins, accuracy))

# Convert results to DataFrame for visualization
results_df = pd.DataFrame(results, columns=["Discretization_bins", "Identifiability"])

# Plot the results
plt.figure(figsize=(10, 6))
plt.plot(results_df["Discretization_bins"], results_df["Identifiability"], marker='o')
plt.title('Identifiability vs Discretization Bins')
plt.xlabel('Number of Discretization Bins')
plt.ylabel('Identifiability (Top-1 Accuracy)')
plt.xticks(results_df["Discretization_bins"])
plt.grid()
plt.show()

ModuleNotFoundError: No module named 'src'