<a href="https://colab.research.google.com/github/nonyeezeh/Research-Project-Code/blob/main/LBN_Sparse_AIC_3_Nodes.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Imports

In [3]:
pip install pgmpy



In [4]:
import numpy as np
import pandas as pd
from pgmpy.estimators import HillClimbSearch, AICScore, MaximumLikelihoodEstimator
from pgmpy.models import BayesianNetwork
from pgmpy.estimators import BayesianEstimator
from sklearn.model_selection import train_test_split
from scipy.stats import entropy
from tabulate import tabulate

# Bayesian Network Data Generation 500, ..., 20000 Samples (sparse)

In [5]:
# Function to generate weaker random CPDs per node
def generate_cpds_with_noise_and_variability():
    # Generate probabilities for each node independently
    ir_probs = np.random.rand(3)
    ir_probs /= ir_probs.sum()

    ei_given_ir_probs = np.random.rand(3, 3) * 0.5 + 0.25
    ei_given_ir_probs /= ei_given_ir_probs.sum(axis=0, keepdims=True)

    # SP placeholder, updated dynamically per selected influencers
    sp_probs = np.random.rand(3, 3, 3)  # Placeholder shape for SP, adjusted per influencers
    sp_probs /= sp_probs.sum(axis=0, keepdims=True)

    return ir_probs, ei_given_ir_probs, sp_probs

# Function to generate samples with varying dependencies and include probabilities
def generate_and_save_samples_with_varying_dependencies(sample_size, filename):
    output_data = []

    # List of all nodes potentially influencing SP
    potential_influencers = ['IR', 'EI']

    for i in range(sample_size):
        # Generate CPDs for each entry
        ir_probs, ei_probs, sp_probs = generate_cpds_with_noise_and_variability()

        # Select 1-2 random nodes to influence SP for this entry
        num_influencers = np.random.choice([1, 2])
        selected_influencers = np.random.choice(potential_influencers, num_influencers, replace=False)

        # Initialize SP's probability distribution based on selected influencing nodes
        sp_probs_shape = [3] * (num_influencers + 1)  # Shape includes SP + selected influencers
        sp_probs = np.random.rand(*sp_probs_shape)
        sp_probs /= sp_probs.sum(axis=0, keepdims=True)

        # Sample each node's state individually and store probabilities
        ir_state_idx = np.random.choice(3, p=ir_probs)
        ir_state = ['low', 'medium', 'high'][ir_state_idx]

        ei_probs_given_ir = ei_probs[:, ir_state_idx]
        ei_state_idx = np.random.choice(3, p=ei_probs_given_ir)
        ei_state = ['poor', 'average', 'good'][ei_state_idx]

        # Only use the state indices for the selected influencers to calculate SP probabilities
        sp_index = []
        for influencer in selected_influencers:
            if influencer == 'IR':
                sp_index.append(ir_state_idx)
            elif influencer == 'EI':
                sp_index.append(ei_state_idx)

        # Use the selected influencers to determine SP's state
        sp_probs_selected = sp_probs[tuple(sp_index)]
        sp_probs_selected /= sp_probs_selected.sum()  # Normalize to ensure probabilities sum to 1

        # Sample SP's state based on normalized probabilities
        sp_state_idx = np.random.choice(3, p=sp_probs_selected)
        sp_state = ['decrease', 'stable', 'increase'][sp_state_idx]

        # Append data for this entry, including probabilities
        output_data.append({
            'IR_State': ir_state,
            'IR_Probability': ir_probs[ir_state_idx],
            'EI_State': ei_state,
            'EI_Probability': ei_probs_given_ir[ei_state_idx],
            'SP_State': sp_state,
            'SP_Probability': sp_probs_selected[sp_state_idx]
        })

    # Save the generated dataset
    output_df = pd.DataFrame(output_data)
    output_df.to_csv(filename, index=False)
    print(f"Sample size {sample_size} generated and saved to {filename}")

# Generate data for different sample sizes with varying dependencies
sample_sizes = [50, 100, 150, 200, 250, 300, 350, 400, 450, 500, 550, 600, 650, 700, 750, 800, 850, 900, 950, 1000, 2000, 3000, 4000, 5000, 6000, 7000, 8000, 9000, 10000, 11000, 12000, 13000, 14000, 15000, 16000, 17000, 18000, 19000, 20000]
for size in sample_sizes:
    generate_and_save_samples_with_varying_dependencies(size, f'varying_dependencies_data_{size}.csv')

Sample size 50 generated and saved to varying_dependencies_data_50.csv
Sample size 100 generated and saved to varying_dependencies_data_100.csv
Sample size 150 generated and saved to varying_dependencies_data_150.csv
Sample size 200 generated and saved to varying_dependencies_data_200.csv
Sample size 250 generated and saved to varying_dependencies_data_250.csv
Sample size 300 generated and saved to varying_dependencies_data_300.csv
Sample size 350 generated and saved to varying_dependencies_data_350.csv
Sample size 400 generated and saved to varying_dependencies_data_400.csv
Sample size 450 generated and saved to varying_dependencies_data_450.csv
Sample size 500 generated and saved to varying_dependencies_data_500.csv
Sample size 550 generated and saved to varying_dependencies_data_550.csv
Sample size 600 generated and saved to varying_dependencies_data_600.csv
Sample size 650 generated and saved to varying_dependencies_data_650.csv
Sample size 700 generated and saved to varying_depend

# LBN Dense AIC & Entropy

In [6]:
# Sample sizes to loop through
sample_sizes = [50, 100, 150, 200, 250, 300, 350, 400, 450, 500, 550, 600, 650, 700, 750, 800, 850, 900, 950, 1000,
                2000, 3000, 4000, 5000, 6000, 7000, 8000, 9000, 10000, 11000, 12000, 13000, 14000, 15000, 16000, 17000, 18000, 19000, 20000]

# Initialize list to store K-L divergence and standard deviation values for each sample size
results = []

# Small smoothing value to avoid zero probabilities
epsilon = 1e-10

# Loop through each sample size
for sample_size in sample_sizes:
    print(f"\nProcessing sample size: {sample_size}")

    # Load the dataset for the current sample size
    data_file = f'varying_dependencies_data_{sample_size}.csv'  # Update file name to match the data generation
    df = pd.read_csv(data_file)

    # Manually encode categorical variables for each of the 6 nodes
    ir_map = {'low': 0, 'medium': 1, 'high': 2}
    ei_map = {'poor': 0, 'average': 1, 'good': 2}
    sp_map = {'decrease': 0, 'stable': 1, 'increase': 2}

    # Encode each node in the DataFrame
    df['IR_encoded'] = df['IR_State'].map(ir_map)
    df['EI_encoded'] = df['EI_State'].map(ei_map)
    df['SP_encoded'] = df['SP_State'].map(sp_map)  # Changed from 'Chosen_SP_State' to 'SP_State'

    # Split the data into training and test sets
    X = df[['IR_encoded', 'EI_encoded']]
    y = df['SP_encoded']

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, shuffle=True)

    # Concatenate X_train and y_train to form the training set for learning the BN structure
    df_train = pd.concat([X_train, y_train], axis=1)
    df_test = pd.concat([X_test, y_test], axis=1)

    # Limited priors to increase learning complexity and allow the model to freely learn
    priors = [
        ('IR_encoded', 'SP_encoded'), ('EI_encoded', 'SP_encoded')
    ]

    # Perform Hill Climb Search with restricted priors on the training data
    hc = HillClimbSearch(df_train)
    best_dag = hc.estimate(scoring_method=AICScore(df_train), fixed_edges=priors)

    # Initialize BayesianNetwork and print edges
    best_model = BayesianNetwork(best_dag.edges())
    print("Learned structure (edges):", best_model.edges())

    if len(best_model.edges()) == 0:
        print("No edges learned. Skipping to next sample size.")
        continue

    # Fit parameters using BayesianEstimator on the training data
    try:
        best_model.fit(df_train, estimator=BayesianEstimator, prior_type="BDeu", equivalent_sample_size=8)
        print("Model fitted successfully.")
    except Exception as e:
        print("Error during fitting:", str(e))
        continue

    # Check model validity
    if not best_model.check_model():
        print("Model check failed.")
        continue

    # Filter X_test to include only nodes in the learned structure, excluding SP_encoded
    learned_nodes = [node for node in best_model.nodes() if node != 'SP_encoded']
    X_test_filtered = X_test[learned_nodes]

    # Predict on the filtered test data and calculate K-L divergence
    try:
        # Generate predictions for SP_encoded based on the learned model
        inference = best_model.predict(X_test_filtered)
        predicted_probabilities = inference['SP_encoded']

        ground_truth_probabilities = y_test.value_counts(normalize=True).sort_index()
        predicted_probabilities = predicted_probabilities.value_counts(normalize=True).sort_index()

        # Reindex both distributions and add smoothing
        all_categories = sorted(set(ground_truth_probabilities.index).union(set(predicted_probabilities.index)))
        ground_truth_probabilities = ground_truth_probabilities.reindex(all_categories, fill_value=epsilon)
        predicted_probabilities = predicted_probabilities.reindex(all_categories, fill_value=epsilon)

        kl_divergence = entropy(pk=ground_truth_probabilities, qk=predicted_probabilities)
        std_dev = np.std(predicted_probabilities - ground_truth_probabilities)

        results.append({
            'Sample_Size': sample_size,
            'K-L_Divergence': kl_divergence,
            'Standard_Deviation': std_dev
        })

        print(f"K-L Divergence: {kl_divergence:.4f}")
        print(f"Standard Deviation: {std_dev:.4f}")

    except Exception as e:
        print("Error during prediction:", str(e))

# Save the K-L divergence results
results_df = pd.DataFrame(results)
results_df.to_csv('kl_std_aic_sparse_dynamic.csv', index=False)

print("\nAll results have been saved.")


Processing sample size: 50


  0%|          | 0/1000000 [00:00<?, ?it/s]

Learned structure (edges): [('IR_encoded', 'SP_encoded'), ('EI_encoded', 'SP_encoded')]
Model fitted successfully.


  0%|          | 0/7 [00:00<?, ?it/s]

K-L Divergence: 0.4953
Standard Deviation: 0.3311

Processing sample size: 100


  0%|          | 0/1000000 [00:00<?, ?it/s]

Learned structure (edges): [('IR_encoded', 'SP_encoded'), ('EI_encoded', 'SP_encoded')]
Model fitted successfully.


  0%|          | 0/8 [00:00<?, ?it/s]

K-L Divergence: 0.0845
Standard Deviation: 0.1440

Processing sample size: 150


  0%|          | 0/1000000 [00:00<?, ?it/s]

Learned structure (edges): [('IR_encoded', 'SP_encoded'), ('EI_encoded', 'SP_encoded')]
Model fitted successfully.


  0%|          | 0/9 [00:00<?, ?it/s]

K-L Divergence: 4.1174
Standard Deviation: 0.1550

Processing sample size: 200


  0%|          | 0/1000000 [00:00<?, ?it/s]

Learned structure (edges): [('IR_encoded', 'SP_encoded'), ('EI_encoded', 'SP_encoded')]
Model fitted successfully.


  0%|          | 0/9 [00:00<?, ?it/s]

K-L Divergence: 0.1700
Standard Deviation: 0.1841

Processing sample size: 250


  0%|          | 0/1000000 [00:00<?, ?it/s]

Learned structure (edges): [('IR_encoded', 'SP_encoded'), ('EI_encoded', 'SP_encoded')]
Model fitted successfully.


  0%|          | 0/9 [00:00<?, ?it/s]

K-L Divergence: 0.0336
Standard Deviation: 0.0891

Processing sample size: 300


  0%|          | 0/1000000 [00:00<?, ?it/s]

Learned structure (edges): [('IR_encoded', 'SP_encoded'), ('EI_encoded', 'SP_encoded')]
Model fitted successfully.


  0%|          | 0/9 [00:00<?, ?it/s]

K-L Divergence: 0.0508
Standard Deviation: 0.1100

Processing sample size: 350


  0%|          | 0/1000000 [00:00<?, ?it/s]

Learned structure (edges): [('IR_encoded', 'SP_encoded'), ('EI_encoded', 'SP_encoded')]
Model fitted successfully.


  0%|          | 0/9 [00:00<?, ?it/s]

K-L Divergence: 0.1346
Standard Deviation: 0.1629

Processing sample size: 400


  0%|          | 0/1000000 [00:00<?, ?it/s]

Learned structure (edges): [('IR_encoded', 'SP_encoded'), ('EI_encoded', 'SP_encoded')]
Model fitted successfully.


  0%|          | 0/9 [00:00<?, ?it/s]

K-L Divergence: 0.2479
Standard Deviation: 0.2366

Processing sample size: 450


  0%|          | 0/1000000 [00:00<?, ?it/s]

Learned structure (edges): [('IR_encoded', 'SP_encoded'), ('EI_encoded', 'SP_encoded')]
Model fitted successfully.


  0%|          | 0/9 [00:00<?, ?it/s]

K-L Divergence: 0.0808
Standard Deviation: 0.1395

Processing sample size: 500


  0%|          | 0/1000000 [00:00<?, ?it/s]

Learned structure (edges): [('IR_encoded', 'SP_encoded'), ('EI_encoded', 'SP_encoded'), ('EI_encoded', 'IR_encoded')]
Model fitted successfully.


  0%|          | 0/9 [00:00<?, ?it/s]

K-L Divergence: 0.0235
Standard Deviation: 0.0708

Processing sample size: 550


  0%|          | 0/1000000 [00:00<?, ?it/s]

Learned structure (edges): [('IR_encoded', 'SP_encoded'), ('EI_encoded', 'SP_encoded')]
Model fitted successfully.


  0%|          | 0/9 [00:00<?, ?it/s]

K-L Divergence: 9.2377
Standard Deviation: 0.3223

Processing sample size: 600


  0%|          | 0/1000000 [00:00<?, ?it/s]

Learned structure (edges): [('IR_encoded', 'SP_encoded'), ('EI_encoded', 'SP_encoded')]
Model fitted successfully.


  0%|          | 0/9 [00:00<?, ?it/s]

K-L Divergence: 0.0353
Standard Deviation: 0.0847

Processing sample size: 650


  0%|          | 0/1000000 [00:00<?, ?it/s]

Learned structure (edges): [('IR_encoded', 'SP_encoded'), ('EI_encoded', 'SP_encoded')]
Model fitted successfully.


  0%|          | 0/9 [00:00<?, ?it/s]

K-L Divergence: 0.0714
Standard Deviation: 0.1208

Processing sample size: 700


  0%|          | 0/1000000 [00:00<?, ?it/s]

Learned structure (edges): [('IR_encoded', 'SP_encoded'), ('EI_encoded', 'SP_encoded')]
Model fitted successfully.


  0%|          | 0/9 [00:00<?, ?it/s]

K-L Divergence: 0.0710
Standard Deviation: 0.1171

Processing sample size: 750


  0%|          | 0/1000000 [00:00<?, ?it/s]

Learned structure (edges): [('IR_encoded', 'SP_encoded'), ('EI_encoded', 'SP_encoded')]
Model fitted successfully.


  0%|          | 0/9 [00:00<?, ?it/s]

K-L Divergence: 0.1193
Standard Deviation: 0.1682

Processing sample size: 800


  0%|          | 0/1000000 [00:00<?, ?it/s]

Learned structure (edges): [('IR_encoded', 'SP_encoded'), ('IR_encoded', 'EI_encoded'), ('EI_encoded', 'SP_encoded')]
Model fitted successfully.


  0%|          | 0/9 [00:00<?, ?it/s]

K-L Divergence: 0.0347
Standard Deviation: 0.0915

Processing sample size: 850


  0%|          | 0/1000000 [00:00<?, ?it/s]

Learned structure (edges): [('IR_encoded', 'SP_encoded'), ('EI_encoded', 'SP_encoded')]
Model fitted successfully.


  0%|          | 0/9 [00:00<?, ?it/s]

K-L Divergence: 0.0367
Standard Deviation: 0.0881

Processing sample size: 900


  0%|          | 0/1000000 [00:00<?, ?it/s]

Learned structure (edges): [('IR_encoded', 'SP_encoded'), ('EI_encoded', 'SP_encoded')]
Model fitted successfully.


  0%|          | 0/9 [00:00<?, ?it/s]

K-L Divergence: 0.0155
Standard Deviation: 0.0552

Processing sample size: 950


  0%|          | 0/1000000 [00:00<?, ?it/s]

Learned structure (edges): [('IR_encoded', 'SP_encoded'), ('EI_encoded', 'SP_encoded')]
Model fitted successfully.


  0%|          | 0/9 [00:00<?, ?it/s]

K-L Divergence: 5.9611
Standard Deviation: 0.2215

Processing sample size: 1000


  0%|          | 0/1000000 [00:00<?, ?it/s]

Learned structure (edges): [('IR_encoded', 'SP_encoded'), ('EI_encoded', 'SP_encoded')]
Model fitted successfully.


  0%|          | 0/9 [00:00<?, ?it/s]

K-L Divergence: 0.3034
Standard Deviation: 0.2027

Processing sample size: 2000


  0%|          | 0/1000000 [00:00<?, ?it/s]

Learned structure (edges): [('IR_encoded', 'SP_encoded'), ('EI_encoded', 'SP_encoded')]
Model fitted successfully.


  0%|          | 0/9 [00:00<?, ?it/s]

K-L Divergence: 0.1046
Standard Deviation: 0.1590

Processing sample size: 3000


  0%|          | 0/1000000 [00:00<?, ?it/s]

Learned structure (edges): [('IR_encoded', 'SP_encoded'), ('IR_encoded', 'EI_encoded'), ('EI_encoded', 'SP_encoded')]
Model fitted successfully.


  0%|          | 0/9 [00:00<?, ?it/s]

K-L Divergence: 0.0534
Standard Deviation: 0.1081

Processing sample size: 4000


  0%|          | 0/1000000 [00:00<?, ?it/s]

Learned structure (edges): [('IR_encoded', 'SP_encoded'), ('EI_encoded', 'SP_encoded')]
Model fitted successfully.


  0%|          | 0/9 [00:00<?, ?it/s]

K-L Divergence: 0.2532
Standard Deviation: 0.2290

Processing sample size: 5000


  0%|          | 0/1000000 [00:00<?, ?it/s]

Learned structure (edges): [('IR_encoded', 'SP_encoded'), ('EI_encoded', 'SP_encoded')]
Model fitted successfully.


  0%|          | 0/9 [00:00<?, ?it/s]

K-L Divergence: 0.0464
Standard Deviation: 0.0983

Processing sample size: 6000


  0%|          | 0/1000000 [00:00<?, ?it/s]

Learned structure (edges): [('IR_encoded', 'SP_encoded'), ('IR_encoded', 'EI_encoded'), ('EI_encoded', 'SP_encoded')]
Model fitted successfully.


  0%|          | 0/9 [00:00<?, ?it/s]

K-L Divergence: 0.2101
Standard Deviation: 0.1924

Processing sample size: 7000


  0%|          | 0/1000000 [00:00<?, ?it/s]

Learned structure (edges): [('IR_encoded', 'SP_encoded'), ('EI_encoded', 'SP_encoded')]
Model fitted successfully.


  0%|          | 0/9 [00:00<?, ?it/s]

K-L Divergence: 7.2927
Standard Deviation: 0.2790

Processing sample size: 8000


  0%|          | 0/1000000 [00:00<?, ?it/s]

Learned structure (edges): [('IR_encoded', 'SP_encoded'), ('IR_encoded', 'EI_encoded'), ('EI_encoded', 'SP_encoded')]
Model fitted successfully.


  0%|          | 0/9 [00:00<?, ?it/s]

K-L Divergence: 0.0406
Standard Deviation: 0.0948

Processing sample size: 9000


  0%|          | 0/1000000 [00:00<?, ?it/s]

Learned structure (edges): [('IR_encoded', 'SP_encoded'), ('EI_encoded', 'SP_encoded')]
Model fitted successfully.


  0%|          | 0/9 [00:00<?, ?it/s]

K-L Divergence: 0.0989
Standard Deviation: 0.1561

Processing sample size: 10000


  0%|          | 0/1000000 [00:00<?, ?it/s]

Learned structure (edges): [('IR_encoded', 'SP_encoded'), ('EI_encoded', 'SP_encoded'), ('EI_encoded', 'IR_encoded')]
Model fitted successfully.


  0%|          | 0/9 [00:00<?, ?it/s]

K-L Divergence: 0.2546
Standard Deviation: 0.2325

Processing sample size: 11000


  0%|          | 0/1000000 [00:00<?, ?it/s]

Learned structure (edges): [('IR_encoded', 'SP_encoded'), ('IR_encoded', 'EI_encoded'), ('EI_encoded', 'SP_encoded')]
Model fitted successfully.


  0%|          | 0/9 [00:00<?, ?it/s]

K-L Divergence: 0.0374
Standard Deviation: 0.0866

Processing sample size: 12000


  0%|          | 0/1000000 [00:00<?, ?it/s]

Learned structure (edges): [('IR_encoded', 'SP_encoded'), ('EI_encoded', 'SP_encoded')]
Model fitted successfully.


  0%|          | 0/9 [00:00<?, ?it/s]

K-L Divergence: 0.0357
Standard Deviation: 0.0871

Processing sample size: 13000


  0%|          | 0/1000000 [00:00<?, ?it/s]

Learned structure (edges): [('IR_encoded', 'SP_encoded'), ('EI_encoded', 'SP_encoded')]
Model fitted successfully.


  0%|          | 0/9 [00:00<?, ?it/s]

K-L Divergence: 0.0445
Standard Deviation: 0.0980

Processing sample size: 14000


  0%|          | 0/1000000 [00:00<?, ?it/s]

Learned structure (edges): [('IR_encoded', 'SP_encoded'), ('IR_encoded', 'EI_encoded'), ('EI_encoded', 'SP_encoded')]
Model fitted successfully.


  0%|          | 0/9 [00:00<?, ?it/s]

K-L Divergence: 0.0446
Standard Deviation: 0.0959

Processing sample size: 15000


  0%|          | 0/1000000 [00:00<?, ?it/s]

Learned structure (edges): [('IR_encoded', 'SP_encoded'), ('EI_encoded', 'SP_encoded')]
Model fitted successfully.


  0%|          | 0/9 [00:00<?, ?it/s]

K-L Divergence: 0.0345
Standard Deviation: 0.0838

Processing sample size: 16000


  0%|          | 0/1000000 [00:00<?, ?it/s]

Learned structure (edges): [('IR_encoded', 'SP_encoded'), ('EI_encoded', 'SP_encoded')]
Model fitted successfully.


  0%|          | 0/9 [00:00<?, ?it/s]

K-L Divergence: 0.0442
Standard Deviation: 0.0953

Processing sample size: 17000


  0%|          | 0/1000000 [00:00<?, ?it/s]

Learned structure (edges): [('IR_encoded', 'SP_encoded'), ('EI_encoded', 'SP_encoded')]
Model fitted successfully.


  0%|          | 0/9 [00:00<?, ?it/s]

K-L Divergence: 0.1736
Standard Deviation: 0.1580

Processing sample size: 18000


  0%|          | 0/1000000 [00:00<?, ?it/s]

Learned structure (edges): [('IR_encoded', 'SP_encoded'), ('EI_encoded', 'SP_encoded')]
Model fitted successfully.


  0%|          | 0/9 [00:00<?, ?it/s]

K-L Divergence: 7.2405
Standard Deviation: 0.2453

Processing sample size: 19000


  0%|          | 0/1000000 [00:00<?, ?it/s]

Learned structure (edges): [('IR_encoded', 'SP_encoded'), ('EI_encoded', 'SP_encoded')]
Model fitted successfully.


  0%|          | 0/9 [00:00<?, ?it/s]

K-L Divergence: 0.1889
Standard Deviation: 0.1789

Processing sample size: 20000


  0%|          | 0/1000000 [00:00<?, ?it/s]

Learned structure (edges): [('IR_encoded', 'SP_encoded'), ('EI_encoded', 'SP_encoded')]
Model fitted successfully.


  0%|          | 0/9 [00:00<?, ?it/s]

K-L Divergence: 0.0429
Standard Deviation: 0.0954

All results have been saved.
