<a href="https://colab.research.google.com/github/nonyeezeh/Research-Project-Code/blob/main/LBN_Sparse_BIC_3_Nodes.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Imports

In [1]:
pip install pgmpy



In [2]:
import numpy as np
import pandas as pd
from pgmpy.estimators import HillClimbSearch, BicScore, MaximumLikelihoodEstimator
from pgmpy.models import BayesianNetwork
from pgmpy.estimators import BayesianEstimator
from sklearn.model_selection import train_test_split
from scipy.stats import entropy
from tabulate import tabulate

# Bayesian Network Data Generation 500, ..., 20000 Samples (sparse)

In [3]:
# Function to generate CPDs for the sparse structure with 2 nodes influencing SP
def generate_cpds_sparse_3_total_nodes():
    # Generate random probabilities for each of the 2 independent nodes
    ir_probs = np.random.rand(3)
    ir_probs /= ir_probs.sum()

    ei_given_ir_probs = np.random.rand(3, 3)
    ei_given_ir_probs /= ei_given_ir_probs.sum(axis=0, keepdims=True)

    # SP depends on the 2 other nodes without interactions between them
    sp_probs = np.random.rand(3, 3, 3)
    sp_probs /= sp_probs.sum(axis=0, keepdims=True)

    return ir_probs, ei_given_ir_probs, sp_probs

# Function to generate and save samples with the sparse structure of 3 nodes total
def generate_and_save_samples_sparse_3_total_nodes(ir_probs, ei_probs, sp_probs, sample_size, filename):
    output_data = []

    # Generate `sample_size` random samples
    for _ in range(sample_size):
        # Sample each of the 2 nodes individually
        ir_state_idx = np.random.choice(3, p=ir_probs)
        ir_state = ['low', 'medium', 'high'][ir_state_idx]

        ei_probs_given_ir = ei_probs[:, ir_state_idx]
        ei_state_idx = np.random.choice(3, p=ei_probs_given_ir)
        ei_state = ['poor', 'average', 'good'][ei_state_idx]

        # Calculate SP probability based on the state of each node (sparse dependency on each)
        sp_probs_given_all = sp_probs[:, ir_state_idx, ei_state_idx]
        sp_state_idx = np.random.choice(3, p=sp_probs_given_all)
        sp_state = ['decrease', 'stable', 'increase'][sp_state_idx]

        # Append sample data to output list including probabilities for all nodes
        output_data.append({
            'IR_State': ir_state,
            'EI_State': ei_state,
            'SP_Probabilities (decrease, stable, increase)': ', '.join([f'{prob:.4f}' for prob in sp_probs_given_all]),
            'Chosen_SP_State': sp_state
        })

    # Create a DataFrame from the output data
    output_df = pd.DataFrame(output_data)

    # Save the output DataFrame to a CSV file
    output_df.to_csv(filename, index=False)

    # Print the first few rows for visual confirmation
    print(f"\nSample size: {sample_size} - First few rows of generated samples:\n")
    print(tabulate(output_df.head(), headers='keys', tablefmt='grid'))

# Generate and save samples for sample sizes
sample_sizes = [50, 100, 150, 200, 500, 1000, 5000, 10000, 15000, 20000]

for size in sample_sizes:
    ir_probs, ei_probs, sp_probs = generate_cpds_sparse_3_total_nodes()
    generate_and_save_samples_sparse_3_total_nodes(ir_probs, ei_probs, sp_probs, size, f'combined_probabilities_{size}.csv')

print("\nGeneration and saving of individual samples complete for all sample sizes!")
# Function to generate CPDs for the sparse structure with 2 nodes influencing SP
def generate_cpds_sparse_3_total_nodes():
    # Generate random probabilities for each of the 2 independent nodes
    ir_probs = np.random.rand(3)
    ir_probs /= ir_probs.sum()

    ei_given_ir_probs = np.random.rand(3, 3)
    ei_given_ir_probs /= ei_given_ir_probs.sum(axis=0, keepdims=True)

    # SP depends on the 2 other nodes without interactions between them
    sp_probs = np.random.rand(3, 3, 3)
    sp_probs /= sp_probs.sum(axis=0, keepdims=True)

    return ir_probs, ei_given_ir_probs, sp_probs

# Function to generate and save samples with the sparse structure of 3 nodes total
def generate_and_save_samples_sparse_3_total_nodes(ir_probs, ei_probs, sp_probs, sample_size, filename):
    output_data = []

    # Generate `sample_size` random samples
    for _ in range(sample_size):
        # Sample each of the 2 nodes individually
        ir_state_idx = np.random.choice(3, p=ir_probs)
        ir_state = ['low', 'medium', 'high'][ir_state_idx]

        ei_probs_given_ir = ei_probs[:, ir_state_idx]
        ei_state_idx = np.random.choice(3, p=ei_probs_given_ir)
        ei_state = ['poor', 'average', 'good'][ei_state_idx]

        # Calculate SP probability based on the state of each node (sparse dependency on each)
        sp_probs_given_all = sp_probs[:, ir_state_idx, ei_state_idx]
        sp_state_idx = np.random.choice(3, p=sp_probs_given_all)
        sp_state = ['decrease', 'stable', 'increase'][sp_state_idx]

        # Append sample data to output list including probabilities for all nodes
        output_data.append({
            'IR_State': ir_state,
            'EI_State': ei_state,
            'SP_Probabilities (decrease, stable, increase)': ', '.join([f'{prob:.4f}' for prob in sp_probs_given_all]),
            'Chosen_SP_State': sp_state
        })

    # Create a DataFrame from the output data
    output_df = pd.DataFrame(output_data)

    # Save the output DataFrame to a CSV file
    output_df.to_csv(filename, index=False)

    # Print the first few rows for visual confirmation
    print(f"\nSample size: {sample_size} - First few rows of generated samples:\n")
    print(tabulate(output_df.head(), headers='keys', tablefmt='grid'))

# Generate and save samples for sample sizes
sample_sizes = [50, 100, 150, 200, 250, 300, 350, 400, 450, 500, 550, 600, 650, 700, 750, 800, 850, 900, 950, 1000, 2000, 3000, 4000, 5000, 6000, 7000, 8000, 9000, 10000, 11000, 12000, 13000, 14000, 15000, 16000, 17000, 18000, 19000, 20000]

for size in sample_sizes:
    ir_probs, ei_probs, sp_probs = generate_cpds_sparse_3_total_nodes()
    generate_and_save_samples_sparse_3_total_nodes(ir_probs, ei_probs, sp_probs, size, f'combined_probabilities_{size}.csv')

print("\nGeneration and saving of individual samples complete for all sample sizes!")


Sample size: 50 - First few rows of generated samples:

+----+------------+------------+-------------------------------------------------+-------------------+
|    | IR_State   | EI_State   | SP_Probabilities (decrease, stable, increase)   | Chosen_SP_State   |
|  0 | medium     | average    | 0.3270, 0.1196, 0.5534                          | increase          |
+----+------------+------------+-------------------------------------------------+-------------------+
|  1 | high       | average    | 0.3060, 0.4416, 0.2524                          | stable            |
+----+------------+------------+-------------------------------------------------+-------------------+
|  2 | medium     | average    | 0.3270, 0.1196, 0.5534                          | decrease          |
+----+------------+------------+-------------------------------------------------+-------------------+
|  3 | medium     | good       | 0.3693, 0.1825, 0.4482                          | decrease          |
+----+----------

# LBN Dense BIC & Entropy

In [4]:
# Sample sizes to loop through
sample_sizes = [50, 100, 150, 200, 500, 1000, 5000, 10000, 15000, 20000]

# Initialize list to store K-L divergence and standard deviation values for each sample size
results = []

# Small smoothing value to avoid zero probabilities
epsilon = 1e-10

# Loop through each sample size
for sample_size in sample_sizes:
    print(f"\nProcessing sample size: {sample_size}")

    # Load the dense dataset for the current sample size
    dense_data_file = f'combined_probabilities_{sample_size}.csv'
    df_dense = pd.read_csv(dense_data_file)

    # Manually encode categorical variables for IR, EI, and SP
    ir_map = {'low': 0, 'medium': 1, 'high': 2}
    ei_map = {'poor': 0, 'average': 1, 'good': 2}
    sp_map = {'decrease': 0, 'stable': 1, 'increase': 2}

    df_dense['IR_encoded'] = df_dense['IR_State'].map(ir_map)
    df_dense['EI_encoded'] = df_dense['EI_State'].map(ei_map)
    df_dense['SP_encoded'] = df_dense['Chosen_SP_State'].map(sp_map)

    # Split the data into training, validation, and test sets
    X = df_dense[['IR_encoded', 'EI_encoded']]
    y = df_dense['SP_encoded']

    # Split into training (70%) and temp (30%) for validation and test
    X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42, shuffle=True)
    X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42, shuffle=True)

    # Concatenate X and y to form the training set for learning the BN structure
    df_train = pd.concat([X_train, y_train], axis=1)
    print("Training data:")
    print(df_train.head())

    # Enforce the inclusion of `SP_encoded` using priors (fixed edges)
    priors = [
        ('IR_encoded', 'SP_encoded'),
        ('EI_encoded', 'SP_encoded')
    ]

    # Perform Hill Climb Search with priors (fixed edges)
    hc = HillClimbSearch(df_train)
    best_dag = hc.estimate(scoring_method=BicScore(df_train), fixed_edges=priors)

    # Initialize BayesianNetwork and print edges
    best_model = BayesianNetwork(best_dag.edges())
    print("Learned structure (edges):", best_model.edges())

    if len(best_model.edges()) == 0:
        print("No edges learned. Skipping to next sample size.")
        continue

    # Try fitting parameters using BayesianEstimator
    try:
        best_model.fit(df_train, estimator=BayesianEstimator, prior_type="BDeu", equivalent_sample_size=5)
        print("Model fitted successfully.")
    except Exception as e:
        print("Error during fitting:", str(e))
        continue

    # Check model validity
    if not best_model.check_model():
        print("Model check failed.")
        continue

    # --- K-L Divergence and Standard Deviation Calculation Block ---
    # Predict on test data
    try:
        inference = best_model.predict(X_test)
        predicted_probabilities = inference['SP_encoded']

        # Calculate K-L divergence and standard deviation between the ground truth and learned BN
        ground_truth_probabilities = y_test.value_counts(normalize=True).sort_index()
        predicted_probabilities = predicted_probabilities.value_counts(normalize=True).sort_index()

        # Reindex both distributions to have the same set of categories and add smoothing
        all_categories = sorted(set(ground_truth_probabilities.index).union(set(predicted_probabilities.index)))
        ground_truth_probabilities = ground_truth_probabilities.reindex(all_categories, fill_value=epsilon)
        predicted_probabilities = predicted_probabilities.reindex(all_categories, fill_value=epsilon)

        # Calculate K-L divergence with smoothing
        kl_divergence = entropy(pk=ground_truth_probabilities, qk=predicted_probabilities)

        # Standard deviation between predicted probabilities and actual probabilities
        std_dev = np.std(predicted_probabilities - ground_truth_probabilities)

        # Append results for this sample size
        results.append({
            'Sample_Size': sample_size,
            'K-L_Divergence': kl_divergence,
            'Standard_Deviation': std_dev
        })

        # Print the K-L divergence and standard deviation for this sample size
        print(f"\nResults for sample size {sample_size}:")
        print(f"K-L Divergence: {kl_divergence:.4f}")
        print(f"Standard Deviation: {std_dev:.4f}")

    except Exception as e:
        print("Error during prediction:", str(e))
        continue

# After processing all sample sizes, save results to CSV and display them
results_df = pd.DataFrame(results)
results_df.to_csv('kl_std_bic_results_nodes.csv', index=False)

# Print all results after saving to CSV
print("\nAll results have been saved to 'kl_std_bic_results_nodes.csv'.")
print(results_df)


Processing sample size: 50
Training data:
    IR_encoded  EI_encoded  SP_encoded
6            0           1           0
41           0           1           1
46           0           2           1
47           0           2           0
15           1           0           2


  0%|          | 0/1000000 [00:00<?, ?it/s]

Learned structure (edges): [('IR_encoded', 'SP_encoded'), ('EI_encoded', 'SP_encoded')]
Model fitted successfully.


  0%|          | 0/5 [00:00<?, ?it/s]


Results for sample size 50:
K-L Divergence: 0.2092
Standard Deviation: 0.2041

Processing sample size: 100
Training data:
    IR_encoded  EI_encoded  SP_encoded
11           1           2           1
47           0           2           0
85           1           0           1
28           1           0           1
93           1           2           2


  0%|          | 0/1000000 [00:00<?, ?it/s]

Learned structure (edges): [('IR_encoded', 'SP_encoded'), ('EI_encoded', 'SP_encoded')]
Model fitted successfully.


  0%|          | 0/5 [00:00<?, ?it/s]


Results for sample size 100:
K-L Divergence: 0.0767
Standard Deviation: 0.1089

Processing sample size: 150
Training data:
     IR_encoded  EI_encoded  SP_encoded
81            0           1           1
133           1           0           1
137           1           1           0
75            1           2           2
109           2           2           1


  0%|          | 0/1000000 [00:00<?, ?it/s]

Learned structure (edges): [('IR_encoded', 'SP_encoded'), ('IR_encoded', 'EI_encoded'), ('EI_encoded', 'SP_encoded')]
Model fitted successfully.


  0%|          | 0/7 [00:00<?, ?it/s]


Results for sample size 150:
K-L Divergence: 0.3869
Standard Deviation: 0.2485

Processing sample size: 200
Training data:
     IR_encoded  EI_encoded  SP_encoded
169           1           1           2
97            2           1           0
31            0           1           1
12            1           1           0
35            2           1           1


  0%|          | 0/1000000 [00:00<?, ?it/s]

Learned structure (edges): [('IR_encoded', 'SP_encoded'), ('IR_encoded', 'EI_encoded'), ('EI_encoded', 'SP_encoded')]
Model fitted successfully.


  0%|          | 0/9 [00:00<?, ?it/s]


Results for sample size 200:
K-L Divergence: 0.8168
Standard Deviation: 0.3031

Processing sample size: 500
Training data:
     IR_encoded  EI_encoded  SP_encoded
5             2           2           1
116           0           0           1
45            1           1           2
16            2           2           1
462           2           2           1


  0%|          | 0/1000000 [00:00<?, ?it/s]

Learned structure (edges): [('IR_encoded', 'SP_encoded'), ('IR_encoded', 'EI_encoded'), ('EI_encoded', 'SP_encoded')]
Model fitted successfully.


  0%|          | 0/9 [00:00<?, ?it/s]


Results for sample size 500:
K-L Divergence: 0.1950
Standard Deviation: 0.2074

Processing sample size: 1000
Training data:
     IR_encoded  EI_encoded  SP_encoded
541           2           0           1
440           2           0           0
482           1           2           1
422           0           1           2
778           1           2           1


  0%|          | 0/1000000 [00:00<?, ?it/s]

Learned structure (edges): [('IR_encoded', 'SP_encoded'), ('EI_encoded', 'SP_encoded'), ('EI_encoded', 'IR_encoded')]
Model fitted successfully.


  0%|          | 0/9 [00:00<?, ?it/s]


Results for sample size 1000:
K-L Divergence: 0.3326
Standard Deviation: 0.1720

Processing sample size: 5000
Training data:
      IR_encoded  EI_encoded  SP_encoded
1840           1           0           1
2115           0           0           1
4437           1           0           0
1146           1           2           1
2486           1           0           2


  0%|          | 0/1000000 [00:00<?, ?it/s]

Learned structure (edges): [('IR_encoded', 'SP_encoded'), ('EI_encoded', 'SP_encoded'), ('EI_encoded', 'IR_encoded')]
Model fitted successfully.


  0%|          | 0/9 [00:00<?, ?it/s]


Results for sample size 5000:
K-L Divergence: 6.4991
Standard Deviation: 0.2721

Processing sample size: 10000
Training data:
      IR_encoded  EI_encoded  SP_encoded
9069           0           2           2
2603           1           1           0
7738           0           1           1
1579           0           2           0
5058           1           2           0


  0%|          | 0/1000000 [00:00<?, ?it/s]

Learned structure (edges): [('IR_encoded', 'SP_encoded'), ('IR_encoded', 'EI_encoded'), ('EI_encoded', 'SP_encoded')]
Model fitted successfully.


  0%|          | 0/9 [00:00<?, ?it/s]


Results for sample size 10000:
K-L Divergence: 0.2098
Standard Deviation: 0.1919

Processing sample size: 15000
Training data:
       IR_encoded  EI_encoded  SP_encoded
11797           0           2           0
5899            1           2           0
9513            0           2           0
1572            1           2           2
12995           2           0           2


  0%|          | 0/1000000 [00:00<?, ?it/s]

Learned structure (edges): [('IR_encoded', 'SP_encoded'), ('IR_encoded', 'EI_encoded'), ('EI_encoded', 'SP_encoded')]
Model fitted successfully.


  0%|          | 0/9 [00:00<?, ?it/s]


Results for sample size 15000:
K-L Divergence: 0.4355
Standard Deviation: 0.2306

Processing sample size: 20000
Training data:
       IR_encoded  EI_encoded  SP_encoded
17218           2           1           1
15188           2           0           1
11295           2           2           1
19772           2           0           1
13072           2           1           0


  0%|          | 0/1000000 [00:00<?, ?it/s]

Learned structure (edges): [('IR_encoded', 'SP_encoded'), ('IR_encoded', 'EI_encoded'), ('EI_encoded', 'SP_encoded')]
Model fitted successfully.


  0%|          | 0/9 [00:00<?, ?it/s]


Results for sample size 20000:
K-L Divergence: 0.4390
Standard Deviation: 0.1831

All results have been saved to 'kl_std_bic_results_nodes.csv'.
   Sample_Size  K-L_Divergence  Standard_Deviation
0           50        0.209247            0.204124
1          100        0.076715            0.108866
2          150        0.386850            0.248499
3          200        0.816845            0.303071
4          500        0.195022            0.207418
5         1000        0.332644            0.172047
6         5000        6.499099            0.272113
7        10000        0.209827            0.191908
8        15000        0.435518            0.230646
9        20000        0.438950            0.183108
