<a href="https://colab.research.google.com/github/nonyeezeh/Research-Project-Code/blob/main/LBN_Dense_BIC.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Imports

In [16]:
pip install pgmpy



In [17]:
import numpy as np
import pandas as pd
from pgmpy.models import BayesianNetwork
from pgmpy.models import BayesianModel
from pgmpy.factors.discrete import TabularCPD
from pgmpy.sampling import BayesianModelSampling
from sklearn.preprocessing import LabelEncoder
from pgmpy.estimators import HillClimbSearch, BicScore, MaximumLikelihoodEstimator
from pgmpy.inference import VariableElimination
from sklearn.metrics import accuracy_score
from scipy.stats import entropy
import os
from tabulate import tabulate
from sklearn.model_selection import train_test_split

# Bayesian Network Data Generation 1000, 2000, ..., 10000 Samples (dense)

In [21]:
# Function to generate CPDs
def generate_cpds():
    # Generate random probabilities for IR
    ir_probs = np.random.rand(3)
    ir_probs /= ir_probs.sum()  # Normalize to make it a valid probability distribution

    # Generate random probabilities for EI given IR
    ei_given_ir_probs = np.random.rand(3, 3)
    ei_given_ir_probs /= ei_given_ir_probs.sum(axis=0, keepdims=True)

    # Generate random probabilities for SP given IR and EI
    sp_probs = np.random.rand(3, 3, 3)
    sp_probs /= sp_probs.sum(axis=0, keepdims=True)

    return ir_probs, ei_given_ir_probs, sp_probs

# Function to generate and save samples
def generate_and_save_samples(ir_probs, ei_probs, sp_probs, sample_size, filename):
    output_data = []

    # Generate `sample_size` random samples
    for _ in range(sample_size):
        # Sample `IR` state based on `IR` probabilities
        ir_state_idx = np.random.choice(3, p=ir_probs)
        ir_state = ['low', 'medium', 'high'][ir_state_idx]
        ir_prob = ir_probs[ir_state_idx]

        # Sample `EI` state based on `EI` probabilities given `IR`
        ei_probs_given_ir = ei_probs[:, ir_state_idx]
        ei_state_idx = np.random.choice(3, p=ei_probs_given_ir)
        ei_state = ['poor', 'average', 'good'][ei_state_idx]
        ei_prob = ei_probs_given_ir[ei_state_idx]

        # Sample `SP` state based on `SP` probabilities given `IR` and `EI`
        sp_probs_given_ir_ei = sp_probs[:, ir_state_idx, ei_state_idx]
        sp_state_idx = np.random.choice(3, p=sp_probs_given_ir_ei)
        sp_state = ['decrease', 'stable', 'increase'][sp_state_idx]
        sp_prob = sp_probs_given_ir_ei[sp_state_idx]

        # Append sample data to output list
        output_data.append({
            'IR_State': ir_state,
            'IR_Prob': f'{ir_prob:.4f}',
            'EI_State': ei_state,
            'EI_Prob': f'{ei_prob:.4f}',
            'SP_Probabilities (decrease, stable, increase)': ', '.join([f'{prob:.4f}' for prob in sp_probs_given_ir_ei]),
            'Chosen_SP_State': sp_state,
            'Chosen_SP_Probability': f'{sp_prob:.4f}'
        })

    # Create a DataFrame from the output data
    output_df = pd.DataFrame(output_data)

    # Save the output DataFrame to a CSV file
    output_df.to_csv(filename, index=False)

    # Print the first few rows for visual confirmation
    print(f"\nSample size: {sample_size} - First few rows of generated samples:\n")
    print(tabulate(output_df.head(), headers='keys', tablefmt='grid'))

# Generate and save samples for sample sizes from 1000 to 10000 every 1000
sample_sizes = range(1000, 11000, 1000)

for size in sample_sizes:
    # Generate the CPDs
    ir_probs, ei_given_ir_probs, sp_probs = generate_cpds()

    # Generate and save individual samples for the given sample size
    generate_and_save_samples(ir_probs, ei_given_ir_probs, sp_probs, size, f'combined_probabilities_{size}.csv')

# Notify the user that the process is done
print("\nGeneration and saving of individual samples complete for all sample sizes!")


Sample size: 1000 - First few rows of generated samples:

+----+------------+-----------+------------+-----------+-------------------------------------------------+-------------------+-------------------------+
|    | IR_State   |   IR_Prob | EI_State   |   EI_Prob | SP_Probabilities (decrease, stable, increase)   | Chosen_SP_State   |   Chosen_SP_Probability |
|  0 | low        |    0.304  | average    |    0.4925 | 0.5121, 0.1305, 0.3574                          | decrease          |                  0.5121 |
+----+------------+-----------+------------+-----------+-------------------------------------------------+-------------------+-------------------------+
|  1 | high       |    0.4588 | average    |    0.436  | 0.3202, 0.2415, 0.4383                          | decrease          |                  0.3202 |
+----+------------+-----------+------------+-----------+-------------------------------------------------+-------------------+-------------------------+
|  2 | high       |    

# LBN Dense BIC

In [22]:
# Sample sizes to loop through
sample_sizes = range(1000, 11000, 1000)

# Loop through each sample size
for sample_size in sample_sizes:
    print(f"\nProcessing sample size: {sample_size}")

    # Load the dense dataset for the current sample size
    dense_data_file = f'combined_probabilities_{sample_size}.csv'
    df_dense = pd.read_csv(dense_data_file)

    # Manually encode categorical variables for IR, EI, and SP
    ir_map = {'low': 0, 'medium': 1, 'high': 2}
    ei_map = {'poor': 0, 'average': 1, 'good': 2}
    sp_map = {'decrease': 0, 'stable': 1, 'increase': 2}

    df_dense['IR_encoded'] = df_dense['IR_State'].map(ir_map)
    df_dense['EI_encoded'] = df_dense['EI_State'].map(ei_map)
    df_dense['SP_encoded'] = df_dense['Chosen_SP_State'].map(sp_map)

    # Split the data into training, validation, and test sets
    X = df_dense[['IR_encoded', 'EI_encoded']]
    y = df_dense['SP_encoded']

    # Split into training (70%) and temp (30%) for validation and test
    X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42, shuffle=True)
    # Split temp into validation (50%) and test (50%)
    X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42, shuffle=True)

    # Concatenate X and y to form the training set for learning the BN structure
    df_train = pd.concat([X_train, y_train], axis=1)

    # Define the Hill-Climb structure learning algorithm
    hc = HillClimbSearch(df_train)
    scoring_method = BicScore(df_train)

    # Estimate the best structure
    best_dag = hc.estimate(scoring_method=scoring_method)
    best_model = BayesianNetwork(best_dag.edges())

    # Display the learned structure (edges of the Bayesian Network)
    print(f"\nLearned Structure (Edges) for {sample_size} samples:")
    print(best_model.edges())

    # Learn the CPDs using Maximum Likelihood Estimation (MLE)
    best_model.fit(df_train, estimator=MaximumLikelihoodEstimator)

    # Check if the model is valid after learning the parameters
    assert best_model.check_model()

    # Print the learned CPDs (Conditional Probability Distributions)
    for cpd in best_model.get_cpds():
        print("\nCPD of", cpd.variable)
        print(cpd)

    # Save the learned Bayesian Network model
    # Optional: Save the learned model to disk if needed
    # best_model.save(f"learned_bn_model_{sample_size}.json")

print("\nProcessing complete for all sample sizes.")


Processing sample size: 1000


  0%|          | 0/1000000 [00:00<?, ?it/s]


Learned Structure (Edges) for 1000 samples:
[('IR_encoded', 'EI_encoded'), ('SP_encoded', 'IR_encoded')]

CPD of IR_encoded
+---------------+---------------------+---------------------+---------------------+
| SP_encoded    | SP_encoded(0)       | SP_encoded(1)       | SP_encoded(2)       |
+---------------+---------------------+---------------------+---------------------+
| IR_encoded(0) | 0.28517110266159695 | 0.2761904761904762  | 0.29515418502202645 |
+---------------+---------------------+---------------------+---------------------+
| IR_encoded(1) | 0.22053231939163498 | 0.46190476190476193 | 0.03524229074889868 |
+---------------+---------------------+---------------------+---------------------+
| IR_encoded(2) | 0.49429657794676807 | 0.2619047619047619  | 0.6696035242290749  |
+---------------+---------------------+---------------------+---------------------+

CPD of EI_encoded
+---------------+---------------+--------------------+---------------------+
| IR_encoded    | IR_en

  0%|          | 0/1000000 [00:00<?, ?it/s]


Learned Structure (Edges) for 2000 samples:
[('IR_encoded', 'EI_encoded'), ('SP_encoded', 'IR_encoded'), ('SP_encoded', 'EI_encoded')]

CPD of IR_encoded
+---------------+---------------------+---------------------+---------------------+
| SP_encoded    | SP_encoded(0)       | SP_encoded(1)       | SP_encoded(2)       |
+---------------+---------------------+---------------------+---------------------+
| IR_encoded(0) | 0.7590361445783133  | 0.6309523809523809  | 0.45843828715365237 |
+---------------+---------------------+---------------------+---------------------+
| IR_encoded(1) | 0.12771084337349398 | 0.07312925170068027 | 0.11838790931989925 |
+---------------+---------------------+---------------------+---------------------+
| IR_encoded(2) | 0.11325301204819277 | 0.29591836734693877 | 0.42317380352644834 |
+---------------+---------------------+---------------------+---------------------+

CPD of EI_encoded
+---------------+---------------------+-----+---------------------+---

  0%|          | 0/1000000 [00:00<?, ?it/s]


Learned Structure (Edges) for 3000 samples:
[('EI_encoded', 'IR_encoded'), ('SP_encoded', 'IR_encoded'), ('SP_encoded', 'EI_encoded')]

CPD of EI_encoded
+---------------+---------------------+---------------------+---------------------+
| SP_encoded    | SP_encoded(0)       | SP_encoded(1)       | SP_encoded(2)       |
+---------------+---------------------+---------------------+---------------------+
| EI_encoded(0) | 0.06017699115044248 | 0.08377896613190731 | 0.04004106776180698 |
+---------------+---------------------+---------------------+---------------------+
| EI_encoded(1) | 0.2938053097345133  | 0.6345811051693404  | 0.608829568788501   |
+---------------+---------------------+---------------------+---------------------+
| EI_encoded(2) | 0.6460176991150443  | 0.2816399286987522  | 0.351129363449692   |
+---------------+---------------------+---------------------+---------------------+

CPD of IR_encoded
+---------------+----------------------+-----+---------------------+--

  0%|          | 0/1000000 [00:00<?, ?it/s]


Learned Structure (Edges) for 4000 samples:
[('IR_encoded', 'EI_encoded'), ('IR_encoded', 'SP_encoded'), ('SP_encoded', 'EI_encoded')]

CPD of IR_encoded
+---------------+----------+
| IR_encoded(0) | 0.208571 |
+---------------+----------+
| IR_encoded(1) | 0.378214 |
+---------------+----------+
| IR_encoded(2) | 0.413214 |
+---------------+----------+

CPD of EI_encoded
+---------------+----------------------+-----+---------------------+----------------------+
| IR_encoded    | IR_encoded(0)        | ... | IR_encoded(2)       | IR_encoded(2)        |
+---------------+----------------------+-----+---------------------+----------------------+
| SP_encoded    | SP_encoded(0)        | ... | SP_encoded(1)       | SP_encoded(2)        |
+---------------+----------------------+-----+---------------------+----------------------+
| EI_encoded(0) | 0.011834319526627219 | ... | 0.15315315315315314 | 0.5176151761517616   |
+---------------+----------------------+-----+---------------------+---

  0%|          | 0/1000000 [00:00<?, ?it/s]


Learned Structure (Edges) for 5000 samples:
[('IR_encoded', 'SP_encoded'), ('EI_encoded', 'IR_encoded'), ('EI_encoded', 'SP_encoded')]

CPD of IR_encoded
+---------------+---------------------+---------------------+---------------------+
| EI_encoded    | EI_encoded(0)       | EI_encoded(1)       | EI_encoded(2)       |
+---------------+---------------------+---------------------+---------------------+
| IR_encoded(0) | 0.3897893030794165  | 0.3051305130513051  | 0.37056277056277054 |
+---------------+---------------------+---------------------+---------------------+
| IR_encoded(1) | 0.1507293354943274  | 0.46354635463546356 | 0.3800865800865801  |
+---------------+---------------------+---------------------+---------------------+
| IR_encoded(2) | 0.45948136142625606 | 0.23132313231323132 | 0.24935064935064935 |
+---------------+---------------------+---------------------+---------------------+

CPD of SP_encoded
+---------------+---------------------+-----+---------------------+---

  0%|          | 0/1000000 [00:00<?, ?it/s]


Learned Structure (Edges) for 6000 samples:
[('IR_encoded', 'EI_encoded'), ('IR_encoded', 'SP_encoded'), ('SP_encoded', 'EI_encoded')]

CPD of IR_encoded
+---------------+----------+
| IR_encoded(0) | 0.231667 |
+---------------+----------+
| IR_encoded(1) | 0.52     |
+---------------+----------+
| IR_encoded(2) | 0.248333 |
+---------------+----------+

CPD of EI_encoded
+---------------+----------------------+-----+---------------------+---------------------+
| IR_encoded    | IR_encoded(0)        | ... | IR_encoded(2)       | IR_encoded(2)       |
+---------------+----------------------+-----+---------------------+---------------------+
| SP_encoded    | SP_encoded(0)        | ... | SP_encoded(1)       | SP_encoded(2)       |
+---------------+----------------------+-----+---------------------+---------------------+
| EI_encoded(0) | 0.85                 | ... | 0.4326923076923077  | 0.5120481927710844  |
+---------------+----------------------+-----+---------------------+---------

  0%|          | 0/1000000 [00:00<?, ?it/s]


Learned Structure (Edges) for 7000 samples:
[('IR_encoded', 'SP_encoded'), ('EI_encoded', 'SP_encoded'), ('EI_encoded', 'IR_encoded')]

CPD of IR_encoded
+---------------+----------------------+----------------------+---------------------+
| EI_encoded    | EI_encoded(0)        | EI_encoded(1)        | EI_encoded(2)       |
+---------------+----------------------+----------------------+---------------------+
| IR_encoded(0) | 0.817773339990015    | 0.8318425760286225   | 0.6066565809379728  |
+---------------+----------------------+----------------------+---------------------+
| IR_encoded(1) | 0.17523714428357465  | 0.16323792486583183  | 0.3767019667170953  |
+---------------+----------------------+----------------------+---------------------+
| IR_encoded(2) | 0.006989515726410384 | 0.004919499105545617 | 0.01664145234493192 |
+---------------+----------------------+----------------------+---------------------+

CPD of SP_encoded
+---------------+----------------------+-----+------

  0%|          | 0/1000000 [00:00<?, ?it/s]


Learned Structure (Edges) for 8000 samples:
[('IR_encoded', 'SP_encoded'), ('EI_encoded', 'SP_encoded'), ('EI_encoded', 'IR_encoded')]

CPD of IR_encoded
+---------------+---------------------+---------------------+----------------------+
| EI_encoded    | EI_encoded(0)       | EI_encoded(1)       | EI_encoded(2)        |
+---------------+---------------------+---------------------+----------------------+
| IR_encoded(0) | 0.481437125748503   | 0.5359082679541339  | 0.5807303123625165   |
+---------------+---------------------+---------------------+----------------------+
| IR_encoded(1) | 0.4868263473053892  | 0.37356668678334337 | 0.36867575890893095  |
+---------------+---------------------+---------------------+----------------------+
| IR_encoded(2) | 0.03173652694610778 | 0.09052504526252263 | 0.050593928728552576 |
+---------------+---------------------+---------------------+----------------------+

CPD of SP_encoded
+---------------+---------------------+-----+----------------

  0%|          | 0/1000000 [00:00<?, ?it/s]


Learned Structure (Edges) for 9000 samples:
[('EI_encoded', 'IR_encoded'), ('EI_encoded', 'SP_encoded'), ('SP_encoded', 'IR_encoded')]

CPD of EI_encoded
+---------------+----------+
| EI_encoded(0) | 0.510159 |
+---------------+----------+
| EI_encoded(1) | 0.25746  |
+---------------+----------+
| EI_encoded(2) | 0.232381 |
+---------------+----------+

CPD of IR_encoded
+---------------+---------------------+-----+----------------------+---------------------+
| EI_encoded    | EI_encoded(0)       | ... | EI_encoded(2)        | EI_encoded(2)       |
+---------------+---------------------+-----+----------------------+---------------------+
| SP_encoded    | SP_encoded(0)       | ... | SP_encoded(1)        | SP_encoded(2)       |
+---------------+---------------------+-----+----------------------+---------------------+
| IR_encoded(0) | 0.01702127659574468 | ... | 0.45081967213114754  | 0.0794392523364486  |
+---------------+---------------------+-----+----------------------+---------

  0%|          | 0/1000000 [00:00<?, ?it/s]


Learned Structure (Edges) for 10000 samples:
[('EI_encoded', 'IR_encoded'), ('SP_encoded', 'EI_encoded'), ('SP_encoded', 'IR_encoded')]

CPD of EI_encoded
+---------------+---------------------+---------------------+---------------------+
| SP_encoded    | SP_encoded(0)       | SP_encoded(1)       | SP_encoded(2)       |
+---------------+---------------------+---------------------+---------------------+
| EI_encoded(0) | 0.5898908673176335  | 0.13037752414398596 | 0.21066756122106675 |
+---------------+---------------------+---------------------+---------------------+
| EI_encoded(1) | 0.12004595060310166 | 0.38191395961369623 | 0.3458570949345857  |
+---------------+---------------------+---------------------+---------------------+
| EI_encoded(2) | 0.2900631820792648  | 0.4877085162423178  | 0.44347534384434756 |
+---------------+---------------------+---------------------+---------------------+

CPD of IR_encoded
+---------------+----------------------+-----+---------------------+-

# Entropy

In [23]:
# Sample sizes to loop through
sample_sizes = range(1000, 11000, 1000)

# Prepare a list to store K-L divergence results
kl_divergence_results = []

# Loop through each sample size
for sample_size in sample_sizes:
    print(f"\nProcessing K-L Divergence for sample size: {sample_size}")

    # Load the dense dataset used in the LBN part
    dense_data_file = f'combined_probabilities_{sample_size}.csv'
    df_dense = pd.read_csv(dense_data_file)

    # Manually encode categorical variables for IR, EI, and SP
    ir_map = {'low': 0, 'medium': 1, 'high': 2}
    ei_map = {'poor': 0, 'average': 1, 'good': 2}
    sp_map = {'decrease': 0, 'stable': 1, 'increase': 2}

    df_dense['IR_encoded'] = df_dense['IR_State'].map(ir_map)
    df_dense['EI_encoded'] = df_dense['EI_State'].map(ei_map)
    df_dense['SP_encoded'] = df_dense['Chosen_SP_State'].map(sp_map)

    # Use the test data split obtained from the LBN part
    X_test = df_dense[['IR_encoded', 'EI_encoded']]
    y_test = df_dense['SP_encoded']

    # Placeholder to store K-L divergence values
    kl_divergences = []

    # Loop through each row in the test data to make predictions
    for index, row in X_test.iterrows():
        sample_input = {'IR_encoded': int(row['IR_encoded']), 'EI_encoded': int(row['EI_encoded'])}

        # Perform inference using the learned Bayesian model
        predicted_sp_distribution = inference.query(variables=['SP_encoded'], evidence=sample_input)
        predicted_probs = predicted_sp_distribution.values

        # Extract the ground truth probabilities for SP from `y_test`
        ground_truth_probabilities_str = df_dense['SP_Probabilities (decrease, stable, increase)'].iloc[index]
        ground_truth_probs = np.array(list(map(float, ground_truth_probabilities_str.strip('[]').split(','))))

        # Ensure the probabilities are non-zero to avoid division by zero
        epsilon = 1e-10
        ground_truth_probs = np.clip(ground_truth_probs, epsilon, 1)
        predicted_probs = np.clip(predicted_probs, epsilon, 1)

        # Normalize both probability distributions
        ground_truth_probs /= ground_truth_probs.sum()
        predicted_probs /= predicted_probs.sum()

        # Calculate the K-L divergence (Learned BN vs Ground Truth)
        kl_div = entropy(ground_truth_probs, predicted_probs)
        kl_divergences.append(kl_div)

    # Calculate the average K-L divergence and standard deviation over all test samples
    average_kl_divergence = np.mean(kl_divergences)
    std_kl_divergence = np.std(kl_divergences)

    # Append the results to the list for saving later
    kl_divergence_results.append({
        'Sample_Size': sample_size,
        'Average_KL_Divergence': average_kl_divergence,
        'Std_Dev': std_kl_divergence
    })

    # Print confirmation and result for this sample size
    print(f"\nAverage K-L Divergence for {sample_size} samples: {average_kl_divergence:.4f}, Std Dev: {std_kl_divergence:.4f}")

# Save the K-L divergence results to a CSV file
kl_divergence_df = pd.DataFrame(kl_divergence_results)
kl_divergence_df.to_csv('kl_div_LBN_dense_bic.csv', index=False)

print("\nK-L divergence calculations complete and results saved to 'kl_div_LBN_dense_bic.csv'.")


Processing K-L Divergence for sample size: 1000

Average K-L Divergence for 1000 samples: 0.3989, Std Dev: 0.3344

Processing K-L Divergence for sample size: 2000

Average K-L Divergence for 2000 samples: 0.4036, Std Dev: 0.3871

Processing K-L Divergence for sample size: 3000

Average K-L Divergence for 3000 samples: 0.7935, Std Dev: 0.7134

Processing K-L Divergence for sample size: 4000

Average K-L Divergence for 4000 samples: 0.3706, Std Dev: 0.3567

Processing K-L Divergence for sample size: 5000

Average K-L Divergence for 5000 samples: 0.3407, Std Dev: 0.3625

Processing K-L Divergence for sample size: 6000

Average K-L Divergence for 6000 samples: 0.3131, Std Dev: 0.4184

Processing K-L Divergence for sample size: 7000

Average K-L Divergence for 7000 samples: 0.6265, Std Dev: 0.3514

Processing K-L Divergence for sample size: 8000

Average K-L Divergence for 8000 samples: 0.4563, Std Dev: 0.2709

Processing K-L Divergence for sample size: 9000

Average K-L Divergence for 900