<a href="https://colab.research.google.com/github/nonyeezeh/Research-Project-Code/blob/main/LBN_Sparse_BIC.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Imports

In [44]:
pip install pgmpy



In [45]:
import numpy as np
import pandas as pd
from pgmpy.estimators import MmhcEstimator, BicScore, MaximumLikelihoodEstimator
from pgmpy.models import BayesianNetwork
from sklearn.model_selection import train_test_split
from scipy.stats import entropy
from tabulate import tabulate

# Bayesian Network Data Generation 1000, 2000, ..., 10000 Samples (sparse)

In [66]:
# Function to safely normalize arrays to avoid NaN values
def safe_normalize(arr, axis=0):
    with np.errstate(divide='ignore', invalid='ignore'):
        norm_arr = arr / arr.sum(axis=axis, keepdims=True)
        norm_arr = np.nan_to_num(norm_arr)  # Replace NaNs with 0s
    return norm_arr

# Function to replace all-zero slices with uniform distribution
def replace_zeros_with_uniform(arr, axis=0):
    sum_along_axis = arr.sum(axis=axis, keepdims=True)
    mask = (sum_along_axis == 0)  # Mask for all-zero slices

    # Create a uniform distribution where the sum is zero
    uniform_distribution = np.ones_like(arr) / arr.shape[axis]

    # Where the mask is True, replace with the uniform distribution
    arr = np.where(mask, uniform_distribution, arr)

    # Normalize the resulting array to ensure it's a valid probability distribution
    arr = safe_normalize(arr, axis=axis)
    return arr

# Function to generate sparse CPDs
def generate_sparse_cpds():
    # Generate random probabilities for IR
    ir_probs = np.random.rand(3)
    ir_probs /= ir_probs.sum()  # Normalize to make it a valid probability distribution

    # Generate sparse probabilities for EI given IR (some probabilities set to zero)
    ei_given_ir_probs = np.random.rand(3, 3)
    ei_given_ir_probs[np.random.rand(3, 3) < 0.5] = 0  # Introduce sparsity by setting 50% of values to 0
    ei_given_ir_probs = replace_zeros_with_uniform(ei_given_ir_probs, axis=0)

    # Generate sparse probabilities for SP given IR and EI (some probabilities set to zero)
    sp_probs = np.random.rand(3, 3, 3)
    sp_probs[np.random.rand(3, 3, 3) < 0.5] = 0  # Introduce sparsity by setting 50% of values to 0
    sp_probs = replace_zeros_with_uniform(sp_probs, axis=0)

    return ir_probs, ei_given_ir_probs, sp_probs

# Function to generate and save samples (Sparse Version)
def generate_and_save_sparse_samples(ir_probs, ei_probs, sp_probs, sample_size, filename):
    output_data = []

    # Generate `sample_size` random samples
    for _ in range(sample_size):
        # Sample `IR` state based on `IR` probabilities
        ir_state_idx = np.random.choice(3, p=ir_probs)
        ir_state = ['low', 'medium', 'high'][ir_state_idx]
        ir_prob = ir_probs[ir_state_idx]

        # Sample `EI` state based on `EI` probabilities given `IR`
        ei_probs_given_ir = ei_probs[:, ir_state_idx]
        ei_state_idx = np.random.choice(3, p=ei_probs_given_ir)
        ei_state = ['poor', 'average', 'good'][ei_state_idx]
        ei_prob = ei_probs_given_ir[ei_state_idx]

        # Sample `SP` state based on `SP` probabilities given `IR` and `EI`
        sp_probs_given_ir_ei = sp_probs[:, ir_state_idx, ei_state_idx]
        sp_state_idx = np.random.choice(3, p=sp_probs_given_ir_ei)
        sp_state = ['decrease', 'stable', 'increase'][sp_state_idx]
        sp_prob = sp_probs_given_ir_ei[sp_state_idx]

        # Append sample data to output list
        output_data.append({
            'IR_State': ir_state,
            'IR_Prob': f'{ir_prob:.4f}',
            'EI_State': ei_state,
            'EI_Prob': f'{ei_prob:.4f}',
            'SP_Probabilities (decrease, stable, increase)': ', '.join([f'{prob:.4f}' for prob in sp_probs_given_ir_ei]),
            'Chosen_SP_State': sp_state,
            'Chosen_SP_Probability': f'{sp_prob:.4f}'
        })

    # Create a DataFrame from the output data
    output_df = pd.DataFrame(output_data)

    # Save the output DataFrame to a CSV file
    output_df.to_csv(filename, index=False)

    # Print the first few rows for visual confirmation
    print(f"\nSample size: {sample_size} - First few rows of generated sparse samples:\n")
    print(tabulate(output_df.head(), headers='keys', tablefmt='grid'))

# Generate and save sparse samples for sample sizes from 1000 to 10000 every 1000
sample_sizes = range(1000, 11000, 1000)

for size in sample_sizes:
    # Generate the sparse CPDs
    ir_probs, ei_given_ir_probs, sp_probs = generate_sparse_cpds()

    # Generate and save individual sparse samples for the given sample size
    generate_and_save_sparse_samples(ir_probs, ei_given_ir_probs, sp_probs, size, f'combined_probabilities_sparse_{size}.csv')

# Notify the user that the process is done
print("\nGeneration and saving of individual sparse samples complete for all sample sizes!")


Sample size: 1000 - First few rows of generated sparse samples:

+----+------------+-----------+------------+-----------+-------------------------------------------------+-------------------+-------------------------+
|    | IR_State   |   IR_Prob | EI_State   |   EI_Prob | SP_Probabilities (decrease, stable, increase)   | Chosen_SP_State   |   Chosen_SP_Probability |
|  0 | medium     |    0.3999 | average    |    1      | 0.4774, 0.0000, 0.5226                          | increase          |                  0.5226 |
+----+------------+-----------+------------+-----------+-------------------------------------------------+-------------------+-------------------------+
|  1 | high       |    0.5251 | good       |    0.3333 | 0.3704, 0.3003, 0.3292                          | decrease          |                  0.3704 |
+----+------------+-----------+------------+-----------+-------------------------------------------------+-------------------+-------------------------+
|  2 | medium   

# LBN Sparse BIC & Entropy

In [67]:
# Sample sizes to loop through
sample_sizes = range(1000, 11000, 1000)

# Initialize list to store K-L divergence and standard deviation values for each sample size
results = []

# Small smoothing value to avoid zero probabilities
epsilon = 1e-10

# Loop through each sample size
for sample_size in sample_sizes:
    print(f"\nProcessing sample size: {sample_size}")

    # Load the sparse dataset for the current sample size
    sparse_data_file = f'combined_probabilities_sparse_{sample_size}.csv'
    df_sparse = pd.read_csv(sparse_data_file)

    # Manually encode categorical variables for IR, EI, and SP
    ir_map = {'low': 0, 'medium': 1, 'high': 2}
    ei_map = {'poor': 0, 'average': 1, 'good': 2}
    sp_map = {'decrease': 0, 'stable': 1, 'increase': 2}

    df_sparse['IR_encoded'] = df_sparse['IR_State'].map(ir_map)
    df_sparse['EI_encoded'] = df_sparse['EI_State'].map(ei_map)
    df_sparse['SP_encoded'] = df_sparse['Chosen_SP_State'].map(sp_map)

    # Split the data into training and test sets (70% train, 30% test)
    X = df_sparse[['IR_encoded', 'EI_encoded']]
    y = df_sparse['SP_encoded']
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, shuffle=True)

    # Concatenate X and y to form the training set for learning the BN structure
    df_train = pd.concat([X_train, y_train], axis=1)

    # Define the MMHC structure learning algorithm for sparse BIC
    mmhc = MmhcEstimator(df_train)
    scoring_method = BicScore(df_train)

    # Estimate the best structure using MMHC and sparse BIC score
    best_dag = mmhc.estimate(scoring_method=scoring_method)
    best_model = BayesianNetwork(best_dag.edges())

    # Display the learned structure (edges of the Bayesian Network)
    print(f"\nLearned Structure (Edges) for {sample_size} samples:")
    print(best_model.edges())

    # Learn the CPDs using Maximum Likelihood Estimation (MLE)
    best_model.fit(df_train, estimator=MaximumLikelihoodEstimator)

    # Check if the model is valid after learning the parameters
    assert best_model.check_model()

    # --- K-L Divergence Calculation ---
    # Calculate the probabilities using the learned BN and the test set
    inference = best_model.predict(X_test)
    predicted_probabilities = inference['SP_encoded']

    # Calculate K-L divergence between the ground truth and learned BN
    ground_truth_probabilities = y_test.value_counts(normalize=True).sort_index()
    predicted_probabilities = predicted_probabilities.value_counts(normalize=True).sort_index()

    # Reindex both distributions to have the same set of categories and add smoothing
    all_categories = sorted(set(ground_truth_probabilities.index).union(set(predicted_probabilities.index)))
    ground_truth_probabilities = ground_truth_probabilities.reindex(all_categories, fill_value=epsilon)
    predicted_probabilities = predicted_probabilities.reindex(all_categories, fill_value=epsilon)

    # Calculate K-L divergence with smoothing
    kl_divergence = entropy(pk=ground_truth_probabilities, qk=predicted_probabilities)

    # Standard deviation between predicted probabilities and actual probabilities
    std_dev = np.std(predicted_probabilities - ground_truth_probabilities)

    # Append results for this sample size
    results.append({
        'Sample_Size': sample_size,
        'K-L_Divergence': kl_divergence,
        'Standard_Deviation': std_dev
    })

    # Print the K-L divergence and standard deviation for this sample size
    print(f"\nResults for sample size {sample_size}:")
    print(f"K-L Divergence: {kl_divergence:.4f}")
    print(f"Standard Deviation: {std_dev:.4f}")

# After processing all sample sizes, save results to CSV and display them
results_df = pd.DataFrame(results)
results_df.to_csv('kl_std_results_sparse_mmhc.csv', index=False)

# Print all results after saving to CSV
print("\nAll results have been saved to 'kl_std_results_sparse_mmhc.csv'.")
print(results_df)


Processing sample size: 1000


  0%|          | 0/1000000 [00:00<?, ?it/s]


Learned Structure (Edges) for 1000 samples:
[('IR_encoded', 'EI_encoded'), ('IR_encoded', 'SP_encoded'), ('EI_encoded', 'SP_encoded')]


  0%|          | 0/5 [00:00<?, ?it/s]


Results for sample size 1000:
K-L Divergence: 2.0320
Standard Deviation: 0.1509

Processing sample size: 2000


  0%|          | 0/1000000 [00:00<?, ?it/s]


Learned Structure (Edges) for 2000 samples:
[('IR_encoded', 'EI_encoded'), ('IR_encoded', 'SP_encoded'), ('SP_encoded', 'EI_encoded')]


  0%|          | 0/6 [00:00<?, ?it/s]


Results for sample size 2000:
K-L Divergence: 0.1218
Standard Deviation: 0.1728

Processing sample size: 3000


  0%|          | 0/1000000 [00:00<?, ?it/s]


Learned Structure (Edges) for 3000 samples:
[('IR_encoded', 'SP_encoded'), ('IR_encoded', 'EI_encoded'), ('SP_encoded', 'EI_encoded')]


  0%|          | 0/5 [00:00<?, ?it/s]


Results for sample size 3000:
K-L Divergence: 1.7977
Standard Deviation: 0.0820

Processing sample size: 4000


  0%|          | 0/1000000 [00:00<?, ?it/s]


Learned Structure (Edges) for 4000 samples:
[('IR_encoded', 'SP_encoded'), ('EI_encoded', 'IR_encoded'), ('EI_encoded', 'SP_encoded')]


  0%|          | 0/7 [00:00<?, ?it/s]


Results for sample size 4000:
K-L Divergence: 0.0601
Standard Deviation: 0.0881

Processing sample size: 5000


  0%|          | 0/1000000 [00:00<?, ?it/s]


Learned Structure (Edges) for 5000 samples:
[('EI_encoded', 'IR_encoded'), ('EI_encoded', 'SP_encoded'), ('SP_encoded', 'IR_encoded')]


  0%|          | 0/7 [00:00<?, ?it/s]


Results for sample size 5000:
K-L Divergence: 1.5502
Standard Deviation: 0.0645

Processing sample size: 6000


  0%|          | 0/1000000 [00:00<?, ?it/s]


Learned Structure (Edges) for 6000 samples:
[('IR_encoded', 'EI_encoded'), ('IR_encoded', 'SP_encoded'), ('SP_encoded', 'EI_encoded')]


  0%|          | 0/5 [00:00<?, ?it/s]


Results for sample size 6000:
K-L Divergence: 0.1157
Standard Deviation: 0.1037

Processing sample size: 7000


  0%|          | 0/1000000 [00:00<?, ?it/s]


Learned Structure (Edges) for 7000 samples:
[('IR_encoded', 'EI_encoded'), ('EI_encoded', 'SP_encoded')]


  0%|          | 0/4 [00:00<?, ?it/s]


Results for sample size 7000:
K-L Divergence: 1.4778
Standard Deviation: 0.1780

Processing sample size: 8000


  0%|          | 0/1000000 [00:00<?, ?it/s]


Learned Structure (Edges) for 8000 samples:
[('IR_encoded', 'EI_encoded'), ('SP_encoded', 'IR_encoded'), ('SP_encoded', 'EI_encoded')]


  0%|          | 0/4 [00:00<?, ?it/s]


Results for sample size 8000:
K-L Divergence: 3.9275
Standard Deviation: 0.2490

Processing sample size: 9000


  0%|          | 0/1000000 [00:00<?, ?it/s]


Learned Structure (Edges) for 9000 samples:
[('IR_encoded', 'EI_encoded'), ('SP_encoded', 'IR_encoded'), ('SP_encoded', 'EI_encoded')]


  0%|          | 0/4 [00:00<?, ?it/s]


Results for sample size 9000:
K-L Divergence: 0.0577
Standard Deviation: 0.0662

Processing sample size: 10000


  0%|          | 0/1000000 [00:00<?, ?it/s]


Learned Structure (Edges) for 10000 samples:
[('IR_encoded', 'EI_encoded'), ('SP_encoded', 'IR_encoded')]


  0%|          | 0/3 [00:00<?, ?it/s]


Results for sample size 10000:
K-L Divergence: 0.8536
Standard Deviation: 0.0340

All results have been saved to 'kl_std_results_sparse_mmhc.csv'.
   Sample_Size  K-L_Divergence  Standard_Deviation
0         1000        2.032001            0.150948
1         2000        0.121839            0.172841
2         3000        1.797676            0.082047
3         4000        0.060096            0.088052
4         5000        1.550193            0.064505
5         6000        0.115746            0.103725
6         7000        1.477777            0.177999
7         8000        3.927548            0.249016
8         9000        0.057745            0.066184
9        10000        0.853592            0.033951
