<a href="https://colab.research.google.com/github/nonyeezeh/Research-Project-Code/blob/main/Learned_BNs.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Imports

In [None]:
pip install pgmpy

Collecting pgmpy
  Downloading pgmpy-0.1.26-py3-none-any.whl.metadata (9.1 kB)
Downloading pgmpy-0.1.26-py3-none-any.whl (2.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.0/2.0 MB[0m [31m13.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pgmpy
Successfully installed pgmpy-0.1.26


In [None]:
import numpy as np
import pandas as pd
from pgmpy.models import BayesianNetwork
from pgmpy.models import BayesianModel
from pgmpy.factors.discrete import TabularCPD
from pgmpy.sampling import BayesianModelSampling
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import tensorflow as tf
from tensorflow.keras import layers, models
from pgmpy.estimators import HillClimbSearch, BicScore, AICScore, MaximumLikelihoodEstimator
from pgmpy.inference import VariableElimination
from sklearn.metrics import accuracy_score
from scipy.stats import entropy
import os
import matplotlib.pyplot as plt

# ------------------------------------------------------------------------------------------------------------

# Bayesian Network Data Generation 500, 1000, 1500, ..., 10000 Samples (dense)

In [None]:
#np.random.seed(1)

# Define the mappings for IR, EI, SP
ir_map = {0: 'low', 1: 'medium', 2: 'high'}
ei_map = {0: 'poor', 1: 'average', 2: 'good'}
sp_map = {0: 'decrease', 1: 'stable', 2: 'increase'}

# Define the dense Bayesian Network
dense_model = BayesianNetwork([('IR', 'EI'), ('EI', 'SP'), ('IR', 'SP')])

# Function to generate CPDs
def generate_cpds():
    ir_probs = np.random.rand(3)
    ir_probs /= ir_probs.sum()  # Normalize to make it a valid probability distribution

    ei_given_ir_probs = np.random.rand(3, 3)
    ei_given_ir_probs /= ei_given_ir_probs.sum(axis=0, keepdims=True)

    sp_probs = np.random.rand(3, 3, 3)
    sp_probs /= sp_probs.sum(axis=0, keepdims=True)

    sp_probs_reshaped = sp_probs.reshape(3, -1)

    return ir_probs, ei_given_ir_probs, sp_probs_reshaped

# Save probabilities in a single CSV file
def save_probabilities(ir_probs, ei_probs, sp_probs, filename):
    # Create a DataFrame for IR probabilities
    ir_df = pd.DataFrame({
        'IR_State': ['low', 'medium', 'high'],
        'IR_Prob': ir_probs
    })

    # Create a DataFrame for EI given IR probabilities
    ei_df = pd.DataFrame(ei_probs, columns=['EI_given_IR_low', 'EI_given_IR_medium', 'EI_given_IR_high'])
    ei_df['EI_State'] = ['poor', 'average', 'good']

    # Create a DataFrame for SP given IR and EI probabilities
    sp_df = pd.DataFrame(sp_probs, columns=[
        'SP_given_IR_low_EI_poor', 'SP_given_IR_low_EI_average', 'SP_given_IR_low_EI_good',
        'SP_given_IR_medium_EI_poor', 'SP_given_IR_medium_EI_average', 'SP_given_IR_medium_EI_good',
        'SP_given_IR_high_EI_poor', 'SP_given_IR_high_EI_average', 'SP_given_IR_high_EI_good'
    ])
    sp_df['SP_State'] = ['decrease', 'stable', 'increase']

    # Combine all data into a single DataFrame
    combined_df = pd.concat([ir_df, ei_df, sp_df], axis=1)

    # Save the combined DataFrame as a single CSV file
    combined_df.to_csv(filename, index=False)

# Save outcomes in a CSV file
def save_outcomes(data_dense, filename):
    data_dense['IR'] = data_dense['IR'].map(ir_map)
    data_dense['EI'] = data_dense['EI'].map(ei_map)
    data_dense['SP'] = data_dense['SP'].map(sp_map)
    data_dense.to_csv(filename, index=False)

# Generate datasets for different sample sizes for the dense model
sample_sizes = range(500, 10500, 500)
for size in sample_sizes:
    # Generate the CPDs
    ir_probs, ei_given_ir_probs, sp_probs_reshaped = generate_cpds()

    # Define CPDs for the dense model
    cpd_ir = TabularCPD(variable='IR', variable_card=3, values=[[ir_probs[0]], [ir_probs[1]], [ir_probs[2]]])
    cpd_ei_dense = TabularCPD(variable='EI', variable_card=3,
                              values=ei_given_ir_probs,
                              evidence=['IR'], evidence_card=[3])
    cpd_sp_dense = TabularCPD(variable='SP', variable_card=3,
                              values=sp_probs_reshaped,
                              evidence=['IR', 'EI'], evidence_card=[3, 3])

    dense_model.add_cpds(cpd_ir, cpd_ei_dense, cpd_sp_dense)

    # Check if the model is valid
    assert dense_model.check_model()

    # Generate samples
    sampler_dense = BayesianModelSampling(dense_model)
    data_dense = sampler_dense.forward_sample(size=size)

    # Save probabilities in one file
    save_probabilities(ir_probs, ei_given_ir_probs, sp_probs_reshaped, f'probabilities_dense_{size}.csv')

    # Save outcomes (low, medium, high) in another file
    save_outcomes(data_dense, f'outcomes_dense_{size}.csv')

# Notify the user that the process is done
print("Data generation and saving complete for the dense model!")

# Bayesian Network Data Generation 500, 1000, 1500, ..., 10000 Samples (sparse)

In [None]:
#np.random.seed(187)

# Define the mappings for IR, EI, SP
ir_map = {0: 'low', 1: 'medium', 2: 'high'}
ei_map = {0: 'poor', 1: 'average', 2: 'good'}
sp_map = {0: 'decrease', 1: 'stable', 2: 'increase'}

# Define the sparse Bayesian Network
sparse_model = BayesianNetwork([('IR', 'SP'), ('EI', 'SP')])

# Function to generate CPDs for the sparse model
def generate_cpds_sparse():
    # Generate probabilities for IR (unconditional)
    ir_probs = np.random.rand(3)
    ir_probs /= ir_probs.sum()  # Normalize to make it a valid probability distribution

    # Generate unconditional probabilities for EI (no dependency on IR)
    ei_probs = np.random.rand(3)
    ei_probs /= ei_probs.sum()

    # Generate conditional probabilities for SP given IR and EI
    sp_probs = np.random.rand(3, 3, 3)
    sp_probs /= sp_probs.sum(axis=0, keepdims=True)

    sp_probs_reshaped = sp_probs.reshape(3, -1)

    return ir_probs, ei_probs, sp_probs_reshaped

# Save probabilities in a single CSV file
def save_probabilities_sparse(ir_probs, ei_probs, sp_probs, filename):
    # Create a DataFrame for IR probabilities
    ir_df = pd.DataFrame({
        'IR_State': ['low', 'medium', 'high'],
        'IR_Prob': ir_probs
    })

    # Create a DataFrame for EI probabilities (since it's not conditional on IR)
    ei_df = pd.DataFrame({
        'EI_State': ['poor', 'average', 'good'],
        'EI_Prob': ei_probs
    })

    # Create a DataFrame for SP given IR and EI probabilities
    sp_df = pd.DataFrame(sp_probs, columns=[
        'SP_given_IR_low_EI_poor', 'SP_given_IR_low_EI_average', 'SP_given_IR_low_EI_good',
        'SP_given_IR_medium_EI_poor', 'SP_given_IR_medium_EI_average', 'SP_given_IR_medium_EI_good',
        'SP_given_IR_high_EI_poor', 'SP_given_IR_high_EI_average', 'SP_given_IR_high_EI_good'
    ])
    sp_df['SP_State'] = ['decrease', 'stable', 'increase']

    # Combine all data into a single DataFrame
    combined_df = pd.concat([ir_df, ei_df, sp_df], axis=1)

    # Save the combined DataFrame as a single CSV file
    combined_df.to_csv(filename, index=False)

# Save outcomes in a CSV file
def save_outcomes_sparse(data_sparse, filename):
    data_sparse['IR'] = data_sparse['IR'].map(ir_map)
    data_sparse['EI'] = data_sparse['EI'].map(ei_map)
    data_sparse['SP'] = data_sparse['SP'].map(sp_map)
    data_sparse.to_csv(filename, index=False)

# Generate datasets for different sample sizes for the sparse model
sample_sizes = range(500, 10500, 500)
for size in sample_sizes:
    # Generate the CPDs
    ir_probs, ei_probs, sp_probs_reshaped = generate_cpds_sparse()

    # Define CPDs for the sparse model
    cpd_ir = TabularCPD(variable='IR', variable_card=3, values=[[ir_probs[0]], [ir_probs[1]], [ir_probs[2]]])
    cpd_ei_sparse = TabularCPD(variable='EI', variable_card=3, values=[[ei_probs[0]], [ei_probs[1]], [ei_probs[2]]])
    cpd_sp_sparse = TabularCPD(variable='SP', variable_card=3,
                               values=sp_probs_reshaped,
                               evidence=['IR', 'EI'], evidence_card=[3, 3])

    sparse_model.add_cpds(cpd_ir, cpd_ei_sparse, cpd_sp_sparse)

    # Check if the model is valid
    assert sparse_model.check_model()

    # Generate samples
    sampler_sparse = BayesianModelSampling(sparse_model)
    data_sparse = sampler_sparse.forward_sample(size=size)

    # Save probabilities in one file
    save_probabilities_sparse(ir_probs, ei_probs, sp_probs_reshaped, f'probabilities_sparse_{size}.csv')

    # Save outcomes (low, medium, high) in another file
    save_outcomes_sparse(data_sparse, f'outcomes_sparse_{size}.csv')

# Notify the user that the process is done
print("Data generation and saving complete for the sparse model!")

# ------------------------------------------------------------------------------------------------------------

# Control Experiment (dense)

## BIC

In [None]:
# Sample sizes to loop through
sample_sizes = range(500, 10500, 500)

# Loop through each sample size
for sample_size in sample_sizes:
    print(f"\nProcessing sample size: {sample_size}")

    # Load the dense dataset for the current sample size
    dense_data_file = f'outcomes_dense_{sample_size}.csv'
    df_dense = pd.read_csv(dense_data_file)

    # Manually encode categorical variables for IR, EI, and SP
    ir_map = {'low': 0, 'medium': 1, 'high': 2}
    ei_map = {'poor': 0, 'average': 1, 'good': 2}
    sp_map = {'decrease': 0, 'stable': 1, 'increase': 2}

    df_dense['IR_encoded'] = df_dense['IR'].map(ir_map)
    df_dense['EI_encoded'] = df_dense['EI'].map(ei_map)
    df_dense['SP_encoded'] = df_dense['SP'].map(sp_map)

    # Define the Hill-Climb structure learning algorithm
    hc = HillClimbSearch(df_dense[['IR_encoded', 'EI_encoded', 'SP_encoded']])
    scoring_method = BicScore(df_dense[['IR_encoded', 'EI_encoded', 'SP_encoded']])

    # Estimate the best structure
    best_dag = hc.estimate(scoring_method=scoring_method)
    best_model = BayesianNetwork(best_dag.edges())

    # Check if all nodes are included in the learned structure
    nodes_in_structure = set(best_model.nodes())
    required_nodes = {'IR_encoded', 'EI_encoded', 'SP_encoded'}

    if not required_nodes.issubset(nodes_in_structure):
        print("\nNot all nodes are connected. Adding a dummy variable.")
        # Add a dummy variable to the dataset
        df_dense['dummy'] = 0

        # Re-estimate the structure with the dummy variable
        hc = HillClimbSearch(df_dense)
        scoring_method = BicScore(df_dense)
        best_dag = hc.estimate(scoring_method=scoring_method)
        best_model = BayesianNetwork(best_dag.edges())

    # Display the learned structure (edges of the Bayesian Network)
    print(f"\nLearned Structure (Edges) for {sample_size} samples:")
    print(best_model.edges())

    # Calculate and display the BIC score
    bic_score = scoring_method.score(best_model)
    print(f"\nBIC Score for {sample_size} samples: {bic_score:.4f}")

    # Learn the CPDs using Maximum Likelihood Estimation (MLE)
    best_model.fit(df_dense, estimator=MaximumLikelihoodEstimator)

    # Check if the model is valid after learning the parameters
    assert best_model.check_model()

    # Print the learned CPDs (Conditional Probability Distributions)
    for cpd in best_model.get_cpds():
        print("\nCPD of", cpd.variable)
        print(cpd)

    # Create an inference object for the best model
    inference = VariableElimination(best_model)

    # Placeholder to store predictions
    predicted_sp_labels = []

    # Loop through each row in the dense dataset to make predictions
    for index, row in df_dense.iterrows():
        # Prepare the evidence from the dataset (IR_encoded and EI_encoded)
        sample_input = {'IR_encoded': int(row['IR_encoded']), 'EI_encoded': int(row['EI_encoded'])}

        # Perform inference to predict the distribution for SP_encoded (Stock Price)
        predicted_sp_distribution = inference.query(variables=['SP_encoded'], evidence=sample_input)

        # Extract the most likely SP_encoded class
        predicted_sp_class = predicted_sp_distribution.values.argmax()
        sp_reverse_map = {0: 'decrease', 1: 'stable', 2: 'increase'}
        predicted_sp_label = sp_reverse_map[predicted_sp_class]

        # Store the predicted label
        predicted_sp_labels.append(predicted_sp_label)

    # Convert the list of predicted labels into a DataFrame for easier comparison
    predicted_results_df = pd.DataFrame({
        'IR': df_dense['IR'],  # Original IR column
        'EI': df_dense['EI'],  # Original EI column
        'Predicted_SP': predicted_sp_labels  # Predicted SP column
    })

    # Add the actual SP values for comparison
    predicted_results_df['Actual_SP'] = df_dense['SP']

    # Calculate accuracy of predictions
    accuracy = accuracy_score(predicted_results_df['Actual_SP'], predicted_results_df['Predicted_SP'])
    print(f"\nPrediction Accuracy for {sample_size} samples: {accuracy:.4f}")

    # Display the first few rows of predictions
    print(f"\nPredicted Results for Dense Data (First 10 rows) for {sample_size} samples:")
    print(predicted_results_df.head(10))

# Notify the user that the process is done
print("\nProcessing complete for all sample sizes.")

## AIC

In [None]:
# Sample sizes to loop through
sample_sizes = range(500, 10500, 500)

# Loop through each sample size
for sample_size in sample_sizes:
    print(f"\nProcessing sample size: {sample_size}")

    # Load the dense dataset for the current sample size
    dense_data_file = f'outcomes_dense_{sample_size}.csv'
    df_dense = pd.read_csv(dense_data_file)

    # Manually encode categorical variables for IR, EI, and SP
    ir_map = {'low': 0, 'medium': 1, 'high': 2}
    ei_map = {'poor': 0, 'average': 1, 'good': 2}
    sp_map = {'decrease': 0, 'stable': 1, 'increase': 2}

    df_dense['IR_encoded'] = df_dense['IR'].map(ir_map)
    df_dense['EI_encoded'] = df_dense['EI'].map(ei_map)
    df_dense['SP_encoded'] = df_dense['SP'].map(sp_map)

    # Define the Hill-Climb structure learning algorithm
    hc = HillClimbSearch(df_dense[['IR_encoded', 'EI_encoded', 'SP_encoded']])
    scoring_method = AICScore(df_dense[['IR_encoded', 'EI_encoded', 'SP_encoded']])  # Use AICScore instead of BicScore

    # Estimate the best structure
    best_dag = hc.estimate(scoring_method=scoring_method)
    best_model = BayesianNetwork(best_dag.edges())

    # Display the learned structure (edges of the Bayesian Network)
    print(f"\nLearned Structure (Edges) for {sample_size} samples (AIC):")
    print(best_model.edges())

    # Calculate and display the AIC score
    aic_score = scoring_method.score(best_model)
    print(f"\nAIC Score for {sample_size} samples: {aic_score:.4f}")

    # Learn the CPDs using Maximum Likelihood Estimation (MLE)
    best_model.fit(df_dense[['IR_encoded', 'EI_encoded', 'SP_encoded']], estimator=MaximumLikelihoodEstimator)

    # Check if the model is valid after learning the parameters
    assert best_model.check_model()

    # Print the learned CPDs (Conditional Probability Distributions)
    for cpd in best_model.get_cpds():
        print("\nCPD of", cpd.variable)
        print(cpd)

    # Create an inference object for the best model
    inference = VariableElimination(best_model)

    # Placeholder to store predictions
    predicted_sp_labels = []

    # Loop through each row in the dense dataset to make predictions
    for index, row in df_dense.iterrows():
        # Prepare the evidence from the dataset (IR_encoded and EI_encoded)
        sample_input = {'IR_encoded': int(row['IR_encoded']), 'EI_encoded': int(row['EI_encoded'])}

        # Perform inference to predict the distribution for SP_encoded (Stock Price)
        predicted_sp_distribution = inference.query(variables=['SP_encoded'], evidence=sample_input)

        # Extract the most likely SP_encoded class
        predicted_sp_class = predicted_sp_distribution.values.argmax()
        sp_reverse_map = {0: 'decrease', 1: 'stable', 2: 'increase'}
        predicted_sp_label = sp_reverse_map[predicted_sp_class]

        # Store the predicted label
        predicted_sp_labels.append(predicted_sp_label)

    # Convert the list of predicted labels into a DataFrame for easier comparison
    predicted_results_df = pd.DataFrame({
        'IR': df_dense['IR'],  # Original IR column
        'EI': df_dense['EI'],  # Original EI column
        'Predicted_SP': predicted_sp_labels  # Predicted SP column
    })

    # Add the actual SP values for comparison
    predicted_results_df['Actual_SP'] = df_dense['SP']

    # Calculate accuracy of predictions
    accuracy = accuracy_score(predicted_results_df['Actual_SP'], predicted_results_df['Predicted_SP'])
    print(f"\nPrediction Accuracy for {sample_size} samples (AIC): {accuracy:.4f}")

    # Display the first few rows of predictions
    print(f"\nPredicted Results for Dense Data (First 10 rows) for {sample_size} samples (AIC):")
    print(predicted_results_df.head(10))

    # Save the results if needed
    results_filename = f'predicted_results_aic_{sample_size}.csv'
    predicted_results_df.to_csv(results_filename, index=False)
    print(f"\nResults saved to {results_filename}")

# Notify the user that the process is done
print("\nProcessing complete for all sample sizes using AIC.")

# Control Experiment (sparse)

## BIC

In [None]:
# Sample sizes to loop through
sample_sizes = range(500, 10500, 500)

# Loop through each sample size
for sample_size in sample_sizes:
    print(f"\nProcessing sample size: {sample_size} (Sparse Data with BIC)")

    # Load the sparse dataset for the current sample size
    sparse_data_file = f'outcomes_sparse_{sample_size}.csv'
    df_sparse = pd.read_csv(sparse_data_file)

    # Manually encode categorical variables for IR, EI, and SP
    ir_map = {'low': 0, 'medium': 1, 'high': 2}
    ei_map = {'poor': 0, 'average': 1, 'good': 2}
    sp_map = {'decrease': 0, 'stable': 1, 'increase': 2}

    df_sparse['IR_encoded'] = df_sparse['IR'].map(ir_map)
    df_sparse['EI_encoded'] = df_sparse['EI'].map(ei_map)
    df_sparse['SP_encoded'] = df_sparse['SP'].map(sp_map)

    # Define the Hill-Climb structure learning algorithm
    hc_bic = HillClimbSearch(df_sparse[['IR_encoded', 'EI_encoded', 'SP_encoded']])
    scoring_method_bic = BicScore(df_sparse[['IR_encoded', 'EI_encoded', 'SP_encoded']])

    # Estimate the best structure using BIC
    best_dag_bic = hc_bic.estimate(scoring_method=scoring_method_bic)

    # Ensure all required nodes are present in the model, even if not connected
    best_model_bic = BayesianNetwork()
    best_model_bic.add_nodes_from(['IR_encoded', 'EI_encoded', 'SP_encoded'])  # Add all nodes
    best_model_bic.add_edges_from(best_dag_bic.edges())  # Add edges from the learned structure

    # Check if all nodes are included in the learned structure
    nodes_in_structure_bic = set(best_model_bic.nodes())
    required_nodes = {'IR_encoded', 'EI_encoded', 'SP_encoded'}

    if not required_nodes.issubset(nodes_in_structure_bic):
        print("\nNot all nodes are connected. Adding a dummy variable and ensuring all required nodes are present.")
        # Add a dummy variable to the dataset
        df_sparse['Dummy_Node'] = 1  # Constant dummy node

        # Ensure all required nodes are in the model by adding edges with the dummy node
        for node in required_nodes:
            if node not in nodes_in_structure_bic:
                best_model_bic.add_edge('Dummy_Node', node)

        # Re-estimate the structure with the dummy variable
        hc_bic = HillClimbSearch(df_sparse[['IR_encoded', 'EI_encoded', 'SP_encoded', 'Dummy_Node']])
        scoring_method_bic = BicScore(df_sparse[['IR_encoded', 'EI_encoded', 'SP_encoded', 'Dummy_Node']])
        best_dag_bic = hc_bic.estimate(scoring_method=scoring_method_bic)
        best_model_bic = BayesianNetwork(best_dag_bic.edges())

    # Display the learned structure (edges of the Bayesian Network)
    print(f"\nLearned Structure (Edges) for {sample_size} samples (BIC):")
    print(best_model_bic.edges())

    # Learn the CPDs using Maximum Likelihood Estimation (MLE)
    best_model_bic.fit(df_sparse[['IR_encoded', 'EI_encoded', 'SP_encoded']], estimator=MaximumLikelihoodEstimator)

    # Check if the model is valid after learning the parameters
    assert best_model_bic.check_model()

    # Print the learned CPDs (Conditional Probability Distributions)
    for cpd in best_model_bic.get_cpds():
        print("\nCPD of", cpd.variable)
        print(cpd)

    # Create an inference object for the BIC model
    inference_bic = VariableElimination(best_model_bic)

    # Placeholder to store predictions
    predicted_sp_labels_bic = []

    # Loop through each row in the dataset to make predictions using BIC
    for index, row in df_sparse.iterrows():
        # Prepare the evidence from the dataset (IR_encoded and EI_encoded)
        sample_input = {'IR_encoded': int(row['IR_encoded']), 'EI_encoded': int(row['EI_encoded'])}

        # Perform inference to predict the distribution for SP_encoded (Stock Price)
        predicted_sp_distribution_bic = inference_bic.query(variables=['SP_encoded'], evidence=sample_input)

        # Extract the most likely SP_encoded class
        predicted_sp_class_bic = predicted_sp_distribution_bic.values.argmax()
        sp_reverse_map = {0: 'decrease', 1: 'stable', 2: 'increase'}
        predicted_sp_label_bic = sp_reverse_map[predicted_sp_class_bic]

        # Store the predicted label
        predicted_sp_labels_bic.append(predicted_sp_label_bic)

    # Convert the list of predicted labels into a DataFrame for easier comparison
    predicted_results_df_bic = pd.DataFrame({
        'IR': df_sparse['IR'],  # Original IR column
        'EI': df_sparse['EI'],  # Original EI column
        'Predicted_SP': predicted_sp_labels_bic  # Predicted SP column
    })

    # Add the actual SP values for comparison
    predicted_results_df_bic['Actual_SP'] = df_sparse['SP']

    # Calculate accuracy of predictions for BIC
    accuracy_bic = accuracy_score(predicted_results_df_bic['Actual_SP'], predicted_results_df_bic['Predicted_SP'])
    print(f"\nPrediction Accuracy for {sample_size} samples (BIC): {accuracy_bic:.4f}")

    # Display the first few rows of predictions for BIC
    print(f"\nPredicted Results for Sparse Data (First 10 rows) for {sample_size} samples (BIC):")
    print(predicted_results_df_bic.head(10))

    # Calculate the BIC score for the Bayesian Network model
    bic_score_value = scoring_method_bic.score(best_model_bic)

    # Print the BIC score
    print(f"\nBIC Score for {sample_size} samples: {bic_score_value:.4f}")

# Notify the user that the process is done
print("\nProcessing complete for all sample sizes using BIC (Sparse Data).")

## AIC

In [None]:
# Sample sizes to loop through
sample_sizes = range(500, 10500, 500)

# Loop through each sample size
for sample_size in sample_sizes:
    print(f"\nProcessing sample size: {sample_size} (Sparse Data with AIC)")

    # Load the sparse dataset for the current sample size
    sparse_data_file = f'outcomes_sparse_{sample_size}.csv'
    df_sparse = pd.read_csv(sparse_data_file)

    # Manually encode categorical variables for IR, EI, and SP
    ir_map = {'low': 0, 'medium': 1, 'high': 2}
    ei_map = {'poor': 0, 'average': 1, 'good': 2}
    sp_map = {'decrease': 0, 'stable': 1, 'increase': 2}

    df_sparse['IR_encoded'] = df_sparse['IR'].map(ir_map)
    df_sparse['EI_encoded'] = df_sparse['EI'].map(ei_map)
    df_sparse['SP_encoded'] = df_sparse['SP'].map(sp_map)

    # Define the Hill-Climb structure learning algorithm
    hc_aic = HillClimbSearch(df_sparse[['IR_encoded', 'EI_encoded', 'SP_encoded']])
    scoring_method_aic = AICScore(df_sparse[['IR_encoded', 'EI_encoded', 'SP_encoded']])

    # Estimate the best structure using AIC
    best_dag_aic = hc_aic.estimate(scoring_method=scoring_method_aic)

    # Ensure all required nodes are present in the model, even if not connected
    best_model_aic = BayesianNetwork()
    best_model_aic.add_nodes_from(['IR_encoded', 'EI_encoded', 'SP_encoded'])  # Add all nodes
    best_model_aic.add_edges_from(best_dag_aic.edges())  # Add edges from the learned structure

    # Check if all nodes are included in the learned structure
    nodes_in_structure_aic = set(best_model_aic.nodes())
    required_nodes = {'IR_encoded', 'EI_encoded', 'SP_encoded'}

    if not required_nodes.issubset(nodes_in_structure_aic):
        print("\nNot all nodes are connected. Adding a dummy variable and ensuring all required nodes are present.")
        # Add a dummy variable to the dataset
        df_sparse['Dummy_Node'] = 1  # Constant dummy node

        # Ensure all required nodes are in the model by adding edges with the dummy node
        for node in required_nodes:
            if node not in nodes_in_structure_aic:
                best_model_aic.add_edge('Dummy_Node', node)

        # Re-estimate the structure with the dummy variable
        hc_aic = HillClimbSearch(df_sparse[['IR_encoded', 'EI_encoded', 'SP_encoded', 'Dummy_Node']])
        scoring_method_aic = AICScore(df_sparse[['IR_encoded', 'EI_encoded', 'SP_encoded', 'Dummy_Node']])
        best_dag_aic = hc_aic.estimate(scoring_method=scoring_method_aic)
        best_model_aic = BayesianNetwork(best_dag_aic.edges())

    # Display the learned structure (edges of the Bayesian Network)
    print(f"\nLearned Structure (Edges) for {sample_size} samples (AIC):")
    print(best_model_aic.edges())

    # Learn the CPDs using Maximum Likelihood Estimation (MLE)
    best_model_aic.fit(df_sparse[['IR_encoded', 'EI_encoded', 'SP_encoded']], estimator=MaximumLikelihoodEstimator)

    # Check if the model is valid after learning the parameters
    assert best_model_aic.check_model()

    # Print the learned CPDs (Conditional Probability Distributions)
    for cpd in best_model_aic.get_cpds():
        print("\nCPD of", cpd.variable)
        print(cpd)

    # Create an inference object for the AIC model
    inference_aic = VariableElimination(best_model_aic)

    # Placeholder to store predictions
    predicted_sp_labels_aic = []

    # Loop through each row in the dataset to make predictions using AIC
    for index, row in df_sparse.iterrows():
        # Prepare the evidence from the dataset (IR_encoded and EI_encoded)
        sample_input = {'IR_encoded': int(row['IR_encoded']), 'EI_encoded': int(row['EI_encoded'])}

        # Perform inference to predict the distribution for SP_encoded (Stock Price)
        predicted_sp_distribution_aic = inference_aic.query(variables=['SP_encoded'], evidence=sample_input)

        # Extract the most likely SP_encoded class
        predicted_sp_class_aic = predicted_sp_distribution_aic.values.argmax()
        sp_reverse_map = {0: 'decrease', 1: 'stable', 2: 'increase'}
        predicted_sp_label_aic = sp_reverse_map[predicted_sp_class_aic]

        # Store the predicted label
        predicted_sp_labels_aic.append(predicted_sp_label_aic)

    # Convert the list of predicted labels into a DataFrame for easier comparison
    predicted_results_df_aic = pd.DataFrame({
        'IR': df_sparse['IR'],  # Original IR column
        'EI': df_sparse['EI'],  # Original EI column
        'Predicted_SP': predicted_sp_labels_aic  # Predicted SP column
    })

    # Add the actual SP values for comparison
    predicted_results_df_aic['Actual_SP'] = df_sparse['SP']

    # Calculate accuracy of predictions for AIC
    accuracy_aic = accuracy_score(predicted_results_df_aic['Actual_SP'], predicted_results_df_aic['Predicted_SP'])
    print(f"\nPrediction Accuracy for {sample_size} samples (AIC): {accuracy_aic:.4f}")

    # Display the first few rows of predictions for AIC
    print(f"\nPredicted Results for Sparse Data (First 10 rows) for {sample_size} samples (AIC):")
    print(predicted_results_df_aic.head(10))

    # Calculate the AIC score for the Bayesian Network model
    aic_score_value = scoring_method_aic.score(best_model_aic)

    # Print the AIC score
    print(f"\nAIC Score for {sample_size} samples: {aic_score_value:.4f}")

# Notify the user that the process is done
print("\nProcessing complete for all sample sizes using AIC (Sparse Data).")

# ------------------------------------------------------------------------------------------------------------

# K-L Divergence LBN Dense Data

## BIC

In [None]:
# Define the function to save K-L divergence to a file
def save_kl_divergence(sample_size, kl_div_value, clear_file=False):
    file_name = 'kl_div_LBN_dense_bic.csv'

    # If it's the first run (clear_file is True), remove the file to start fresh
    if clear_file and os.path.exists(file_name):
        os.remove(file_name)

    # Create a new file and write the header if it doesn't exist
    if not os.path.exists(file_name):
        with open(file_name, 'w') as f:
            f.write('Size,LBN_Dense_BIC_Entropy\n')  # Write the headers

    # Append the K-L divergence for this sample size to the file
    with open(file_name, 'a') as f:
        f.write(f"{sample_size},{kl_div_value:.4f}\n")

# Sample sizes to loop through
sample_sizes = range(500, 10500, 500)

# Start fresh and clear the file on the first run
clear_file = True

# Loop through each sample size
for sample_size in sample_sizes:
    # Load the dense dataset and ground truth probabilities for the current sample size
    dense_data_file = f'outcomes_dense_{sample_size}.csv'
    ground_truth_probs_file = f'probabilities_dense_{sample_size}.csv'
    df_dense = pd.read_csv(dense_data_file)
    df_gt_probs = pd.read_csv(ground_truth_probs_file)

    # Manually encode categorical variables for IR, EI, and SP
    ir_map = {'low': 0, 'medium': 1, 'high': 2}
    ei_map = {'poor': 0, 'average': 1, 'good': 2}
    sp_map = {'decrease': 0, 'stable': 1, 'increase': 2}

    df_dense['IR_encoded'] = df_dense['IR'].map(ir_map)
    df_dense['EI_encoded'] = df_dense['EI'].map(ei_map)
    df_dense['SP_encoded'] = df_dense['SP'].map(sp_map)

    # Define the Hill-Climb structure learning algorithm
    hc = HillClimbSearch(df_dense[['IR_encoded', 'EI_encoded', 'SP_encoded']])
    scoring_method = BicScore(df_dense[['IR_encoded', 'EI_encoded', 'SP_encoded']])

    # Estimate the best structure
    best_dag = hc.estimate(scoring_method=scoring_method)

    # Ensure SP_encoded is part of the model, even if not in best_dag
    best_model = BayesianModel(best_dag.edges())
    best_model.add_node('SP_encoded')

    # Learn the CPDs using Maximum Likelihood Estimation (MLE)
    best_model.fit(df_dense[['IR_encoded', 'EI_encoded', 'SP_encoded']], estimator=MaximumLikelihoodEstimator)

    # Create an inference object for the best model
    inference = VariableElimination(best_model)

    # Placeholder to store K-L divergence values
    kl_divergences = []

    # Loop through each row in the dense dataset to make predictions
    for index, row in df_dense.iterrows():
        sample_input = {'IR_encoded': int(row['IR_encoded']), 'EI_encoded': int(row['EI_encoded'])}
        predicted_sp_distribution = inference.query(variables=['SP_encoded'], evidence=sample_input)
        predicted_probs = predicted_sp_distribution.values

        ir_value = row['IR']
        ei_value = row['EI']

        col_prefix = f'SP_given_IR_{ir_value}_EI_{ei_value}'
        ground_truth_probs = df_gt_probs.filter(like=col_prefix).values.flatten()

        # Ensure the probabilities are non-zero to avoid division by zero
        epsilon = 1e-10
        ground_truth_probs = np.clip(ground_truth_probs, epsilon, 1)

        # Compute K-L divergence (Learned BN vs Ground Truth)
        kl_div = entropy(predicted_probs, ground_truth_probs)
        kl_divergences.append(kl_div)

    # Calculate the average K-L divergence over all samples
    average_kl_divergence = np.mean(kl_divergences)

    # Save the K-L divergence value to the CSV file
    save_kl_divergence(sample_size, average_kl_divergence, clear_file=clear_file)

    # Set clear_file to False after the first iteration
    clear_file = False

    # Print confirmation
    print(f"Average K-L Divergence for {sample_size} samples (BIC): {average_kl_divergence:.4f}")

# Notify the user that the process is done
print("\nK-L divergence calculations complete and saved to 'kl_div_LBN_dense_bic.csv'.")

## AIC

In [None]:
# Define the function to save K-L divergence to a file
def save_kl_divergence(sample_size, kl_div_value, clear_file=False):
    file_name = 'kl_div_LBN_dense_aic.csv'  # Changed filename to indicate AIC

    # If it's the first run (clear_file is True), remove the file to start fresh
    if clear_file and os.path.exists(file_name):
        os.remove(file_name)

    # Create a new file and write the header if it doesn't exist
    if not os.path.exists(file_name):
        with open(file_name, 'w') as f:
            f.write('Size,LBN_Dense_AIC_Entropy\n')  # Write the headers

    # Append the K-L divergence for this sample size to the file
    with open(file_name, 'a') as f:
        f.write(f"{sample_size},{kl_div_value:.4f}\n")

# Sample sizes to loop through
sample_sizes = range(500, 10500, 500)

# Start fresh and clear the file on the first run
clear_file = True

# Loop through each sample size
for sample_size in sample_sizes:
    # Load the dense dataset and ground truth probabilities for the current sample size
    dense_data_file = f'outcomes_dense_{sample_size}.csv'
    ground_truth_probs_file = f'probabilities_dense_{sample_size}.csv'
    df_dense = pd.read_csv(dense_data_file)
    df_gt_probs = pd.read_csv(ground_truth_probs_file)

    # Manually encode categorical variables for IR, EI, and SP
    ir_map = {'low': 0, 'medium': 1, 'high': 2}
    ei_map = {'poor': 0, 'average': 1, 'good': 2}
    sp_map = {'decrease': 0, 'stable': 1, 'increase': 2}

    df_dense['IR_encoded'] = df_dense['IR'].map(ir_map)
    df_dense['EI_encoded'] = df_dense['EI'].map(ei_map)
    df_dense['SP_encoded'] = df_dense['SP'].map(sp_map)

    # Define the Hill-Climb structure learning algorithm
    hc = HillClimbSearch(df_dense[['IR_encoded', 'EI_encoded', 'SP_encoded']])
    scoring_method = AICScore(df_dense[['IR_encoded', 'EI_encoded', 'SP_encoded']])  # Use AICScore instead of BicScore

    # Estimate the best structure
    best_dag = hc.estimate(scoring_method=scoring_method)

    # Ensure SP_encoded is part of the model, even if not in best_dag
    best_model = BayesianModel(best_dag.edges())
    best_model.add_node('SP_encoded')

    # Learn the CPDs using Maximum Likelihood Estimation (MLE)
    best_model.fit(df_dense[['IR_encoded', 'EI_encoded', 'SP_encoded']], estimator=MaximumLikelihoodEstimator)

    # Create an inference object for the best model
    inference = VariableElimination(best_model)

    # Placeholder to store K-L divergence values
    kl_divergences = []

    # Loop through each row in the dense dataset to make predictions
    for index, row in df_dense.iterrows():
        sample_input = {'IR_encoded': int(row['IR_encoded']), 'EI_encoded': int(row['EI_encoded'])}
        predicted_sp_distribution = inference.query(variables=['SP_encoded'], evidence=sample_input)
        predicted_probs = predicted_sp_distribution.values

        ir_value = row['IR']
        ei_value = row['EI']

        col_prefix = f'SP_given_IR_{ir_value}_EI_{ei_value}'
        ground_truth_probs = df_gt_probs.filter(like=col_prefix).values.flatten()

        # Ensure the probabilities are non-zero to avoid division by zero
        epsilon = 1e-10
        ground_truth_probs = np.clip(ground_truth_probs, epsilon, 1)

        # Compute K-L divergence (Learned BN vs Ground Truth)
        kl_div = entropy(predicted_probs, ground_truth_probs)
        kl_divergences.append(kl_div)

    # Calculate the average K-L divergence over all samples
    average_kl_divergence = np.mean(kl_divergences)

    # Save the K-L divergence value to the CSV file
    save_kl_divergence(sample_size, average_kl_divergence, clear_file=clear_file)

    # Set clear_file to False after the first iteration
    clear_file = False

    # Print confirmation
    print(f"Average K-L Divergence for {sample_size} samples (AIC): {average_kl_divergence:.4f}")

# Notify the user that the process is done
print("\nK-L divergence calculations complete and saved to 'kl_div_LBN_dense_aic.csv'.")

# K-L Divergence LBN Sparse Data

## BIC

In [None]:
# Define the function to save K-L divergence to a file
def save_kl_divergence(sample_size, kl_div_value, file_name, clear_file=False):
    # If it's the first run (clear_file is True), remove the file to start fresh
    if clear_file and os.path.exists(file_name):
        os.remove(file_name)

    # Create a new file and write the header if it doesn't exist
    if not os.path.exists(file_name):
        with open(file_name, 'w') as f:
            f.write('Size,LBN_Sparse_BIC_Entropy\n')  # Write the headers

    # Append the K-L divergence for this sample size to the file
    with open(file_name, 'a') as f:
        f.write(f"{sample_size},{kl_div_value:.4f}\n")

# Sample sizes to loop through
sample_sizes = range(100, 1100, 100)

# File name for BIC K-L divergence results
kl_div_bic_file = 'kl_div_LBN_sparse_bic.csv'

# Start fresh and clear the file on the first run
clear_file = True

# Loop through each sample size for BIC
for sample_size in sample_sizes:
    print(f"\nCalculating K-L Divergence for BIC with sample size: {sample_size}")

    # Load the sparse dataset and ground truth probabilities for the current sample size
    sparse_data_file = f'outcomes_sparse_{sample_size}.csv'
    ground_truth_probs_file = f'probabilities_sparse_{sample_size}.csv'
    df_sparse = pd.read_csv(sparse_data_file)
    df_gt_probs = pd.read_csv(ground_truth_probs_file)

    # Manually encode categorical variables for IR, EI, and SP
    ir_map = {'low': 0, 'medium': 1, 'high': 2}
    ei_map = {'poor': 0, 'average': 1, 'good': 2}
    sp_map = {'decrease': 0, 'stable': 1, 'increase': 2}

    df_sparse['IR_encoded'] = df_sparse['IR'].map(ir_map)
    df_sparse['EI_encoded'] = df_sparse['EI'].map(ei_map)
    df_sparse['SP_encoded'] = df_sparse['SP'].map(sp_map)

    # Define the Hill-Climb structure learning algorithm
    hc = HillClimbSearch(df_sparse[['IR_encoded', 'EI_encoded', 'SP_encoded']])
    scoring_method = BicScore(df_sparse[['IR_encoded', 'EI_encoded', 'SP_encoded']])

    # Estimate the best structure
    best_dag = hc.estimate(scoring_method=scoring_method)

    # Ensure all required nodes are present in the model, even if not connected
    best_model = BayesianNetwork()
    best_model.add_nodes_from(['IR_encoded', 'EI_encoded', 'SP_encoded'])  # Add all nodes
    best_model.add_edges_from(best_dag.edges())  # Add edges from the learned structure

    # Add a dummy variable if required nodes are not connected
    nodes_in_structure = set(best_model.nodes())
    required_nodes = {'IR_encoded', 'EI_encoded', 'SP_encoded'}
    if not required_nodes.issubset(nodes_in_structure):
        df_sparse['Dummy_Node'] = 1  # Constant dummy node
        hc = HillClimbSearch(df_sparse[['IR_encoded', 'EI_encoded', 'SP_encoded', 'Dummy_Node']])
        scoring_method = BicScore(df_sparse[['IR_encoded', 'EI_encoded', 'SP_encoded', 'Dummy_Node']])
        best_dag = hc.estimate(scoring_method=scoring_method)
        best_model = BayesianNetwork(best_dag.edges())

    # Learn the CPDs using Maximum Likelihood Estimation (MLE)
    best_model.fit(df_sparse[['IR_encoded', 'EI_encoded', 'SP_encoded']], estimator=MaximumLikelihoodEstimator)

    # Create an inference object for the best model
    inference = VariableElimination(best_model)

    # Placeholder to store K-L divergence values
    kl_divergences = []

    # Loop through each row in the sparse dataset to make predictions
    for index, row in df_sparse.iterrows():
        sample_input = {'IR_encoded': int(row['IR_encoded']), 'EI_encoded': int(row['EI_encoded'])}
        predicted_sp_distribution = inference.query(variables=['SP_encoded'], evidence=sample_input)
        predicted_probs = predicted_sp_distribution.values

        ir_value = row['IR']
        ei_value = row['EI']

        col_prefix = f'SP_given_IR_{ir_value}_EI_{ei_value}'
        ground_truth_probs = df_gt_probs.filter(like=col_prefix).values.flatten()

        # Ensure the probabilities are non-zero to avoid division by zero
        epsilon = 1e-10
        ground_truth_probs = np.clip(ground_truth_probs, epsilon, 1)

        # Compute K-L divergence (Learned BN vs Ground Truth)
        kl_div = entropy(predicted_probs, ground_truth_probs)
        kl_divergences.append(kl_div)

    # Calculate the average K-L divergence over all samples
    average_kl_divergence = np.mean(kl_divergences)

    # Save the K-L divergence value to the CSV file
    save_kl_divergence(sample_size, average_kl_divergence, kl_div_bic_file, clear_file=clear_file)

    # Set clear_file to False after the first iteration
    clear_file = False

    # Print confirmation
    print(f"Average K-L Divergence for {sample_size} samples (BIC): {average_kl_divergence:.4f}")

# Notify the user that the process is done
print("\nK-L divergence calculations for BIC complete and saved to 'kl_div_LBN_sparse_BIC.csv'.")

## AIC

In [None]:
# Define the function to save K-L divergence to a file
def save_kl_divergence(sample_size, kl_div_value, file_name, clear_file=False):
    # If it's the first run (clear_file is True), remove the file to start fresh
    if clear_file and os.path.exists(file_name):
        os.remove(file_name)

    # Create a new file and write the header if it doesn't exist
    if not os.path.exists(file_name):
        with open(file_name, 'w') as f:
            f.write('Size,LBN_Sparse_AIC_Entropy\n')  # Write the headers

    # Append the K-L divergence for this sample size to the file
    with open(file_name, 'a') as f:
        f.write(f"{sample_size},{kl_div_value:.4f}\n")

# Sample sizes to loop through
sample_sizes = range(100, 1100, 100)

# File name for AIC K-L divergence results
kl_div_aic_file = 'kl_div_LBN_sparse_aic.csv'

# Start fresh and clear the file on the first run
clear_file = True

# Loop through each sample size for AIC
for sample_size in sample_sizes:
    print(f"\nCalculating K-L Divergence for AIC with sample size: {sample_size}")

    # Load the sparse dataset and ground truth probabilities for the current sample size
    sparse_data_file = f'outcomes_sparse_{sample_size}.csv'
    ground_truth_probs_file = f'probabilities_sparse_{sample_size}.csv'
    df_sparse = pd.read_csv(sparse_data_file)
    df_gt_probs = pd.read_csv(ground_truth_probs_file)

    # Manually encode categorical variables for IR, EI, and SP
    ir_map = {'low': 0, 'medium': 1, 'high': 2}
    ei_map = {'poor': 0, 'average': 1, 'good': 2}
    sp_map = {'decrease': 0, 'stable': 1, 'increase': 2}

    df_sparse['IR_encoded'] = df_sparse['IR'].map(ir_map)
    df_sparse['EI_encoded'] = df_sparse['EI'].map(ei_map)
    df_sparse['SP_encoded'] = df_sparse['SP'].map(sp_map)

    # Define the Hill-Climb structure learning algorithm
    hc = HillClimbSearch(df_sparse[['IR_encoded', 'EI_encoded', 'SP_encoded']])
    scoring_method = AICScore(df_sparse[['IR_encoded', 'EI_encoded', 'SP_encoded']])

    # Estimate the best structure
    best_dag = hc.estimate(scoring_method=scoring_method)

    # Ensure all required nodes are present in the model, even if not connected
    best_model = BayesianNetwork()
    best_model.add_nodes_from(['IR_encoded', 'EI_encoded', 'SP_encoded'])  # Add all nodes
    best_model.add_edges_from(best_dag.edges())  # Add edges from the learned structure

    # Add a dummy variable if required nodes are not connected
    nodes_in_structure = set(best_model.nodes())
    required_nodes = {'IR_encoded', 'EI_encoded', 'SP_encoded'}
    if not required_nodes.issubset(nodes_in_structure):
        df_sparse['Dummy_Node'] = 1  # Constant dummy node
        hc = HillClimbSearch(df_sparse[['IR_encoded', 'EI_encoded', 'SP_encoded', 'Dummy_Node']])
        scoring_method = AICScore(df_sparse[['IR_encoded', 'EI_encoded', 'SP_encoded', 'Dummy_Node']])
        best_dag = hc.estimate(scoring_method=scoring_method)
        best_model = BayesianNetwork(best_dag.edges())

    # Learn the CPDs using Maximum Likelihood Estimation (MLE)
    best_model.fit(df_sparse[['IR_encoded', 'EI_encoded', 'SP_encoded']], estimator=MaximumLikelihoodEstimator)

    # Create an inference object for the best model
    inference = VariableElimination(best_model)

    # Placeholder to store K-L divergence values
    kl_divergences = []

    # Loop through each row in the sparse dataset to make predictions
    for index, row in df_sparse.iterrows():
        sample_input = {'IR_encoded': int(row['IR_encoded']), 'EI_encoded': int(row['EI_encoded'])}
        predicted_sp_distribution = inference.query(variables=['SP_encoded'], evidence=sample_input)
        predicted_probs = predicted_sp_distribution.values

        ir_value = row['IR']
        ei_value = row['EI']

        col_prefix = f'SP_given_IR_{ir_value}_EI_{ei_value}'
        ground_truth_probs = df_gt_probs.filter(like=col_prefix).values.flatten()

        # Ensure the probabilities are non-zero to avoid division by zero
        epsilon = 1e-10
        ground_truth_probs = np.clip(ground_truth_probs, epsilon, 1)

        # Compute K-L divergence (Learned BN vs Ground Truth)
        kl_div = entropy(predicted_probs, ground_truth_probs)
        kl_divergences.append(kl_div)

    # Calculate the average K-L divergence over all samples
    average_kl_divergence = np.mean(kl_divergences)

    # Save the K-L divergence value to the CSV file
    save_kl_divergence(sample_size, average_kl_divergence, kl_div_aic_file, clear_file=clear_file)

    # Set clear_file to False after the first iteration
    clear_file = False

    # Print confirmation
    print(f"Average K-L Divergence for {sample_size} samples (AIC): {average_kl_divergence:.4f}")

# Notify the user that the process is done
print("\nK-L divergence calculations for AIC complete and saved to 'kl_div_LBN_sparse_AIC.csv'.")

# ------------------------------------------------------------------------------------------------------------