<a href="https://colab.research.google.com/github/nonyeezeh/Research-Project-Code/blob/main/NN_1_10_Relu.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Imports

In [1]:
pip install pgmpy

Collecting pgmpy
  Downloading pgmpy-0.1.26-py3-none-any.whl.metadata (9.1 kB)
Downloading pgmpy-0.1.26-py3-none-any.whl (2.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.0/2.0 MB[0m [31m13.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pgmpy
Successfully installed pgmpy-0.1.26


In [2]:
import numpy as np
import pandas as pd
from pgmpy.models import BayesianNetwork
from pgmpy.models import BayesianModel
from pgmpy.factors.discrete import TabularCPD
from pgmpy.sampling import BayesianModelSampling
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import tensorflow as tf
from tensorflow.keras import layers, models
from pgmpy.estimators import HillClimbSearch, BicScore, AICScore, MaximumLikelihoodEstimator
from pgmpy.inference import VariableElimination
from sklearn.metrics import accuracy_score
from scipy.stats import entropy
import os
import matplotlib.pyplot as plt

# ------------------------------------------------------------------------------------------------------------

# Bayesian Network Data Generation 500, 1000, 1500, ..., 10000 Samples (dense)

In [3]:
#np.random.seed(1)

# Define the mappings for IR, EI, SP
ir_map = {0: 'low', 1: 'medium', 2: 'high'}
ei_map = {0: 'poor', 1: 'average', 2: 'good'}
sp_map = {0: 'decrease', 1: 'stable', 2: 'increase'}

# Define the dense Bayesian Network
dense_model = BayesianNetwork([('IR', 'EI'), ('EI', 'SP'), ('IR', 'SP')])

# Function to generate CPDs
def generate_cpds():
    ir_probs = np.random.rand(3)
    ir_probs /= ir_probs.sum()  # Normalize to make it a valid probability distribution

    ei_given_ir_probs = np.random.rand(3, 3)
    ei_given_ir_probs /= ei_given_ir_probs.sum(axis=0, keepdims=True)

    sp_probs = np.random.rand(3, 3, 3)
    sp_probs /= sp_probs.sum(axis=0, keepdims=True)

    sp_probs_reshaped = sp_probs.reshape(3, -1)

    return ir_probs, ei_given_ir_probs, sp_probs_reshaped

# Save probabilities in a single CSV file
def save_probabilities(ir_probs, ei_probs, sp_probs, filename):
    # Create a DataFrame for IR probabilities
    ir_df = pd.DataFrame({
        'IR_State': ['low', 'medium', 'high'],
        'IR_Prob': ir_probs
    })

    # Create a DataFrame for EI given IR probabilities
    ei_df = pd.DataFrame(ei_probs, columns=['EI_given_IR_low', 'EI_given_IR_medium', 'EI_given_IR_high'])
    ei_df['EI_State'] = ['poor', 'average', 'good']

    # Create a DataFrame for SP given IR and EI probabilities
    sp_df = pd.DataFrame(sp_probs, columns=[
        'SP_given_IR_low_EI_poor', 'SP_given_IR_low_EI_average', 'SP_given_IR_low_EI_good',
        'SP_given_IR_medium_EI_poor', 'SP_given_IR_medium_EI_average', 'SP_given_IR_medium_EI_good',
        'SP_given_IR_high_EI_poor', 'SP_given_IR_high_EI_average', 'SP_given_IR_high_EI_good'
    ])
    sp_df['SP_State'] = ['decrease', 'stable', 'increase']

    # Combine all data into a single DataFrame
    combined_df = pd.concat([ir_df, ei_df, sp_df], axis=1)

    # Save the combined DataFrame as a single CSV file
    combined_df.to_csv(filename, index=False)

# Save outcomes in a CSV file
def save_outcomes(data_dense, filename):
    data_dense['IR'] = data_dense['IR'].map(ir_map)
    data_dense['EI'] = data_dense['EI'].map(ei_map)
    data_dense['SP'] = data_dense['SP'].map(sp_map)
    data_dense.to_csv(filename, index=False)

# Generate datasets for different sample sizes for the dense model
sample_sizes = range(500, 10500, 500)
for size in sample_sizes:
    # Generate the CPDs
    ir_probs, ei_given_ir_probs, sp_probs_reshaped = generate_cpds()

    # Define CPDs for the dense model
    cpd_ir = TabularCPD(variable='IR', variable_card=3, values=[[ir_probs[0]], [ir_probs[1]], [ir_probs[2]]])
    cpd_ei_dense = TabularCPD(variable='EI', variable_card=3,
                              values=ei_given_ir_probs,
                              evidence=['IR'], evidence_card=[3])
    cpd_sp_dense = TabularCPD(variable='SP', variable_card=3,
                              values=sp_probs_reshaped,
                              evidence=['IR', 'EI'], evidence_card=[3, 3])

    dense_model.add_cpds(cpd_ir, cpd_ei_dense, cpd_sp_dense)

    # Check if the model is valid
    assert dense_model.check_model()

    # Generate samples
    sampler_dense = BayesianModelSampling(dense_model)
    data_dense = sampler_dense.forward_sample(size=size)

    # Save probabilities in one file
    save_probabilities(ir_probs, ei_given_ir_probs, sp_probs_reshaped, f'probabilities_dense_{size}.csv')

    # Save outcomes (low, medium, high) in another file
    save_outcomes(data_dense, f'outcomes_dense_{size}.csv')

# Notify the user that the process is done
print("Data generation and saving complete for the dense model!")

  0%|          | 0/3 [00:00<?, ?it/s]



  0%|          | 0/3 [00:00<?, ?it/s]



  0%|          | 0/3 [00:00<?, ?it/s]



  0%|          | 0/3 [00:00<?, ?it/s]



  0%|          | 0/3 [00:00<?, ?it/s]



  0%|          | 0/3 [00:00<?, ?it/s]



  0%|          | 0/3 [00:00<?, ?it/s]



  0%|          | 0/3 [00:00<?, ?it/s]



  0%|          | 0/3 [00:00<?, ?it/s]



  0%|          | 0/3 [00:00<?, ?it/s]



  0%|          | 0/3 [00:00<?, ?it/s]



  0%|          | 0/3 [00:00<?, ?it/s]



  0%|          | 0/3 [00:00<?, ?it/s]



  0%|          | 0/3 [00:00<?, ?it/s]



  0%|          | 0/3 [00:00<?, ?it/s]



  0%|          | 0/3 [00:00<?, ?it/s]



  0%|          | 0/3 [00:00<?, ?it/s]



  0%|          | 0/3 [00:00<?, ?it/s]



  0%|          | 0/3 [00:00<?, ?it/s]



  0%|          | 0/3 [00:00<?, ?it/s]



Data generation and saving complete for the dense model!


# Bayesian Network Data Generation 500, 1000, 1500, ..., 10000 Samples (sparse)

In [4]:
#np.random.seed(187)

# Define the mappings for IR, EI, SP
ir_map = {0: 'low', 1: 'medium', 2: 'high'}
ei_map = {0: 'poor', 1: 'average', 2: 'good'}
sp_map = {0: 'decrease', 1: 'stable', 2: 'increase'}

# Define the sparse Bayesian Network
sparse_model = BayesianNetwork([('IR', 'SP'), ('EI', 'SP')])

# Function to generate CPDs for the sparse model
def generate_cpds_sparse():
    # Generate probabilities for IR (unconditional)
    ir_probs = np.random.rand(3)
    ir_probs /= ir_probs.sum()  # Normalize to make it a valid probability distribution

    # Generate unconditional probabilities for EI (no dependency on IR)
    ei_probs = np.random.rand(3)
    ei_probs /= ei_probs.sum()

    # Generate conditional probabilities for SP given IR and EI
    sp_probs = np.random.rand(3, 3, 3)
    sp_probs /= sp_probs.sum(axis=0, keepdims=True)

    sp_probs_reshaped = sp_probs.reshape(3, -1)

    return ir_probs, ei_probs, sp_probs_reshaped

# Save probabilities in a single CSV file
def save_probabilities_sparse(ir_probs, ei_probs, sp_probs, filename):
    # Create a DataFrame for IR probabilities
    ir_df = pd.DataFrame({
        'IR_State': ['low', 'medium', 'high'],
        'IR_Prob': ir_probs
    })

    # Create a DataFrame for EI probabilities (since it's not conditional on IR)
    ei_df = pd.DataFrame({
        'EI_State': ['poor', 'average', 'good'],
        'EI_Prob': ei_probs
    })

    # Create a DataFrame for SP given IR and EI probabilities
    sp_df = pd.DataFrame(sp_probs, columns=[
        'SP_given_IR_low_EI_poor', 'SP_given_IR_low_EI_average', 'SP_given_IR_low_EI_good',
        'SP_given_IR_medium_EI_poor', 'SP_given_IR_medium_EI_average', 'SP_given_IR_medium_EI_good',
        'SP_given_IR_high_EI_poor', 'SP_given_IR_high_EI_average', 'SP_given_IR_high_EI_good'
    ])
    sp_df['SP_State'] = ['decrease', 'stable', 'increase']

    # Combine all data into a single DataFrame
    combined_df = pd.concat([ir_df, ei_df, sp_df], axis=1)

    # Save the combined DataFrame as a single CSV file
    combined_df.to_csv(filename, index=False)

# Save outcomes in a CSV file
def save_outcomes_sparse(data_sparse, filename):
    data_sparse['IR'] = data_sparse['IR'].map(ir_map)
    data_sparse['EI'] = data_sparse['EI'].map(ei_map)
    data_sparse['SP'] = data_sparse['SP'].map(sp_map)
    data_sparse.to_csv(filename, index=False)

# Generate datasets for different sample sizes for the sparse model
sample_sizes = range(500, 10500, 500)
for size in sample_sizes:
    # Generate the CPDs
    ir_probs, ei_probs, sp_probs_reshaped = generate_cpds_sparse()

    # Define CPDs for the sparse model
    cpd_ir = TabularCPD(variable='IR', variable_card=3, values=[[ir_probs[0]], [ir_probs[1]], [ir_probs[2]]])
    cpd_ei_sparse = TabularCPD(variable='EI', variable_card=3, values=[[ei_probs[0]], [ei_probs[1]], [ei_probs[2]]])
    cpd_sp_sparse = TabularCPD(variable='SP', variable_card=3,
                               values=sp_probs_reshaped,
                               evidence=['IR', 'EI'], evidence_card=[3, 3])

    sparse_model.add_cpds(cpd_ir, cpd_ei_sparse, cpd_sp_sparse)

    # Check if the model is valid
    assert sparse_model.check_model()

    # Generate samples
    sampler_sparse = BayesianModelSampling(sparse_model)
    data_sparse = sampler_sparse.forward_sample(size=size)

    # Save probabilities in one file
    save_probabilities_sparse(ir_probs, ei_probs, sp_probs_reshaped, f'probabilities_sparse_{size}.csv')

    # Save outcomes (low, medium, high) in another file
    save_outcomes_sparse(data_sparse, f'outcomes_sparse_{size}.csv')

# Notify the user that the process is done
print("Data generation and saving complete for the sparse model!")

  0%|          | 0/3 [00:00<?, ?it/s]



  0%|          | 0/3 [00:00<?, ?it/s]



  0%|          | 0/3 [00:00<?, ?it/s]



  0%|          | 0/3 [00:00<?, ?it/s]



  0%|          | 0/3 [00:00<?, ?it/s]



  0%|          | 0/3 [00:00<?, ?it/s]



  0%|          | 0/3 [00:00<?, ?it/s]



  0%|          | 0/3 [00:00<?, ?it/s]



  0%|          | 0/3 [00:00<?, ?it/s]



  0%|          | 0/3 [00:00<?, ?it/s]



  0%|          | 0/3 [00:00<?, ?it/s]



  0%|          | 0/3 [00:00<?, ?it/s]



  0%|          | 0/3 [00:00<?, ?it/s]



  0%|          | 0/3 [00:00<?, ?it/s]



  0%|          | 0/3 [00:00<?, ?it/s]



  0%|          | 0/3 [00:00<?, ?it/s]



  0%|          | 0/3 [00:00<?, ?it/s]



  0%|          | 0/3 [00:00<?, ?it/s]



  0%|          | 0/3 [00:00<?, ?it/s]



  0%|          | 0/3 [00:00<?, ?it/s]

Data generation and saving complete for the sparse model!


In [None]:
# Sample sizes to loop through
sample_sizes = range(500, 10500, 500)

# Loop through each sample size
for sample_size in sample_sizes:
    print(f"\nProcessing sample size: {sample_size}")

    # Load the dense dataset for the current sample size
    dense_data_file = f'outcomes_dense_{sample_size}.csv'
    df_dense = pd.read_csv(dense_data_file)

    # Manually encode categorical variables for IR, EI, and SP
    ir_map = {'low': 0, 'medium': 1, 'high': 2}
    ei_map = {'poor': 0, 'average': 1, 'good': 2}
    sp_map = {'decrease': 0, 'stable': 1, 'increase': 2}

    df_dense['IR_encoded'] = df_dense['IR'].map(ir_map)
    df_dense['EI_encoded'] = df_dense['EI'].map(ei_map)
    df_dense['SP_encoded'] = df_dense['SP'].map(sp_map)

    # Define the Hill-Climb structure learning algorithm
    hc = HillClimbSearch(df_dense[['IR_encoded', 'EI_encoded', 'SP_encoded']])
    scoring_method = BicScore(df_dense[['IR_encoded', 'EI_encoded', 'SP_encoded']])

    # Estimate the best structure
    best_dag = hc.estimate(scoring_method=scoring_method)
    best_model = BayesianNetwork(best_dag.edges())

    # Check if all nodes are included in the learned structure
    nodes_in_structure = set(best_model.nodes())
    required_nodes = {'IR_encoded', 'EI_encoded', 'SP_encoded'}

    if not required_nodes.issubset(nodes_in_structure):
        print("\nNot all nodes are connected. Adding a dummy variable.")
        # Add a dummy variable to the dataset
        df_dense['dummy'] = 0

        # Re-estimate the structure with the dummy variable
        hc = HillClimbSearch(df_dense)
        scoring_method = BicScore(df_dense)
        best_dag = hc.estimate(scoring_method=scoring_method)
        best_model = BayesianNetwork(best_dag.edges())

    # Display the learned structure (edges of the Bayesian Network)
    print(f"\nLearned Structure (Edges) for {sample_size} samples:")
    print(best_model.edges())

    # Calculate and display the BIC score
    bic_score = scoring_method.score(best_model)
    print(f"\nBIC Score for {sample_size} samples: {bic_score:.4f}")

    # Learn the CPDs using Maximum Likelihood Estimation (MLE)
    best_model.fit(df_dense, estimator=MaximumLikelihoodEstimator)

    # Check if the model is valid after learning the parameters
    assert best_model.check_model()

    # Print the learned CPDs (Conditional Probability Distributions)
    for cpd in best_model.get_cpds():
        print("\nCPD of", cpd.variable)
        print(cpd)

    # Create an inference object for the best model
    inference = VariableElimination(best_model)

    # Placeholder to store predictions
    predicted_sp_labels = []

    # Loop through each row in the dense dataset to make predictions
    for index, row in df_dense.iterrows():
        # Prepare the evidence from the dataset (IR_encoded and EI_encoded)
        sample_input = {'IR_encoded': int(row['IR_encoded']), 'EI_encoded': int(row['EI_encoded'])}

        # Perform inference to predict the distribution for SP_encoded (Stock Price)
        predicted_sp_distribution = inference.query(variables=['SP_encoded'], evidence=sample_input)

        # Extract the most likely SP_encoded class
        predicted_sp_class = predicted_sp_distribution.values.argmax()
        sp_reverse_map = {0: 'decrease', 1: 'stable', 2: 'increase'}
        predicted_sp_label = sp_reverse_map[predicted_sp_class]

        # Store the predicted label
        predicted_sp_labels.append(predicted_sp_label)

    # Convert the list of predicted labels into a DataFrame for easier comparison
    predicted_results_df = pd.DataFrame({
        'IR': df_dense['IR'],  # Original IR column
        'EI': df_dense['EI'],  # Original EI column
        'Predicted_SP': predicted_sp_labels  # Predicted SP column
    })

    # Add the actual SP values for comparison
    predicted_results_df['Actual_SP'] = df_dense['SP']

    # Calculate accuracy of predictions
    accuracy = accuracy_score(predicted_results_df['Actual_SP'], predicted_results_df['Predicted_SP'])
    print(f"\nPrediction Accuracy for {sample_size} samples: {accuracy:.4f}")

    # Display the first few rows of predictions
    print(f"\nPredicted Results for Dense Data (First 10 rows) for {sample_size} samples:")
    print(predicted_results_df.head(10))

# Notify the user that the process is done
print("\nProcessing complete for all sample sizes.")


Processing sample size: 500


  0%|          | 0/1000000 [00:00<?, ?it/s]


Learned Structure (Edges) for 500 samples:
[('IR_encoded', 'SP_encoded'), ('IR_encoded', 'EI_encoded'), ('EI_encoded', 'SP_encoded')]

BIC Score for 500 samples: -1319.6222

CPD of IR_encoded
+---------------+-------+
| IR_encoded(0) | 0.352 |
+---------------+-------+
| IR_encoded(1) | 0.648 |
+---------------+-------+

CPD of SP_encoded
+---------------+---------------------+-----+----------------------+---------------------+
| EI_encoded    | EI_encoded(0)       | ... | EI_encoded(2)        | EI_encoded(2)       |
+---------------+---------------------+-----+----------------------+---------------------+
| IR_encoded    | IR_encoded(0)       | ... | IR_encoded(0)        | IR_encoded(1)       |
+---------------+---------------------+-----+----------------------+---------------------+
| SP_encoded(0) | 0.12727272727272726 | ... | 0.023809523809523808 | 0.6122448979591837  |
+---------------+---------------------+-----+----------------------+---------------------+
| SP_encoded(1) | 0.8

  0%|          | 0/1000000 [00:00<?, ?it/s]


Learned Structure (Edges) for 1000 samples:
[('IR_encoded', 'EI_encoded'), ('EI_encoded', 'SP_encoded')]

BIC Score for 1000 samples: -2711.4516

CPD of IR_encoded
+---------------+-------+
| IR_encoded(0) | 0.525 |
+---------------+-------+
| IR_encoded(1) | 0.057 |
+---------------+-------+
| IR_encoded(2) | 0.418 |
+---------------+-------+

CPD of EI_encoded
+---------------+---------------------+---------------------+---------------------+
| IR_encoded    | IR_encoded(0)       | IR_encoded(1)       | IR_encoded(2)       |
+---------------+---------------------+---------------------+---------------------+
| EI_encoded(0) | 0.11238095238095239 | 0.03508771929824561 | 0.34688995215311    |
+---------------+---------------------+---------------------+---------------------+
| EI_encoded(1) | 0.8552380952380952  | 0.5087719298245614  | 0.17703349282296652 |
+---------------+---------------------+---------------------+---------------------+
| EI_encoded(2) | 0.03238095238095238 | 0.4561

  0%|          | 0/1000000 [00:00<?, ?it/s]


Learned Structure (Edges) for 1500 samples:
[('IR_encoded', 'SP_encoded'), ('IR_encoded', 'EI_encoded'), ('EI_encoded', 'SP_encoded')]

BIC Score for 1500 samples: -4609.2884

CPD of IR_encoded
+---------------+----------+
| IR_encoded(0) | 0.458    |
+---------------+----------+
| IR_encoded(1) | 0.129333 |
+---------------+----------+
| IR_encoded(2) | 0.412667 |
+---------------+----------+

CPD of SP_encoded
+---------------+---------------------+-----+--------------------+---------------------+
| EI_encoded    | EI_encoded(0)       | ... | EI_encoded(2)      | EI_encoded(2)       |
+---------------+---------------------+-----+--------------------+---------------------+
| IR_encoded    | IR_encoded(0)       | ... | IR_encoded(1)      | IR_encoded(2)       |
+---------------+---------------------+-----+--------------------+---------------------+
| SP_encoded(0) | 0.4791666666666667  | ... | 0.3333333333333333 | 0.3622448979591837  |
+---------------+---------------------+-----+----

  0%|          | 0/1000000 [00:00<?, ?it/s]


Learned Structure (Edges) for 2000 samples:
[('IR_encoded', 'EI_encoded'), ('SP_encoded', 'IR_encoded'), ('SP_encoded', 'EI_encoded')]

BIC Score for 2000 samples: -6037.1192

CPD of IR_encoded
+---------------+---------------------+--------------------+--------------------+
| SP_encoded    | SP_encoded(0)       | SP_encoded(1)      | SP_encoded(2)      |
+---------------+---------------------+--------------------+--------------------+
| IR_encoded(0) | 0.3587662337662338  | 0.4549019607843137 | 0.3473344103392569 |
+---------------+---------------------+--------------------+--------------------+
| IR_encoded(1) | 0.10714285714285714 | 0.2758169934640523 | 0.5266558966074314 |
+---------------+---------------------+--------------------+--------------------+
| IR_encoded(2) | 0.5340909090909091  | 0.269281045751634  | 0.1260096930533118 |
+---------------+---------------------+--------------------+--------------------+

CPD of EI_encoded
+---------------+---------------------+-----+---

  0%|          | 0/1000000 [00:00<?, ?it/s]


Learned Structure (Edges) for 2500 samples:
[('IR_encoded', 'EI_encoded'), ('SP_encoded', 'IR_encoded'), ('SP_encoded', 'EI_encoded')]

BIC Score for 2500 samples: -7367.3181

CPD of IR_encoded
+---------------+--------------------+---------------------+---------------------+
| SP_encoded    | SP_encoded(0)      | SP_encoded(1)       | SP_encoded(2)       |
+---------------+--------------------+---------------------+---------------------+
| IR_encoded(0) | 0.3423076923076923 | 0.06787330316742081 | 0.37440191387559807 |
+---------------+--------------------+---------------------+---------------------+
| IR_encoded(1) | 0.4576923076923077 | 0.3212669683257919  | 0.18301435406698566 |
+---------------+--------------------+---------------------+---------------------+
| IR_encoded(2) | 0.2                | 0.6108597285067874  | 0.44258373205741625 |
+---------------+--------------------+---------------------+---------------------+

CPD of EI_encoded
+---------------+---------------------+

  0%|          | 0/1000000 [00:00<?, ?it/s]


Learned Structure (Edges) for 3000 samples:
[('IR_encoded', 'EI_encoded'), ('SP_encoded', 'IR_encoded'), ('SP_encoded', 'EI_encoded')]

BIC Score for 3000 samples: -9073.2248

CPD of IR_encoded
+---------------+---------------------+---------------------+---------------------+
| SP_encoded    | SP_encoded(0)       | SP_encoded(1)       | SP_encoded(2)       |
+---------------+---------------------+---------------------+---------------------+
| IR_encoded(0) | 0.6323809523809524  | 0.20218120805369127 | 0.4287598944591029  |
+---------------+---------------------+---------------------+---------------------+
| IR_encoded(1) | 0.24                | 0.5956375838926175  | 0.43007915567282323 |
+---------------+---------------------+---------------------+---------------------+
| IR_encoded(2) | 0.12761904761904763 | 0.20218120805369127 | 0.14116094986807387 |
+---------------+---------------------+---------------------+---------------------+

CPD of EI_encoded
+---------------+-------------

  0%|          | 0/1000000 [00:00<?, ?it/s]


Learned Structure (Edges) for 3500 samples:
[('EI_encoded', 'IR_encoded'), ('EI_encoded', 'SP_encoded'), ('SP_encoded', 'IR_encoded')]

BIC Score for 3500 samples: -10222.5304

CPD of EI_encoded
+---------------+----------+
| EI_encoded(0) | 0.179429 |
+---------------+----------+
| EI_encoded(1) | 0.531429 |
+---------------+----------+
| EI_encoded(2) | 0.289143 |
+---------------+----------+

CPD of IR_encoded
+---------------+---------------------+-----+---------------------+----------------------+
| EI_encoded    | EI_encoded(0)       | ... | EI_encoded(2)       | EI_encoded(2)        |
+---------------+---------------------+-----+---------------------+----------------------+
| SP_encoded    | SP_encoded(0)       | ... | SP_encoded(1)       | SP_encoded(2)        |
+---------------+---------------------+-----+---------------------+----------------------+
| IR_encoded(0) | 0.1206896551724138  | ... | 0.14641744548286603 | 0.033268101761252444 |
+---------------+-------------------

  0%|          | 0/1000000 [00:00<?, ?it/s]


Learned Structure (Edges) for 4000 samples:
[('EI_encoded', 'IR_encoded'), ('SP_encoded', 'IR_encoded'), ('SP_encoded', 'EI_encoded')]

BIC Score for 4000 samples: -11406.3537

CPD of EI_encoded
+---------------+---------------------+---------------------+---------------------+
| SP_encoded    | SP_encoded(0)       | SP_encoded(1)       | SP_encoded(2)       |
+---------------+---------------------+---------------------+---------------------+
| EI_encoded(0) | 0.3629310344827586  | 0.09164086687306501 | 0.10857142857142857 |
+---------------+---------------------+---------------------+---------------------+
| EI_encoded(1) | 0.29310344827586204 | 0.5578947368421052  | 0.6555102040816326  |
+---------------+---------------------+---------------------+---------------------+
| EI_encoded(2) | 0.3439655172413793  | 0.3504643962848297  | 0.23591836734693877 |
+---------------+---------------------+---------------------+---------------------+

CPD of IR_encoded
+---------------+------------

  0%|          | 0/1000000 [00:00<?, ?it/s]


Learned Structure (Edges) for 4500 samples:
[('EI_encoded', 'IR_encoded'), ('SP_encoded', 'EI_encoded'), ('SP_encoded', 'IR_encoded')]

BIC Score for 4500 samples: -11623.1219

CPD of EI_encoded
+---------------+---------------------+---------------------+---------------------+
| SP_encoded    | SP_encoded(0)       | SP_encoded(1)       | SP_encoded(2)       |
+---------------+---------------------+---------------------+---------------------+
| EI_encoded(0) | 0.36135812449474536 | 0.15428571428571428 | 0.13002944062806673 |
+---------------+---------------------+---------------------+---------------------+
| EI_encoded(1) | 0.52303961196443    | 0.43020408163265306 | 0.6609421000981355  |
+---------------+---------------------+---------------------+---------------------+
| EI_encoded(2) | 0.11560226354082458 | 0.41551020408163264 | 0.20902845927379785 |
+---------------+---------------------+---------------------+---------------------+

CPD of IR_encoded
+---------------+------------

  0%|          | 0/1000000 [00:00<?, ?it/s]


Learned Structure (Edges) for 5000 samples:
[('IR_encoded', 'EI_encoded'), ('SP_encoded', 'EI_encoded'), ('SP_encoded', 'IR_encoded')]

BIC Score for 5000 samples: -14862.5625

CPD of IR_encoded
+---------------+---------------------+---------------------+---------------------+
| SP_encoded    | SP_encoded(0)       | SP_encoded(1)       | SP_encoded(2)       |
+---------------+---------------------+---------------------+---------------------+
| IR_encoded(0) | 0.4466950959488273  | 0.350104821802935   | 0.34292763157894735 |
+---------------+---------------------+---------------------+---------------------+
| IR_encoded(1) | 0.43976545842217485 | 0.5707547169811321  | 0.5263157894736842  |
+---------------+---------------------+---------------------+---------------------+
| IR_encoded(2) | 0.11353944562899787 | 0.07914046121593292 | 0.13075657894736842 |
+---------------+---------------------+---------------------+---------------------+

CPD of EI_encoded
+---------------+------------

  0%|          | 0/1000000 [00:00<?, ?it/s]


Learned Structure (Edges) for 5500 samples:
[('IR_encoded', 'SP_encoded'), ('EI_encoded', 'SP_encoded'), ('EI_encoded', 'IR_encoded')]

BIC Score for 5500 samples: -15253.4326

CPD of IR_encoded
+---------------+---------------------+---------------------+---------------------+
| EI_encoded    | EI_encoded(0)       | EI_encoded(1)       | EI_encoded(2)       |
+---------------+---------------------+---------------------+---------------------+
| IR_encoded(0) | 0.6530032467532467  | 0.7236580516898609  | 0.6180422264875239  |
+---------------+---------------------+---------------------+---------------------+
| IR_encoded(1) | 0.12702922077922077 | 0.21829025844930416 | 0.272552783109405   |
+---------------+---------------------+---------------------+---------------------+
| IR_encoded(2) | 0.21996753246753248 | 0.05805168986083499 | 0.10940499040307101 |
+---------------+---------------------+---------------------+---------------------+

CPD of SP_encoded
+---------------+------------

  0%|          | 0/1000000 [00:00<?, ?it/s]


Learned Structure (Edges) for 6000 samples:
[('EI_encoded', 'IR_encoded'), ('EI_encoded', 'SP_encoded'), ('SP_encoded', 'IR_encoded')]

BIC Score for 6000 samples: -18159.7919

CPD of EI_encoded
+---------------+----------+
| EI_encoded(0) | 0.521167 |
+---------------+----------+
| EI_encoded(1) | 0.222833 |
+---------------+----------+
| EI_encoded(2) | 0.256    |
+---------------+----------+

CPD of IR_encoded
+---------------+---------------------+-----+---------------------+---------------------+
| EI_encoded    | EI_encoded(0)       | ... | EI_encoded(2)       | EI_encoded(2)       |
+---------------+---------------------+-----+---------------------+---------------------+
| SP_encoded    | SP_encoded(0)       | ... | SP_encoded(1)       | SP_encoded(2)       |
+---------------+---------------------+-----+---------------------+---------------------+
| IR_encoded(0) | 0.3792808219178082  | ... | 0.3549382716049383  | 0.5752688172043011  |
+---------------+---------------------+---

  0%|          | 0/1000000 [00:00<?, ?it/s]


Learned Structure (Edges) for 6500 samples:
[('IR_encoded', 'EI_encoded'), ('IR_encoded', 'SP_encoded'), ('SP_encoded', 'EI_encoded')]

BIC Score for 6500 samples: -18795.5857

CPD of IR_encoded
+---------------+-----------+
| IR_encoded(0) | 0.383692  |
+---------------+-----------+
| IR_encoded(1) | 0.0355385 |
+---------------+-----------+
| IR_encoded(2) | 0.580769  |
+---------------+-----------+

CPD of EI_encoded
+---------------+--------------------+-----+---------------------+---------------------+
| IR_encoded    | IR_encoded(0)      | ... | IR_encoded(2)       | IR_encoded(2)       |
+---------------+--------------------+-----+---------------------+---------------------+
| SP_encoded    | SP_encoded(0)      | ... | SP_encoded(1)       | SP_encoded(2)       |
+---------------+--------------------+-----+---------------------+---------------------+
| EI_encoded(0) | 0.1720226843100189 | ... | 0.4531568228105906  | 0.25096525096525096 |
+---------------+--------------------+---

  0%|          | 0/1000000 [00:00<?, ?it/s]


Learned Structure (Edges) for 7000 samples:
[('IR_encoded', 'EI_encoded'), ('SP_encoded', 'EI_encoded'), ('SP_encoded', 'IR_encoded')]

BIC Score for 7000 samples: -19224.3055

CPD of IR_encoded
+---------------+---------------------+---------------------+---------------------+
| SP_encoded    | SP_encoded(0)       | SP_encoded(1)       | SP_encoded(2)       |
+---------------+---------------------+---------------------+---------------------+
| IR_encoded(0) | 0.33005531653349723 | 0.37037037037037035 | 0.5574795574795575  |
+---------------+---------------------+---------------------+---------------------+
| IR_encoded(1) | 0.4732636754763368  | 0.5728597449908925  | 0.36892736892736894 |
+---------------+---------------------+---------------------+---------------------+
| IR_encoded(2) | 0.19668100799016594 | 0.0567698846387371  | 0.0735930735930736  |
+---------------+---------------------+---------------------+---------------------+

CPD of EI_encoded
+---------------+------------

  0%|          | 0/1000000 [00:00<?, ?it/s]


Learned Structure (Edges) for 7500 samples:
[('IR_encoded', 'EI_encoded'), ('SP_encoded', 'EI_encoded'), ('SP_encoded', 'IR_encoded')]

BIC Score for 7500 samples: -22791.1369

CPD of IR_encoded
+---------------+---------------------+---------------------+--------------------+
| SP_encoded    | SP_encoded(0)       | SP_encoded(1)       | SP_encoded(2)      |
+---------------+---------------------+---------------------+--------------------+
| IR_encoded(0) | 0.31312690798081116 | 0.3797930614729154  | 0.2932098765432099 |
+---------------+---------------------+---------------------+--------------------+
| IR_encoded(1) | 0.43959877889228083 | 0.33414485696895924 | 0.351010101010101  |
+---------------+---------------------+---------------------+--------------------+
| IR_encoded(2) | 0.24727431312690798 | 0.2860620815581254  | 0.3557800224466891 |
+---------------+---------------------+---------------------+--------------------+

CPD of EI_encoded
+---------------+---------------------

  0%|          | 0/1000000 [00:00<?, ?it/s]


Learned Structure (Edges) for 8000 samples:
[('IR_encoded', 'SP_encoded'), ('IR_encoded', 'EI_encoded'), ('EI_encoded', 'SP_encoded')]

BIC Score for 8000 samples: -23173.5393

CPD of IR_encoded
+---------------+----------+
| IR_encoded(0) | 0.594125 |
+---------------+----------+
| IR_encoded(1) | 0.29025  |
+---------------+----------+
| IR_encoded(2) | 0.115625 |
+---------------+----------+

CPD of SP_encoded
+---------------+----------------------+-----+--------------------+---------------------+
| EI_encoded    | EI_encoded(0)        | ... | EI_encoded(2)      | EI_encoded(2)       |
+---------------+----------------------+-----+--------------------+---------------------+
| IR_encoded    | IR_encoded(0)        | ... | IR_encoded(1)      | IR_encoded(2)       |
+---------------+----------------------+-----+--------------------+---------------------+
| SP_encoded(0) | 0.003740648379052369 | ... | 0.3229018492176387 | 0.4699248120300752  |
+---------------+----------------------+--

  0%|          | 0/1000000 [00:00<?, ?it/s]


Learned Structure (Edges) for 8500 samples:
[('IR_encoded', 'SP_encoded'), ('EI_encoded', 'SP_encoded'), ('EI_encoded', 'IR_encoded')]

BIC Score for 8500 samples: -26673.3248

CPD of IR_encoded
+---------------+---------------------+---------------------+---------------------+
| EI_encoded    | EI_encoded(0)       | EI_encoded(1)       | EI_encoded(2)       |
+---------------+---------------------+---------------------+---------------------+
| IR_encoded(0) | 0.34951456310679613 | 0.4577853203224438  | 0.405116002379536   |
+---------------+---------------------+---------------------+---------------------+
| IR_encoded(1) | 0.2937792161093132  | 0.18201103097157403 | 0.22873289708506842 |
+---------------+---------------------+---------------------+---------------------+
| IR_encoded(2) | 0.3567062207838907  | 0.3602036487059822  | 0.3661511005353956  |
+---------------+---------------------+---------------------+---------------------+

CPD of SP_encoded
+---------------+------------

  0%|          | 0/1000000 [00:00<?, ?it/s]


Learned Structure (Edges) for 9000 samples:
[('IR_encoded', 'EI_encoded'), ('SP_encoded', 'EI_encoded'), ('SP_encoded', 'IR_encoded')]

BIC Score for 9000 samples: -25678.4204

CPD of IR_encoded
+---------------+----------------------+----------------------+---------------------+
| SP_encoded    | SP_encoded(0)        | SP_encoded(1)        | SP_encoded(2)       |
+---------------+----------------------+----------------------+---------------------+
| IR_encoded(0) | 0.307740130556419    | 0.5086916742909423   | 0.45726837060702874 |
+---------------+----------------------+----------------------+---------------------+
| IR_encoded(1) | 0.6673919801056886   | 0.46996035376639217  | 0.4972044728434505  |
+---------------+----------------------+----------------------+---------------------+
| IR_encoded(2) | 0.024867889337892447 | 0.021347971942665446 | 0.04552715654952077 |
+---------------+----------------------+----------------------+---------------------+

CPD of EI_encoded
+----------

  0%|          | 0/1000000 [00:00<?, ?it/s]


Learned Structure (Edges) for 9500 samples:
[('IR_encoded', 'EI_encoded'), ('IR_encoded', 'SP_encoded'), ('SP_encoded', 'EI_encoded')]

BIC Score for 9500 samples: -28290.2908

CPD of IR_encoded
+---------------+----------+
| IR_encoded(0) | 0.196737 |
+---------------+----------+
| IR_encoded(1) | 0.319684 |
+---------------+----------+
| IR_encoded(2) | 0.483579 |
+---------------+----------+

CPD of EI_encoded
+---------------+----------------------+-----+---------------------+---------------------+
| IR_encoded    | IR_encoded(0)        | ... | IR_encoded(2)       | IR_encoded(2)       |
+---------------+----------------------+-----+---------------------+---------------------+
| SP_encoded    | SP_encoded(0)        | ... | SP_encoded(1)       | SP_encoded(2)       |
+---------------+----------------------+-----+---------------------+---------------------+
| EI_encoded(0) | 0.2945054945054945   | ... | 0.3809782608695652  | 0.42289239204934886 |
+---------------+-------------------

  0%|          | 0/1000000 [00:00<?, ?it/s]


Learned Structure (Edges) for 10000 samples:
[('IR_encoded', 'EI_encoded'), ('IR_encoded', 'SP_encoded'), ('SP_encoded', 'EI_encoded')]

BIC Score for 10000 samples: -29479.1864

CPD of IR_encoded
+---------------+--------+
| IR_encoded(0) | 0.5033 |
+---------------+--------+
| IR_encoded(1) | 0.2318 |
+---------------+--------+
| IR_encoded(2) | 0.2649 |
+---------------+--------+

CPD of EI_encoded
+---------------+---------------------+-----+---------------------+---------------------+
| IR_encoded    | IR_encoded(0)       | ... | IR_encoded(2)       | IR_encoded(2)       |
+---------------+---------------------+-----+---------------------+---------------------+
| SP_encoded    | SP_encoded(0)       | ... | SP_encoded(1)       | SP_encoded(2)       |
+---------------+---------------------+-----+---------------------+---------------------+
| EI_encoded(0) | 0.03593339176161262 | ... | 0.12480252764612954 | 0.4421052631578947  |
+---------------+---------------------+-----+---------

## AIC

In [None]:
# Sample sizes to loop through
sample_sizes = range(500, 10500, 500)

# Loop through each sample size
for sample_size in sample_sizes:
    print(f"\nProcessing sample size: {sample_size}")

    # Load the dense dataset for the current sample size
    dense_data_file = f'outcomes_dense_{sample_size}.csv'
    df_dense = pd.read_csv(dense_data_file)

    # Manually encode categorical variables for IR, EI, and SP
    ir_map = {'low': 0, 'medium': 1, 'high': 2}
    ei_map = {'poor': 0, 'average': 1, 'good': 2}
    sp_map = {'decrease': 0, 'stable': 1, 'increase': 2}

    df_dense['IR_encoded'] = df_dense['IR'].map(ir_map)
    df_dense['EI_encoded'] = df_dense['EI'].map(ei_map)
    df_dense['SP_encoded'] = df_dense['SP'].map(sp_map)

    # Define the Hill-Climb structure learning algorithm
    hc = HillClimbSearch(df_dense[['IR_encoded', 'EI_encoded', 'SP_encoded']])
    scoring_method = AICScore(df_dense[['IR_encoded', 'EI_encoded', 'SP_encoded']])  # Use AICScore instead of BicScore

    # Estimate the best structure
    best_dag = hc.estimate(scoring_method=scoring_method)
    best_model = BayesianNetwork(best_dag.edges())

    # Display the learned structure (edges of the Bayesian Network)
    print(f"\nLearned Structure (Edges) for {sample_size} samples (AIC):")
    print(best_model.edges())

    # Calculate and display the AIC score
    aic_score = scoring_method.score(best_model)
    print(f"\nAIC Score for {sample_size} samples: {aic_score:.4f}")

    # Learn the CPDs using Maximum Likelihood Estimation (MLE)
    best_model.fit(df_dense[['IR_encoded', 'EI_encoded', 'SP_encoded']], estimator=MaximumLikelihoodEstimator)

    # Check if the model is valid after learning the parameters
    assert best_model.check_model()

    # Print the learned CPDs (Conditional Probability Distributions)
    for cpd in best_model.get_cpds():
        print("\nCPD of", cpd.variable)
        print(cpd)

    # Create an inference object for the best model
    inference = VariableElimination(best_model)

    # Placeholder to store predictions
    predicted_sp_labels = []

    # Loop through each row in the dense dataset to make predictions
    for index, row in df_dense.iterrows():
        # Prepare the evidence from the dataset (IR_encoded and EI_encoded)
        sample_input = {'IR_encoded': int(row['IR_encoded']), 'EI_encoded': int(row['EI_encoded'])}

        # Perform inference to predict the distribution for SP_encoded (Stock Price)
        predicted_sp_distribution = inference.query(variables=['SP_encoded'], evidence=sample_input)

        # Extract the most likely SP_encoded class
        predicted_sp_class = predicted_sp_distribution.values.argmax()
        sp_reverse_map = {0: 'decrease', 1: 'stable', 2: 'increase'}
        predicted_sp_label = sp_reverse_map[predicted_sp_class]

        # Store the predicted label
        predicted_sp_labels.append(predicted_sp_label)

    # Convert the list of predicted labels into a DataFrame for easier comparison
    predicted_results_df = pd.DataFrame({
        'IR': df_dense['IR'],  # Original IR column
        'EI': df_dense['EI'],  # Original EI column
        'Predicted_SP': predicted_sp_labels  # Predicted SP column
    })

    # Add the actual SP values for comparison
    predicted_results_df['Actual_SP'] = df_dense['SP']

    # Calculate accuracy of predictions
    accuracy = accuracy_score(predicted_results_df['Actual_SP'], predicted_results_df['Predicted_SP'])
    print(f"\nPrediction Accuracy for {sample_size} samples (AIC): {accuracy:.4f}")

    # Display the first few rows of predictions
    print(f"\nPredicted Results for Dense Data (First 10 rows) for {sample_size} samples (AIC):")
    print(predicted_results_df.head(10))

    # Save the results if needed
    results_filename = f'predicted_results_aic_{sample_size}.csv'
    predicted_results_df.to_csv(results_filename, index=False)
    print(f"\nResults saved to {results_filename}")

# Notify the user that the process is done
print("\nProcessing complete for all sample sizes using AIC.")


Processing sample size: 500


  0%|          | 0/1000000 [00:00<?, ?it/s]


Learned Structure (Edges) for 500 samples (AIC):
[('IR_encoded', 'EI_encoded'), ('SP_encoded', 'EI_encoded'), ('SP_encoded', 'IR_encoded')]

AIC Score for 500 samples: -1283.7981

CPD of IR_encoded
+---------------+---------------------+--------------------+---------------------+
| SP_encoded    | SP_encoded(0)       | SP_encoded(1)      | SP_encoded(2)       |
+---------------+---------------------+--------------------+---------------------+
| IR_encoded(0) | 0.17532467532467533 | 0.4550561797752809 | 0.40476190476190477 |
+---------------+---------------------+--------------------+---------------------+
| IR_encoded(1) | 0.8246753246753247  | 0.5449438202247191 | 0.5952380952380952  |
+---------------+---------------------+--------------------+---------------------+

CPD of EI_encoded
+---------------+---------------------+-----+---------------------+---------------+
| IR_encoded    | IR_encoded(0)       | ... | IR_encoded(1)       | IR_encoded(1) |
+---------------+----------------

  0%|          | 0/1000000 [00:00<?, ?it/s]


Learned Structure (Edges) for 1000 samples (AIC):
[('IR_encoded', 'EI_encoded'), ('IR_encoded', 'SP_encoded'), ('SP_encoded', 'EI_encoded')]

AIC Score for 1000 samples: -2669.6189

CPD of IR_encoded
+---------------+-------+
| IR_encoded(0) | 0.525 |
+---------------+-------+
| IR_encoded(1) | 0.057 |
+---------------+-------+
| IR_encoded(2) | 0.418 |
+---------------+-------+

CPD of EI_encoded
+---------------+---------------------+-----+---------------------+---------------------+
| IR_encoded    | IR_encoded(0)       | ... | IR_encoded(2)       | IR_encoded(2)       |
+---------------+---------------------+-----+---------------------+---------------------+
| SP_encoded    | SP_encoded(0)       | ... | SP_encoded(1)       | SP_encoded(2)       |
+---------------+---------------------+-----+---------------------+---------------------+
| EI_encoded(0) | 0.1986754966887417  | ... | 0.5                 | 0.3484848484848485  |
+---------------+---------------------+-----+-------------

  0%|          | 0/1000000 [00:00<?, ?it/s]


Learned Structure (Edges) for 1500 samples (AIC):
[('IR_encoded', 'SP_encoded'), ('IR_encoded', 'EI_encoded'), ('EI_encoded', 'SP_encoded')]

AIC Score for 1500 samples: -4540.2165

CPD of IR_encoded
+---------------+----------+
| IR_encoded(0) | 0.458    |
+---------------+----------+
| IR_encoded(1) | 0.129333 |
+---------------+----------+
| IR_encoded(2) | 0.412667 |
+---------------+----------+

CPD of SP_encoded
+---------------+---------------------+-----+--------------------+---------------------+
| EI_encoded    | EI_encoded(0)       | ... | EI_encoded(2)      | EI_encoded(2)       |
+---------------+---------------------+-----+--------------------+---------------------+
| IR_encoded    | IR_encoded(0)       | ... | IR_encoded(1)      | IR_encoded(2)       |
+---------------+---------------------+-----+--------------------+---------------------+
| SP_encoded(0) | 0.4791666666666667  | ... | 0.3333333333333333 | 0.3622448979591837  |
+---------------+---------------------+----

  0%|          | 0/1000000 [00:00<?, ?it/s]


Learned Structure (Edges) for 2000 samples (AIC):
[('IR_encoded', 'EI_encoded'), ('SP_encoded', 'IR_encoded'), ('SP_encoded', 'EI_encoded')]

AIC Score for 2000 samples: -5964.3075

CPD of IR_encoded
+---------------+---------------------+--------------------+--------------------+
| SP_encoded    | SP_encoded(0)       | SP_encoded(1)      | SP_encoded(2)      |
+---------------+---------------------+--------------------+--------------------+
| IR_encoded(0) | 0.3587662337662338  | 0.4549019607843137 | 0.3473344103392569 |
+---------------+---------------------+--------------------+--------------------+
| IR_encoded(1) | 0.10714285714285714 | 0.2758169934640523 | 0.5266558966074314 |
+---------------+---------------------+--------------------+--------------------+
| IR_encoded(2) | 0.5340909090909091  | 0.269281045751634  | 0.1260096930533118 |
+---------------+---------------------+--------------------+--------------------+

CPD of EI_encoded
+---------------+---------------------+---

  0%|          | 0/1000000 [00:00<?, ?it/s]


Learned Structure (Edges) for 2500 samples (AIC):
[('IR_encoded', 'EI_encoded'), ('SP_encoded', 'IR_encoded'), ('SP_encoded', 'EI_encoded')]

AIC Score for 2500 samples: -7291.6055

CPD of IR_encoded
+---------------+--------------------+---------------------+---------------------+
| SP_encoded    | SP_encoded(0)      | SP_encoded(1)       | SP_encoded(2)       |
+---------------+--------------------+---------------------+---------------------+
| IR_encoded(0) | 0.3423076923076923 | 0.06787330316742081 | 0.37440191387559807 |
+---------------+--------------------+---------------------+---------------------+
| IR_encoded(1) | 0.4576923076923077 | 0.3212669683257919  | 0.18301435406698566 |
+---------------+--------------------+---------------------+---------------------+
| IR_encoded(2) | 0.2                | 0.6108597285067874  | 0.44258373205741625 |
+---------------+--------------------+---------------------+---------------------+

CPD of EI_encoded
+---------------+----------------

  0%|          | 0/1000000 [00:00<?, ?it/s]


Learned Structure (Edges) for 3000 samples (AIC):
[('IR_encoded', 'EI_encoded'), ('SP_encoded', 'IR_encoded'), ('SP_encoded', 'EI_encoded')]

AIC Score for 3000 samples: -8995.1420

CPD of IR_encoded
+---------------+---------------------+---------------------+---------------------+
| SP_encoded    | SP_encoded(0)       | SP_encoded(1)       | SP_encoded(2)       |
+---------------+---------------------+---------------------+---------------------+
| IR_encoded(0) | 0.6323809523809524  | 0.20218120805369127 | 0.4287598944591029  |
+---------------+---------------------+---------------------+---------------------+
| IR_encoded(1) | 0.24                | 0.5956375838926175  | 0.43007915567282323 |
+---------------+---------------------+---------------------+---------------------+
| IR_encoded(2) | 0.12761904761904763 | 0.20218120805369127 | 0.14116094986807387 |
+---------------+---------------------+---------------------+---------------------+

CPD of EI_encoded
+---------------+-------

  0%|          | 0/1000000 [00:00<?, ?it/s]


Learned Structure (Edges) for 3500 samples (AIC):
[('EI_encoded', 'IR_encoded'), ('EI_encoded', 'SP_encoded'), ('SP_encoded', 'IR_encoded')]

AIC Score for 3500 samples: -10142.4437

CPD of EI_encoded
+---------------+----------+
| EI_encoded(0) | 0.179429 |
+---------------+----------+
| EI_encoded(1) | 0.531429 |
+---------------+----------+
| EI_encoded(2) | 0.289143 |
+---------------+----------+

CPD of IR_encoded
+---------------+---------------------+-----+---------------------+----------------------+
| EI_encoded    | EI_encoded(0)       | ... | EI_encoded(2)       | EI_encoded(2)        |
+---------------+---------------------+-----+---------------------+----------------------+
| SP_encoded    | SP_encoded(0)       | ... | SP_encoded(1)       | SP_encoded(2)        |
+---------------+---------------------+-----+---------------------+----------------------+
| IR_encoded(0) | 0.1206896551724138  | ... | 0.14641744548286603 | 0.033268101761252444 |
+---------------+-------------

  0%|          | 0/1000000 [00:00<?, ?it/s]


Learned Structure (Edges) for 4000 samples (AIC):
[('IR_encoded', 'EI_encoded'), ('SP_encoded', 'EI_encoded'), ('SP_encoded', 'IR_encoded')]

AIC Score for 4000 samples: -11324.5311

CPD of IR_encoded
+---------------+---------------------+---------------------+---------------------+
| SP_encoded    | SP_encoded(0)       | SP_encoded(1)       | SP_encoded(2)       |
+---------------+---------------------+---------------------+---------------------+
| IR_encoded(0) | 0.1870689655172414  | 0.3157894736842105  | 0.2473469387755102  |
+---------------+---------------------+---------------------+---------------------+
| IR_encoded(1) | 0.48017241379310344 | 0.17708978328173375 | 0.23265306122448978 |
+---------------+---------------------+---------------------+---------------------+
| IR_encoded(2) | 0.33275862068965517 | 0.5071207430340557  | 0.52                |
+---------------+---------------------+---------------------+---------------------+

CPD of EI_encoded
+---------------+------

  0%|          | 0/1000000 [00:00<?, ?it/s]


Learned Structure (Edges) for 4500 samples (AIC):
[('EI_encoded', 'IR_encoded'), ('SP_encoded', 'IR_encoded'), ('SP_encoded', 'EI_encoded')]

AIC Score for 4500 samples: -11539.7680

CPD of EI_encoded
+---------------+---------------------+---------------------+---------------------+
| SP_encoded    | SP_encoded(0)       | SP_encoded(1)       | SP_encoded(2)       |
+---------------+---------------------+---------------------+---------------------+
| EI_encoded(0) | 0.36135812449474536 | 0.15428571428571428 | 0.13002944062806673 |
+---------------+---------------------+---------------------+---------------------+
| EI_encoded(1) | 0.52303961196443    | 0.43020408163265306 | 0.6609421000981355  |
+---------------+---------------------+---------------------+---------------------+
| EI_encoded(2) | 0.11560226354082458 | 0.41551020408163264 | 0.20902845927379785 |
+---------------+---------------------+---------------------+---------------------+

CPD of IR_encoded
+---------------+------

  0%|          | 0/1000000 [00:00<?, ?it/s]


Learned Structure (Edges) for 5000 samples (AIC):
[('IR_encoded', 'EI_encoded'), ('SP_encoded', 'EI_encoded'), ('SP_encoded', 'IR_encoded')]

AIC Score for 5000 samples: -14777.8390

CPD of IR_encoded
+---------------+---------------------+---------------------+---------------------+
| SP_encoded    | SP_encoded(0)       | SP_encoded(1)       | SP_encoded(2)       |
+---------------+---------------------+---------------------+---------------------+
| IR_encoded(0) | 0.4466950959488273  | 0.350104821802935   | 0.34292763157894735 |
+---------------+---------------------+---------------------+---------------------+
| IR_encoded(1) | 0.43976545842217485 | 0.5707547169811321  | 0.5263157894736842  |
+---------------+---------------------+---------------------+---------------------+
| IR_encoded(2) | 0.11353944562899787 | 0.07914046121593292 | 0.13075657894736842 |
+---------------+---------------------+---------------------+---------------------+

CPD of EI_encoded
+---------------+------

  0%|          | 0/1000000 [00:00<?, ?it/s]


Learned Structure (Edges) for 5500 samples (AIC):
[('IR_encoded', 'SP_encoded'), ('EI_encoded', 'SP_encoded'), ('EI_encoded', 'IR_encoded')]

AIC Score for 5500 samples: -15167.4701

CPD of IR_encoded
+---------------+---------------------+---------------------+---------------------+
| EI_encoded    | EI_encoded(0)       | EI_encoded(1)       | EI_encoded(2)       |
+---------------+---------------------+---------------------+---------------------+
| IR_encoded(0) | 0.6530032467532467  | 0.7236580516898609  | 0.6180422264875239  |
+---------------+---------------------+---------------------+---------------------+
| IR_encoded(1) | 0.12702922077922077 | 0.21829025844930416 | 0.272552783109405   |
+---------------+---------------------+---------------------+---------------------+
| IR_encoded(2) | 0.21996753246753248 | 0.05805168986083499 | 0.10940499040307101 |
+---------------+---------------------+---------------------+---------------------+

CPD of SP_encoded
+---------------+------

  0%|          | 0/1000000 [00:00<?, ?it/s]


Learned Structure (Edges) for 6000 samples (AIC):
[('EI_encoded', 'IR_encoded'), ('EI_encoded', 'SP_encoded'), ('SP_encoded', 'IR_encoded')]

AIC Score for 6000 samples: -18072.6983

CPD of EI_encoded
+---------------+----------+
| EI_encoded(0) | 0.521167 |
+---------------+----------+
| EI_encoded(1) | 0.222833 |
+---------------+----------+
| EI_encoded(2) | 0.256    |
+---------------+----------+

CPD of IR_encoded
+---------------+---------------------+-----+---------------------+---------------------+
| EI_encoded    | EI_encoded(0)       | ... | EI_encoded(2)       | EI_encoded(2)       |
+---------------+---------------------+-----+---------------------+---------------------+
| SP_encoded    | SP_encoded(0)       | ... | SP_encoded(1)       | SP_encoded(2)       |
+---------------+---------------------+-----+---------------------+---------------------+
| IR_encoded(0) | 0.3792808219178082  | ... | 0.3549382716049383  | 0.5752688172043011  |
+---------------+-------------------

  0%|          | 0/1000000 [00:00<?, ?it/s]


Learned Structure (Edges) for 6500 samples (AIC):
[('IR_encoded', 'EI_encoded'), ('IR_encoded', 'SP_encoded'), ('SP_encoded', 'EI_encoded')]

AIC Score for 6500 samples: -18707.4514

CPD of IR_encoded
+---------------+-----------+
| IR_encoded(0) | 0.383692  |
+---------------+-----------+
| IR_encoded(1) | 0.0355385 |
+---------------+-----------+
| IR_encoded(2) | 0.580769  |
+---------------+-----------+

CPD of EI_encoded
+---------------+--------------------+-----+---------------------+---------------------+
| IR_encoded    | IR_encoded(0)      | ... | IR_encoded(2)       | IR_encoded(2)       |
+---------------+--------------------+-----+---------------------+---------------------+
| SP_encoded    | SP_encoded(0)      | ... | SP_encoded(1)       | SP_encoded(2)       |
+---------------+--------------------+-----+---------------------+---------------------+
| EI_encoded(0) | 0.1720226843100189 | ... | 0.4531568228105906  | 0.25096525096525096 |
+---------------+------------------

  0%|          | 0/1000000 [00:00<?, ?it/s]


Learned Structure (Edges) for 7000 samples (AIC):
[('IR_encoded', 'EI_encoded'), ('SP_encoded', 'EI_encoded'), ('SP_encoded', 'IR_encoded')]

AIC Score for 7000 samples: -19135.2079

CPD of IR_encoded
+---------------+---------------------+---------------------+---------------------+
| SP_encoded    | SP_encoded(0)       | SP_encoded(1)       | SP_encoded(2)       |
+---------------+---------------------+---------------------+---------------------+
| IR_encoded(0) | 0.33005531653349723 | 0.37037037037037035 | 0.5574795574795575  |
+---------------+---------------------+---------------------+---------------------+
| IR_encoded(1) | 0.4732636754763368  | 0.5728597449908925  | 0.36892736892736894 |
+---------------+---------------------+---------------------+---------------------+
| IR_encoded(2) | 0.19668100799016594 | 0.0567698846387371  | 0.0735930735930736  |
+---------------+---------------------+---------------------+---------------------+

CPD of EI_encoded
+---------------+------

  0%|          | 0/1000000 [00:00<?, ?it/s]


Learned Structure (Edges) for 7500 samples (AIC):
[('IR_encoded', 'EI_encoded'), ('SP_encoded', 'EI_encoded'), ('SP_encoded', 'IR_encoded')]

AIC Score for 7500 samples: -22701.1424

CPD of IR_encoded
+---------------+---------------------+---------------------+--------------------+
| SP_encoded    | SP_encoded(0)       | SP_encoded(1)       | SP_encoded(2)      |
+---------------+---------------------+---------------------+--------------------+
| IR_encoded(0) | 0.31312690798081116 | 0.3797930614729154  | 0.2932098765432099 |
+---------------+---------------------+---------------------+--------------------+
| IR_encoded(1) | 0.43959877889228083 | 0.33414485696895924 | 0.351010101010101  |
+---------------+---------------------+---------------------+--------------------+
| IR_encoded(2) | 0.24727431312690798 | 0.2860620815581254  | 0.3557800224466891 |
+---------------+---------------------+---------------------+--------------------+

CPD of EI_encoded
+---------------+---------------

  0%|          | 0/1000000 [00:00<?, ?it/s]


Learned Structure (Edges) for 8000 samples (AIC):
[('IR_encoded', 'SP_encoded'), ('IR_encoded', 'EI_encoded'), ('EI_encoded', 'SP_encoded')]

AIC Score for 8000 samples: -23082.7057

CPD of IR_encoded
+---------------+----------+
| IR_encoded(0) | 0.594125 |
+---------------+----------+
| IR_encoded(1) | 0.29025  |
+---------------+----------+
| IR_encoded(2) | 0.115625 |
+---------------+----------+

CPD of SP_encoded
+---------------+----------------------+-----+--------------------+---------------------+
| EI_encoded    | EI_encoded(0)        | ... | EI_encoded(2)      | EI_encoded(2)       |
+---------------+----------------------+-----+--------------------+---------------------+
| IR_encoded    | IR_encoded(0)        | ... | IR_encoded(1)      | IR_encoded(2)       |
+---------------+----------------------+-----+--------------------+---------------------+
| SP_encoded(0) | 0.003740648379052369 | ... | 0.3229018492176387 | 0.4699248120300752  |
+---------------+-------------------

  0%|          | 0/1000000 [00:00<?, ?it/s]


Learned Structure (Edges) for 8500 samples (AIC):
[('IR_encoded', 'SP_encoded'), ('EI_encoded', 'SP_encoded'), ('EI_encoded', 'IR_encoded')]

AIC Score for 8500 samples: -26581.7031

CPD of IR_encoded
+---------------+---------------------+---------------------+---------------------+
| EI_encoded    | EI_encoded(0)       | EI_encoded(1)       | EI_encoded(2)       |
+---------------+---------------------+---------------------+---------------------+
| IR_encoded(0) | 0.34951456310679613 | 0.4577853203224438  | 0.405116002379536   |
+---------------+---------------------+---------------------+---------------------+
| IR_encoded(1) | 0.2937792161093132  | 0.18201103097157403 | 0.22873289708506842 |
+---------------+---------------------+---------------------+---------------------+
| IR_encoded(2) | 0.3567062207838907  | 0.3602036487059822  | 0.3661511005353956  |
+---------------+---------------------+---------------------+---------------------+

CPD of SP_encoded
+---------------+------

  0%|          | 0/1000000 [00:00<?, ?it/s]


Learned Structure (Edges) for 9000 samples (AIC):
[('IR_encoded', 'EI_encoded'), ('SP_encoded', 'EI_encoded'), ('SP_encoded', 'IR_encoded')]

AIC Score for 9000 samples: -25586.0556

CPD of IR_encoded
+---------------+----------------------+----------------------+---------------------+
| SP_encoded    | SP_encoded(0)        | SP_encoded(1)        | SP_encoded(2)       |
+---------------+----------------------+----------------------+---------------------+
| IR_encoded(0) | 0.307740130556419    | 0.5086916742909423   | 0.45726837060702874 |
+---------------+----------------------+----------------------+---------------------+
| IR_encoded(1) | 0.6673919801056886   | 0.46996035376639217  | 0.4972044728434505  |
+---------------+----------------------+----------------------+---------------------+
| IR_encoded(2) | 0.024867889337892447 | 0.021347971942665446 | 0.04552715654952077 |
+---------------+----------------------+----------------------+---------------------+

CPD of EI_encoded
+----

  0%|          | 0/1000000 [00:00<?, ?it/s]


Learned Structure (Edges) for 9500 samples (AIC):
[('IR_encoded', 'EI_encoded'), ('IR_encoded', 'SP_encoded'), ('SP_encoded', 'EI_encoded')]

AIC Score for 9500 samples: -28197.2232

CPD of IR_encoded
+---------------+----------+
| IR_encoded(0) | 0.196737 |
+---------------+----------+
| IR_encoded(1) | 0.319684 |
+---------------+----------+
| IR_encoded(2) | 0.483579 |
+---------------+----------+

CPD of EI_encoded
+---------------+----------------------+-----+---------------------+---------------------+
| IR_encoded    | IR_encoded(0)        | ... | IR_encoded(2)       | IR_encoded(2)       |
+---------------+----------------------+-----+---------------------+---------------------+
| SP_encoded    | SP_encoded(0)        | ... | SP_encoded(1)       | SP_encoded(2)       |
+---------------+----------------------+-----+---------------------+---------------------+
| EI_encoded(0) | 0.2945054945054945   | ... | 0.3809782608695652  | 0.42289239204934886 |
+---------------+-------------

  0%|          | 0/1000000 [00:00<?, ?it/s]


Learned Structure (Edges) for 10000 samples (AIC):
[('IR_encoded', 'EI_encoded'), ('IR_encoded', 'SP_encoded'), ('SP_encoded', 'EI_encoded')]

AIC Score for 10000 samples: -29385.4519

CPD of IR_encoded
+---------------+--------+
| IR_encoded(0) | 0.5033 |
+---------------+--------+
| IR_encoded(1) | 0.2318 |
+---------------+--------+
| IR_encoded(2) | 0.2649 |
+---------------+--------+

CPD of EI_encoded
+---------------+---------------------+-----+---------------------+---------------------+
| IR_encoded    | IR_encoded(0)       | ... | IR_encoded(2)       | IR_encoded(2)       |
+---------------+---------------------+-----+---------------------+---------------------+
| SP_encoded    | SP_encoded(0)       | ... | SP_encoded(1)       | SP_encoded(2)       |
+---------------+---------------------+-----+---------------------+---------------------+
| EI_encoded(0) | 0.03593339176161262 | ... | 0.12480252764612954 | 0.4421052631578947  |
+---------------+---------------------+-----+---

## BIC

In [None]:
# Sample sizes to loop through
sample_sizes = range(500, 10500, 500)

# Loop through each sample size
for sample_size in sample_sizes:
    print(f"\nProcessing sample size: {sample_size} (Sparse Data with BIC)")

    # Load the sparse dataset for the current sample size
    sparse_data_file = f'outcomes_sparse_{sample_size}.csv'
    df_sparse = pd.read_csv(sparse_data_file)

    # Manually encode categorical variables for IR, EI, and SP
    ir_map = {'low': 0, 'medium': 1, 'high': 2}
    ei_map = {'poor': 0, 'average': 1, 'good': 2}
    sp_map = {'decrease': 0, 'stable': 1, 'increase': 2}

    df_sparse['IR_encoded'] = df_sparse['IR'].map(ir_map)
    df_sparse['EI_encoded'] = df_sparse['EI'].map(ei_map)
    df_sparse['SP_encoded'] = df_sparse['SP'].map(sp_map)

    # Define the Hill-Climb structure learning algorithm
    hc_bic = HillClimbSearch(df_sparse[['IR_encoded', 'EI_encoded', 'SP_encoded']])
    scoring_method_bic = BicScore(df_sparse[['IR_encoded', 'EI_encoded', 'SP_encoded']])

    # Estimate the best structure using BIC
    best_dag_bic = hc_bic.estimate(scoring_method=scoring_method_bic)

    # Ensure all required nodes are present in the model, even if not connected
    best_model_bic = BayesianNetwork()
    best_model_bic.add_nodes_from(['IR_encoded', 'EI_encoded', 'SP_encoded'])  # Add all nodes
    best_model_bic.add_edges_from(best_dag_bic.edges())  # Add edges from the learned structure

    # Check if all nodes are included in the learned structure
    nodes_in_structure_bic = set(best_model_bic.nodes())
    required_nodes = {'IR_encoded', 'EI_encoded', 'SP_encoded'}

    if not required_nodes.issubset(nodes_in_structure_bic):
        print("\nNot all nodes are connected. Adding a dummy variable and ensuring all required nodes are present.")
        # Add a dummy variable to the dataset
        df_sparse['Dummy_Node'] = 1  # Constant dummy node

        # Ensure all required nodes are in the model by adding edges with the dummy node
        for node in required_nodes:
            if node not in nodes_in_structure_bic:
                best_model_bic.add_edge('Dummy_Node', node)

        # Re-estimate the structure with the dummy variable
        hc_bic = HillClimbSearch(df_sparse[['IR_encoded', 'EI_encoded', 'SP_encoded', 'Dummy_Node']])
        scoring_method_bic = BicScore(df_sparse[['IR_encoded', 'EI_encoded', 'SP_encoded', 'Dummy_Node']])
        best_dag_bic = hc_bic.estimate(scoring_method=scoring_method_bic)
        best_model_bic = BayesianNetwork(best_dag_bic.edges())

    # Display the learned structure (edges of the Bayesian Network)
    print(f"\nLearned Structure (Edges) for {sample_size} samples (BIC):")
    print(best_model_bic.edges())

    # Learn the CPDs using Maximum Likelihood Estimation (MLE)
    best_model_bic.fit(df_sparse[['IR_encoded', 'EI_encoded', 'SP_encoded']], estimator=MaximumLikelihoodEstimator)

    # Check if the model is valid after learning the parameters
    assert best_model_bic.check_model()

    # Print the learned CPDs (Conditional Probability Distributions)
    for cpd in best_model_bic.get_cpds():
        print("\nCPD of", cpd.variable)
        print(cpd)

    # Create an inference object for the BIC model
    inference_bic = VariableElimination(best_model_bic)

    # Placeholder to store predictions
    predicted_sp_labels_bic = []

    # Loop through each row in the dataset to make predictions using BIC
    for index, row in df_sparse.iterrows():
        # Prepare the evidence from the dataset (IR_encoded and EI_encoded)
        sample_input = {'IR_encoded': int(row['IR_encoded']), 'EI_encoded': int(row['EI_encoded'])}

        # Perform inference to predict the distribution for SP_encoded (Stock Price)
        predicted_sp_distribution_bic = inference_bic.query(variables=['SP_encoded'], evidence=sample_input)

        # Extract the most likely SP_encoded class
        predicted_sp_class_bic = predicted_sp_distribution_bic.values.argmax()
        sp_reverse_map = {0: 'decrease', 1: 'stable', 2: 'increase'}
        predicted_sp_label_bic = sp_reverse_map[predicted_sp_class_bic]

        # Store the predicted label
        predicted_sp_labels_bic.append(predicted_sp_label_bic)

    # Convert the list of predicted labels into a DataFrame for easier comparison
    predicted_results_df_bic = pd.DataFrame({
        'IR': df_sparse['IR'],  # Original IR column
        'EI': df_sparse['EI'],  # Original EI column
        'Predicted_SP': predicted_sp_labels_bic  # Predicted SP column
    })

    # Add the actual SP values for comparison
    predicted_results_df_bic['Actual_SP'] = df_sparse['SP']

    # Calculate accuracy of predictions for BIC
    accuracy_bic = accuracy_score(predicted_results_df_bic['Actual_SP'], predicted_results_df_bic['Predicted_SP'])
    print(f"\nPrediction Accuracy for {sample_size} samples (BIC): {accuracy_bic:.4f}")

    # Display the first few rows of predictions for BIC
    print(f"\nPredicted Results for Sparse Data (First 10 rows) for {sample_size} samples (BIC):")
    print(predicted_results_df_bic.head(10))

    # Calculate the BIC score for the Bayesian Network model
    bic_score_value = scoring_method_bic.score(best_model_bic)

    # Print the BIC score
    print(f"\nBIC Score for {sample_size} samples: {bic_score_value:.4f}")

# Notify the user that the process is done
print("\nProcessing complete for all sample sizes using BIC (Sparse Data).")


Processing sample size: 500 (Sparse Data with BIC)


  0%|          | 0/1000000 [00:00<?, ?it/s]


Learned Structure (Edges) for 500 samples (BIC):
[('SP_encoded', 'EI_encoded')]

CPD of IR_encoded
+---------------+------+
| IR_encoded(0) | 0.44 |
+---------------+------+
| IR_encoded(1) | 0.49 |
+---------------+------+
| IR_encoded(2) | 0.07 |
+---------------+------+

CPD of EI_encoded
+---------------+---------------------+--------------------+---------------------+
| SP_encoded    | SP_encoded(0)       | SP_encoded(1)      | SP_encoded(2)       |
+---------------+---------------------+--------------------+---------------------+
| EI_encoded(0) | 0.3248407643312102  | 0.2222222222222222 | 0.3768844221105528  |
+---------------+---------------------+--------------------+---------------------+
| EI_encoded(1) | 0.27388535031847133 | 0.2013888888888889 | 0.4623115577889447  |
+---------------+---------------------+--------------------+---------------------+
| EI_encoded(2) | 0.4012738853503185  | 0.5763888888888888 | 0.16080402010050251 |
+---------------+---------------------+---

  0%|          | 0/1000000 [00:00<?, ?it/s]


Learned Structure (Edges) for 1000 samples (BIC):
[('EI_encoded', 'SP_encoded'), ('SP_encoded', 'IR_encoded')]

CPD of IR_encoded
+---------------+----------------------+---------------------+---------------------+
| SP_encoded    | SP_encoded(0)        | SP_encoded(1)       | SP_encoded(2)       |
+---------------+----------------------+---------------------+---------------------+
| IR_encoded(0) | 0.037037037037037035 | 0.14285714285714285 | 0.13649851632047477 |
+---------------+----------------------+---------------------+---------------------+
| IR_encoded(1) | 0.2152777777777778   | 0.46320346320346323 | 0.2818991097922849  |
+---------------+----------------------+---------------------+---------------------+
| IR_encoded(2) | 0.7476851851851852   | 0.3939393939393939  | 0.5816023738872403  |
+---------------+----------------------+---------------------+---------------------+

CPD of EI_encoded
+---------------+-------+
| EI_encoded(0) | 0.395 |
+---------------+-------+
| EI_en

  0%|          | 0/1000000 [00:00<?, ?it/s]


Learned Structure (Edges) for 1500 samples (BIC):
[('IR_encoded', 'SP_encoded'), ('EI_encoded', 'SP_encoded')]

CPD of IR_encoded
+---------------+-----------+
| IR_encoded(0) | 0.0213333 |
+---------------+-----------+
| IR_encoded(1) | 0.571333  |
+---------------+-----------+
| IR_encoded(2) | 0.407333  |
+---------------+-----------+

CPD of EI_encoded
+---------------+----------+
| EI_encoded(0) | 0.304667 |
+---------------+----------+
| EI_encoded(1) | 0.473333 |
+---------------+----------+
| EI_encoded(2) | 0.222    |
+---------------+----------+

CPD of SP_encoded
+---------------+--------------------+-----+---------------------+---------------------+
| EI_encoded    | EI_encoded(0)      | ... | EI_encoded(2)       | EI_encoded(2)       |
+---------------+--------------------+-----+---------------------+---------------------+
| IR_encoded    | IR_encoded(0)      | ... | IR_encoded(1)       | IR_encoded(2)       |
+---------------+--------------------+-----+------------------

  0%|          | 0/1000000 [00:00<?, ?it/s]


Learned Structure (Edges) for 2000 samples (BIC):
[('IR_encoded', 'EI_encoded'), ('SP_encoded', 'EI_encoded')]

CPD of IR_encoded
+---------------+--------+
| IR_encoded(0) | 0.0265 |
+---------------+--------+
| IR_encoded(1) | 0.321  |
+---------------+--------+
| IR_encoded(2) | 0.6525 |
+---------------+--------+

CPD of EI_encoded
+---------------+---------------------+-----+---------------------+---------------------+
| IR_encoded    | IR_encoded(0)       | ... | IR_encoded(2)       | IR_encoded(2)       |
+---------------+---------------------+-----+---------------------+---------------------+
| SP_encoded    | SP_encoded(0)       | ... | SP_encoded(1)       | SP_encoded(2)       |
+---------------+---------------------+-----+---------------------+---------------------+
| EI_encoded(0) | 0.1111111111111111  | ... | 0.06823529411764706 | 0.17333333333333334 |
+---------------+---------------------+-----+---------------------+---------------------+
| EI_encoded(1) | 0.48148148148

  0%|          | 0/1000000 [00:00<?, ?it/s]


Learned Structure (Edges) for 2500 samples (BIC):
[('IR_encoded', 'EI_encoded'), ('SP_encoded', 'EI_encoded'), ('SP_encoded', 'IR_encoded')]

CPD of IR_encoded
+---------------+---------------------+---------------------+---------------------+
| SP_encoded    | SP_encoded(0)       | SP_encoded(1)       | SP_encoded(2)       |
+---------------+---------------------+---------------------+---------------------+
| IR_encoded(0) | 0.09759188846641319 | 0.12959183673469388 | 0.18878248974008208 |
+---------------+---------------------+---------------------+---------------------+
| IR_encoded(1) | 0.17490494296577946 | 0.20714285714285716 | 0.17099863201094392 |
+---------------+---------------------+---------------------+---------------------+
| IR_encoded(2) | 0.7275031685678074  | 0.6632653061224489  | 0.640218878248974   |
+---------------+---------------------+---------------------+---------------------+

CPD of EI_encoded
+---------------+---------------------+-----+-------------------

  0%|          | 0/1000000 [00:00<?, ?it/s]


Learned Structure (Edges) for 3000 samples (BIC):
[('EI_encoded', 'IR_encoded'), ('SP_encoded', 'IR_encoded'), ('SP_encoded', 'EI_encoded')]

CPD of IR_encoded
+---------------+---------------------+-----+---------------------+---------------------+
| EI_encoded    | EI_encoded(0)       | ... | EI_encoded(2)       | EI_encoded(2)       |
+---------------+---------------------+-----+---------------------+---------------------+
| SP_encoded    | SP_encoded(0)       | ... | SP_encoded(1)       | SP_encoded(2)       |
+---------------+---------------------+-----+---------------------+---------------------+
| IR_encoded(0) | 0.17989417989417988 | ... | 0.19472361809045227 | 0.11610486891385768 |
+---------------+---------------------+-----+---------------------+---------------------+
| IR_encoded(1) | 0.0                 | ... | 0.03768844221105527 | 0.0686641697877653  |
+---------------+---------------------+-----+---------------------+---------------------+
| IR_encoded(2) | 0.820105820

  0%|          | 0/1000000 [00:00<?, ?it/s]


Learned Structure (Edges) for 3500 samples (BIC):
[('IR_encoded', 'SP_encoded'), ('EI_encoded', 'SP_encoded')]

CPD of IR_encoded
+---------------+-----------+
| IR_encoded(0) | 0.460571  |
+---------------+-----------+
| IR_encoded(1) | 0.0465714 |
+---------------+-----------+
| IR_encoded(2) | 0.492857  |
+---------------+-----------+

CPD of EI_encoded
+---------------+-----------+
| EI_encoded(0) | 0.0574286 |
+---------------+-----------+
| EI_encoded(1) | 0.108286  |
+---------------+-----------+
| EI_encoded(2) | 0.834286  |
+---------------+-----------+

CPD of SP_encoded
+---------------+---------------------+-----+---------------------+---------------------+
| EI_encoded    | EI_encoded(0)       | ... | EI_encoded(2)       | EI_encoded(2)       |
+---------------+---------------------+-----+---------------------+---------------------+
| IR_encoded    | IR_encoded(0)       | ... | IR_encoded(1)       | IR_encoded(2)       |
+---------------+---------------------+-----+------

  0%|          | 0/1000000 [00:00<?, ?it/s]


Learned Structure (Edges) for 4000 samples (BIC):
[('IR_encoded', 'SP_encoded'), ('EI_encoded', 'SP_encoded')]

CPD of IR_encoded
+---------------+---------+
| IR_encoded(0) | 0.32125 |
+---------------+---------+
| IR_encoded(1) | 0.4025  |
+---------------+---------+
| IR_encoded(2) | 0.27625 |
+---------------+---------+

CPD of EI_encoded
+---------------+---------+
| EI_encoded(0) | 0.42525 |
+---------------+---------+
| EI_encoded(1) | 0.153   |
+---------------+---------+
| EI_encoded(2) | 0.42175 |
+---------------+---------+

CPD of SP_encoded
+---------------+---------------------+-----+--------------------+---------------------+
| EI_encoded    | EI_encoded(0)       | ... | EI_encoded(2)      | EI_encoded(2)       |
+---------------+---------------------+-----+--------------------+---------------------+
| IR_encoded    | IR_encoded(0)       | ... | IR_encoded(1)      | IR_encoded(2)       |
+---------------+---------------------+-----+--------------------+-----------------

  0%|          | 0/1000000 [00:00<?, ?it/s]


Learned Structure (Edges) for 4500 samples (BIC):
[('IR_encoded', 'SP_encoded'), ('EI_encoded', 'SP_encoded')]

CPD of IR_encoded
+---------------+----------+
| IR_encoded(0) | 0.368444 |
+---------------+----------+
| IR_encoded(1) | 0.228444 |
+---------------+----------+
| IR_encoded(2) | 0.403111 |
+---------------+----------+

CPD of EI_encoded
+---------------+------------+
| EI_encoded(0) | 0.322444   |
+---------------+------------+
| EI_encoded(1) | 0.674      |
+---------------+------------+
| EI_encoded(2) | 0.00355556 |
+---------------+------------+

CPD of SP_encoded
+---------------+---------------------+-----+---------------+---------------+
| EI_encoded    | EI_encoded(0)       | ... | EI_encoded(2) | EI_encoded(2) |
+---------------+---------------------+-----+---------------+---------------+
| IR_encoded    | IR_encoded(0)       | ... | IR_encoded(1) | IR_encoded(2) |
+---------------+---------------------+-----+---------------+---------------+
| SP_encoded(0) | 0.1

  0%|          | 0/1000000 [00:00<?, ?it/s]


Learned Structure (Edges) for 5000 samples (BIC):
[('IR_encoded', 'SP_encoded'), ('EI_encoded', 'SP_encoded')]

CPD of IR_encoded
+---------------+--------+
| IR_encoded(0) | 0.6572 |
+---------------+--------+
| IR_encoded(1) | 0.003  |
+---------------+--------+
| IR_encoded(2) | 0.3398 |
+---------------+--------+

CPD of EI_encoded
+---------------+--------+
| EI_encoded(0) | 0.1476 |
+---------------+--------+
| EI_encoded(1) | 0.7236 |
+---------------+--------+
| EI_encoded(2) | 0.1288 |
+---------------+--------+

CPD of SP_encoded
+---------------+----------------------+-----+--------------------+---------------------+
| EI_encoded    | EI_encoded(0)        | ... | EI_encoded(2)      | EI_encoded(2)       |
+---------------+----------------------+-----+--------------------+---------------------+
| IR_encoded    | IR_encoded(0)        | ... | IR_encoded(1)      | IR_encoded(2)       |
+---------------+----------------------+-----+--------------------+---------------------+
| S

  0%|          | 0/1000000 [00:00<?, ?it/s]


Learned Structure (Edges) for 5500 samples (BIC):
[('EI_encoded', 'IR_encoded'), ('EI_encoded', 'SP_encoded'), ('SP_encoded', 'IR_encoded')]

CPD of IR_encoded
+---------------+----------------------+-----+---------------------+---------------------+
| EI_encoded    | EI_encoded(0)        | ... | EI_encoded(2)       | EI_encoded(2)       |
+---------------+----------------------+-----+---------------------+---------------------+
| SP_encoded    | SP_encoded(0)        | ... | SP_encoded(1)       | SP_encoded(2)       |
+---------------+----------------------+-----+---------------------+---------------------+
| IR_encoded(0) | 0.4731182795698925   | ... | 0.3879598662207358  | 0.1623246492985972  |
+---------------+----------------------+-----+---------------------+---------------------+
| IR_encoded(1) | 0.010752688172043012 | ... | 0.23411371237458195 | 0.30060120240480964 |
+---------------+----------------------+-----+---------------------+---------------------+
| IR_encoded(2) | 0.

  0%|          | 0/1000000 [00:00<?, ?it/s]


Learned Structure (Edges) for 6000 samples (BIC):
[('IR_encoded', 'SP_encoded'), ('EI_encoded', 'SP_encoded')]

CPD of IR_encoded
+---------------+----------+
| IR_encoded(0) | 0.3595   |
+---------------+----------+
| IR_encoded(1) | 0.376833 |
+---------------+----------+
| IR_encoded(2) | 0.263667 |
+---------------+----------+

CPD of EI_encoded
+---------------+----------+
| EI_encoded(0) | 0.244667 |
+---------------+----------+
| EI_encoded(1) | 0.545667 |
+---------------+----------+
| EI_encoded(2) | 0.209667 |
+---------------+----------+

CPD of SP_encoded
+---------------+--------------------+-----+---------------------+---------------------+
| EI_encoded    | EI_encoded(0)      | ... | EI_encoded(2)       | EI_encoded(2)       |
+---------------+--------------------+-----+---------------------+---------------------+
| IR_encoded    | IR_encoded(0)      | ... | IR_encoded(1)       | IR_encoded(2)       |
+---------------+--------------------+-----+---------------------+---

  0%|          | 0/1000000 [00:00<?, ?it/s]


Learned Structure (Edges) for 6500 samples (BIC):
[('EI_encoded', 'IR_encoded'), ('SP_encoded', 'EI_encoded'), ('SP_encoded', 'IR_encoded')]

CPD of IR_encoded
+---------------+---------------+-----+---------------------+---------------------+
| EI_encoded    | EI_encoded(0) | ... | EI_encoded(2)       | EI_encoded(2)       |
+---------------+---------------+-----+---------------------+---------------------+
| SP_encoded    | SP_encoded(0) | ... | SP_encoded(1)       | SP_encoded(2)       |
+---------------+---------------+-----+---------------------+---------------------+
| IR_encoded(0) | 0.248         | ... | 0.38713826366559484 | 0.31143667296786387 |
+---------------+---------------+-----+---------------------+---------------------+
| IR_encoded(1) | 0.532         | ... | 0.4643086816720257  | 0.2556710775047259  |
+---------------+---------------+-----+---------------------+---------------------+
| IR_encoded(2) | 0.22          | ... | 0.14855305466237942 | 0.43289224952741023 |

  0%|          | 0/1000000 [00:00<?, ?it/s]


Learned Structure (Edges) for 7000 samples (BIC):
[('IR_encoded', 'SP_encoded'), ('EI_encoded', 'SP_encoded')]

CPD of IR_encoded
+---------------+----------+
| IR_encoded(0) | 0.385286 |
+---------------+----------+
| IR_encoded(1) | 0.359571 |
+---------------+----------+
| IR_encoded(2) | 0.255143 |
+---------------+----------+

CPD of EI_encoded
+---------------+----------+
| EI_encoded(0) | 0.581571 |
+---------------+----------+
| EI_encoded(1) | 0.226714 |
+---------------+----------+
| EI_encoded(2) | 0.191714 |
+---------------+----------+

CPD of SP_encoded
+---------------+----------------------+-----+---------------------+---------------------+
| EI_encoded    | EI_encoded(0)        | ... | EI_encoded(2)       | EI_encoded(2)       |
+---------------+----------------------+-----+---------------------+---------------------+
| IR_encoded    | IR_encoded(0)        | ... | IR_encoded(1)       | IR_encoded(2)       |
+---------------+----------------------+-----+---------------

  0%|          | 0/1000000 [00:00<?, ?it/s]


Learned Structure (Edges) for 7500 samples (BIC):
[('IR_encoded', 'SP_encoded'), ('EI_encoded', 'SP_encoded')]

CPD of IR_encoded
+---------------+----------+
| IR_encoded(0) | 0.471867 |
+---------------+----------+
| IR_encoded(1) | 0.187867 |
+---------------+----------+
| IR_encoded(2) | 0.340267 |
+---------------+----------+

CPD of EI_encoded
+---------------+--------+
| EI_encoded(0) | 0.1924 |
+---------------+--------+
| EI_encoded(1) | 0.428  |
+---------------+--------+
| EI_encoded(2) | 0.3796 |
+---------------+--------+

CPD of SP_encoded
+---------------+---------------------+-----+---------------------+----------------------+
| EI_encoded    | EI_encoded(0)       | ... | EI_encoded(2)       | EI_encoded(2)        |
+---------------+---------------------+-----+---------------------+----------------------+
| IR_encoded    | IR_encoded(0)       | ... | IR_encoded(1)       | IR_encoded(2)        |
+---------------+---------------------+-----+---------------------+--------

  0%|          | 0/1000000 [00:00<?, ?it/s]


Learned Structure (Edges) for 8000 samples (BIC):
[('EI_encoded', 'IR_encoded'), ('EI_encoded', 'SP_encoded'), ('SP_encoded', 'IR_encoded')]

CPD of IR_encoded
+---------------+---------------------+-----+---------------------+---------------------+
| EI_encoded    | EI_encoded(0)       | ... | EI_encoded(2)       | EI_encoded(2)       |
+---------------+---------------------+-----+---------------------+---------------------+
| SP_encoded    | SP_encoded(0)       | ... | SP_encoded(1)       | SP_encoded(2)       |
+---------------+---------------------+-----+---------------------+---------------------+
| IR_encoded(0) | 0.22704081632653061 | ... | 0.6472184531886025  | 0.5614973262032086  |
+---------------+---------------------+-----+---------------------+---------------------+
| IR_encoded(1) | 0.25255102040816324 | ... | 0.06784260515603799 | 0.08823529411764706 |
+---------------+---------------------+-----+---------------------+---------------------+
| IR_encoded(2) | 0.520408163

  0%|          | 0/1000000 [00:00<?, ?it/s]


Learned Structure (Edges) for 8500 samples (BIC):
[('IR_encoded', 'EI_encoded'), ('SP_encoded', 'EI_encoded'), ('SP_encoded', 'IR_encoded')]

CPD of IR_encoded
+---------------+----------------------+----------------------+----------------------+
| SP_encoded    | SP_encoded(0)        | SP_encoded(1)        | SP_encoded(2)        |
+---------------+----------------------+----------------------+----------------------+
| IR_encoded(0) | 0.015982364287682557 | 0.013721413721413722 | 0.023114355231143552 |
+---------------+----------------------+----------------------+----------------------+
| IR_encoded(1) | 0.09479195370625516  | 0.2415800415800416   | 0.2615571776155718   |
+---------------+----------------------+----------------------+----------------------+
| IR_encoded(2) | 0.8892256820060622   | 0.7446985446985447   | 0.7153284671532847   |
+---------------+----------------------+----------------------+----------------------+

CPD of EI_encoded
+---------------+--------------------

  0%|          | 0/1000000 [00:00<?, ?it/s]


Learned Structure (Edges) for 9000 samples (BIC):
[('IR_encoded', 'EI_encoded'), ('IR_encoded', 'SP_encoded'), ('SP_encoded', 'EI_encoded')]

CPD of IR_encoded
+---------------+----------+
| IR_encoded(0) | 0.177889 |
+---------------+----------+
| IR_encoded(1) | 0.125    |
+---------------+----------+
| IR_encoded(2) | 0.697111 |
+---------------+----------+

CPD of EI_encoded
+---------------+--------------------+-----+---------------------+--------------------+
| IR_encoded    | IR_encoded(0)      | ... | IR_encoded(2)       | IR_encoded(2)      |
+---------------+--------------------+-----+---------------------+--------------------+
| SP_encoded    | SP_encoded(0)      | ... | SP_encoded(1)       | SP_encoded(2)      |
+---------------+--------------------+-----+---------------------+--------------------+
| EI_encoded(0) | 0.0860655737704918 | ... | 0.4263653483992467  | 0.466710182767624  |
+---------------+--------------------+-----+---------------------+--------------------+
|

  0%|          | 0/1000000 [00:00<?, ?it/s]


Learned Structure (Edges) for 9500 samples (BIC):
[('IR_encoded', 'SP_encoded'), ('EI_encoded', 'SP_encoded')]

CPD of IR_encoded
+---------------+----------+
| IR_encoded(0) | 0.362947 |
+---------------+----------+
| IR_encoded(1) | 0.288632 |
+---------------+----------+
| IR_encoded(2) | 0.348421 |
+---------------+----------+

CPD of EI_encoded
+---------------+----------+
| EI_encoded(0) | 0.288421 |
+---------------+----------+
| EI_encoded(1) | 0.313684 |
+---------------+----------+
| EI_encoded(2) | 0.397895 |
+---------------+----------+

CPD of SP_encoded
+---------------+----------------------+-----+---------------------+--------------------+
| EI_encoded    | EI_encoded(0)        | ... | EI_encoded(2)       | EI_encoded(2)      |
+---------------+----------------------+-----+---------------------+--------------------+
| IR_encoded    | IR_encoded(0)        | ... | IR_encoded(1)       | IR_encoded(2)      |
+---------------+----------------------+-----+-------------------

  0%|          | 0/1000000 [00:00<?, ?it/s]


Learned Structure (Edges) for 10000 samples (BIC):
[('EI_encoded', 'IR_encoded'), ('SP_encoded', 'IR_encoded'), ('SP_encoded', 'EI_encoded')]

CPD of IR_encoded
+---------------+---------------------+-----+---------------------+---------------------+
| EI_encoded    | EI_encoded(0)       | ... | EI_encoded(2)       | EI_encoded(2)       |
+---------------+---------------------+-----+---------------------+---------------------+
| SP_encoded    | SP_encoded(0)       | ... | SP_encoded(1)       | SP_encoded(2)       |
+---------------+---------------------+-----+---------------------+---------------------+
| IR_encoded(0) | 0.22929936305732485 | ... | 0.37530864197530867 | 0.41580756013745707 |
+---------------+---------------------+-----+---------------------+---------------------+
| IR_encoded(1) | 0.554140127388535   | ... | 0.4765432098765432  | 0.5189003436426117  |
+---------------+---------------------+-----+---------------------+---------------------+
| IR_encoded(2) | 0.21656050

## AIC

In [None]:
# Sample sizes to loop through
sample_sizes = range(500, 10500, 500)

# Loop through each sample size
for sample_size in sample_sizes:
    print(f"\nProcessing sample size: {sample_size} (Sparse Data with AIC)")

    # Load the sparse dataset for the current sample size
    sparse_data_file = f'outcomes_sparse_{sample_size}.csv'
    df_sparse = pd.read_csv(sparse_data_file)

    # Manually encode categorical variables for IR, EI, and SP
    ir_map = {'low': 0, 'medium': 1, 'high': 2}
    ei_map = {'poor': 0, 'average': 1, 'good': 2}
    sp_map = {'decrease': 0, 'stable': 1, 'increase': 2}

    df_sparse['IR_encoded'] = df_sparse['IR'].map(ir_map)
    df_sparse['EI_encoded'] = df_sparse['EI'].map(ei_map)
    df_sparse['SP_encoded'] = df_sparse['SP'].map(sp_map)

    # Define the Hill-Climb structure learning algorithm
    hc_aic = HillClimbSearch(df_sparse[['IR_encoded', 'EI_encoded', 'SP_encoded']])
    scoring_method_aic = AICScore(df_sparse[['IR_encoded', 'EI_encoded', 'SP_encoded']])

    # Estimate the best structure using AIC
    best_dag_aic = hc_aic.estimate(scoring_method=scoring_method_aic)

    # Ensure all required nodes are present in the model, even if not connected
    best_model_aic = BayesianNetwork()
    best_model_aic.add_nodes_from(['IR_encoded', 'EI_encoded', 'SP_encoded'])  # Add all nodes
    best_model_aic.add_edges_from(best_dag_aic.edges())  # Add edges from the learned structure

    # Check if all nodes are included in the learned structure
    nodes_in_structure_aic = set(best_model_aic.nodes())
    required_nodes = {'IR_encoded', 'EI_encoded', 'SP_encoded'}

    if not required_nodes.issubset(nodes_in_structure_aic):
        print("\nNot all nodes are connected. Adding a dummy variable and ensuring all required nodes are present.")
        # Add a dummy variable to the dataset
        df_sparse['Dummy_Node'] = 1  # Constant dummy node

        # Ensure all required nodes are in the model by adding edges with the dummy node
        for node in required_nodes:
            if node not in nodes_in_structure_aic:
                best_model_aic.add_edge('Dummy_Node', node)

        # Re-estimate the structure with the dummy variable
        hc_aic = HillClimbSearch(df_sparse[['IR_encoded', 'EI_encoded', 'SP_encoded', 'Dummy_Node']])
        scoring_method_aic = AICScore(df_sparse[['IR_encoded', 'EI_encoded', 'SP_encoded', 'Dummy_Node']])
        best_dag_aic = hc_aic.estimate(scoring_method=scoring_method_aic)
        best_model_aic = BayesianNetwork(best_dag_aic.edges())

    # Display the learned structure (edges of the Bayesian Network)
    print(f"\nLearned Structure (Edges) for {sample_size} samples (AIC):")
    print(best_model_aic.edges())

    # Learn the CPDs using Maximum Likelihood Estimation (MLE)
    best_model_aic.fit(df_sparse[['IR_encoded', 'EI_encoded', 'SP_encoded']], estimator=MaximumLikelihoodEstimator)

    # Check if the model is valid after learning the parameters
    assert best_model_aic.check_model()

    # Print the learned CPDs (Conditional Probability Distributions)
    for cpd in best_model_aic.get_cpds():
        print("\nCPD of", cpd.variable)
        print(cpd)

    # Create an inference object for the AIC model
    inference_aic = VariableElimination(best_model_aic)

    # Placeholder to store predictions
    predicted_sp_labels_aic = []

    # Loop through each row in the dataset to make predictions using AIC
    for index, row in df_sparse.iterrows():
        # Prepare the evidence from the dataset (IR_encoded and EI_encoded)
        sample_input = {'IR_encoded': int(row['IR_encoded']), 'EI_encoded': int(row['EI_encoded'])}

        # Perform inference to predict the distribution for SP_encoded (Stock Price)
        predicted_sp_distribution_aic = inference_aic.query(variables=['SP_encoded'], evidence=sample_input)

        # Extract the most likely SP_encoded class
        predicted_sp_class_aic = predicted_sp_distribution_aic.values.argmax()
        sp_reverse_map = {0: 'decrease', 1: 'stable', 2: 'increase'}
        predicted_sp_label_aic = sp_reverse_map[predicted_sp_class_aic]

        # Store the predicted label
        predicted_sp_labels_aic.append(predicted_sp_label_aic)

    # Convert the list of predicted labels into a DataFrame for easier comparison
    predicted_results_df_aic = pd.DataFrame({
        'IR': df_sparse['IR'],  # Original IR column
        'EI': df_sparse['EI'],  # Original EI column
        'Predicted_SP': predicted_sp_labels_aic  # Predicted SP column
    })

    # Add the actual SP values for comparison
    predicted_results_df_aic['Actual_SP'] = df_sparse['SP']

    # Calculate accuracy of predictions for AIC
    accuracy_aic = accuracy_score(predicted_results_df_aic['Actual_SP'], predicted_results_df_aic['Predicted_SP'])
    print(f"\nPrediction Accuracy for {sample_size} samples (AIC): {accuracy_aic:.4f}")

    # Display the first few rows of predictions for AIC
    print(f"\nPredicted Results for Sparse Data (First 10 rows) for {sample_size} samples (AIC):")
    print(predicted_results_df_aic.head(10))

    # Calculate the AIC score for the Bayesian Network model
    aic_score_value = scoring_method_aic.score(best_model_aic)

    # Print the AIC score
    print(f"\nAIC Score for {sample_size} samples: {aic_score_value:.4f}")

# Notify the user that the process is done
print("\nProcessing complete for all sample sizes using AIC (Sparse Data).")


Processing sample size: 500 (Sparse Data with AIC)


  0%|          | 0/1000000 [00:00<?, ?it/s]


Learned Structure (Edges) for 500 samples (AIC):
[('IR_encoded', 'EI_encoded'), ('SP_encoded', 'EI_encoded'), ('SP_encoded', 'IR_encoded')]

CPD of IR_encoded
+---------------+---------------------+---------------------+---------------------+
| SP_encoded    | SP_encoded(0)       | SP_encoded(1)       | SP_encoded(2)       |
+---------------+---------------------+---------------------+---------------------+
| IR_encoded(0) | 0.34394904458598724 | 0.375               | 0.5628140703517588  |
+---------------+---------------------+---------------------+---------------------+
| IR_encoded(1) | 0.5923566878980892  | 0.5347222222222222  | 0.3768844221105528  |
+---------------+---------------------+---------------------+---------------------+
| IR_encoded(2) | 0.06369426751592357 | 0.09027777777777778 | 0.06030150753768844 |
+---------------+---------------------+---------------------+---------------------+

CPD of EI_encoded
+---------------+---------------------+-----+--------------------

  0%|          | 0/1000000 [00:00<?, ?it/s]


Learned Structure (Edges) for 1000 samples (AIC):
[('IR_encoded', 'SP_encoded'), ('EI_encoded', 'SP_encoded')]

CPD of IR_encoded
+---------------+-------+
| IR_encoded(0) | 0.095 |
+---------------+-------+
| IR_encoded(1) | 0.295 |
+---------------+-------+
| IR_encoded(2) | 0.61  |
+---------------+-------+

CPD of EI_encoded
+---------------+-------+
| EI_encoded(0) | 0.395 |
+---------------+-------+
| EI_encoded(1) | 0.311 |
+---------------+-------+
| EI_encoded(2) | 0.294 |
+---------------+-------+

CPD of SP_encoded
+---------------+---------------------+-----+---------------------+---------------------+
| EI_encoded    | EI_encoded(0)       | ... | EI_encoded(2)       | EI_encoded(2)       |
+---------------+---------------------+-----+---------------------+---------------------+
| IR_encoded    | IR_encoded(0)       | ... | IR_encoded(1)       | IR_encoded(2)       |
+---------------+---------------------+-----+---------------------+---------------------+
| SP_encoded(0) |

  0%|          | 0/1000000 [00:00<?, ?it/s]


Learned Structure (Edges) for 1500 samples (AIC):
[('IR_encoded', 'SP_encoded'), ('EI_encoded', 'SP_encoded')]

CPD of IR_encoded
+---------------+-----------+
| IR_encoded(0) | 0.0213333 |
+---------------+-----------+
| IR_encoded(1) | 0.571333  |
+---------------+-----------+
| IR_encoded(2) | 0.407333  |
+---------------+-----------+

CPD of EI_encoded
+---------------+----------+
| EI_encoded(0) | 0.304667 |
+---------------+----------+
| EI_encoded(1) | 0.473333 |
+---------------+----------+
| EI_encoded(2) | 0.222    |
+---------------+----------+

CPD of SP_encoded
+---------------+--------------------+-----+---------------------+---------------------+
| EI_encoded    | EI_encoded(0)      | ... | EI_encoded(2)       | EI_encoded(2)       |
+---------------+--------------------+-----+---------------------+---------------------+
| IR_encoded    | IR_encoded(0)      | ... | IR_encoded(1)       | IR_encoded(2)       |
+---------------+--------------------+-----+------------------

  0%|          | 0/1000000 [00:00<?, ?it/s]


Learned Structure (Edges) for 2000 samples (AIC):
[('IR_encoded', 'EI_encoded'), ('IR_encoded', 'SP_encoded'), ('SP_encoded', 'EI_encoded')]

CPD of IR_encoded
+---------------+--------+
| IR_encoded(0) | 0.0265 |
+---------------+--------+
| IR_encoded(1) | 0.321  |
+---------------+--------+
| IR_encoded(2) | 0.6525 |
+---------------+--------+

CPD of EI_encoded
+---------------+---------------------+-----+---------------------+---------------------+
| IR_encoded    | IR_encoded(0)       | ... | IR_encoded(2)       | IR_encoded(2)       |
+---------------+---------------------+-----+---------------------+---------------------+
| SP_encoded    | SP_encoded(0)       | ... | SP_encoded(1)       | SP_encoded(2)       |
+---------------+---------------------+-----+---------------------+---------------------+
| EI_encoded(0) | 0.1111111111111111  | ... | 0.06823529411764706 | 0.17333333333333334 |
+---------------+---------------------+-----+---------------------+---------------------+
|

  0%|          | 0/1000000 [00:00<?, ?it/s]


Learned Structure (Edges) for 2500 samples (AIC):
[('IR_encoded', 'EI_encoded'), ('SP_encoded', 'EI_encoded'), ('SP_encoded', 'IR_encoded')]

CPD of IR_encoded
+---------------+---------------------+---------------------+---------------------+
| SP_encoded    | SP_encoded(0)       | SP_encoded(1)       | SP_encoded(2)       |
+---------------+---------------------+---------------------+---------------------+
| IR_encoded(0) | 0.09759188846641319 | 0.12959183673469388 | 0.18878248974008208 |
+---------------+---------------------+---------------------+---------------------+
| IR_encoded(1) | 0.17490494296577946 | 0.20714285714285716 | 0.17099863201094392 |
+---------------+---------------------+---------------------+---------------------+
| IR_encoded(2) | 0.7275031685678074  | 0.6632653061224489  | 0.640218878248974   |
+---------------+---------------------+---------------------+---------------------+

CPD of EI_encoded
+---------------+---------------------+-----+-------------------

  0%|          | 0/1000000 [00:00<?, ?it/s]


Learned Structure (Edges) for 3000 samples (AIC):
[('EI_encoded', 'IR_encoded'), ('SP_encoded', 'IR_encoded'), ('SP_encoded', 'EI_encoded')]

CPD of IR_encoded
+---------------+---------------------+-----+---------------------+---------------------+
| EI_encoded    | EI_encoded(0)       | ... | EI_encoded(2)       | EI_encoded(2)       |
+---------------+---------------------+-----+---------------------+---------------------+
| SP_encoded    | SP_encoded(0)       | ... | SP_encoded(1)       | SP_encoded(2)       |
+---------------+---------------------+-----+---------------------+---------------------+
| IR_encoded(0) | 0.17989417989417988 | ... | 0.19472361809045227 | 0.11610486891385768 |
+---------------+---------------------+-----+---------------------+---------------------+
| IR_encoded(1) | 0.0                 | ... | 0.03768844221105527 | 0.0686641697877653  |
+---------------+---------------------+-----+---------------------+---------------------+
| IR_encoded(2) | 0.820105820

  0%|          | 0/1000000 [00:00<?, ?it/s]


Learned Structure (Edges) for 3500 samples (AIC):
[('IR_encoded', 'SP_encoded'), ('EI_encoded', 'SP_encoded'), ('EI_encoded', 'IR_encoded')]

CPD of IR_encoded
+---------------+---------------------+----------------------+---------------------+
| EI_encoded    | EI_encoded(0)       | EI_encoded(1)        | EI_encoded(2)       |
+---------------+---------------------+----------------------+---------------------+
| IR_encoded(0) | 0.5124378109452736  | 0.503957783641161    | 0.45136986301369864 |
+---------------+---------------------+----------------------+---------------------+
| IR_encoded(1) | 0.05970149253731343 | 0.052770448548812667 | 0.04486301369863014 |
+---------------+---------------------+----------------------+---------------------+
| IR_encoded(2) | 0.42786069651741293 | 0.44327176781002636  | 0.5037671232876713  |
+---------------+---------------------+----------------------+---------------------+

CPD of EI_encoded
+---------------+-----------+
| EI_encoded(0) | 0.05742

  0%|          | 0/1000000 [00:00<?, ?it/s]


Learned Structure (Edges) for 4000 samples (AIC):
[('IR_encoded', 'SP_encoded'), ('EI_encoded', 'SP_encoded')]

CPD of IR_encoded
+---------------+---------+
| IR_encoded(0) | 0.32125 |
+---------------+---------+
| IR_encoded(1) | 0.4025  |
+---------------+---------+
| IR_encoded(2) | 0.27625 |
+---------------+---------+

CPD of EI_encoded
+---------------+---------+
| EI_encoded(0) | 0.42525 |
+---------------+---------+
| EI_encoded(1) | 0.153   |
+---------------+---------+
| EI_encoded(2) | 0.42175 |
+---------------+---------+

CPD of SP_encoded
+---------------+---------------------+-----+--------------------+---------------------+
| EI_encoded    | EI_encoded(0)       | ... | EI_encoded(2)      | EI_encoded(2)       |
+---------------+---------------------+-----+--------------------+---------------------+
| IR_encoded    | IR_encoded(0)       | ... | IR_encoded(1)      | IR_encoded(2)       |
+---------------+---------------------+-----+--------------------+-----------------

  0%|          | 0/1000000 [00:00<?, ?it/s]


Learned Structure (Edges) for 4500 samples (AIC):
[('IR_encoded', 'SP_encoded'), ('EI_encoded', 'SP_encoded')]

CPD of IR_encoded
+---------------+----------+
| IR_encoded(0) | 0.368444 |
+---------------+----------+
| IR_encoded(1) | 0.228444 |
+---------------+----------+
| IR_encoded(2) | 0.403111 |
+---------------+----------+

CPD of EI_encoded
+---------------+------------+
| EI_encoded(0) | 0.322444   |
+---------------+------------+
| EI_encoded(1) | 0.674      |
+---------------+------------+
| EI_encoded(2) | 0.00355556 |
+---------------+------------+

CPD of SP_encoded
+---------------+---------------------+-----+---------------+---------------+
| EI_encoded    | EI_encoded(0)       | ... | EI_encoded(2) | EI_encoded(2) |
+---------------+---------------------+-----+---------------+---------------+
| IR_encoded    | IR_encoded(0)       | ... | IR_encoded(1) | IR_encoded(2) |
+---------------+---------------------+-----+---------------+---------------+
| SP_encoded(0) | 0.1

  0%|          | 0/1000000 [00:00<?, ?it/s]


Learned Structure (Edges) for 5000 samples (AIC):
[('IR_encoded', 'SP_encoded'), ('EI_encoded', 'SP_encoded')]

CPD of IR_encoded
+---------------+--------+
| IR_encoded(0) | 0.6572 |
+---------------+--------+
| IR_encoded(1) | 0.003  |
+---------------+--------+
| IR_encoded(2) | 0.3398 |
+---------------+--------+

CPD of EI_encoded
+---------------+--------+
| EI_encoded(0) | 0.1476 |
+---------------+--------+
| EI_encoded(1) | 0.7236 |
+---------------+--------+
| EI_encoded(2) | 0.1288 |
+---------------+--------+

CPD of SP_encoded
+---------------+----------------------+-----+--------------------+---------------------+
| EI_encoded    | EI_encoded(0)        | ... | EI_encoded(2)      | EI_encoded(2)       |
+---------------+----------------------+-----+--------------------+---------------------+
| IR_encoded    | IR_encoded(0)        | ... | IR_encoded(1)      | IR_encoded(2)       |
+---------------+----------------------+-----+--------------------+---------------------+
| S

  0%|          | 0/1000000 [00:00<?, ?it/s]


Learned Structure (Edges) for 5500 samples (AIC):
[('EI_encoded', 'IR_encoded'), ('EI_encoded', 'SP_encoded'), ('SP_encoded', 'IR_encoded')]

CPD of IR_encoded
+---------------+----------------------+-----+---------------------+---------------------+
| EI_encoded    | EI_encoded(0)        | ... | EI_encoded(2)       | EI_encoded(2)       |
+---------------+----------------------+-----+---------------------+---------------------+
| SP_encoded    | SP_encoded(0)        | ... | SP_encoded(1)       | SP_encoded(2)       |
+---------------+----------------------+-----+---------------------+---------------------+
| IR_encoded(0) | 0.4731182795698925   | ... | 0.3879598662207358  | 0.1623246492985972  |
+---------------+----------------------+-----+---------------------+---------------------+
| IR_encoded(1) | 0.010752688172043012 | ... | 0.23411371237458195 | 0.30060120240480964 |
+---------------+----------------------+-----+---------------------+---------------------+
| IR_encoded(2) | 0.

  0%|          | 0/1000000 [00:00<?, ?it/s]


Learned Structure (Edges) for 6000 samples (AIC):
[('IR_encoded', 'SP_encoded'), ('EI_encoded', 'SP_encoded')]

CPD of IR_encoded
+---------------+----------+
| IR_encoded(0) | 0.3595   |
+---------------+----------+
| IR_encoded(1) | 0.376833 |
+---------------+----------+
| IR_encoded(2) | 0.263667 |
+---------------+----------+

CPD of EI_encoded
+---------------+----------+
| EI_encoded(0) | 0.244667 |
+---------------+----------+
| EI_encoded(1) | 0.545667 |
+---------------+----------+
| EI_encoded(2) | 0.209667 |
+---------------+----------+

CPD of SP_encoded
+---------------+--------------------+-----+---------------------+---------------------+
| EI_encoded    | EI_encoded(0)      | ... | EI_encoded(2)       | EI_encoded(2)       |
+---------------+--------------------+-----+---------------------+---------------------+
| IR_encoded    | IR_encoded(0)      | ... | IR_encoded(1)       | IR_encoded(2)       |
+---------------+--------------------+-----+---------------------+---

  0%|          | 0/1000000 [00:00<?, ?it/s]


Learned Structure (Edges) for 6500 samples (AIC):
[('EI_encoded', 'IR_encoded'), ('SP_encoded', 'EI_encoded'), ('SP_encoded', 'IR_encoded')]

CPD of IR_encoded
+---------------+---------------+-----+---------------------+---------------------+
| EI_encoded    | EI_encoded(0) | ... | EI_encoded(2)       | EI_encoded(2)       |
+---------------+---------------+-----+---------------------+---------------------+
| SP_encoded    | SP_encoded(0) | ... | SP_encoded(1)       | SP_encoded(2)       |
+---------------+---------------+-----+---------------------+---------------------+
| IR_encoded(0) | 0.248         | ... | 0.38713826366559484 | 0.31143667296786387 |
+---------------+---------------+-----+---------------------+---------------------+
| IR_encoded(1) | 0.532         | ... | 0.4643086816720257  | 0.2556710775047259  |
+---------------+---------------+-----+---------------------+---------------------+
| IR_encoded(2) | 0.22          | ... | 0.14855305466237942 | 0.43289224952741023 |

  0%|          | 0/1000000 [00:00<?, ?it/s]


Learned Structure (Edges) for 7000 samples (AIC):
[('IR_encoded', 'SP_encoded'), ('EI_encoded', 'SP_encoded')]

CPD of IR_encoded
+---------------+----------+
| IR_encoded(0) | 0.385286 |
+---------------+----------+
| IR_encoded(1) | 0.359571 |
+---------------+----------+
| IR_encoded(2) | 0.255143 |
+---------------+----------+

CPD of EI_encoded
+---------------+----------+
| EI_encoded(0) | 0.581571 |
+---------------+----------+
| EI_encoded(1) | 0.226714 |
+---------------+----------+
| EI_encoded(2) | 0.191714 |
+---------------+----------+

CPD of SP_encoded
+---------------+----------------------+-----+---------------------+---------------------+
| EI_encoded    | EI_encoded(0)        | ... | EI_encoded(2)       | EI_encoded(2)       |
+---------------+----------------------+-----+---------------------+---------------------+
| IR_encoded    | IR_encoded(0)        | ... | IR_encoded(1)       | IR_encoded(2)       |
+---------------+----------------------+-----+---------------

  0%|          | 0/1000000 [00:00<?, ?it/s]


Learned Structure (Edges) for 7500 samples (AIC):
[('IR_encoded', 'SP_encoded'), ('EI_encoded', 'SP_encoded')]

CPD of IR_encoded
+---------------+----------+
| IR_encoded(0) | 0.471867 |
+---------------+----------+
| IR_encoded(1) | 0.187867 |
+---------------+----------+
| IR_encoded(2) | 0.340267 |
+---------------+----------+

CPD of EI_encoded
+---------------+--------+
| EI_encoded(0) | 0.1924 |
+---------------+--------+
| EI_encoded(1) | 0.428  |
+---------------+--------+
| EI_encoded(2) | 0.3796 |
+---------------+--------+

CPD of SP_encoded
+---------------+---------------------+-----+---------------------+----------------------+
| EI_encoded    | EI_encoded(0)       | ... | EI_encoded(2)       | EI_encoded(2)        |
+---------------+---------------------+-----+---------------------+----------------------+
| IR_encoded    | IR_encoded(0)       | ... | IR_encoded(1)       | IR_encoded(2)        |
+---------------+---------------------+-----+---------------------+--------

  0%|          | 0/1000000 [00:00<?, ?it/s]


Learned Structure (Edges) for 8000 samples (AIC):
[('EI_encoded', 'IR_encoded'), ('EI_encoded', 'SP_encoded'), ('SP_encoded', 'IR_encoded')]

CPD of IR_encoded
+---------------+---------------------+-----+---------------------+---------------------+
| EI_encoded    | EI_encoded(0)       | ... | EI_encoded(2)       | EI_encoded(2)       |
+---------------+---------------------+-----+---------------------+---------------------+
| SP_encoded    | SP_encoded(0)       | ... | SP_encoded(1)       | SP_encoded(2)       |
+---------------+---------------------+-----+---------------------+---------------------+
| IR_encoded(0) | 0.22704081632653061 | ... | 0.6472184531886025  | 0.5614973262032086  |
+---------------+---------------------+-----+---------------------+---------------------+
| IR_encoded(1) | 0.25255102040816324 | ... | 0.06784260515603799 | 0.08823529411764706 |
+---------------+---------------------+-----+---------------------+---------------------+
| IR_encoded(2) | 0.520408163

  0%|          | 0/1000000 [00:00<?, ?it/s]


Learned Structure (Edges) for 8500 samples (AIC):
[('IR_encoded', 'EI_encoded'), ('SP_encoded', 'EI_encoded'), ('SP_encoded', 'IR_encoded')]

CPD of IR_encoded
+---------------+----------------------+----------------------+----------------------+
| SP_encoded    | SP_encoded(0)        | SP_encoded(1)        | SP_encoded(2)        |
+---------------+----------------------+----------------------+----------------------+
| IR_encoded(0) | 0.015982364287682557 | 0.013721413721413722 | 0.023114355231143552 |
+---------------+----------------------+----------------------+----------------------+
| IR_encoded(1) | 0.09479195370625516  | 0.2415800415800416   | 0.2615571776155718   |
+---------------+----------------------+----------------------+----------------------+
| IR_encoded(2) | 0.8892256820060622   | 0.7446985446985447   | 0.7153284671532847   |
+---------------+----------------------+----------------------+----------------------+

CPD of EI_encoded
+---------------+--------------------

  0%|          | 0/1000000 [00:00<?, ?it/s]


Learned Structure (Edges) for 9000 samples (AIC):
[('IR_encoded', 'EI_encoded'), ('IR_encoded', 'SP_encoded'), ('SP_encoded', 'EI_encoded')]

CPD of IR_encoded
+---------------+----------+
| IR_encoded(0) | 0.177889 |
+---------------+----------+
| IR_encoded(1) | 0.125    |
+---------------+----------+
| IR_encoded(2) | 0.697111 |
+---------------+----------+

CPD of EI_encoded
+---------------+--------------------+-----+---------------------+--------------------+
| IR_encoded    | IR_encoded(0)      | ... | IR_encoded(2)       | IR_encoded(2)      |
+---------------+--------------------+-----+---------------------+--------------------+
| SP_encoded    | SP_encoded(0)      | ... | SP_encoded(1)       | SP_encoded(2)      |
+---------------+--------------------+-----+---------------------+--------------------+
| EI_encoded(0) | 0.0860655737704918 | ... | 0.4263653483992467  | 0.466710182767624  |
+---------------+--------------------+-----+---------------------+--------------------+
|

  0%|          | 0/1000000 [00:00<?, ?it/s]


Learned Structure (Edges) for 9500 samples (AIC):
[('IR_encoded', 'SP_encoded'), ('IR_encoded', 'EI_encoded'), ('EI_encoded', 'SP_encoded')]

CPD of IR_encoded
+---------------+----------+
| IR_encoded(0) | 0.362947 |
+---------------+----------+
| IR_encoded(1) | 0.288632 |
+---------------+----------+
| IR_encoded(2) | 0.348421 |
+---------------+----------+

CPD of EI_encoded
+---------------+---------------------+---------------------+---------------------+
| IR_encoded    | IR_encoded(0)       | IR_encoded(1)       | IR_encoded(2)       |
+---------------+---------------------+---------------------+---------------------+
| EI_encoded(0) | 0.2827726218097448  | 0.29285193289569655 | 0.29063444108761327 |
+---------------+---------------------+---------------------+---------------------+
| EI_encoded(1) | 0.31322505800464034 | 0.2975929978118162  | 0.3274924471299094  |
+---------------+---------------------+---------------------+---------------------+
| EI_encoded(2) | 0.404002320

  0%|          | 0/1000000 [00:00<?, ?it/s]


Learned Structure (Edges) for 10000 samples (AIC):
[('EI_encoded', 'IR_encoded'), ('SP_encoded', 'IR_encoded'), ('SP_encoded', 'EI_encoded')]

CPD of IR_encoded
+---------------+---------------------+-----+---------------------+---------------------+
| EI_encoded    | EI_encoded(0)       | ... | EI_encoded(2)       | EI_encoded(2)       |
+---------------+---------------------+-----+---------------------+---------------------+
| SP_encoded    | SP_encoded(0)       | ... | SP_encoded(1)       | SP_encoded(2)       |
+---------------+---------------------+-----+---------------------+---------------------+
| IR_encoded(0) | 0.22929936305732485 | ... | 0.37530864197530867 | 0.41580756013745707 |
+---------------+---------------------+-----+---------------------+---------------------+
| IR_encoded(1) | 0.554140127388535   | ... | 0.4765432098765432  | 0.5189003436426117  |
+---------------+---------------------+-----+---------------------+---------------------+
| IR_encoded(2) | 0.21656050

# ------------------------------------------------------------------------------------------------------------

# Hypothesis Model 500, 1000, 1500, ..., 10000 Samples (dense) 1 hidden Layer, 10 Neurons

In [5]:
# Sample sizes to loop through
sample_sizes = range(500, 10500, 500)

# Define the Neural Network architecture
def create_nn_model(hidden_layers=1, nodes_per_layer=10):
    model = models.Sequential()

    # Input layer (2 input features: IR_encoded and EI_encoded)
    model.add(layers.InputLayer(input_shape=(2,)))

    # Hidden layers
    for layer_num in range(hidden_layers):
        model.add(layers.Dense(nodes_per_layer, activation='relu', name=f"hidden_layer_{layer_num + 1}"))

    # Output layer (3 classes: decrease, stable, increase)
    model.add(layers.Dense(3, activation='softmax', name="output_layer"))

    # Compile the model
    model.compile(optimizer='adam',
                  loss='sparse_categorical_crossentropy',
                  metrics=['accuracy'])

    return model

# Loop through each sample size
for size in sample_sizes:
    # Load data for the current sample size (adjust the file paths if necessary)
    outcomes_file = f'outcomes_dense_{size}.csv'

    df = pd.read_csv(outcomes_file)

    # Manually encode categorical variables for IR, EI, and SP
    ir_map = {'low': 0, 'medium': 1, 'high': 2}
    ei_map = {'poor': 0, 'average': 1, 'good': 2}
    sp_map = {'decrease': 0, 'stable': 1, 'increase': 2}

    df['IR_encoded'] = df['IR'].map(ir_map)
    df['EI_encoded'] = df['EI'].map(ei_map)
    df['SP_encoded'] = df['SP'].map(sp_map)

    # Features (IR and EI) and labels (SP)
    X = df[['IR_encoded', 'EI_encoded']]
    y = df['SP_encoded']

    # Refresh the data split for each iteration
    X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, shuffle=False, random_state=42)
    X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, shuffle=False, random_state=42)

    # Show split confirmation
    print(f"\nSample size: {size}")
    print("Training Data:", X_train.shape, y_train.shape)
    print("Validation Data:", X_val.shape, y_val.shape)
    print("Test Data:", X_test.shape, y_test.shape)

    # Create the Neural Network model
    nn_model = create_nn_model(hidden_layers=1, nodes_per_layer=10)

    # Train the model
    history = nn_model.fit(X_train, y_train,
                           epochs=50,
                           batch_size=32,
                           validation_data=(X_val, y_val),
                           verbose=0)  # Set verbose=0 to avoid too much output

    # Evaluate on the validation set
    val_loss, val_accuracy = nn_model.evaluate(X_val, y_val, verbose=0)
    print(f"Validation Accuracy for {size} samples: {val_accuracy:.4f}")

    # Evaluate on the test set
    test_loss, test_accuracy = nn_model.evaluate(X_test, y_test, verbose=0)
    print(f"Test Accuracy for {size} samples: {test_accuracy:.4f}")

    # Make predictions on the test set
    predictions = nn_model.predict(X_test)

    # Convert the predicted probabilities to class labels
    predicted_classes = predictions.argmax(axis=1)

    # Create a list to map integers back to the original SP labels
    sp_reverse_map = ['decrease', 'stable', 'increase']

    # Convert the predicted classes to the original labels
    predicted_labels = [sp_reverse_map[label] for label in predicted_classes]

    # Create a DataFrame for the predicted probabilities
    probs_df = pd.DataFrame(predictions, columns=['Prob_decrease', 'Prob_stable', 'Prob_increase'])

    # Output the IR, EI, predicted SP, and the NN probabilities
    result_df = pd.DataFrame({
        'IR': df['IR'][:len(predicted_labels)],  # IR column from the original dataframe
        'EI': df['EI'][:len(predicted_labels)],  # EI column from the original dataframe
        'Predicted_SP': predicted_labels         # Predicted SP labels
    })

    # Combine the result with the predicted probabilities
    combined_df = pd.concat([result_df, probs_df.reset_index(drop=True)], axis=1)

    # Show the first few rows of the results for this sample size
    print(f"\nPredicted Results and Probabilities for {size} samples (First 5 rows):")
    print(combined_df.head(15))

# After the loop is done, print this message
print("\nLooping through all sample sizes complete!")


Sample size: 500
Training Data: (350, 2) (350,)
Validation Data: (75, 2) (75,)
Test Data: (75, 2) (75,)




Validation Accuracy for 500 samples: 0.4400
Test Accuracy for 500 samples: 0.3600
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 16ms/step

Predicted Results and Probabilities for 500 samples (First 5 rows):
        IR       EI Predicted_SP  Prob_decrease  Prob_stable  Prob_increase
0      low     good       stable       0.220026     0.416640       0.363334
1      low  average     decrease       0.413598     0.217326       0.369076
2     high     good       stable       0.220026     0.416640       0.363334
3   medium  average     increase       0.284828     0.339500       0.375672
4      low     good     decrease       0.482076     0.230441       0.287483
5      low  average     decrease       0.413598     0.217326       0.369076
6      low  average     decrease       0.402544     0.247264       0.350192
7     high     poor       stable       0.220026     0.416640       0.363334
8      low     good     decrease       0.382125     0.254997       0.362877
9      low     go



Validation Accuracy for 1000 samples: 0.3800
Test Accuracy for 1000 samples: 0.3867
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step 

Predicted Results and Probabilities for 1000 samples (First 5 rows):
        IR       EI Predicted_SP  Prob_decrease  Prob_stable  Prob_increase
0   medium     poor     decrease       0.484780     0.186481       0.328738
1      low  average     increase       0.319682     0.272961       0.407357
2   medium     poor     decrease       0.484780     0.186481       0.328738
3     high     poor     increase       0.319682     0.272961       0.407357
4     high  average     decrease       0.458092     0.244180       0.297728
5   medium     good     increase       0.199729     0.298456       0.501814
6      low     poor     increase       0.270419     0.318913       0.410667
7     high     poor     decrease       0.458092     0.244180       0.297728
8   medium     good     increase       0.341192     0.244760       0.414048
9     high    



Validation Accuracy for 1500 samples: 0.4978
Test Accuracy for 1500 samples: 0.5333
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step 





Predicted Results and Probabilities for 1500 samples (First 5 rows):
      IR       EI Predicted_SP  Prob_decrease  Prob_stable  Prob_increase
0    low  average     increase       0.184528     0.338349       0.477123
1    low  average     increase       0.136586     0.155298       0.708116
2    low  average     increase       0.239780     0.305497       0.454723
3   high     poor     increase       0.126848     0.353257       0.519894
4    low  average     decrease       0.500415     0.256532       0.243053
5   high     poor     increase       0.136586     0.155298       0.708116
6    low  average     increase       0.184528     0.338349       0.477123
7    low  average     increase       0.184528     0.338349       0.477123
8    low     good     increase       0.299257     0.285129       0.415614
9    low  average     decrease       0.500415     0.256532       0.243053
10  high     good     increase       0.239780     0.305497       0.454723
11   low  average     increase       0.126



Validation Accuracy for 2000 samples: 0.5267
Test Accuracy for 2000 samples: 0.5733
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step 





Predicted Results and Probabilities for 2000 samples (First 5 rows):
        IR       EI Predicted_SP  Prob_decrease  Prob_stable  Prob_increase
0      low     good       stable       0.141561     0.551773       0.306666
1     high     good       stable       0.141561     0.551773       0.306666
2     high  average     increase       0.227766     0.340097       0.432136
3     high     good       stable       0.316619     0.350342       0.333039
4      low  average     increase       0.302768     0.264975       0.432257
5     high     good       stable       0.141561     0.551773       0.306666
6      low     good     increase       0.152233     0.257851       0.589915
7   medium  average     decrease       0.410983     0.183860       0.405157
8     high     good       stable       0.141561     0.551773       0.306666
9     high  average     increase       0.227766     0.340097       0.432136
10    high     good     decrease       0.512315     0.210332       0.277354
11  medium     goo



Validation Accuracy for 3000 samples: 0.6022
Test Accuracy for 3000 samples: 0.5889
[1m15/15[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step 

Predicted Results and Probabilities for 3000 samples (First 5 rows):
        IR       EI Predicted_SP  Prob_decrease  Prob_stable  Prob_increase
0   medium     good     decrease       0.372033     0.344763       0.283204
1      low  average     decrease       0.372033     0.344763       0.283204
2      low     good     decrease       0.389574     0.389202       0.221224
3   medium     poor     increase       0.226774     0.297198       0.476028
4      low  average     decrease       0.372033     0.344763       0.283204
5   medium     good     increase       0.203187     0.334566       0.462247
6   medium     good       stable       0.211250     0.707464       0.081286
7      low  average     increase       0.203187     0.334566       0.462247
8     high     poor     increase       0.198818     0.324790       0.476392
9   medium  



Validation Accuracy for 3500 samples: 0.5067
Test Accuracy for 3500 samples: 0.4533
[1m17/17[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step 

Predicted Results and Probabilities for 3500 samples (First 5 rows):
        IR       EI Predicted_SP  Prob_decrease  Prob_stable  Prob_increase
0     high     poor     decrease       0.658431     0.101459       0.240110
1     high  average     decrease       0.658431     0.101459       0.240110
2     high  average       stable       0.336198     0.503748       0.160053
3   medium     poor     decrease       0.515755     0.189382       0.294863
4   medium     good       stable       0.269507     0.387100       0.343393
5   medium     good     decrease       0.512233     0.201881       0.285887
6   medium     good       stable       0.269507     0.387100       0.343393
7      low     good       stable       0.352562     0.357596       0.289842
8   medium  average     increase       0.234485     0.316669       0.448845
9      low  



Validation Accuracy for 4000 samples: 0.5617
Test Accuracy for 4000 samples: 0.5433
[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step

Predicted Results and Probabilities for 4000 samples (First 5 rows):
        IR       EI Predicted_SP  Prob_decrease  Prob_stable  Prob_increase
0      low     good     decrease       0.422423     0.156655       0.420922
1   medium  average     decrease       0.422423     0.156655       0.420922
2      low     good       stable       0.226956     0.620519       0.152525
3   medium  average       stable       0.226956     0.620519       0.152525
4      low     poor     decrease       0.425426     0.394353       0.180221
5     high     good     decrease       0.425426     0.394353       0.180221
6      low     poor     decrease       0.425426     0.394353       0.180221
7      low     poor     decrease       0.425426     0.394353       0.180221
8      low     good     decrease       0.546610     0.244059       0.209331
9      low   



Validation Accuracy for 4500 samples: 0.4563
Test Accuracy for 4500 samples: 0.4548
[1m22/22[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step

Predicted Results and Probabilities for 4500 samples (First 5 rows):
        IR       EI Predicted_SP  Prob_decrease  Prob_stable  Prob_increase
0   medium     good       stable       0.302282     0.516199       0.181519
1   medium  average     decrease       0.420597     0.231549       0.347854
2      low  average     decrease       0.420597     0.231549       0.347854
3      low  average       stable       0.302282     0.516199       0.181519
4     high  average       stable       0.302282     0.516199       0.181519
5     high     good     decrease       0.420597     0.231549       0.347854
6      low  average     decrease       0.420597     0.231549       0.347854
7   medium     good     increase       0.373278     0.151298       0.475424
8     high     poor     increase       0.373278     0.151298       0.475424
9      low   



Validation Accuracy for 5000 samples: 0.4640
Test Accuracy for 5000 samples: 0.4947
[1m24/24[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step

Predicted Results and Probabilities for 5000 samples (First 5 rows):
        IR       EI Predicted_SP  Prob_decrease  Prob_stable  Prob_increase
0   medium  average     increase       0.342324     0.235133       0.422543
1   medium  average     increase       0.280898     0.301132       0.417969
2   medium     good     decrease       0.658502     0.219144       0.122354
3     high     good     decrease       0.658502     0.219144       0.122354
4   medium  average     decrease       0.481579     0.213438       0.304984
5   medium  average     decrease       0.481579     0.213438       0.304984
6     high  average     increase       0.362189     0.165438       0.472372
7     high  average     increase       0.342324     0.235133       0.422543
8     high  average     increase       0.272412     0.217132       0.510456
9     high   



Validation Accuracy for 5500 samples: 0.4339
Test Accuracy for 5500 samples: 0.3794
[1m26/26[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step

Predicted Results and Probabilities for 5500 samples (First 5 rows):
        IR       EI Predicted_SP  Prob_decrease  Prob_stable  Prob_increase
0      low     poor     decrease       0.382080     0.324133       0.293787
1     high     poor     increase       0.406124     0.156902       0.436975
2      low     poor     decrease       0.382080     0.324133       0.293787
3     high     poor     decrease       0.382080     0.324133       0.293787
4      low  average     decrease       0.382080     0.324133       0.293787
5      low  average     increase       0.191774     0.381229       0.426997
6   medium  average     decrease       0.358427     0.305831       0.335742
7   medium     good     increase       0.191774     0.381229       0.426997
8      low     poor     increase       0.403509     0.191365       0.405126
9   medium   



Validation Accuracy for 6000 samples: 0.5222
Test Accuracy for 6000 samples: 0.5500
[1m29/29[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step

Predicted Results and Probabilities for 6000 samples (First 5 rows):
      IR       EI Predicted_SP  Prob_decrease  Prob_stable  Prob_increase
0   high  average     decrease       0.425486     0.292636       0.281879
1   high     good     decrease       0.425486     0.292636       0.281879
2    low  average     decrease       0.425486     0.292636       0.281879
3   high  average     decrease       0.425486     0.292636       0.281879
4   high     good     decrease       0.630958     0.110166       0.258875
5    low     good     decrease       0.484064     0.079314       0.436622
6   high     good     decrease       0.425486     0.292636       0.281879
7   high  average     decrease       0.484064     0.079314       0.436622
8   high  average     decrease       0.425486     0.292636       0.281879
9   high  average     decrease   



Validation Accuracy for 6500 samples: 0.4154
Test Accuracy for 6500 samples: 0.4236
[1m31/31[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step

Predicted Results and Probabilities for 6500 samples (First 5 rows):
        IR       EI Predicted_SP  Prob_decrease  Prob_stable  Prob_increase
0     high     good     decrease       0.389732     0.381095       0.229173
1     high  average     increase       0.337299     0.317998       0.344703
2   medium  average       stable       0.361393     0.443846       0.194761
3     high     good     increase       0.397546     0.139970       0.462484
4     high  average       stable       0.357613     0.424365       0.218022
5     high  average     decrease       0.389732     0.381095       0.229173
6     high  average     increase       0.397546     0.139970       0.462484
7   medium     poor     increase       0.397546     0.139970       0.462484
8   medium     poor       stable       0.361393     0.443846       0.194761
9     high  a



Validation Accuracy for 7000 samples: 0.4467
Test Accuracy for 7000 samples: 0.4400
[1m33/33[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step

Predicted Results and Probabilities for 7000 samples (First 5 rows):
        IR       EI Predicted_SP  Prob_decrease  Prob_stable  Prob_increase
0   medium     poor       stable       0.262281     0.409936       0.327783
1      low     poor       stable       0.222252     0.389883       0.387865
2     high     good       stable       0.301509     0.406212       0.292279
3   medium     poor       stable       0.262281     0.409936       0.327783
4     high     good       stable       0.262281     0.409936       0.327783
5      low     good     increase       0.286756     0.202611       0.510633
6     high  average       stable       0.222252     0.389883       0.387865
7     high     good       stable       0.222252     0.389883       0.387865
8      low  average       stable       0.301509     0.406212       0.292279
9     high   



Validation Accuracy for 7500 samples: 0.4702
Test Accuracy for 7500 samples: 0.4880
[1m36/36[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step

Predicted Results and Probabilities for 7500 samples (First 5 rows):
        IR       EI Predicted_SP  Prob_decrease  Prob_stable  Prob_increase
0      low     good       stable       0.322093     0.520681       0.157226
1     high     good       stable       0.212840     0.426129       0.361031
2     high  average       stable       0.215051     0.489741       0.295208
3     high  average       stable       0.267573     0.513394       0.219032
4      low  average       stable       0.322093     0.520681       0.157226
5     high     poor       stable       0.215051     0.489741       0.295208
6     high  average       stable       0.278255     0.410373       0.311372
7     high  average       stable       0.353794     0.558360       0.087846
8     high     poor       stable       0.267573     0.513394       0.219032
9     high   



Validation Accuracy for 8000 samples: 0.5542
Test Accuracy for 8000 samples: 0.5325
[1m38/38[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step

Predicted Results and Probabilities for 8000 samples (First 5 rows):
        IR       EI Predicted_SP  Prob_decrease  Prob_stable  Prob_increase
0   medium     good     decrease       0.390688     0.350244       0.259068
1      low  average     decrease       0.531097     0.327984       0.140920
2      low     poor     decrease       0.390688     0.350244       0.259068
3   medium     poor       stable       0.071524     0.627500       0.300977
4   medium     good     decrease       0.661701     0.186728       0.151571
5   medium  average       stable       0.071524     0.627500       0.300977
6     high     poor       stable       0.071524     0.627500       0.300977
7   medium  average       stable       0.059222     0.534138       0.406640
8     high  average     decrease       0.390688     0.350244       0.259068
9   medium   



Validation Accuracy for 8500 samples: 0.5129
Test Accuracy for 8500 samples: 0.4957
[1m40/40[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step

Predicted Results and Probabilities for 8500 samples (First 5 rows):
        IR       EI Predicted_SP  Prob_decrease  Prob_stable  Prob_increase
0   medium     good       stable       0.257724     0.572557       0.169719
1   medium     good     increase       0.317533     0.316392       0.366075
2     high  average     increase       0.250845     0.334492       0.414663
3   medium     good     increase       0.279480     0.178064       0.542456
4      low     poor       stable       0.257724     0.572557       0.169719
5   medium     good     increase       0.264354     0.277363       0.458283
6   medium  average     decrease       0.434111     0.148061       0.417828
7     high     poor       stable       0.257724     0.572557       0.169719
8     high     poor       stable       0.257724     0.572557       0.169719
9   medium   



Validation Accuracy for 9000 samples: 0.5785
Test Accuracy for 9000 samples: 0.5422
[1m43/43[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step

Predicted Results and Probabilities for 9000 samples (First 5 rows):
        IR       EI Predicted_SP  Prob_decrease  Prob_stable  Prob_increase
0      low     poor     increase       0.203553     0.130800       0.665647
1   medium  average     increase       0.203553     0.130800       0.665647
2     high     poor     increase       0.203553     0.130800       0.665647
3   medium     poor     increase       0.203553     0.130800       0.665647
4   medium     poor     increase       0.332343     0.109771       0.557886
5   medium     good     increase       0.332343     0.109771       0.557886
6   medium     good     increase       0.203553     0.130800       0.665647
7   medium  average     increase       0.203553     0.130800       0.665647
8   medium     poor     increase       0.323144     0.178940       0.497915
9   medium   



Validation Accuracy for 9500 samples: 0.5846
Test Accuracy for 9500 samples: 0.5432
[1m45/45[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step

Predicted Results and Probabilities for 9500 samples (First 5 rows):
        IR       EI Predicted_SP  Prob_decrease  Prob_stable  Prob_increase
0      low  average       stable       0.205373     0.542198       0.252428
1      low     good       stable       0.205373     0.542198       0.252428
2     high  average     increase       0.039831     0.201106       0.759062
3     high  average     increase       0.039831     0.201106       0.759062
4   medium  average     increase       0.039831     0.201106       0.759062
5     high     poor       stable       0.361559     0.440328       0.198113
6     high     poor       stable       0.361559     0.440328       0.198113
7      low     poor       stable       0.361559     0.440328       0.198113
8      low     poor       stable       0.361559     0.440328       0.198113
9      low   



Validation Accuracy for 10000 samples: 0.5120
Test Accuracy for 10000 samples: 0.5407
[1m47/47[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step

Predicted Results and Probabilities for 10000 samples (First 5 rows):
        IR       EI Predicted_SP  Prob_decrease  Prob_stable  Prob_increase
0      low  average     increase       0.359770     0.169999       0.470231
1   medium  average     decrease       0.526740     0.162752       0.310508
2   medium     good     increase       0.089461     0.451430       0.459109
3   medium     good     decrease       0.526740     0.162752       0.310508
4     high     poor     increase       0.309916     0.051491       0.638593
5     high     poor     increase       0.122031     0.430991       0.446978
6     high     good     increase       0.122031     0.430991       0.446978
7     high  average       stable       0.236517     0.452659       0.310824
8     high  average     increase       0.122031     0.430991       0.446978
9   medium

# Hypothesis Model 500, 1000, 1500, ..., 10000 Samples (sparse) 1 hidden Layer, 10 Neurons

In [6]:
# Sample sizes to loop through
sample_sizes = range(500, 10500, 500)

# Define the Neural Network architecture
def create_nn_model(hidden_layers=1, nodes_per_layer=10):
    model = models.Sequential()

    # Input layer (2 input features: IR_encoded and EI_encoded)
    model.add(layers.InputLayer(input_shape=(2,)))

    # Hidden layers
    for layer_num in range(hidden_layers):
        model.add(layers.Dense(nodes_per_layer, activation='relu', name=f"hidden_layer_{layer_num + 1}"))

    # Output layer (3 classes: decrease, stable, increase)
    model.add(layers.Dense(3, activation='softmax', name="output_layer"))

    # Compile the model
    model.compile(optimizer='adam',
                  loss='sparse_categorical_crossentropy',
                  metrics=['accuracy'])

    return model

# Loop through each sample size
for size in sample_sizes:
    # Load data for the current sample size (adjust the file paths for sparse data)
    outcomes_file = f'outcomes_sparse_{size}.csv'

    df = pd.read_csv(outcomes_file)

    # Manually encode categorical variables for IR, EI, and SP
    ir_map = {'low': 0, 'medium': 1, 'high': 2}
    ei_map = {'poor': 0, 'average': 1, 'good': 2}
    sp_map = {'decrease': 0, 'stable': 1, 'increase': 2}

    df['IR_encoded'] = df['IR'].map(ir_map)
    df['EI_encoded'] = df['EI'].map(ei_map)
    df['SP_encoded'] = df['SP'].map(sp_map)

    # Features (IR and EI) and labels (SP)
    X = df[['IR_encoded', 'EI_encoded']]
    y = df['SP_encoded']

    # Refresh the data split for each iteration
    X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, shuffle=False, random_state=42)
    X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, shuffle=False, random_state=42)

    # Show split confirmation
    print(f"\nSample size: {size}")
    print("Training Data:", X_train.shape, y_train.shape)
    print("Validation Data:", X_val.shape, y_val.shape)
    print("Test Data:", X_test.shape, y_test.shape)

    # Create the Neural Network model
    nn_model = create_nn_model(hidden_layers=1, nodes_per_layer=10)

    # Train the model
    history = nn_model.fit(X_train, y_train,
                           epochs=50,
                           batch_size=32,
                           validation_data=(X_val, y_val),
                           verbose=0)  # Set verbose=0 to avoid too much output

    # Evaluate on the validation set
    val_loss, val_accuracy = nn_model.evaluate(X_val, y_val, verbose=0)
    print(f"Validation Accuracy for {size} samples: {val_accuracy:.4f}")

    # Evaluate on the test set
    test_loss, test_accuracy = nn_model.evaluate(X_test, y_test, verbose=0)
    print(f"Test Accuracy for {size} samples: {test_accuracy:.4f}")

    # Make predictions on the test set
    predictions = nn_model.predict(X_test)

    # Convert the predicted probabilities to class labels
    predicted_classes = predictions.argmax(axis=1)

    # Create a list to map integers back to the original SP labels
    sp_reverse_map = ['decrease', 'stable', 'increase']

    # Convert the predicted classes to the original labels
    predicted_labels = [sp_reverse_map[label] for label in predicted_classes]

    # Create a DataFrame for the predicted probabilities
    probs_df = pd.DataFrame(predictions, columns=['Prob_decrease', 'Prob_stable', 'Prob_increase'])

    # Output the IR, EI, predicted SP, and the NN probabilities
    result_df = pd.DataFrame({
        'IR': df['IR'][:len(predicted_labels)],  # IR column from the original dataframe
        'EI': df['EI'][:len(predicted_labels)],  # EI column from the original dataframe
        'Predicted_SP': predicted_labels         # Predicted SP labels
    })

    # Combine the result with the predicted probabilities
    combined_df = pd.concat([result_df, probs_df.reset_index(drop=True)], axis=1)

    # Show the first few rows of the results for this sample size
    print(f"\nPredicted Results and Probabilities for {size} samples (First 5 rows):")
    print(combined_df.head(15))

# After the loop is done, print this message
print("\nLooping through all sparse sample sizes complete!")


Sample size: 500
Training Data: (350, 2) (350,)
Validation Data: (75, 2) (75,)
Test Data: (75, 2) (75,)




Validation Accuracy for 500 samples: 0.4533
Test Accuracy for 500 samples: 0.4000
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 17ms/step

Predicted Results and Probabilities for 500 samples (First 5 rows):
        IR       EI Predicted_SP  Prob_decrease  Prob_stable  Prob_increase
0     high     poor     increase       0.354493     0.236941       0.408565
1     high     good     increase       0.354493     0.236941       0.408565
2   medium  average       stable       0.387208     0.391846       0.220946
3   medium     good       stable       0.393601     0.469468       0.136931
4     high  average     decrease       0.400182     0.297971       0.301847
5     high  average       stable       0.393601     0.469468       0.136931
6      low     good       stable       0.393601     0.469468       0.136931
7     high  average     increase       0.307705     0.336684       0.355610
8      low  average     decrease       0.400182     0.297971       0.301847
9   medium  avera



Validation Accuracy for 1000 samples: 0.4600
Test Accuracy for 1000 samples: 0.5400
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step

Predicted Results and Probabilities for 1000 samples (First 5 rows):
        IR       EI Predicted_SP  Prob_decrease  Prob_stable  Prob_increase
0   medium     good     increase       0.319657     0.316506       0.363837
1   medium     good     increase       0.397934     0.169565       0.432501
2     high  average     decrease       0.452440     0.250053       0.297507
3     high  average     decrease       0.382008     0.329299       0.288693
4      low     good     decrease       0.388275     0.361029       0.250696
5     high     good     increase       0.397934     0.169565       0.432501
6   medium     good     decrease       0.388275     0.361029       0.250696
7   medium  average     increase       0.397934     0.169565       0.432501
8     high     good     decrease       0.388275     0.361029       0.250696
9     high    



Validation Accuracy for 1500 samples: 0.5067
Test Accuracy for 1500 samples: 0.5600
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step 

Predicted Results and Probabilities for 1500 samples (First 5 rows):
        IR       EI Predicted_SP  Prob_decrease  Prob_stable  Prob_increase
0     high  average     decrease       0.445001     0.220133       0.334867
1     high  average     decrease       0.445001     0.220133       0.334867
2   medium  average     decrease       0.435284     0.275365       0.289351
3   medium  average     decrease       0.435284     0.275365       0.289351
4     high     good     decrease       0.435284     0.275365       0.289351
5     high     good     decrease       0.435284     0.275365       0.289351
6   medium  average     decrease       0.445001     0.220133       0.334867
7   medium  average     decrease       0.445001     0.220133       0.334867
8   medium  average     decrease       0.597078     0.087700       0.315222
9   medium  av



Validation Accuracy for 2000 samples: 0.5267
Test Accuracy for 2000 samples: 0.5167
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 22ms/step

Predicted Results and Probabilities for 2000 samples (First 5 rows):
        IR       EI Predicted_SP  Prob_decrease  Prob_stable  Prob_increase
0     high  average     decrease       0.493802     0.251348       0.254851
1      low     poor     increase       0.197589     0.199510       0.602901
2     high  average       stable       0.129052     0.588589       0.282358
3      low  average       stable       0.283330     0.500031       0.216639
4      low     good       stable       0.129052     0.588589       0.282358
5      low  average     decrease       0.493802     0.251348       0.254851
6      low     good     decrease       0.493802     0.251348       0.254851
7     high     good       stable       0.283330     0.500031       0.216639
8      low     good     decrease       0.541684     0.249602       0.208715
9      low  



Validation Accuracy for 2500 samples: 0.4773
Test Accuracy for 2500 samples: 0.4720
[1m12/12[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step 

Predicted Results and Probabilities for 2500 samples (First 5 rows):
        IR    EI Predicted_SP  Prob_decrease  Prob_stable  Prob_increase
0   medium  good     increase       0.075571     0.363471       0.560957
1     high  good     increase       0.163566     0.348607       0.487827
2   medium  poor       stable       0.151193     0.493278       0.355529
3     high  poor     increase       0.075571     0.363471       0.560957
4   medium  poor     decrease       0.421684     0.222918       0.355398
5      low  poor       stable       0.151193     0.493278       0.355529
6      low  good       stable       0.151193     0.493278       0.355529
7     high  good       stable       0.151193     0.493278       0.355529
8      low  poor       stable       0.151193     0.493278       0.355529
9   medium  good     decrease       0.4216



Validation Accuracy for 3000 samples: 0.5222
Test Accuracy for 3000 samples: 0.5267
[1m15/15[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step

Predicted Results and Probabilities for 3000 samples (First 5 rows):
        IR       EI Predicted_SP  Prob_decrease  Prob_stable  Prob_increase
0     high     poor     decrease       0.461366     0.136365       0.402269
1   medium     poor     increase       0.188658     0.341994       0.469347
2   medium     good     decrease       0.430358     0.158372       0.411270
3   medium     poor     increase       0.189077     0.261710       0.549213
4      low     poor     decrease       0.430358     0.158372       0.411270
5   medium  average     increase       0.189077     0.261710       0.549213
6   medium     poor     decrease       0.461366     0.136365       0.402269
7   medium     poor     increase       0.355060     0.240835       0.404105
8     high  average     decrease       0.430358     0.158372       0.411270
9   medium  



Validation Accuracy for 3500 samples: 0.4819
Test Accuracy for 3500 samples: 0.5295
[1m17/17[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step

Predicted Results and Probabilities for 3500 samples (First 5 rows):
        IR       EI Predicted_SP  Prob_decrease  Prob_stable  Prob_increase
0     high  average       stable       0.207337     0.565515       0.227148
1   medium     good     increase       0.116349     0.369685       0.513967
2     high     good     increase       0.410938     0.173140       0.415921
3   medium     good     decrease       0.577712     0.204206       0.218081
4     high  average     increase       0.410938     0.173140       0.415921
5      low  average       stable       0.207337     0.565515       0.227148
6     high  average       stable       0.212417     0.435423       0.352160
7     high  average       stable       0.207337     0.565515       0.227148
8     high  average     increase       0.410938     0.173140       0.415921
9      low   



Validation Accuracy for 4000 samples: 0.4433
Test Accuracy for 4000 samples: 0.4983
[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step

Predicted Results and Probabilities for 4000 samples (First 5 rows):
        IR       EI Predicted_SP  Prob_decrease  Prob_stable  Prob_increase
0     high     good       stable       0.254511     0.492770       0.252719
1   medium     good     increase       0.389832     0.215419       0.394749
2     high     good       stable       0.254511     0.492770       0.252719
3     high  average     increase       0.171690     0.163120       0.665190
4      low  average     increase       0.294497     0.254475       0.451028
5     high  average     increase       0.389832     0.215419       0.394749
6     high     good       stable       0.254511     0.492770       0.252719
7     high     good       stable       0.192831     0.518968       0.288201
8      low     good     increase       0.382510     0.150774       0.466716
9     high   



Validation Accuracy for 4500 samples: 0.5156
Test Accuracy for 4500 samples: 0.4933
[1m22/22[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step

Predicted Results and Probabilities for 4500 samples (First 5 rows):
        IR       EI Predicted_SP  Prob_decrease  Prob_stable  Prob_increase
0     high     poor       stable       0.251856     0.390981       0.357163
1   medium     good     decrease       0.468314     0.144341       0.387346
2     high     good     decrease       0.468314     0.144341       0.387346
3     high     good       stable       0.251856     0.390981       0.357163
4   medium     poor       stable       0.295276     0.553644       0.151079
5   medium     poor     decrease       0.512393     0.342795       0.144812
6   medium     poor     increase       0.322540     0.168992       0.508468
7   medium  average     increase       0.164279     0.357822       0.477899
8     high     good     increase       0.322540     0.168992       0.508468
9   medium   



Validation Accuracy for 5000 samples: 0.5733
Test Accuracy for 5000 samples: 0.5427
[1m24/24[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step

Predicted Results and Probabilities for 5000 samples (First 5 rows):
        IR    EI Predicted_SP  Prob_decrease  Prob_stable  Prob_increase
0     high  good     increase       0.337930     0.260354       0.401715
1   medium  good       stable       0.260907     0.383175       0.355918
2   medium  good     increase       0.337930     0.260354       0.401715
3   medium  poor     increase       0.293610     0.325581       0.380809
4     high  good       stable       0.180256     0.718109       0.101636
5     high  good       stable       0.119504     0.490541       0.389955
6      low  good       stable       0.260907     0.383175       0.355918
7   medium  good     increase       0.269120     0.314513       0.416367
8   medium  good       stable       0.180256     0.718109       0.101636
9   medium  good       stable       0.18025



Validation Accuracy for 5500 samples: 0.4861
Test Accuracy for 5500 samples: 0.4558
[1m26/26[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step

Predicted Results and Probabilities for 5500 samples (First 5 rows):
        IR       EI Predicted_SP  Prob_decrease  Prob_stable  Prob_increase
0   medium     poor       stable       0.365112     0.390440       0.244447
1   medium     poor       stable       0.171061     0.506894       0.322044
2   medium     poor     decrease       0.747861     0.112688       0.139451
3     high     poor     decrease       0.390716     0.319575       0.289709
4      low     poor       stable       0.318539     0.364485       0.316976
5      low  average     decrease       0.441624     0.308785       0.249591
6      low  average       stable       0.365112     0.390440       0.244447
7   medium  average     decrease       0.747861     0.112688       0.139451
8   medium     poor       stable       0.365112     0.390440       0.244447
9   medium   



Validation Accuracy for 6000 samples: 0.4922
Test Accuracy for 6000 samples: 0.5044
[1m29/29[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step

Predicted Results and Probabilities for 6000 samples (First 5 rows):
        IR       EI Predicted_SP  Prob_decrease  Prob_stable  Prob_increase
0      low  average       stable       0.252032     0.448769       0.299199
1     high     good     decrease       0.509017     0.427957       0.063026
2      low     good     increase       0.260772     0.359051       0.380177
3      low     good     decrease       0.409801     0.301729       0.288469
4     high     poor       stable       0.248779     0.607626       0.143595
5      low     poor       stable       0.252032     0.448769       0.299199
6      low     good       stable       0.241045     0.421634       0.337321
7      low  average     decrease       0.509017     0.427957       0.063026
8      low     good     increase       0.260772     0.359051       0.380177
9      low   



Validation Accuracy for 6500 samples: 0.4872
Test Accuracy for 6500 samples: 0.4964
[1m31/31[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step

Predicted Results and Probabilities for 6500 samples (First 5 rows):
        IR       EI Predicted_SP  Prob_decrease  Prob_stable  Prob_increase
0     high  average     decrease       0.483663     0.199155       0.317182
1   medium     poor     increase       0.370435     0.200202       0.429363
2   medium     poor     decrease       0.501747     0.182816       0.315438
3     high     poor     decrease       0.793543     0.104885       0.101572
4     high     poor       stable       0.344909     0.467221       0.187870
5     high  average       stable       0.344909     0.467221       0.187870
6     high     poor     decrease       0.793543     0.104885       0.101572
7     high     poor       stable       0.275859     0.402118       0.322023
8      low  average       stable       0.344909     0.467221       0.187870
9     high  a



Validation Accuracy for 7000 samples: 0.6076
Test Accuracy for 7000 samples: 0.5952
[1m33/33[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step

Predicted Results and Probabilities for 7000 samples (First 5 rows):
        IR       EI Predicted_SP  Prob_decrease  Prob_stable  Prob_increase
0   medium     good     increase       0.306141     0.224817       0.469042
1   medium  average       stable       0.104271     0.704121       0.191608
2   medium     poor     increase       0.306141     0.224817       0.469042
3   medium  average     increase       0.200250     0.151993       0.647757
4   medium  average     increase       0.059529     0.386666       0.553805
5   medium     poor       stable       0.143715     0.515616       0.340669
6      low  average     increase       0.059529     0.386666       0.553805
7   medium     poor       stable       0.104271     0.704121       0.191608
8   medium     good     decrease       0.507635     0.167616       0.324750
9      low  a



Validation Accuracy for 7500 samples: 0.6329
Test Accuracy for 7500 samples: 0.6116
[1m36/36[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step

Predicted Results and Probabilities for 7500 samples (First 5 rows):
        IR       EI Predicted_SP  Prob_decrease  Prob_stable  Prob_increase
0   medium     good     decrease       0.810678     0.066415       0.122907
1   medium     poor       stable       0.235775     0.461874       0.302351
2      low     good     decrease       0.810678     0.066415       0.122907
3      low  average     decrease       0.810678     0.066415       0.122907
4   medium     poor     decrease       0.810678     0.066415       0.122907
5      low     good     increase       0.094251     0.365833       0.539916
6   medium     poor     increase       0.379044     0.193670       0.427286
7      low  average     increase       0.094251     0.365833       0.539916
8   medium  average     decrease       0.810678     0.066415       0.122907
9   medium   



Validation Accuracy for 8000 samples: 0.4933
Test Accuracy for 8000 samples: 0.4858
[1m38/38[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step

Predicted Results and Probabilities for 8000 samples (First 5 rows):
        IR       EI Predicted_SP  Prob_decrease  Prob_stable  Prob_increase
0      low     good       stable       0.230618     0.445016       0.324366
1      low     good       stable       0.230618     0.445016       0.324366
2   medium     good     decrease       0.352735     0.344845       0.302419
3   medium     good     decrease       0.459060     0.435542       0.105399
4     high     good       stable       0.230618     0.445016       0.324366
5   medium     good     increase       0.237352     0.378371       0.384277
6      low     poor     decrease       0.459060     0.435542       0.105399
7     high     good     decrease       0.427071     0.346035       0.226894
8   medium     good     decrease       0.459060     0.435542       0.105399
9     high   



Validation Accuracy for 8500 samples: 0.4596
Test Accuracy for 8500 samples: 0.4769
[1m40/40[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step

Predicted Results and Probabilities for 8500 samples (First 5 rows):
        IR       EI Predicted_SP  Prob_decrease  Prob_stable  Prob_increase
0      low     good       stable       0.059953     0.544065       0.395981
1      low     good       stable       0.361269     0.399675       0.239057
2   medium     poor       stable       0.343895     0.359589       0.296516
3   medium     poor       stable       0.361269     0.399675       0.239057
4      low     good       stable       0.285321     0.448077       0.266602
5   medium     poor       stable       0.059953     0.544065       0.395981
6   medium  average       stable       0.059953     0.544065       0.395981
7     high     poor       stable       0.343895     0.359589       0.296516
8      low  average       stable       0.361269     0.399675       0.239057
9      low   



Validation Accuracy for 9000 samples: 0.4422
Test Accuracy for 9000 samples: 0.4519
[1m43/43[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step

Predicted Results and Probabilities for 9000 samples (First 5 rows):
      IR       EI Predicted_SP  Prob_decrease  Prob_stable  Prob_increase
0   high     good     decrease       0.390754     0.376969       0.232277
1   high  average     decrease       0.390754     0.376969       0.232277
2   high     poor       stable       0.421453     0.503074       0.075473
3   high  average     decrease       0.390754     0.376969       0.232277
4   high  average     decrease       0.390754     0.376969       0.232277
5   high     good       stable       0.421453     0.503074       0.075473
6   high     good       stable       0.421453     0.503074       0.075473
7   high     good     decrease       0.383619     0.278086       0.338295
8   high     good     decrease       0.390754     0.376969       0.232277
9   high     poor     decrease   



Validation Accuracy for 9500 samples: 0.5018
Test Accuracy for 9500 samples: 0.4912
[1m45/45[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step

Predicted Results and Probabilities for 9500 samples (First 5 rows):
        IR       EI Predicted_SP  Prob_decrease  Prob_stable  Prob_increase
0     high     poor       stable       0.094702     0.587856       0.317441
1     high     good       stable       0.322751     0.381096       0.296153
2   medium     poor     increase       0.298169     0.251321       0.450510
3   medium     poor     increase       0.216407     0.314835       0.468758
4   medium     poor     increase       0.133787     0.201980       0.664233
5      low     poor     increase       0.216407     0.314835       0.468758
6     high     poor     decrease       0.398109     0.331615       0.270276
7   medium     poor     increase       0.216407     0.314835       0.468758
8   medium     poor     increase       0.133787     0.201980       0.664233
9     high   



Validation Accuracy for 10000 samples: 0.4827
Test Accuracy for 10000 samples: 0.4720
[1m47/47[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step

Predicted Results and Probabilities for 10000 samples (First 5 rows):
        IR       EI Predicted_SP  Prob_decrease  Prob_stable  Prob_increase
0      low     good     decrease       0.495838     0.097371       0.406792
1      low  average       stable       0.314306     0.462680       0.223013
2     high     good       stable       0.314306     0.462680       0.223013
3      low     good     decrease       0.495838     0.097371       0.406792
4     high     good     decrease       0.495838     0.097371       0.406792
5     high  average     decrease       0.404043     0.309526       0.286431
6     high  average       stable       0.211935     0.691541       0.096524
7      low  average     decrease       0.495838     0.097371       0.406792
8      low     good       stable       0.314306     0.462680       0.223013
9   medium

# ------------------------------------------------------------------------------------------------------------

# K-L Divergence NN Dense Data

## This is the entropy for dense data distribution with a sample size of 500 - 10000 and a NN with 1 hidden layer and 10 Neurons.

In [12]:
# Define the function to save K-L divergence to a file at the end
def save_kl_divergences_to_file(kl_divergence_data):
    file_name = 'K-L Divergence dense NN 1_10.csv'

    # Save the K-L divergences to a CSV file
    with open(file_name, 'w') as f:
        f.write('Size,NN_Dense_1_10_Entropy\n')  # Write the headers
        for sample_size, kl_div_value in kl_divergence_data.items():
            f.write(f"{sample_size},{kl_div_value:.4f}\n")

# Placeholder to store K-L divergence values for all sample sizes
kl_divergence_results = {}

# Sample sizes to loop through
sample_sizes = range(500, 10500, 500)

# Loop through each sample size
for sample_size in sample_sizes:
    # Load the ground truth probabilities for the current sample size
    ground_truth_probs_file = f'probabilities_dense_{sample_size}.csv'
    df_gt_probs = pd.read_csv(ground_truth_probs_file)

    # Load outcomes for the current sample size
    outcomes_file = f'outcomes_dense_{sample_size}.csv'
    df = pd.read_csv(outcomes_file)

    # Manually encode categorical variables for IR, EI, and SP
    ir_map = {'low': 0, 'medium': 1, 'high': 2}
    ei_map = {'poor': 0, 'average': 1, 'good': 2}

    df['IR_encoded'] = df['IR'].map(ir_map)
    df['EI_encoded'] = df['EI'].map(ei_map)

    # Placeholder to store K-L divergence values
    kl_divergences = []

    # Loop through the test set predictions
    for i in range(len(df)):
        # Neural Network predicted probabilities for SP (decrease, stable, increase)
        predicted_probs = predictions[i]  # Assuming the NN predictions are already available in memory for this sample size

        # Get the IR and EI values for the current sample
        ir_value = df.iloc[i]['IR_encoded']
        ei_value = df.iloc[i]['EI_encoded']

        # Map encoded values back to original labels
        ir_value = {0: 'low', 1: 'medium', 2: 'high'}[ir_value]
        ei_value = {0: 'poor', 1: 'average', 2: 'good'}[ei_value]

        # Get the corresponding ground truth probabilities for SP given IR and EI
        col_prefix = f'SP_given_IR_{ir_value}_EI_{ei_value}'
        ground_truth_probs = df_gt_probs.filter(like=col_prefix).values.flatten()

        # Ensure the probabilities are non-zero to avoid division by zero
        epsilon = 1e-10
        ground_truth_probs = np.clip(ground_truth_probs, epsilon, 1)

        # Compute K-L divergence (Neural Network vs Ground Truth)
        kl_div = entropy(predicted_probs, ground_truth_probs)
        kl_divergences.append(kl_div)

    # Calculate the average K-L divergence over all samples for the current sample size
    average_kl_divergence = np.mean(kl_divergences)

    # Store the result for this sample size
    kl_divergence_results[sample_size] = average_kl_divergence

    # Print confirmation for each sample size
    print(f"Average K-L Divergence for {sample_size} dense samples: {average_kl_divergence:.4f}")

# Once all sample sizes are processed, save the results to a CSV file
save_kl_divergences_to_file(kl_divergence_results)

# Print completion message
print("\nK-L divergence calculations complete and saved to 'K-L Divergence dense NN 1_10.csv'.")

Average K-L Divergence for 500 dense samples: 0.3219
Average K-L Divergence for 1000 dense samples: 0.4108
Average K-L Divergence for 1500 dense samples: 0.4250
Average K-L Divergence for 2000 dense samples: 0.3801
Average K-L Divergence for 2500 dense samples: 0.3694
Average K-L Divergence for 3000 dense samples: 0.4624
Average K-L Divergence for 3500 dense samples: 0.4288
Average K-L Divergence for 4000 dense samples: 0.4070
Average K-L Divergence for 4500 dense samples: 0.2583
Average K-L Divergence for 5000 dense samples: 0.3882
Average K-L Divergence for 5500 dense samples: 0.2103
Average K-L Divergence for 6000 dense samples: 0.6866
Average K-L Divergence for 6500 dense samples: 0.2862
Average K-L Divergence for 7000 dense samples: 0.2105
Average K-L Divergence for 7500 dense samples: 0.2742
Average K-L Divergence for 8000 dense samples: 0.6441
Average K-L Divergence for 8500 dense samples: 0.2623
Average K-L Divergence for 9000 dense samples: 0.3623
Average K-L Divergence for 95

# K-L Divergence NN Sparse Data

## This is the entropy for sparse data distribution with a sample size of 500 - 10000 and a NN with 1 hidden layer and 10 Neurons.

In [13]:
# Define the function to save K-L divergence to a file at the end
def save_kl_divergences_to_file(kl_divergence_data):
    file_name = 'K-L Divergence sparse NN 1_10.csv'  # Changed file name to indicate sparse data

    # Save the K-L divergences to a CSV file
    with open(file_name, 'w') as f:
        f.write('Size,NN_Sparse_1_10_Entropy\n')  # Updated the header to indicate sparse data
        for sample_size, kl_div_value in kl_divergence_data.items():
            f.write(f"{sample_size},{kl_div_value:.4f}\n")

# Placeholder to store K-L divergence values for all sample sizes
kl_divergence_results = {}

# Sample sizes to loop through
sample_sizes = range(500, 10500, 500)

# Loop through each sample size
for sample_size in sample_sizes:
    # Load the ground truth probabilities for the current sample size (sparse data)
    ground_truth_probs_file = f'probabilities_sparse_{sample_size}.csv'  # Changed to sparse data file
    df_gt_probs = pd.read_csv(ground_truth_probs_file)

    # Load outcomes for the current sample size (sparse data)
    outcomes_file = f'outcomes_sparse_{sample_size}.csv'  # Changed to sparse data file
    df = pd.read_csv(outcomes_file)

    # Manually encode categorical variables for IR, EI, and SP
    ir_map = {'low': 0, 'medium': 1, 'high': 2}
    ei_map = {'poor': 0, 'average': 1, 'good': 2}

    df['IR_encoded'] = df['IR'].map(ir_map)
    df['EI_encoded'] = df['EI'].map(ei_map)

    # Placeholder to store K-L divergence values
    kl_divergences = []

    # Loop through the test set predictions
    for i in range(len(df)):
        # Neural Network predicted probabilities for SP (decrease, stable, increase)
        predicted_probs = predictions[i]  # Assuming the NN predictions are already available in memory for this sample size

        # Get the IR and EI values for the current sample
        ir_value = df.iloc[i]['IR_encoded']
        ei_value = df.iloc[i]['EI_encoded']

        # Map encoded values back to original labels
        ir_value = {0: 'low', 1: 'medium', 2: 'high'}[ir_value]
        ei_value = {0: 'poor', 1: 'average', 2: 'good'}[ei_value]

        # Get the corresponding ground truth probabilities for SP given IR and EI
        col_prefix = f'SP_given_IR_{ir_value}_EI_{ei_value}'
        ground_truth_probs = df_gt_probs.filter(like=col_prefix).values.flatten()

        # Ensure the probabilities are non-zero to avoid division by zero
        epsilon = 1e-10
        ground_truth_probs = np.clip(ground_truth_probs, epsilon, 1)

        # Compute K-L divergence (Neural Network vs Ground Truth)
        kl_div = entropy(predicted_probs, ground_truth_probs)
        kl_divergences.append(kl_div)

    # Calculate the average K-L divergence over all samples for the current sample size
    average_kl_divergence = np.mean(kl_divergences)

    # Store the result for this sample size
    kl_divergence_results[sample_size] = average_kl_divergence

    # Print confirmation for each sample size
    print(f"Average K-L Divergence for {sample_size} sparse samples: {average_kl_divergence:.4f}")

# Once all sample sizes are processed, save the results to a CSV file
save_kl_divergences_to_file(kl_divergence_results)

# Print completion message
print("\nK-L divergence calculations complete and saved to 'K-L Divergence sparse NN 1_10.csv'.")

Average K-L Divergence for 500 sparse samples: 0.2561
Average K-L Divergence for 1000 sparse samples: 0.3704
Average K-L Divergence for 1500 sparse samples: 0.3235
Average K-L Divergence for 2000 sparse samples: 0.4161
Average K-L Divergence for 2500 sparse samples: 0.3134
Average K-L Divergence for 3000 sparse samples: 0.3122
Average K-L Divergence for 3500 sparse samples: 0.3196
Average K-L Divergence for 4000 sparse samples: 0.3261
Average K-L Divergence for 4500 sparse samples: 0.4116
Average K-L Divergence for 5000 sparse samples: 0.3940
Average K-L Divergence for 5500 sparse samples: 0.3374
Average K-L Divergence for 6000 sparse samples: 0.3627
Average K-L Divergence for 6500 sparse samples: 0.3223
Average K-L Divergence for 7000 sparse samples: 0.4758
Average K-L Divergence for 7500 sparse samples: 0.5426
Average K-L Divergence for 8000 sparse samples: 0.3905
Average K-L Divergence for 8500 sparse samples: 0.3577
Average K-L Divergence for 9000 sparse samples: 0.2985
Average K-L

# ------------------------------------------------------------------------------------------------------------