<a href="https://colab.research.google.com/github/nonyeezeh/Research-Project-Code/blob/main/NN_1_10_Relu.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Imports

In [1]:
pip install pgmpy

Collecting pgmpy
  Downloading pgmpy-0.1.26-py3-none-any.whl.metadata (9.1 kB)
Downloading pgmpy-0.1.26-py3-none-any.whl (2.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.0/2.0 MB[0m [31m13.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pgmpy
Successfully installed pgmpy-0.1.26


In [2]:
import numpy as np
import pandas as pd
from pgmpy.models import BayesianNetwork
from pgmpy.models import BayesianModel
from pgmpy.factors.discrete import TabularCPD
from pgmpy.sampling import BayesianModelSampling
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import tensorflow as tf
from tensorflow.keras import layers, models
from pgmpy.estimators import HillClimbSearch, BicScore, AICScore, MaximumLikelihoodEstimator
from pgmpy.inference import VariableElimination
from sklearn.metrics import accuracy_score
from scipy.stats import entropy
import os
import matplotlib.pyplot as plt

# ------------------------------------------------------------------------------------------------------------

# Bayesian Network Data Generation 500, 1000, 1500, ..., 10000 Samples (dense)

In [43]:
#np.random.seed(1)

# Define the mappings for IR, EI, SP
ir_map = {0: 'low', 1: 'medium', 2: 'high'}
ei_map = {0: 'poor', 1: 'average', 2: 'good'}
sp_map = {0: 'decrease', 1: 'stable', 2: 'increase'}

# Define the dense Bayesian Network
dense_model = BayesianNetwork([('IR', 'EI'), ('EI', 'SP'), ('IR', 'SP')])

# Function to generate CPDs
def generate_cpds():
    ir_probs = np.random.rand(3)
    ir_probs /= ir_probs.sum()  # Normalize to make it a valid probability distribution

    ei_given_ir_probs = np.random.rand(3, 3)
    ei_given_ir_probs /= ei_given_ir_probs.sum(axis=0, keepdims=True)

    sp_probs = np.random.rand(3, 3, 3)
    sp_probs /= sp_probs.sum(axis=0, keepdims=True)

    sp_probs_reshaped = sp_probs.reshape(3, -1)

    return ir_probs, ei_given_ir_probs, sp_probs_reshaped

# Save probabilities in a single CSV file
def save_probabilities(ir_probs, ei_probs, sp_probs, filename):
    # Create a DataFrame for IR probabilities
    ir_df = pd.DataFrame({
        'IR_State': ['low', 'medium', 'high'],
        'IR_Prob': ir_probs
    })

    # Create a DataFrame for EI given IR probabilities
    ei_df = pd.DataFrame(ei_probs, columns=['EI_given_IR_low', 'EI_given_IR_medium', 'EI_given_IR_high'])
    ei_df['EI_State'] = ['poor', 'average', 'good']

    # Create a DataFrame for SP given IR and EI probabilities
    sp_df = pd.DataFrame(sp_probs, columns=[
        'SP_given_IR_low_EI_poor', 'SP_given_IR_low_EI_average', 'SP_given_IR_low_EI_good',
        'SP_given_IR_medium_EI_poor', 'SP_given_IR_medium_EI_average', 'SP_given_IR_medium_EI_good',
        'SP_given_IR_high_EI_poor', 'SP_given_IR_high_EI_average', 'SP_given_IR_high_EI_good'
    ])
    sp_df['SP_State'] = ['decrease', 'stable', 'increase']

    # Combine all data into a single DataFrame
    combined_df = pd.concat([ir_df, ei_df, sp_df], axis=1)

    # Save the combined DataFrame as a single CSV file
    combined_df.to_csv(filename, index=False)

# Save outcomes in a CSV file
def save_outcomes(data_dense, filename):
    data_dense['IR'] = data_dense['IR'].map(ir_map)
    data_dense['EI'] = data_dense['EI'].map(ei_map)
    data_dense['SP'] = data_dense['SP'].map(sp_map)
    data_dense.to_csv(filename, index=False)

# Generate datasets for different sample sizes for the dense model
sample_sizes = range(500, 10500, 500)
for size in sample_sizes:
    # Generate the CPDs
    ir_probs, ei_given_ir_probs, sp_probs_reshaped = generate_cpds()

    # Define CPDs for the dense model
    cpd_ir = TabularCPD(variable='IR', variable_card=3, values=[[ir_probs[0]], [ir_probs[1]], [ir_probs[2]]])
    cpd_ei_dense = TabularCPD(variable='EI', variable_card=3,
                              values=ei_given_ir_probs,
                              evidence=['IR'], evidence_card=[3])
    cpd_sp_dense = TabularCPD(variable='SP', variable_card=3,
                              values=sp_probs_reshaped,
                              evidence=['IR', 'EI'], evidence_card=[3, 3])

    dense_model.add_cpds(cpd_ir, cpd_ei_dense, cpd_sp_dense)

    # Check if the model is valid
    assert dense_model.check_model()

    # Generate samples
    sampler_dense = BayesianModelSampling(dense_model)
    data_dense = sampler_dense.forward_sample(size=size)

    # Save probabilities in one file
    save_probabilities(ir_probs, ei_given_ir_probs, sp_probs_reshaped, f'probabilities_dense_{size}.csv')

    # Save outcomes (low, medium, high) in another file
    save_outcomes(data_dense, f'outcomes_dense_{size}.csv')

# Notify the user that the process is done
print("Data generation and saving complete for the dense model!")

  0%|          | 0/3 [00:00<?, ?it/s]



  0%|          | 0/3 [00:00<?, ?it/s]



  0%|          | 0/3 [00:00<?, ?it/s]



  0%|          | 0/3 [00:00<?, ?it/s]



  0%|          | 0/3 [00:00<?, ?it/s]



  0%|          | 0/3 [00:00<?, ?it/s]



  0%|          | 0/3 [00:00<?, ?it/s]



  0%|          | 0/3 [00:00<?, ?it/s]



  0%|          | 0/3 [00:00<?, ?it/s]



  0%|          | 0/3 [00:00<?, ?it/s]



  0%|          | 0/3 [00:00<?, ?it/s]



  0%|          | 0/3 [00:00<?, ?it/s]



  0%|          | 0/3 [00:00<?, ?it/s]



  0%|          | 0/3 [00:00<?, ?it/s]



  0%|          | 0/3 [00:00<?, ?it/s]



  0%|          | 0/3 [00:00<?, ?it/s]



  0%|          | 0/3 [00:00<?, ?it/s]



  0%|          | 0/3 [00:00<?, ?it/s]



  0%|          | 0/3 [00:00<?, ?it/s]



  0%|          | 0/3 [00:00<?, ?it/s]

Data generation and saving complete for the dense model!


# Bayesian Network Data Generation 500, 1000, 1500, ..., 10000 Samples (sparse)

In [44]:
#np.random.seed(187)

# Define the mappings for IR, EI, SP
ir_map = {0: 'low', 1: 'medium', 2: 'high'}
ei_map = {0: 'poor', 1: 'average', 2: 'good'}
sp_map = {0: 'decrease', 1: 'stable', 2: 'increase'}

# Define the sparse Bayesian Network
sparse_model = BayesianNetwork([('IR', 'SP'), ('EI', 'SP')])

# Function to generate CPDs for the sparse model
def generate_cpds_sparse():
    # Generate probabilities for IR (unconditional)
    ir_probs = np.random.rand(3)
    ir_probs /= ir_probs.sum()  # Normalize to make it a valid probability distribution

    # Generate unconditional probabilities for EI (no dependency on IR)
    ei_probs = np.random.rand(3)
    ei_probs /= ei_probs.sum()

    # Generate conditional probabilities for SP given IR and EI
    sp_probs = np.random.rand(3, 3, 3)
    sp_probs /= sp_probs.sum(axis=0, keepdims=True)

    sp_probs_reshaped = sp_probs.reshape(3, -1)

    return ir_probs, ei_probs, sp_probs_reshaped

# Save probabilities in a single CSV file
def save_probabilities_sparse(ir_probs, ei_probs, sp_probs, filename):
    # Create a DataFrame for IR probabilities
    ir_df = pd.DataFrame({
        'IR_State': ['low', 'medium', 'high'],
        'IR_Prob': ir_probs
    })

    # Create a DataFrame for EI probabilities (since it's not conditional on IR)
    ei_df = pd.DataFrame({
        'EI_State': ['poor', 'average', 'good'],
        'EI_Prob': ei_probs
    })

    # Create a DataFrame for SP given IR and EI probabilities
    sp_df = pd.DataFrame(sp_probs, columns=[
        'SP_given_IR_low_EI_poor', 'SP_given_IR_low_EI_average', 'SP_given_IR_low_EI_good',
        'SP_given_IR_medium_EI_poor', 'SP_given_IR_medium_EI_average', 'SP_given_IR_medium_EI_good',
        'SP_given_IR_high_EI_poor', 'SP_given_IR_high_EI_average', 'SP_given_IR_high_EI_good'
    ])
    sp_df['SP_State'] = ['decrease', 'stable', 'increase']

    # Combine all data into a single DataFrame
    combined_df = pd.concat([ir_df, ei_df, sp_df], axis=1)

    # Save the combined DataFrame as a single CSV file
    combined_df.to_csv(filename, index=False)

# Save outcomes in a CSV file
def save_outcomes_sparse(data_sparse, filename):
    data_sparse['IR'] = data_sparse['IR'].map(ir_map)
    data_sparse['EI'] = data_sparse['EI'].map(ei_map)
    data_sparse['SP'] = data_sparse['SP'].map(sp_map)
    data_sparse.to_csv(filename, index=False)

# Generate datasets for different sample sizes for the sparse model
sample_sizes = range(500, 10500, 500)
for size in sample_sizes:
    # Generate the CPDs
    ir_probs, ei_probs, sp_probs_reshaped = generate_cpds_sparse()

    # Define CPDs for the sparse model
    cpd_ir = TabularCPD(variable='IR', variable_card=3, values=[[ir_probs[0]], [ir_probs[1]], [ir_probs[2]]])
    cpd_ei_sparse = TabularCPD(variable='EI', variable_card=3, values=[[ei_probs[0]], [ei_probs[1]], [ei_probs[2]]])
    cpd_sp_sparse = TabularCPD(variable='SP', variable_card=3,
                               values=sp_probs_reshaped,
                               evidence=['IR', 'EI'], evidence_card=[3, 3])

    sparse_model.add_cpds(cpd_ir, cpd_ei_sparse, cpd_sp_sparse)

    # Check if the model is valid
    assert sparse_model.check_model()

    # Generate samples
    sampler_sparse = BayesianModelSampling(sparse_model)
    data_sparse = sampler_sparse.forward_sample(size=size)

    # Save probabilities in one file
    save_probabilities_sparse(ir_probs, ei_probs, sp_probs_reshaped, f'probabilities_sparse_{size}.csv')

    # Save outcomes (low, medium, high) in another file
    save_outcomes_sparse(data_sparse, f'outcomes_sparse_{size}.csv')

# Notify the user that the process is done
print("Data generation and saving complete for the sparse model!")

  0%|          | 0/3 [00:00<?, ?it/s]



  0%|          | 0/3 [00:00<?, ?it/s]



  0%|          | 0/3 [00:00<?, ?it/s]



  0%|          | 0/3 [00:00<?, ?it/s]



  0%|          | 0/3 [00:00<?, ?it/s]



  0%|          | 0/3 [00:00<?, ?it/s]



  0%|          | 0/3 [00:00<?, ?it/s]



  0%|          | 0/3 [00:00<?, ?it/s]



  0%|          | 0/3 [00:00<?, ?it/s]



  0%|          | 0/3 [00:00<?, ?it/s]



  0%|          | 0/3 [00:00<?, ?it/s]



  0%|          | 0/3 [00:00<?, ?it/s]



  0%|          | 0/3 [00:00<?, ?it/s]



  0%|          | 0/3 [00:00<?, ?it/s]



  0%|          | 0/3 [00:00<?, ?it/s]



  0%|          | 0/3 [00:00<?, ?it/s]



  0%|          | 0/3 [00:00<?, ?it/s]



  0%|          | 0/3 [00:00<?, ?it/s]



  0%|          | 0/3 [00:00<?, ?it/s]



  0%|          | 0/3 [00:00<?, ?it/s]



Data generation and saving complete for the sparse model!


In [45]:
# Sample sizes to loop through
sample_sizes = range(500, 10500, 500)

# Loop through each sample size
for sample_size in sample_sizes:
    print(f"\nProcessing sample size: {sample_size}")

    # Load the dense dataset for the current sample size
    dense_data_file = f'outcomes_dense_{sample_size}.csv'
    df_dense = pd.read_csv(dense_data_file)

    # Manually encode categorical variables for IR, EI, and SP
    ir_map = {'low': 0, 'medium': 1, 'high': 2}
    ei_map = {'poor': 0, 'average': 1, 'good': 2}
    sp_map = {'decrease': 0, 'stable': 1, 'increase': 2}

    df_dense['IR_encoded'] = df_dense['IR'].map(ir_map)
    df_dense['EI_encoded'] = df_dense['EI'].map(ei_map)
    df_dense['SP_encoded'] = df_dense['SP'].map(sp_map)

    # Define the Hill-Climb structure learning algorithm
    hc = HillClimbSearch(df_dense[['IR_encoded', 'EI_encoded', 'SP_encoded']])
    scoring_method = BicScore(df_dense[['IR_encoded', 'EI_encoded', 'SP_encoded']])

    # Estimate the best structure
    best_dag = hc.estimate(scoring_method=scoring_method)
    best_model = BayesianNetwork(best_dag.edges())

    # Check if all nodes are included in the learned structure
    nodes_in_structure = set(best_model.nodes())
    required_nodes = {'IR_encoded', 'EI_encoded', 'SP_encoded'}

    if not required_nodes.issubset(nodes_in_structure):
        print("\nNot all nodes are connected. Adding a dummy variable.")
        # Add a dummy variable to the dataset
        df_dense['dummy'] = 0

        # Re-estimate the structure with the dummy variable
        hc = HillClimbSearch(df_dense)
        scoring_method = BicScore(df_dense)
        best_dag = hc.estimate(scoring_method=scoring_method)
        best_model = BayesianNetwork(best_dag.edges())

    # Display the learned structure (edges of the Bayesian Network)
    print(f"\nLearned Structure (Edges) for {sample_size} samples:")
    print(best_model.edges())

    # Calculate and display the BIC score
    bic_score = scoring_method.score(best_model)
    print(f"\nBIC Score for {sample_size} samples: {bic_score:.4f}")

    # Learn the CPDs using Maximum Likelihood Estimation (MLE)
    best_model.fit(df_dense, estimator=MaximumLikelihoodEstimator)

    # Check if the model is valid after learning the parameters
    assert best_model.check_model()

    # Print the learned CPDs (Conditional Probability Distributions)
    for cpd in best_model.get_cpds():
        print("\nCPD of", cpd.variable)
        print(cpd)

    # Create an inference object for the best model
    inference = VariableElimination(best_model)

    # Placeholder to store predictions
    predicted_sp_labels = []

    # Loop through each row in the dense dataset to make predictions
    for index, row in df_dense.iterrows():
        # Prepare the evidence from the dataset (IR_encoded and EI_encoded)
        sample_input = {'IR_encoded': int(row['IR_encoded']), 'EI_encoded': int(row['EI_encoded'])}

        # Perform inference to predict the distribution for SP_encoded (Stock Price)
        predicted_sp_distribution = inference.query(variables=['SP_encoded'], evidence=sample_input)

        # Extract the most likely SP_encoded class
        predicted_sp_class = predicted_sp_distribution.values.argmax()
        sp_reverse_map = {0: 'decrease', 1: 'stable', 2: 'increase'}
        predicted_sp_label = sp_reverse_map[predicted_sp_class]

        # Store the predicted label
        predicted_sp_labels.append(predicted_sp_label)

    # Convert the list of predicted labels into a DataFrame for easier comparison
    predicted_results_df = pd.DataFrame({
        'IR': df_dense['IR'],  # Original IR column
        'EI': df_dense['EI'],  # Original EI column
        'Predicted_SP': predicted_sp_labels  # Predicted SP column
    })

    # Add the actual SP values for comparison
    predicted_results_df['Actual_SP'] = df_dense['SP']

    # Calculate accuracy of predictions
    accuracy = accuracy_score(predicted_results_df['Actual_SP'], predicted_results_df['Predicted_SP'])
    print(f"\nPrediction Accuracy for {sample_size} samples: {accuracy:.4f}")

    # Display the first few rows of predictions
    print(f"\nPredicted Results for Dense Data (First 10 rows) for {sample_size} samples:")
    print(predicted_results_df.head(10))

# Notify the user that the process is done
print("\nProcessing complete for all sample sizes.")


Processing sample size: 500


  0%|          | 0/1000000 [00:00<?, ?it/s]


Learned Structure (Edges) for 500 samples:
[('IR_encoded', 'SP_encoded'), ('EI_encoded', 'SP_encoded')]

BIC Score for 500 samples: -1530.3002

CPD of IR_encoded
+---------------+-------+
| IR_encoded(0) | 0.572 |
+---------------+-------+
| IR_encoded(1) | 0.234 |
+---------------+-------+
| IR_encoded(2) | 0.194 |
+---------------+-------+

CPD of SP_encoded
+---------------+---------------------+-----+---------------+---------------------+
| EI_encoded    | EI_encoded(0)       | ... | EI_encoded(2) | EI_encoded(2)       |
+---------------+---------------------+-----+---------------+---------------------+
| IR_encoded    | IR_encoded(0)       | ... | IR_encoded(1) | IR_encoded(2)       |
+---------------+---------------------+-----+---------------+---------------------+
| SP_encoded(0) | 0.4444444444444444  | ... | 0.525         | 0.14285714285714285 |
+---------------+---------------------+-----+---------------+---------------------+
| SP_encoded(1) | 0.3939393939393939  | ... | 0.

  0%|          | 0/1000000 [00:00<?, ?it/s]


Learned Structure (Edges) for 1000 samples:
[('EI_encoded', 'IR_encoded'), ('SP_encoded', 'EI_encoded'), ('SP_encoded', 'IR_encoded')]

BIC Score for 1000 samples: -2818.7869

CPD of EI_encoded
+---------------+--------------------+---------------------+---------------------+
| SP_encoded    | SP_encoded(0)      | SP_encoded(1)       | SP_encoded(2)       |
+---------------+--------------------+---------------------+---------------------+
| EI_encoded(0) | 0.3473684210526316 | 0.2659846547314578  | 0.5061728395061729  |
+---------------+--------------------+---------------------+---------------------+
| EI_encoded(1) | 0.3894736842105263 | 0.19437340153452684 | 0.4012345679012346  |
+---------------+--------------------+---------------------+---------------------+
| EI_encoded(2) | 0.2631578947368421 | 0.5396419437340153  | 0.09259259259259259 |
+---------------+--------------------+---------------------+---------------------+

CPD of IR_encoded
+---------------+---------------------+

  0%|          | 0/1000000 [00:00<?, ?it/s]


Learned Structure (Edges) for 1500 samples:
[('EI_encoded', 'IR_encoded'), ('EI_encoded', 'SP_encoded'), ('SP_encoded', 'IR_encoded')]

BIC Score for 1500 samples: -4587.7732

CPD of EI_encoded
+---------------+----------+
| EI_encoded(0) | 0.338    |
+---------------+----------+
| EI_encoded(1) | 0.266667 |
+---------------+----------+
| EI_encoded(2) | 0.395333 |
+---------------+----------+

CPD of IR_encoded
+---------------+---------------------+-----+----------------------+---------------------+
| EI_encoded    | EI_encoded(0)       | ... | EI_encoded(2)        | EI_encoded(2)       |
+---------------+---------------------+-----+----------------------+---------------------+
| SP_encoded    | SP_encoded(0)       | ... | SP_encoded(1)        | SP_encoded(2)       |
+---------------+---------------------+-----+----------------------+---------------------+
| IR_encoded(0) | 0.13736263736263737 | ... | 0.037037037037037035 | 0.2661290322580645  |
+---------------+--------------------

  0%|          | 0/1000000 [00:00<?, ?it/s]


Learned Structure (Edges) for 2000 samples:
[('EI_encoded', 'IR_encoded'), ('EI_encoded', 'SP_encoded'), ('SP_encoded', 'IR_encoded')]

BIC Score for 2000 samples: -5125.1237

CPD of EI_encoded
+---------------+--------+
| EI_encoded(0) | 0.2145 |
+---------------+--------+
| EI_encoded(1) | 0.275  |
+---------------+--------+
| EI_encoded(2) | 0.5105 |
+---------------+--------+

CPD of IR_encoded
+---------------+----------------------+-----+---------------------+----------------------+
| EI_encoded    | EI_encoded(0)        | ... | EI_encoded(2)       | EI_encoded(2)        |
+---------------+----------------------+-----+---------------------+----------------------+
| SP_encoded    | SP_encoded(0)        | ... | SP_encoded(1)       | SP_encoded(2)        |
+---------------+----------------------+-----+---------------------+----------------------+
| IR_encoded(0) | 0.014084507042253521 | ... | 0.02631578947368421 | 0.002976190476190476 |
+---------------+----------------------+-----

  0%|          | 0/1000000 [00:00<?, ?it/s]


Learned Structure (Edges) for 2500 samples:
[('IR_encoded', 'EI_encoded'), ('SP_encoded', 'EI_encoded'), ('SP_encoded', 'IR_encoded')]

BIC Score for 2500 samples: -7507.6621

CPD of IR_encoded
+---------------+---------------------+---------------------+---------------------+
| SP_encoded    | SP_encoded(0)       | SP_encoded(1)       | SP_encoded(2)       |
+---------------+---------------------+---------------------+---------------------+
| IR_encoded(0) | 0.23673469387755103 | 0.3266832917705736  | 0.40083073727933544 |
+---------------+---------------------+---------------------+---------------------+
| IR_encoded(1) | 0.4816326530612245  | 0.4089775561097257  | 0.2949117341640706  |
+---------------+---------------------+---------------------+---------------------+
| IR_encoded(2) | 0.2816326530612245  | 0.26433915211970077 | 0.304257528556594   |
+---------------+---------------------+---------------------+---------------------+

CPD of EI_encoded
+---------------+-------------

  0%|          | 0/1000000 [00:00<?, ?it/s]


Learned Structure (Edges) for 3000 samples:
[('IR_encoded', 'SP_encoded'), ('IR_encoded', 'EI_encoded'), ('EI_encoded', 'SP_encoded')]

BIC Score for 3000 samples: -9450.9350

CPD of IR_encoded
+---------------+----------+
| IR_encoded(0) | 0.419333 |
+---------------+----------+
| IR_encoded(1) | 0.309    |
+---------------+----------+
| IR_encoded(2) | 0.271667 |
+---------------+----------+

CPD of SP_encoded
+---------------+---------------------+-----+---------------------+---------------------+
| EI_encoded    | EI_encoded(0)       | ... | EI_encoded(2)       | EI_encoded(2)       |
+---------------+---------------------+-----+---------------------+---------------------+
| IR_encoded    | IR_encoded(0)       | ... | IR_encoded(1)       | IR_encoded(2)       |
+---------------+---------------------+-----+---------------------+---------------------+
| SP_encoded(0) | 0.4989106753812636  | ... | 0.20307692307692307 | 0.34408602150537637 |
+---------------+---------------------+----

  0%|          | 0/1000000 [00:00<?, ?it/s]


Learned Structure (Edges) for 3500 samples:
[('EI_encoded', 'IR_encoded'), ('SP_encoded', 'IR_encoded'), ('SP_encoded', 'EI_encoded')]

BIC Score for 3500 samples: -11061.9437

CPD of EI_encoded
+---------------+---------------------+--------------------+---------------------+
| SP_encoded    | SP_encoded(0)       | SP_encoded(1)      | SP_encoded(2)       |
+---------------+---------------------+--------------------+---------------------+
| EI_encoded(0) | 0.4411764705882353  | 0.1695205479452055 | 0.26035911602209943 |
+---------------+---------------------+--------------------+---------------------+
| EI_encoded(1) | 0.22850678733031674 | 0.3981164383561644 | 0.3846685082872928  |
+---------------+---------------------+--------------------+---------------------+
| EI_encoded(2) | 0.33031674208144796 | 0.4323630136986301 | 0.35497237569060774 |
+---------------+---------------------+--------------------+---------------------+

CPD of IR_encoded
+---------------+---------------------

  0%|          | 0/1000000 [00:00<?, ?it/s]


Learned Structure (Edges) for 4000 samples:
[('EI_encoded', 'IR_encoded'), ('SP_encoded', 'EI_encoded'), ('SP_encoded', 'IR_encoded')]

BIC Score for 4000 samples: -10523.3081

CPD of EI_encoded
+---------------+---------------------+---------------------+---------------------+
| SP_encoded    | SP_encoded(0)       | SP_encoded(1)       | SP_encoded(2)       |
+---------------+---------------------+---------------------+---------------------+
| EI_encoded(0) | 0.29393627954779034 | 0.3428819444444444  | 0.11893333333333334 |
+---------------+---------------------+---------------------+---------------------+
| EI_encoded(1) | 0.49229188078108943 | 0.19010416666666666 | 0.6346666666666667  |
+---------------+---------------------+---------------------+---------------------+
| EI_encoded(2) | 0.21377183967112023 | 0.4670138888888889  | 0.2464              |
+---------------+---------------------+---------------------+---------------------+

CPD of IR_encoded
+---------------+------------

  0%|          | 0/1000000 [00:00<?, ?it/s]


Learned Structure (Edges) for 4500 samples:
[('IR_encoded', 'EI_encoded'), ('SP_encoded', 'EI_encoded')]

BIC Score for 4500 samples: -13062.5369

CPD of IR_encoded
+---------------+----------+
| IR_encoded(0) | 0.186889 |
+---------------+----------+
| IR_encoded(1) | 0.559111 |
+---------------+----------+
| IR_encoded(2) | 0.254    |
+---------------+----------+

CPD of EI_encoded
+---------------+---------------------+-----+----------------------+
| IR_encoded    | IR_encoded(0)       | ... | IR_encoded(2)        |
+---------------+---------------------+-----+----------------------+
| SP_encoded    | SP_encoded(0)       | ... | SP_encoded(2)        |
+---------------+---------------------+-----+----------------------+
| EI_encoded(0) | 0.6952380952380952  | ... | 0.029411764705882353 |
+---------------+---------------------+-----+----------------------+
| EI_encoded(1) | 0.07936507936507936 | ... | 0.6339869281045751   |
+---------------+---------------------+-----+---------------

  0%|          | 0/1000000 [00:00<?, ?it/s]


Learned Structure (Edges) for 5000 samples:
[('IR_encoded', 'EI_encoded'), ('SP_encoded', 'EI_encoded'), ('SP_encoded', 'IR_encoded')]

BIC Score for 5000 samples: -14952.3502

CPD of IR_encoded
+---------------+--------------------+---------------------+---------------------+
| SP_encoded    | SP_encoded(0)      | SP_encoded(1)       | SP_encoded(2)       |
+---------------+--------------------+---------------------+---------------------+
| IR_encoded(0) | 0.1587378640776699 | 0.45575757575757575 | 0.3682170542635659  |
+---------------+--------------------+---------------------+---------------------+
| IR_encoded(1) | 0.5344660194174757 | 0.26666666666666666 | 0.22015503875968992 |
+---------------+--------------------+---------------------+---------------------+
| IR_encoded(2) | 0.3067961165048544 | 0.2775757575757576  | 0.4116279069767442  |
+---------------+--------------------+---------------------+---------------------+

CPD of EI_encoded
+---------------+---------------------

  0%|          | 0/1000000 [00:00<?, ?it/s]


Learned Structure (Edges) for 5500 samples:
[('IR_encoded', 'EI_encoded'), ('SP_encoded', 'EI_encoded'), ('SP_encoded', 'IR_encoded')]

BIC Score for 5500 samples: -15854.8325

CPD of IR_encoded
+---------------+---------------------+---------------------+---------------------+
| SP_encoded    | SP_encoded(0)       | SP_encoded(1)       | SP_encoded(2)       |
+---------------+---------------------+---------------------+---------------------+
| IR_encoded(0) | 0.5908838243468594  | 0.5095541401273885  | 0.6560338201383551  |
+---------------+---------------------+---------------------+---------------------+
| IR_encoded(1) | 0.3079488604780434  | 0.30118289353958144 | 0.24942352036894697 |
+---------------+---------------------+---------------------+---------------------+
| IR_encoded(2) | 0.10116731517509728 | 0.18926296633303002 | 0.09454265949269793 |
+---------------+---------------------+---------------------+---------------------+

CPD of EI_encoded
+---------------+------------

  0%|          | 0/1000000 [00:00<?, ?it/s]


Learned Structure (Edges) for 6000 samples:
[('EI_encoded', 'IR_encoded'), ('SP_encoded', 'EI_encoded'), ('SP_encoded', 'IR_encoded')]

BIC Score for 6000 samples: -14213.8112

CPD of EI_encoded
+---------------+---------------------+---------------------+---------------------+
| SP_encoded    | SP_encoded(0)       | SP_encoded(1)       | SP_encoded(2)       |
+---------------+---------------------+---------------------+---------------------+
| EI_encoded(0) | 0.506426735218509   | 0.5891980360065466  | 0.22778402699662542 |
+---------------+---------------------+---------------------+---------------------+
| EI_encoded(1) | 0.11996572407883462 | 0.09819967266775777 | 0.09280089988751405 |
+---------------+---------------------+---------------------+---------------------+
| EI_encoded(2) | 0.3736075407026564  | 0.31260229132569556 | 0.6794150731158605  |
+---------------+---------------------+---------------------+---------------------+

CPD of IR_encoded
+---------------+------------

  0%|          | 0/1000000 [00:00<?, ?it/s]


Learned Structure (Edges) for 6500 samples:
[('EI_encoded', 'IR_encoded'), ('SP_encoded', 'IR_encoded'), ('SP_encoded', 'EI_encoded')]

BIC Score for 6500 samples: -18999.5385

CPD of EI_encoded
+---------------+---------------------+---------------------+---------------------+
| SP_encoded    | SP_encoded(0)       | SP_encoded(1)       | SP_encoded(2)       |
+---------------+---------------------+---------------------+---------------------+
| EI_encoded(0) | 0.1984848484848485  | 0.19202104340201667 | 0.30460026797677536 |
+---------------+---------------------+---------------------+---------------------+
| EI_encoded(1) | 0.5737373737373738  | 0.5918456817185445  | 0.43501563197856186 |
+---------------+---------------------+---------------------+---------------------+
| EI_encoded(2) | 0.22777777777777777 | 0.21613327487943884 | 0.2603841000446628  |
+---------------+---------------------+---------------------+---------------------+

CPD of IR_encoded
+---------------+------------

  0%|          | 0/1000000 [00:00<?, ?it/s]


Learned Structure (Edges) for 7000 samples:
[('IR_encoded', 'EI_encoded'), ('SP_encoded', 'EI_encoded'), ('SP_encoded', 'IR_encoded')]

BIC Score for 7000 samples: -21157.9297

CPD of IR_encoded
+---------------+---------------------+---------------------+--------------------+
| SP_encoded    | SP_encoded(0)       | SP_encoded(1)       | SP_encoded(2)      |
+---------------+---------------------+---------------------+--------------------+
| IR_encoded(0) | 0.3454391891891892  | 0.3333333333333333  | 0.312719298245614  |
+---------------+---------------------+---------------------+--------------------+
| IR_encoded(1) | 0.29011824324324326 | 0.32695578231292516 | 0.4004385964912281 |
+---------------+---------------------+---------------------+--------------------+
| IR_encoded(2) | 0.36444256756756754 | 0.33971088435374147 | 0.2868421052631579 |
+---------------+---------------------+---------------------+--------------------+

CPD of EI_encoded
+---------------+---------------------

  0%|          | 0/1000000 [00:00<?, ?it/s]


Learned Structure (Edges) for 7500 samples:
[('EI_encoded', 'IR_encoded'), ('EI_encoded', 'SP_encoded'), ('SP_encoded', 'IR_encoded')]

BIC Score for 7500 samples: -21106.8960

CPD of EI_encoded
+---------------+----------+
| EI_encoded(0) | 0.400533 |
+---------------+----------+
| EI_encoded(1) | 0.33     |
+---------------+----------+
| EI_encoded(2) | 0.269467 |
+---------------+----------+

CPD of IR_encoded
+---------------+----------------------+-----+----------------------+---------------------+
| EI_encoded    | EI_encoded(0)        | ... | EI_encoded(2)        | EI_encoded(2)       |
+---------------+----------------------+-----+----------------------+---------------------+
| SP_encoded    | SP_encoded(0)        | ... | SP_encoded(1)        | SP_encoded(2)       |
+---------------+----------------------+-----+----------------------+---------------------+
| IR_encoded(0) | 0.014778325123152709 | ... | 0.06959152798789713  | 0.3615023474178404  |
+---------------+-------------

  0%|          | 0/1000000 [00:00<?, ?it/s]


Learned Structure (Edges) for 8000 samples:
[('IR_encoded', 'EI_encoded'), ('SP_encoded', 'EI_encoded'), ('SP_encoded', 'IR_encoded')]

BIC Score for 8000 samples: -22990.6379

CPD of IR_encoded
+---------------+---------------------+---------------------+---------------------+
| SP_encoded    | SP_encoded(0)       | SP_encoded(1)       | SP_encoded(2)       |
+---------------+---------------------+---------------------+---------------------+
| IR_encoded(0) | 0.08886054421768708 | 0.02760545905707196 | 0.03259075907590759 |
+---------------+---------------------+---------------------+---------------------+
| IR_encoded(1) | 0.38732993197278914 | 0.5918114143920595  | 0.5655940594059405  |
+---------------+---------------------+---------------------+---------------------+
| IR_encoded(2) | 0.5238095238095238  | 0.3805831265508685  | 0.4018151815181518  |
+---------------+---------------------+---------------------+---------------------+

CPD of EI_encoded
+---------------+------------

  0%|          | 0/1000000 [00:00<?, ?it/s]


Learned Structure (Edges) for 8500 samples:
[('IR_encoded', 'EI_encoded'), ('SP_encoded', 'EI_encoded')]

BIC Score for 8500 samples: -23885.8824

CPD of IR_encoded
+---------------+-----------+
| IR_encoded(0) | 0.556235  |
+---------------+-----------+
| IR_encoded(1) | 0.392235  |
+---------------+-----------+
| IR_encoded(2) | 0.0515294 |
+---------------+-----------+

CPD of EI_encoded
+---------------+---------------------+-----+----------------------+---------------------+
| IR_encoded    | IR_encoded(0)       | ... | IR_encoded(2)        | IR_encoded(2)       |
+---------------+---------------------+-----+----------------------+---------------------+
| SP_encoded    | SP_encoded(0)       | ... | SP_encoded(1)        | SP_encoded(2)       |
+---------------+---------------------+-----+----------------------+---------------------+
| EI_encoded(0) | 0.6564452015900056  | ... | 0.22674418604651161  | 0.2980769230769231  |
+---------------+---------------------+-----+--------------

  0%|          | 0/1000000 [00:00<?, ?it/s]


Learned Structure (Edges) for 9000 samples:
[('EI_encoded', 'IR_encoded'), ('SP_encoded', 'IR_encoded'), ('SP_encoded', 'EI_encoded')]

BIC Score for 9000 samples: -27418.3816

CPD of EI_encoded
+---------------+---------------------+---------------------+---------------------+
| SP_encoded    | SP_encoded(0)       | SP_encoded(1)       | SP_encoded(2)       |
+---------------+---------------------+---------------------+---------------------+
| EI_encoded(0) | 0.39520958083832336 | 0.31519274376417233 | 0.34797738147020446 |
+---------------+---------------------+---------------------+---------------------+
| EI_encoded(1) | 0.3394264103372203  | 0.27380952380952384 | 0.48325358851674644 |
+---------------+---------------------+---------------------+---------------------+
| EI_encoded(2) | 0.2653640088244564  | 0.41099773242630383 | 0.16876903001304916 |
+---------------+---------------------+---------------------+---------------------+

CPD of IR_encoded
+---------------+------------

  0%|          | 0/1000000 [00:00<?, ?it/s]


Learned Structure (Edges) for 9500 samples:
[('IR_encoded', 'EI_encoded'), ('SP_encoded', 'EI_encoded'), ('SP_encoded', 'IR_encoded')]

BIC Score for 9500 samples: -28594.8663

CPD of IR_encoded
+---------------+---------------------+---------------------+--------------------+
| SP_encoded    | SP_encoded(0)       | SP_encoded(1)       | SP_encoded(2)      |
+---------------+---------------------+---------------------+--------------------+
| IR_encoded(0) | 0.09686003193187866 | 0.31178608515057116 | 0.2767312284425577 |
+---------------+---------------------+---------------------+--------------------+
| IR_encoded(1) | 0.4912187333688132  | 0.3637071651090343  | 0.432740780047758  |
+---------------+---------------------+---------------------+--------------------+
| IR_encoded(2) | 0.41192123469930814 | 0.3245067497403946  | 0.2905279915096843 |
+---------------+---------------------+---------------------+--------------------+

CPD of EI_encoded
+---------------+---------------------

  0%|          | 0/1000000 [00:00<?, ?it/s]


Learned Structure (Edges) for 10000 samples:
[('IR_encoded', 'EI_encoded'), ('SP_encoded', 'EI_encoded'), ('SP_encoded', 'IR_encoded')]

BIC Score for 10000 samples: -28120.0358

CPD of IR_encoded
+---------------+---------------------+---------------------+---------------------+
| SP_encoded    | SP_encoded(0)       | SP_encoded(1)       | SP_encoded(2)       |
+---------------+---------------------+---------------------+---------------------+
| IR_encoded(0) | 0.6909403669724771  | 0.6958066808813077  | 0.6957660410301179  |
+---------------+---------------------+---------------------+---------------------+
| IR_encoded(1) | 0.27780963302752293 | 0.22032693674484718 | 0.24487123526844173 |
+---------------+---------------------+---------------------+---------------------+
| IR_encoded(2) | 0.03125             | 0.08386638237384506 | 0.05936272370144042 |
+---------------+---------------------+---------------------+---------------------+

CPD of EI_encoded
+---------------+----------

## AIC

In [46]:
# Sample sizes to loop through
sample_sizes = range(500, 10500, 500)

# Loop through each sample size
for sample_size in sample_sizes:
    print(f"\nProcessing sample size: {sample_size}")

    # Load the dense dataset for the current sample size
    dense_data_file = f'outcomes_dense_{sample_size}.csv'
    df_dense = pd.read_csv(dense_data_file)

    # Manually encode categorical variables for IR, EI, and SP
    ir_map = {'low': 0, 'medium': 1, 'high': 2}
    ei_map = {'poor': 0, 'average': 1, 'good': 2}
    sp_map = {'decrease': 0, 'stable': 1, 'increase': 2}

    df_dense['IR_encoded'] = df_dense['IR'].map(ir_map)
    df_dense['EI_encoded'] = df_dense['EI'].map(ei_map)
    df_dense['SP_encoded'] = df_dense['SP'].map(sp_map)

    # Define the Hill-Climb structure learning algorithm
    hc = HillClimbSearch(df_dense[['IR_encoded', 'EI_encoded', 'SP_encoded']])
    scoring_method = AICScore(df_dense[['IR_encoded', 'EI_encoded', 'SP_encoded']])  # Use AICScore instead of BicScore

    # Estimate the best structure
    best_dag = hc.estimate(scoring_method=scoring_method)
    best_model = BayesianNetwork(best_dag.edges())

    # Display the learned structure (edges of the Bayesian Network)
    print(f"\nLearned Structure (Edges) for {sample_size} samples (AIC):")
    print(best_model.edges())

    # Calculate and display the AIC score
    aic_score = scoring_method.score(best_model)
    print(f"\nAIC Score for {sample_size} samples: {aic_score:.4f}")

    # Learn the CPDs using Maximum Likelihood Estimation (MLE)
    best_model.fit(df_dense[['IR_encoded', 'EI_encoded', 'SP_encoded']], estimator=MaximumLikelihoodEstimator)

    # Check if the model is valid after learning the parameters
    assert best_model.check_model()

    # Print the learned CPDs (Conditional Probability Distributions)
    for cpd in best_model.get_cpds():
        print("\nCPD of", cpd.variable)
        print(cpd)

    # Create an inference object for the best model
    inference = VariableElimination(best_model)

    # Placeholder to store predictions
    predicted_sp_labels = []

    # Loop through each row in the dense dataset to make predictions
    for index, row in df_dense.iterrows():
        # Prepare the evidence from the dataset (IR_encoded and EI_encoded)
        sample_input = {'IR_encoded': int(row['IR_encoded']), 'EI_encoded': int(row['EI_encoded'])}

        # Perform inference to predict the distribution for SP_encoded (Stock Price)
        predicted_sp_distribution = inference.query(variables=['SP_encoded'], evidence=sample_input)

        # Extract the most likely SP_encoded class
        predicted_sp_class = predicted_sp_distribution.values.argmax()
        sp_reverse_map = {0: 'decrease', 1: 'stable', 2: 'increase'}
        predicted_sp_label = sp_reverse_map[predicted_sp_class]

        # Store the predicted label
        predicted_sp_labels.append(predicted_sp_label)

    # Convert the list of predicted labels into a DataFrame for easier comparison
    predicted_results_df = pd.DataFrame({
        'IR': df_dense['IR'],  # Original IR column
        'EI': df_dense['EI'],  # Original EI column
        'Predicted_SP': predicted_sp_labels  # Predicted SP column
    })

    # Add the actual SP values for comparison
    predicted_results_df['Actual_SP'] = df_dense['SP']

    # Calculate accuracy of predictions
    accuracy = accuracy_score(predicted_results_df['Actual_SP'], predicted_results_df['Predicted_SP'])
    print(f"\nPrediction Accuracy for {sample_size} samples (AIC): {accuracy:.4f}")

    # Display the first few rows of predictions
    print(f"\nPredicted Results for Dense Data (First 10 rows) for {sample_size} samples (AIC):")
    print(predicted_results_df.head(10))

    # Save the results if needed
    results_filename = f'predicted_results_aic_{sample_size}.csv'
    predicted_results_df.to_csv(results_filename, index=False)
    print(f"\nResults saved to {results_filename}")

# Notify the user that the process is done
print("\nProcessing complete for all sample sizes using AIC.")


Processing sample size: 500


  0%|          | 0/1000000 [00:00<?, ?it/s]


Learned Structure (Edges) for 500 samples (AIC):
[('IR_encoded', 'SP_encoded'), ('EI_encoded', 'SP_encoded')]

AIC Score for 500 samples: -1483.9395

CPD of IR_encoded
+---------------+-------+
| IR_encoded(0) | 0.572 |
+---------------+-------+
| IR_encoded(1) | 0.234 |
+---------------+-------+
| IR_encoded(2) | 0.194 |
+---------------+-------+

CPD of SP_encoded
+---------------+---------------------+-----+---------------+---------------------+
| EI_encoded    | EI_encoded(0)       | ... | EI_encoded(2) | EI_encoded(2)       |
+---------------+---------------------+-----+---------------+---------------------+
| IR_encoded    | IR_encoded(0)       | ... | IR_encoded(1) | IR_encoded(2)       |
+---------------+---------------------+-----+---------------+---------------------+
| SP_encoded(0) | 0.4444444444444444  | ... | 0.525         | 0.14285714285714285 |
+---------------+---------------------+-----+---------------+---------------------+
| SP_encoded(1) | 0.3939393939393939  | ..

  0%|          | 0/1000000 [00:00<?, ?it/s]


Learned Structure (Edges) for 1000 samples (AIC):
[('IR_encoded', 'EI_encoded'), ('SP_encoded', 'EI_encoded'), ('SP_encoded', 'IR_encoded')]

AIC Score for 1000 samples: -2754.9861

CPD of IR_encoded
+---------------+----------------------+---------------------+----------------------+
| SP_encoded    | SP_encoded(0)        | SP_encoded(1)       | SP_encoded(2)        |
+---------------+----------------------+---------------------+----------------------+
| IR_encoded(0) | 0.21754385964912282  | 0.43478260869565216 | 0.23765432098765432  |
+---------------+----------------------+---------------------+----------------------+
| IR_encoded(1) | 0.042105263157894736 | 0.03836317135549872 | 0.027777777777777776 |
+---------------+----------------------+---------------------+----------------------+
| IR_encoded(2) | 0.7403508771929824   | 0.5268542199488491  | 0.7345679012345679   |
+---------------+----------------------+---------------------+----------------------+

CPD of EI_encoded
+-----

  0%|          | 0/1000000 [00:00<?, ?it/s]


Learned Structure (Edges) for 1500 samples (AIC):
[('EI_encoded', 'IR_encoded'), ('EI_encoded', 'SP_encoded'), ('SP_encoded', 'IR_encoded')]

AIC Score for 1500 samples: -4518.7013

CPD of EI_encoded
+---------------+----------+
| EI_encoded(0) | 0.338    |
+---------------+----------+
| EI_encoded(1) | 0.266667 |
+---------------+----------+
| EI_encoded(2) | 0.395333 |
+---------------+----------+

CPD of IR_encoded
+---------------+---------------------+-----+----------------------+---------------------+
| EI_encoded    | EI_encoded(0)       | ... | EI_encoded(2)        | EI_encoded(2)       |
+---------------+---------------------+-----+----------------------+---------------------+
| SP_encoded    | SP_encoded(0)       | ... | SP_encoded(1)        | SP_encoded(2)       |
+---------------+---------------------+-----+----------------------+---------------------+
| IR_encoded(0) | 0.13736263736263737 | ... | 0.037037037037037035 | 0.2661290322580645  |
+---------------+--------------

  0%|          | 0/1000000 [00:00<?, ?it/s]


Learned Structure (Edges) for 2000 samples (AIC):
[('EI_encoded', 'IR_encoded'), ('SP_encoded', 'EI_encoded'), ('SP_encoded', 'IR_encoded')]

AIC Score for 2000 samples: -5052.3120

CPD of EI_encoded
+---------------+---------------------+---------------------+---------------------+
| SP_encoded    | SP_encoded(0)       | SP_encoded(1)       | SP_encoded(2)       |
+---------------+---------------------+---------------------+---------------------+
| EI_encoded(0) | 0.20579710144927535 | 0.04617834394904458 | 0.32035053554040893 |
+---------------+---------------------+---------------------+---------------------+
| EI_encoded(1) | 0.24057971014492754 | 0.16719745222929935 | 0.3524829600778968  |
+---------------+---------------------+---------------------+---------------------+
| EI_encoded(2) | 0.553623188405797   | 0.7866242038216561  | 0.32716650438169426 |
+---------------+---------------------+---------------------+---------------------+

CPD of IR_encoded
+---------------+-------

  0%|          | 0/1000000 [00:00<?, ?it/s]


Learned Structure (Edges) for 2500 samples (AIC):
[('IR_encoded', 'EI_encoded'), ('SP_encoded', 'EI_encoded'), ('SP_encoded', 'IR_encoded')]

AIC Score for 2500 samples: -7431.9495

CPD of IR_encoded
+---------------+---------------------+---------------------+---------------------+
| SP_encoded    | SP_encoded(0)       | SP_encoded(1)       | SP_encoded(2)       |
+---------------+---------------------+---------------------+---------------------+
| IR_encoded(0) | 0.23673469387755103 | 0.3266832917705736  | 0.40083073727933544 |
+---------------+---------------------+---------------------+---------------------+
| IR_encoded(1) | 0.4816326530612245  | 0.4089775561097257  | 0.2949117341640706  |
+---------------+---------------------+---------------------+---------------------+
| IR_encoded(2) | 0.2816326530612245  | 0.26433915211970077 | 0.304257528556594   |
+---------------+---------------------+---------------------+---------------------+

CPD of EI_encoded
+---------------+-------

  0%|          | 0/1000000 [00:00<?, ?it/s]


Learned Structure (Edges) for 3000 samples (AIC):
[('IR_encoded', 'SP_encoded'), ('IR_encoded', 'EI_encoded'), ('EI_encoded', 'SP_encoded')]

AIC Score for 3000 samples: -9372.8522

CPD of IR_encoded
+---------------+----------+
| IR_encoded(0) | 0.419333 |
+---------------+----------+
| IR_encoded(1) | 0.309    |
+---------------+----------+
| IR_encoded(2) | 0.271667 |
+---------------+----------+

CPD of SP_encoded
+---------------+---------------------+-----+---------------------+---------------------+
| EI_encoded    | EI_encoded(0)       | ... | EI_encoded(2)       | EI_encoded(2)       |
+---------------+---------------------+-----+---------------------+---------------------+
| IR_encoded    | IR_encoded(0)       | ... | IR_encoded(1)       | IR_encoded(2)       |
+---------------+---------------------+-----+---------------------+---------------------+
| SP_encoded(0) | 0.4989106753812636  | ... | 0.20307692307692307 | 0.34408602150537637 |
+---------------+--------------------

  0%|          | 0/1000000 [00:00<?, ?it/s]


Learned Structure (Edges) for 3500 samples (AIC):
[('EI_encoded', 'IR_encoded'), ('SP_encoded', 'IR_encoded'), ('SP_encoded', 'EI_encoded')]

AIC Score for 3500 samples: -10981.8570

CPD of EI_encoded
+---------------+---------------------+--------------------+---------------------+
| SP_encoded    | SP_encoded(0)       | SP_encoded(1)      | SP_encoded(2)       |
+---------------+---------------------+--------------------+---------------------+
| EI_encoded(0) | 0.4411764705882353  | 0.1695205479452055 | 0.26035911602209943 |
+---------------+---------------------+--------------------+---------------------+
| EI_encoded(1) | 0.22850678733031674 | 0.3981164383561644 | 0.3846685082872928  |
+---------------+---------------------+--------------------+---------------------+
| EI_encoded(2) | 0.33031674208144796 | 0.4323630136986301 | 0.35497237569060774 |
+---------------+---------------------+--------------------+---------------------+

CPD of IR_encoded
+---------------+---------------

  0%|          | 0/1000000 [00:00<?, ?it/s]


Learned Structure (Edges) for 4000 samples (AIC):
[('EI_encoded', 'IR_encoded'), ('SP_encoded', 'EI_encoded'), ('SP_encoded', 'IR_encoded')]

AIC Score for 4000 samples: -10441.4854

CPD of EI_encoded
+---------------+---------------------+---------------------+---------------------+
| SP_encoded    | SP_encoded(0)       | SP_encoded(1)       | SP_encoded(2)       |
+---------------+---------------------+---------------------+---------------------+
| EI_encoded(0) | 0.29393627954779034 | 0.3428819444444444  | 0.11893333333333334 |
+---------------+---------------------+---------------------+---------------------+
| EI_encoded(1) | 0.49229188078108943 | 0.19010416666666666 | 0.6346666666666667  |
+---------------+---------------------+---------------------+---------------------+
| EI_encoded(2) | 0.21377183967112023 | 0.4670138888888889  | 0.2464              |
+---------------+---------------------+---------------------+---------------------+

CPD of IR_encoded
+---------------+------

  0%|          | 0/1000000 [00:00<?, ?it/s]


Learned Structure (Edges) for 4500 samples (AIC):
[('IR_encoded', 'EI_encoded'), ('IR_encoded', 'SP_encoded'), ('SP_encoded', 'EI_encoded')]

AIC Score for 4500 samples: -12986.9057

CPD of IR_encoded
+---------------+----------+
| IR_encoded(0) | 0.186889 |
+---------------+----------+
| IR_encoded(1) | 0.559111 |
+---------------+----------+
| IR_encoded(2) | 0.254    |
+---------------+----------+

CPD of EI_encoded
+---------------+---------------------+-----+----------------------+
| IR_encoded    | IR_encoded(0)       | ... | IR_encoded(2)        |
+---------------+---------------------+-----+----------------------+
| SP_encoded    | SP_encoded(0)       | ... | SP_encoded(2)        |
+---------------+---------------------+-----+----------------------+
| EI_encoded(0) | 0.6952380952380952  | ... | 0.029411764705882353 |
+---------------+---------------------+-----+----------------------+
| EI_encoded(1) | 0.07936507936507936 | ... | 0.6339869281045751   |
+---------------+-------

  0%|          | 0/1000000 [00:00<?, ?it/s]


Learned Structure (Edges) for 5000 samples (AIC):
[('IR_encoded', 'EI_encoded'), ('SP_encoded', 'EI_encoded'), ('SP_encoded', 'IR_encoded')]

AIC Score for 5000 samples: -14867.6267

CPD of IR_encoded
+---------------+--------------------+---------------------+---------------------+
| SP_encoded    | SP_encoded(0)      | SP_encoded(1)       | SP_encoded(2)       |
+---------------+--------------------+---------------------+---------------------+
| IR_encoded(0) | 0.1587378640776699 | 0.45575757575757575 | 0.3682170542635659  |
+---------------+--------------------+---------------------+---------------------+
| IR_encoded(1) | 0.5344660194174757 | 0.26666666666666666 | 0.22015503875968992 |
+---------------+--------------------+---------------------+---------------------+
| IR_encoded(2) | 0.3067961165048544 | 0.2775757575757576  | 0.4116279069767442  |
+---------------+--------------------+---------------------+---------------------+

CPD of EI_encoded
+---------------+---------------

  0%|          | 0/1000000 [00:00<?, ?it/s]


Learned Structure (Edges) for 5500 samples (AIC):
[('IR_encoded', 'EI_encoded'), ('SP_encoded', 'EI_encoded'), ('SP_encoded', 'IR_encoded')]

AIC Score for 5500 samples: -15768.8700

CPD of IR_encoded
+---------------+---------------------+---------------------+---------------------+
| SP_encoded    | SP_encoded(0)       | SP_encoded(1)       | SP_encoded(2)       |
+---------------+---------------------+---------------------+---------------------+
| IR_encoded(0) | 0.5908838243468594  | 0.5095541401273885  | 0.6560338201383551  |
+---------------+---------------------+---------------------+---------------------+
| IR_encoded(1) | 0.3079488604780434  | 0.30118289353958144 | 0.24942352036894697 |
+---------------+---------------------+---------------------+---------------------+
| IR_encoded(2) | 0.10116731517509728 | 0.18926296633303002 | 0.09454265949269793 |
+---------------+---------------------+---------------------+---------------------+

CPD of EI_encoded
+---------------+------

  0%|          | 0/1000000 [00:00<?, ?it/s]


Learned Structure (Edges) for 6000 samples (AIC):
[('IR_encoded', 'EI_encoded'), ('SP_encoded', 'EI_encoded'), ('SP_encoded', 'IR_encoded')]

AIC Score for 6000 samples: -14126.7175

CPD of IR_encoded
+---------------+---------------------+-----------------------+---------------------+
| SP_encoded    | SP_encoded(0)       | SP_encoded(1)         | SP_encoded(2)       |
+---------------+---------------------+-----------------------+---------------------+
| IR_encoded(0) | 0.04027420736932305 | 0.0032733224222585926 | 0.00843644544431946 |
+---------------+---------------------+-----------------------+---------------------+
| IR_encoded(1) | 0.2853470437017995  | 0.07659574468085106   | 0.28008998875140606 |
+---------------+---------------------+-----------------------+---------------------+
| IR_encoded(2) | 0.6743787489288775  | 0.9201309328968903    | 0.7114735658042745  |
+---------------+---------------------+-----------------------+---------------------+

CPD of EI_encoded
+----

  0%|          | 0/1000000 [00:00<?, ?it/s]


Learned Structure (Edges) for 6500 samples (AIC):
[('EI_encoded', 'IR_encoded'), ('SP_encoded', 'IR_encoded'), ('SP_encoded', 'EI_encoded')]

AIC Score for 6500 samples: -18911.4043

CPD of EI_encoded
+---------------+---------------------+---------------------+---------------------+
| SP_encoded    | SP_encoded(0)       | SP_encoded(1)       | SP_encoded(2)       |
+---------------+---------------------+---------------------+---------------------+
| EI_encoded(0) | 0.1984848484848485  | 0.19202104340201667 | 0.30460026797677536 |
+---------------+---------------------+---------------------+---------------------+
| EI_encoded(1) | 0.5737373737373738  | 0.5918456817185445  | 0.43501563197856186 |
+---------------+---------------------+---------------------+---------------------+
| EI_encoded(2) | 0.22777777777777777 | 0.21613327487943884 | 0.2603841000446628  |
+---------------+---------------------+---------------------+---------------------+

CPD of IR_encoded
+---------------+------

  0%|          | 0/1000000 [00:00<?, ?it/s]


Learned Structure (Edges) for 7000 samples (AIC):
[('IR_encoded', 'EI_encoded'), ('SP_encoded', 'EI_encoded'), ('SP_encoded', 'IR_encoded')]

AIC Score for 7000 samples: -21068.8320

CPD of IR_encoded
+---------------+---------------------+---------------------+--------------------+
| SP_encoded    | SP_encoded(0)       | SP_encoded(1)       | SP_encoded(2)      |
+---------------+---------------------+---------------------+--------------------+
| IR_encoded(0) | 0.3454391891891892  | 0.3333333333333333  | 0.312719298245614  |
+---------------+---------------------+---------------------+--------------------+
| IR_encoded(1) | 0.29011824324324326 | 0.32695578231292516 | 0.4004385964912281 |
+---------------+---------------------+---------------------+--------------------+
| IR_encoded(2) | 0.36444256756756754 | 0.33971088435374147 | 0.2868421052631579 |
+---------------+---------------------+---------------------+--------------------+

CPD of EI_encoded
+---------------+---------------

  0%|          | 0/1000000 [00:00<?, ?it/s]


Learned Structure (Edges) for 7500 samples (AIC):
[('EI_encoded', 'IR_encoded'), ('EI_encoded', 'SP_encoded'), ('SP_encoded', 'IR_encoded')]

AIC Score for 7500 samples: -21016.9015

CPD of EI_encoded
+---------------+----------+
| EI_encoded(0) | 0.400533 |
+---------------+----------+
| EI_encoded(1) | 0.33     |
+---------------+----------+
| EI_encoded(2) | 0.269467 |
+---------------+----------+

CPD of IR_encoded
+---------------+----------------------+-----+----------------------+---------------------+
| EI_encoded    | EI_encoded(0)        | ... | EI_encoded(2)        | EI_encoded(2)       |
+---------------+----------------------+-----+----------------------+---------------------+
| SP_encoded    | SP_encoded(0)        | ... | SP_encoded(1)        | SP_encoded(2)       |
+---------------+----------------------+-----+----------------------+---------------------+
| IR_encoded(0) | 0.014778325123152709 | ... | 0.06959152798789713  | 0.3615023474178404  |
+---------------+-------

  0%|          | 0/1000000 [00:00<?, ?it/s]


Learned Structure (Edges) for 8000 samples (AIC):
[('IR_encoded', 'EI_encoded'), ('SP_encoded', 'EI_encoded'), ('SP_encoded', 'IR_encoded')]

AIC Score for 8000 samples: -22899.8043

CPD of IR_encoded
+---------------+---------------------+---------------------+---------------------+
| SP_encoded    | SP_encoded(0)       | SP_encoded(1)       | SP_encoded(2)       |
+---------------+---------------------+---------------------+---------------------+
| IR_encoded(0) | 0.08886054421768708 | 0.02760545905707196 | 0.03259075907590759 |
+---------------+---------------------+---------------------+---------------------+
| IR_encoded(1) | 0.38732993197278914 | 0.5918114143920595  | 0.5655940594059405  |
+---------------+---------------------+---------------------+---------------------+
| IR_encoded(2) | 0.5238095238095238  | 0.3805831265508685  | 0.4018151815181518  |
+---------------+---------------------+---------------------+---------------------+

CPD of EI_encoded
+---------------+------

  0%|          | 0/1000000 [00:00<?, ?it/s]


Learned Structure (Edges) for 8500 samples (AIC):
[('IR_encoded', 'EI_encoded'), ('SP_encoded', 'EI_encoded'), ('SP_encoded', 'IR_encoded')]

AIC Score for 8500 samples: -23806.2543

CPD of IR_encoded
+---------------+---------------------+---------------------+---------------------+
| SP_encoded    | SP_encoded(0)       | SP_encoded(1)       | SP_encoded(2)       |
+---------------+---------------------+---------------------+---------------------+
| IR_encoded(0) | 0.560828025477707   | 0.5684380032206119  | 0.5330376940133038  |
+---------------+---------------------+---------------------+---------------------+
| IR_encoded(1) | 0.38757961783439493 | 0.3761674718196457  | 0.42084257206208425 |
+---------------+---------------------+---------------------+---------------------+
| IR_encoded(2) | 0.05159235668789809 | 0.05539452495974235 | 0.04611973392461197 |
+---------------+---------------------+---------------------+---------------------+

CPD of EI_encoded
+---------------+------

  0%|          | 0/1000000 [00:00<?, ?it/s]


Learned Structure (Edges) for 9000 samples (AIC):
[('EI_encoded', 'IR_encoded'), ('SP_encoded', 'IR_encoded'), ('SP_encoded', 'EI_encoded')]

AIC Score for 9000 samples: -27326.0168

CPD of EI_encoded
+---------------+---------------------+---------------------+---------------------+
| SP_encoded    | SP_encoded(0)       | SP_encoded(1)       | SP_encoded(2)       |
+---------------+---------------------+---------------------+---------------------+
| EI_encoded(0) | 0.39520958083832336 | 0.31519274376417233 | 0.34797738147020446 |
+---------------+---------------------+---------------------+---------------------+
| EI_encoded(1) | 0.3394264103372203  | 0.27380952380952384 | 0.48325358851674644 |
+---------------+---------------------+---------------------+---------------------+
| EI_encoded(2) | 0.2653640088244564  | 0.41099773242630383 | 0.16876903001304916 |
+---------------+---------------------+---------------------+---------------------+

CPD of IR_encoded
+---------------+------

  0%|          | 0/1000000 [00:00<?, ?it/s]


Learned Structure (Edges) for 9500 samples (AIC):
[('IR_encoded', 'EI_encoded'), ('SP_encoded', 'EI_encoded'), ('SP_encoded', 'IR_encoded')]

AIC Score for 9500 samples: -28501.7986

CPD of IR_encoded
+---------------+---------------------+---------------------+--------------------+
| SP_encoded    | SP_encoded(0)       | SP_encoded(1)       | SP_encoded(2)      |
+---------------+---------------------+---------------------+--------------------+
| IR_encoded(0) | 0.09686003193187866 | 0.31178608515057116 | 0.2767312284425577 |
+---------------+---------------------+---------------------+--------------------+
| IR_encoded(1) | 0.4912187333688132  | 0.3637071651090343  | 0.432740780047758  |
+---------------+---------------------+---------------------+--------------------+
| IR_encoded(2) | 0.41192123469930814 | 0.3245067497403946  | 0.2905279915096843 |
+---------------+---------------------+---------------------+--------------------+

CPD of EI_encoded
+---------------+---------------

  0%|          | 0/1000000 [00:00<?, ?it/s]


Learned Structure (Edges) for 10000 samples (AIC):
[('IR_encoded', 'EI_encoded'), ('SP_encoded', 'EI_encoded'), ('SP_encoded', 'IR_encoded')]

AIC Score for 10000 samples: -28026.3014

CPD of IR_encoded
+---------------+---------------------+---------------------+---------------------+
| SP_encoded    | SP_encoded(0)       | SP_encoded(1)       | SP_encoded(2)       |
+---------------+---------------------+---------------------+---------------------+
| IR_encoded(0) | 0.6909403669724771  | 0.6958066808813077  | 0.6957660410301179  |
+---------------+---------------------+---------------------+---------------------+
| IR_encoded(1) | 0.27780963302752293 | 0.22032693674484718 | 0.24487123526844173 |
+---------------+---------------------+---------------------+---------------------+
| IR_encoded(2) | 0.03125             | 0.08386638237384506 | 0.05936272370144042 |
+---------------+---------------------+---------------------+---------------------+

CPD of EI_encoded
+---------------+----

## BIC

In [47]:
# Sample sizes to loop through
sample_sizes = range(500, 10500, 500)

# Loop through each sample size
for sample_size in sample_sizes:
    print(f"\nProcessing sample size: {sample_size} (Sparse Data with BIC)")

    # Load the sparse dataset for the current sample size
    sparse_data_file = f'outcomes_sparse_{sample_size}.csv'
    df_sparse = pd.read_csv(sparse_data_file)

    # Manually encode categorical variables for IR, EI, and SP
    ir_map = {'low': 0, 'medium': 1, 'high': 2}
    ei_map = {'poor': 0, 'average': 1, 'good': 2}
    sp_map = {'decrease': 0, 'stable': 1, 'increase': 2}

    df_sparse['IR_encoded'] = df_sparse['IR'].map(ir_map)
    df_sparse['EI_encoded'] = df_sparse['EI'].map(ei_map)
    df_sparse['SP_encoded'] = df_sparse['SP'].map(sp_map)

    # Define the Hill-Climb structure learning algorithm
    hc_bic = HillClimbSearch(df_sparse[['IR_encoded', 'EI_encoded', 'SP_encoded']])
    scoring_method_bic = BicScore(df_sparse[['IR_encoded', 'EI_encoded', 'SP_encoded']])

    # Estimate the best structure using BIC
    best_dag_bic = hc_bic.estimate(scoring_method=scoring_method_bic)

    # Ensure all required nodes are present in the model, even if not connected
    best_model_bic = BayesianNetwork()
    best_model_bic.add_nodes_from(['IR_encoded', 'EI_encoded', 'SP_encoded'])  # Add all nodes
    best_model_bic.add_edges_from(best_dag_bic.edges())  # Add edges from the learned structure

    # Check if all nodes are included in the learned structure
    nodes_in_structure_bic = set(best_model_bic.nodes())
    required_nodes = {'IR_encoded', 'EI_encoded', 'SP_encoded'}

    if not required_nodes.issubset(nodes_in_structure_bic):
        print("\nNot all nodes are connected. Adding a dummy variable and ensuring all required nodes are present.")
        # Add a dummy variable to the dataset
        df_sparse['Dummy_Node'] = 1  # Constant dummy node

        # Ensure all required nodes are in the model by adding edges with the dummy node
        for node in required_nodes:
            if node not in nodes_in_structure_bic:
                best_model_bic.add_edge('Dummy_Node', node)

        # Re-estimate the structure with the dummy variable
        hc_bic = HillClimbSearch(df_sparse[['IR_encoded', 'EI_encoded', 'SP_encoded', 'Dummy_Node']])
        scoring_method_bic = BicScore(df_sparse[['IR_encoded', 'EI_encoded', 'SP_encoded', 'Dummy_Node']])
        best_dag_bic = hc_bic.estimate(scoring_method=scoring_method_bic)
        best_model_bic = BayesianNetwork(best_dag_bic.edges())

    # Display the learned structure (edges of the Bayesian Network)
    print(f"\nLearned Structure (Edges) for {sample_size} samples (BIC):")
    print(best_model_bic.edges())

    # Learn the CPDs using Maximum Likelihood Estimation (MLE)
    best_model_bic.fit(df_sparse[['IR_encoded', 'EI_encoded', 'SP_encoded']], estimator=MaximumLikelihoodEstimator)

    # Check if the model is valid after learning the parameters
    assert best_model_bic.check_model()

    # Print the learned CPDs (Conditional Probability Distributions)
    for cpd in best_model_bic.get_cpds():
        print("\nCPD of", cpd.variable)
        print(cpd)

    # Create an inference object for the BIC model
    inference_bic = VariableElimination(best_model_bic)

    # Placeholder to store predictions
    predicted_sp_labels_bic = []

    # Loop through each row in the dataset to make predictions using BIC
    for index, row in df_sparse.iterrows():
        # Prepare the evidence from the dataset (IR_encoded and EI_encoded)
        sample_input = {'IR_encoded': int(row['IR_encoded']), 'EI_encoded': int(row['EI_encoded'])}

        # Perform inference to predict the distribution for SP_encoded (Stock Price)
        predicted_sp_distribution_bic = inference_bic.query(variables=['SP_encoded'], evidence=sample_input)

        # Extract the most likely SP_encoded class
        predicted_sp_class_bic = predicted_sp_distribution_bic.values.argmax()
        sp_reverse_map = {0: 'decrease', 1: 'stable', 2: 'increase'}
        predicted_sp_label_bic = sp_reverse_map[predicted_sp_class_bic]

        # Store the predicted label
        predicted_sp_labels_bic.append(predicted_sp_label_bic)

    # Convert the list of predicted labels into a DataFrame for easier comparison
    predicted_results_df_bic = pd.DataFrame({
        'IR': df_sparse['IR'],  # Original IR column
        'EI': df_sparse['EI'],  # Original EI column
        'Predicted_SP': predicted_sp_labels_bic  # Predicted SP column
    })

    # Add the actual SP values for comparison
    predicted_results_df_bic['Actual_SP'] = df_sparse['SP']

    # Calculate accuracy of predictions for BIC
    accuracy_bic = accuracy_score(predicted_results_df_bic['Actual_SP'], predicted_results_df_bic['Predicted_SP'])
    print(f"\nPrediction Accuracy for {sample_size} samples (BIC): {accuracy_bic:.4f}")

    # Display the first few rows of predictions for BIC
    print(f"\nPredicted Results for Sparse Data (First 10 rows) for {sample_size} samples (BIC):")
    print(predicted_results_df_bic.head(10))

    # Calculate the BIC score for the Bayesian Network model
    bic_score_value = scoring_method_bic.score(best_model_bic)

    # Print the BIC score
    print(f"\nBIC Score for {sample_size} samples: {bic_score_value:.4f}")

# Notify the user that the process is done
print("\nProcessing complete for all sample sizes using BIC (Sparse Data).")


Processing sample size: 500 (Sparse Data with BIC)


  0%|          | 0/1000000 [00:00<?, ?it/s]


Learned Structure (Edges) for 500 samples (BIC):
[('SP_encoded', 'EI_encoded'), ('SP_encoded', 'IR_encoded')]

CPD of IR_encoded
+---------------+---------------------+---------------+----------------------+
| SP_encoded    | SP_encoded(0)       | SP_encoded(1) | SP_encoded(2)        |
+---------------+---------------------+---------------+----------------------+
| IR_encoded(0) | 0.6022727272727273  | 0.734375      | 0.8133802816901409   |
+---------------+---------------------+---------------+----------------------+
| IR_encoded(1) | 0.18181818181818182 | 0.0390625     | 0.017605633802816902 |
+---------------+---------------------+---------------+----------------------+
| IR_encoded(2) | 0.2159090909090909  | 0.2265625     | 0.16901408450704225  |
+---------------+---------------------+---------------+----------------------+

CPD of EI_encoded
+---------------+---------------------+---------------+----------------------+
| SP_encoded    | SP_encoded(0)       | SP_encoded(1) | SP_en

  0%|          | 0/1000000 [00:00<?, ?it/s]


Learned Structure (Edges) for 1000 samples (BIC):
[('IR_encoded', 'SP_encoded'), ('EI_encoded', 'SP_encoded')]

CPD of IR_encoded
+---------------+-------+
| IR_encoded(0) | 0.006 |
+---------------+-------+
| IR_encoded(1) | 0.482 |
+---------------+-------+
| IR_encoded(2) | 0.512 |
+---------------+-------+

CPD of EI_encoded
+---------------+-------+
| EI_encoded(0) | 0.404 |
+---------------+-------+
| EI_encoded(1) | 0.372 |
+---------------+-------+
| EI_encoded(2) | 0.224 |
+---------------+-------+

CPD of SP_encoded
+---------------+---------------+-----+---------------------+---------------------+
| EI_encoded    | EI_encoded(0) | ... | EI_encoded(2)       | EI_encoded(2)       |
+---------------+---------------+-----+---------------------+---------------------+
| IR_encoded    | IR_encoded(0) | ... | IR_encoded(1)       | IR_encoded(2)       |
+---------------+---------------+-----+---------------------+---------------------+
| SP_encoded(0) | 0.0           | ... | 0.85436

  0%|          | 0/1000000 [00:00<?, ?it/s]


Learned Structure (Edges) for 1500 samples (BIC):
[('EI_encoded', 'SP_encoded'), ('SP_encoded', 'IR_encoded')]

CPD of IR_encoded
+---------------+---------------------+----------------------+---------------------+
| SP_encoded    | SP_encoded(0)       | SP_encoded(1)        | SP_encoded(2)       |
+---------------+---------------------+----------------------+---------------------+
| IR_encoded(0) | 0.23796033994334279 | 0.2079207920792079   | 0.13493530499075784 |
+---------------+---------------------+----------------------+---------------------+
| IR_encoded(1) | 0.6657223796033994  | 0.7656765676567657   | 0.7763401109057301  |
+---------------+---------------------+----------------------+---------------------+
| IR_encoded(2) | 0.09631728045325778 | 0.026402640264026403 | 0.08872458410351201 |
+---------------+---------------------+----------------------+---------------------+

CPD of EI_encoded
+---------------+-------+
| EI_encoded(0) | 0.362 |
+---------------+-------+
| EI_en

  0%|          | 0/1000000 [00:00<?, ?it/s]


Learned Structure (Edges) for 2000 samples (BIC):
[('EI_encoded', 'IR_encoded'), ('SP_encoded', 'IR_encoded'), ('SP_encoded', 'EI_encoded')]

CPD of IR_encoded
+---------------+---------------------+-----+---------------------+----------------------+
| EI_encoded    | EI_encoded(0)       | ... | EI_encoded(2)       | EI_encoded(2)        |
+---------------+---------------------+-----+---------------------+----------------------+
| SP_encoded    | SP_encoded(0)       | ... | SP_encoded(1)       | SP_encoded(2)        |
+---------------+---------------------+-----+---------------------+----------------------+
| IR_encoded(0) | 0.0                 | ... | 0.4732510288065844  | 0.706766917293233    |
+---------------+---------------------+-----+---------------------+----------------------+
| IR_encoded(1) | 0.8967136150234741  | ... | 0.5102880658436214  | 0.24812030075187969  |
+---------------+---------------------+-----+---------------------+----------------------+
| IR_encoded(2) | 0.

  0%|          | 0/1000000 [00:00<?, ?it/s]


Learned Structure (Edges) for 2500 samples (BIC):
[('IR_encoded', 'EI_encoded'), ('SP_encoded', 'EI_encoded'), ('SP_encoded', 'IR_encoded')]

CPD of IR_encoded
+---------------+--------------------+---------------------+--------------------+
| SP_encoded    | SP_encoded(0)      | SP_encoded(1)       | SP_encoded(2)      |
+---------------+--------------------+---------------------+--------------------+
| IR_encoded(0) | 0.1388888888888889 | 0.13493064312736444 | 0.0671971706454465 |
+---------------+--------------------+---------------------+--------------------+
| IR_encoded(1) | 0.1840277777777778 | 0.1210592686002522  | 0.1317418213969938 |
+---------------+--------------------+---------------------+--------------------+
| IR_encoded(2) | 0.6770833333333334 | 0.7440100882723834  | 0.8010610079575596 |
+---------------+--------------------+---------------------+--------------------+

CPD of EI_encoded
+---------------+---------------+-----+---------------------+---------------------

  0%|          | 0/1000000 [00:00<?, ?it/s]


Learned Structure (Edges) for 3000 samples (BIC):
[('IR_encoded', 'EI_encoded'), ('SP_encoded', 'EI_encoded'), ('SP_encoded', 'IR_encoded')]

CPD of IR_encoded
+---------------+---------------------+---------------------+--------------------+
| SP_encoded    | SP_encoded(0)       | SP_encoded(1)       | SP_encoded(2)      |
+---------------+---------------------+---------------------+--------------------+
| IR_encoded(0) | 0.43530419880034277 | 0.42706333973128596 | 0.638432364096081  |
+---------------+---------------------+---------------------+--------------------+
| IR_encoded(1) | 0.15509854327335046 | 0.2600767754318618  | 0.2402022756005057 |
+---------------+---------------------+---------------------+--------------------+
| IR_encoded(2) | 0.40959725792630675 | 0.31285988483685223 | 0.1213653603034134 |
+---------------+---------------------+---------------------+--------------------+

CPD of EI_encoded
+---------------+----------------------+-----+---------------------+-----

  0%|          | 0/1000000 [00:00<?, ?it/s]


Learned Structure (Edges) for 3500 samples (BIC):
[('EI_encoded', 'IR_encoded'), ('SP_encoded', 'IR_encoded'), ('SP_encoded', 'EI_encoded')]

CPD of IR_encoded
+---------------+---------------------+-----+---------------------+---------------------+
| EI_encoded    | EI_encoded(0)       | ... | EI_encoded(2)       | EI_encoded(2)       |
+---------------+---------------------+-----+---------------------+---------------------+
| SP_encoded    | SP_encoded(0)       | ... | SP_encoded(1)       | SP_encoded(2)       |
+---------------+---------------------+-----+---------------------+---------------------+
| IR_encoded(0) | 0.3644859813084112  | ... | 0.6610169491525424  | 0.5523809523809524  |
+---------------+---------------------+-----+---------------------+---------------------+
| IR_encoded(1) | 0.6121495327102804  | ... | 0.13559322033898305 | 0.37142857142857144 |
+---------------+---------------------+-----+---------------------+---------------------+
| IR_encoded(2) | 0.023364485

  0%|          | 0/1000000 [00:00<?, ?it/s]


Learned Structure (Edges) for 4000 samples (BIC):
[('IR_encoded', 'SP_encoded'), ('EI_encoded', 'SP_encoded')]

CPD of IR_encoded
+---------------+---------+
| IR_encoded(0) | 0.20775 |
+---------------+---------+
| IR_encoded(1) | 0.565   |
+---------------+---------+
| IR_encoded(2) | 0.22725 |
+---------------+---------+

CPD of EI_encoded
+---------------+---------+
| EI_encoded(0) | 0.437   |
+---------------+---------+
| EI_encoded(1) | 0.30025 |
+---------------+---------+
| EI_encoded(2) | 0.26275 |
+---------------+---------+

CPD of SP_encoded
+---------------+---------------------+-----+----------------------+---------------------+
| EI_encoded    | EI_encoded(0)       | ... | EI_encoded(2)        | EI_encoded(2)       |
+---------------+---------------------+-----+----------------------+---------------------+
| IR_encoded    | IR_encoded(0)       | ... | IR_encoded(1)        | IR_encoded(2)       |
+---------------+---------------------+-----+----------------------+-------

  0%|          | 0/1000000 [00:00<?, ?it/s]


Learned Structure (Edges) for 4500 samples (BIC):
[('IR_encoded', 'EI_encoded'), ('SP_encoded', 'EI_encoded'), ('SP_encoded', 'IR_encoded')]

CPD of IR_encoded
+---------------+---------------------+---------------------+---------------------+
| SP_encoded    | SP_encoded(0)       | SP_encoded(1)       | SP_encoded(2)       |
+---------------+---------------------+---------------------+---------------------+
| IR_encoded(0) | 0.0191044776119403  | 0.03262642740619902 | 0.01938711694809256 |
+---------------+---------------------+---------------------+---------------------+
| IR_encoded(1) | 0.5504477611940298  | 0.42169657422512236 | 0.3233270794246404  |
+---------------+---------------------+---------------------+---------------------+
| IR_encoded(2) | 0.43044776119402983 | 0.5456769983686787  | 0.6572858036272671  |
+---------------+---------------------+---------------------+---------------------+

CPD of EI_encoded
+---------------+---------------+-----+---------------------+---

  0%|          | 0/1000000 [00:00<?, ?it/s]


Learned Structure (Edges) for 5000 samples (BIC):
[('EI_encoded', 'IR_encoded'), ('SP_encoded', 'IR_encoded'), ('SP_encoded', 'EI_encoded')]

CPD of IR_encoded
+---------------+---------------------+-----+---------------------+---------------------+
| EI_encoded    | EI_encoded(0)       | ... | EI_encoded(2)       | EI_encoded(2)       |
+---------------+---------------------+-----+---------------------+---------------------+
| SP_encoded    | SP_encoded(0)       | ... | SP_encoded(1)       | SP_encoded(2)       |
+---------------+---------------------+-----+---------------------+---------------------+
| IR_encoded(0) | 0.25757575757575757 | ... | 0.43979504696840305 | 0.05555555555555555 |
+---------------+---------------------+-----+---------------------+---------------------+
| IR_encoded(1) | 0.42045454545454547 | ... | 0.07429547395388557 | 0.32954545454545453 |
+---------------+---------------------+-----+---------------------+---------------------+
| IR_encoded(2) | 0.321969696

  0%|          | 0/1000000 [00:00<?, ?it/s]


Learned Structure (Edges) for 5500 samples (BIC):
[('EI_encoded', 'IR_encoded'), ('SP_encoded', 'IR_encoded'), ('SP_encoded', 'EI_encoded')]

CPD of IR_encoded
+---------------+----------------------+-----+---------------------+---------------------+
| EI_encoded    | EI_encoded(0)        | ... | EI_encoded(2)       | EI_encoded(2)       |
+---------------+----------------------+-----+---------------------+---------------------+
| SP_encoded    | SP_encoded(0)        | ... | SP_encoded(1)       | SP_encoded(2)       |
+---------------+----------------------+-----+---------------------+---------------------+
| IR_encoded(0) | 0.5820379965457686   | ... | 0.29222520107238603 | 0.37037037037037035 |
+---------------+----------------------+-----+---------------------+---------------------+
| IR_encoded(1) | 0.4075993091537133   | ... | 0.631367292225201   | 0.5608465608465608  |
+---------------+----------------------+-----+---------------------+---------------------+
| IR_encoded(2) | 0.

  0%|          | 0/1000000 [00:00<?, ?it/s]


Learned Structure (Edges) for 6000 samples (BIC):
[('IR_encoded', 'SP_encoded'), ('EI_encoded', 'SP_encoded')]

CPD of IR_encoded
+---------------+----------+
| IR_encoded(0) | 0.406833 |
+---------------+----------+
| IR_encoded(1) | 0.314833 |
+---------------+----------+
| IR_encoded(2) | 0.278333 |
+---------------+----------+

CPD of EI_encoded
+---------------+-----------+
| EI_encoded(0) | 0.0301667 |
+---------------+-----------+
| EI_encoded(1) | 0.636167  |
+---------------+-----------+
| EI_encoded(2) | 0.333667  |
+---------------+-----------+

CPD of SP_encoded
+---------------+---------------------+-----+---------------------+----------------------+
| EI_encoded    | EI_encoded(0)       | ... | EI_encoded(2)       | EI_encoded(2)        |
+---------------+---------------------+-----+---------------------+----------------------+
| IR_encoded    | IR_encoded(0)       | ... | IR_encoded(1)       | IR_encoded(2)        |
+---------------+---------------------+-----+---------

  0%|          | 0/1000000 [00:00<?, ?it/s]


Learned Structure (Edges) for 6500 samples (BIC):
[('IR_encoded', 'SP_encoded'), ('EI_encoded', 'SP_encoded')]

CPD of IR_encoded
+---------------+----------+
| IR_encoded(0) | 0.249385 |
+---------------+----------+
| IR_encoded(1) | 0.360615 |
+---------------+----------+
| IR_encoded(2) | 0.39     |
+---------------+----------+

CPD of EI_encoded
+---------------+----------+
| EI_encoded(0) | 0.367077 |
+---------------+----------+
| EI_encoded(1) | 0.538923 |
+---------------+----------+
| EI_encoded(2) | 0.094    |
+---------------+----------+

CPD of SP_encoded
+---------------+---------------------+-----+---------------------+---------------+
| EI_encoded    | EI_encoded(0)       | ... | EI_encoded(2)       | EI_encoded(2) |
+---------------+---------------------+-----+---------------------+---------------+
| IR_encoded    | IR_encoded(0)       | ... | IR_encoded(1)       | IR_encoded(2) |
+---------------+---------------------+-----+---------------------+---------------+
| SP_

  0%|          | 0/1000000 [00:00<?, ?it/s]


Learned Structure (Edges) for 7000 samples (BIC):
[('IR_encoded', 'EI_encoded'), ('SP_encoded', 'EI_encoded'), ('SP_encoded', 'IR_encoded')]

CPD of IR_encoded
+---------------+--------------------+---------------------+---------------------+
| SP_encoded    | SP_encoded(0)      | SP_encoded(1)       | SP_encoded(2)       |
+---------------+--------------------+---------------------+---------------------+
| IR_encoded(0) | 0.3470562195661797 | 0.37731733914940024 | 0.31028551771585827 |
+---------------+--------------------+---------------------+---------------------+
| IR_encoded(1) | 0.4820717131474104 | 0.22846237731733915 | 0.45786033711730306 |
+---------------+--------------------+---------------------+---------------------+
| IR_encoded(2) | 0.1708720672864099 | 0.3942202835332606  | 0.23185414516683867 |
+---------------+--------------------+---------------------+---------------------+

CPD of EI_encoded
+---------------+---------------------+-----+----------------------+
| IR

  0%|          | 0/1000000 [00:00<?, ?it/s]


Learned Structure (Edges) for 7500 samples (BIC):
[('IR_encoded', 'SP_encoded'), ('EI_encoded', 'SP_encoded')]

CPD of IR_encoded
+---------------+----------+
| IR_encoded(0) | 0.388    |
+---------------+----------+
| IR_encoded(1) | 0.270133 |
+---------------+----------+
| IR_encoded(2) | 0.341867 |
+---------------+----------+

CPD of EI_encoded
+---------------+-----------+
| EI_encoded(0) | 0.847867  |
+---------------+-----------+
| EI_encoded(1) | 0.066     |
+---------------+-----------+
| EI_encoded(2) | 0.0861333 |
+---------------+-----------+

CPD of SP_encoded
+---------------+--------------------+-----+---------------------+---------------------+
| EI_encoded    | EI_encoded(0)      | ... | EI_encoded(2)       | EI_encoded(2)       |
+---------------+--------------------+-----+---------------------+---------------------+
| IR_encoded    | IR_encoded(0)      | ... | IR_encoded(1)       | IR_encoded(2)       |
+---------------+--------------------+-----+------------------

  0%|          | 0/1000000 [00:00<?, ?it/s]


Learned Structure (Edges) for 8000 samples (BIC):
[('IR_encoded', 'EI_encoded'), ('SP_encoded', 'EI_encoded'), ('SP_encoded', 'IR_encoded')]

CPD of IR_encoded
+---------------+--------------------+---------------------+---------------------+
| SP_encoded    | SP_encoded(0)      | SP_encoded(1)       | SP_encoded(2)       |
+---------------+--------------------+---------------------+---------------------+
| IR_encoded(0) | 0.4381909547738693 | 0.450143815915628   | 0.3562691131498471  |
+---------------+--------------------+---------------------+---------------------+
| IR_encoded(1) | 0.3331658291457286 | 0.3101629913710451  | 0.5239551478083588  |
+---------------+--------------------+---------------------+---------------------+
| IR_encoded(2) | 0.228643216080402  | 0.23969319271332695 | 0.11977573904179409 |
+---------------+--------------------+---------------------+---------------------+

CPD of EI_encoded
+---------------+---------------------+-----+---------------+------------

  0%|          | 0/1000000 [00:00<?, ?it/s]


Learned Structure (Edges) for 8500 samples (BIC):
[('IR_encoded', 'SP_encoded'), ('EI_encoded', 'SP_encoded')]

CPD of IR_encoded
+---------------+-----------+
| IR_encoded(0) | 0.524824  |
+---------------+-----------+
| IR_encoded(1) | 0.0607059 |
+---------------+-----------+
| IR_encoded(2) | 0.414471  |
+---------------+-----------+

CPD of EI_encoded
+---------------+----------+
| EI_encoded(0) | 0.552    |
+---------------+----------+
| EI_encoded(1) | 0.135059 |
+---------------+----------+
| EI_encoded(2) | 0.312941 |
+---------------+----------+

CPD of SP_encoded
+---------------+---------------------+-----+---------------------+---------------------+
| EI_encoded    | EI_encoded(0)       | ... | EI_encoded(2)       | EI_encoded(2)       |
+---------------+---------------------+-----+---------------------+---------------------+
| IR_encoded    | IR_encoded(0)       | ... | IR_encoded(1)       | IR_encoded(2)       |
+---------------+---------------------+-----+-------------

  0%|          | 0/1000000 [00:00<?, ?it/s]


Learned Structure (Edges) for 9000 samples (BIC):
[('EI_encoded', 'IR_encoded'), ('SP_encoded', 'IR_encoded'), ('SP_encoded', 'EI_encoded')]

CPD of IR_encoded
+---------------+---------------------+-----+---------------------+---------------------+
| EI_encoded    | EI_encoded(0)       | ... | EI_encoded(2)       | EI_encoded(2)       |
+---------------+---------------------+-----+---------------------+---------------------+
| SP_encoded    | SP_encoded(0)       | ... | SP_encoded(1)       | SP_encoded(2)       |
+---------------+---------------------+-----+---------------------+---------------------+
| IR_encoded(0) | 0.16732617297908423 | ... | 0.11569416498993963 | 0.082687338501292   |
+---------------+---------------------+-----+---------------------+---------------------+
| IR_encoded(1) | 0.4652345958168457  | ... | 0.5503018108651911  | 0.31976744186046513 |
+---------------+---------------------+-----+---------------------+---------------------+
| IR_encoded(2) | 0.367439231

  0%|          | 0/1000000 [00:00<?, ?it/s]


Learned Structure (Edges) for 9500 samples (BIC):
[('IR_encoded', 'EI_encoded'), ('SP_encoded', 'EI_encoded'), ('SP_encoded', 'IR_encoded')]

CPD of IR_encoded
+---------------+---------------------+----------------------+----------------------+
| SP_encoded    | SP_encoded(0)       | SP_encoded(1)        | SP_encoded(2)        |
+---------------+---------------------+----------------------+----------------------+
| IR_encoded(0) | 0.76309963099631    | 0.9118564742589703   | 0.8970657832465688   |
+---------------+---------------------+----------------------+----------------------+
| IR_encoded(1) | 0.16605166051660517 | 0.039781591263650544 | 0.0922858495030762   |
+---------------+---------------------+----------------------+----------------------+
| IR_encoded(2) | 0.07084870848708487 | 0.0483619344773791   | 0.010648367250354946 |
+---------------+---------------------+----------------------+----------------------+

CPD of EI_encoded
+---------------+---------------------+-----+-

  0%|          | 0/1000000 [00:00<?, ?it/s]


Learned Structure (Edges) for 10000 samples (BIC):
[('IR_encoded', 'EI_encoded'), ('SP_encoded', 'EI_encoded'), ('SP_encoded', 'IR_encoded')]

CPD of IR_encoded
+---------------+---------------------+---------------------+---------------------+
| SP_encoded    | SP_encoded(0)       | SP_encoded(1)       | SP_encoded(2)       |
+---------------+---------------------+---------------------+---------------------+
| IR_encoded(0) | 0.2545334652159578  | 0.22393822393822393 | 0.30370562321845035 |
+---------------+---------------------+---------------------+---------------------+
| IR_encoded(1) | 0.5858885591823277  | 0.46782496782496785 | 0.5172324436382483  |
+---------------+---------------------+---------------------+---------------------+
| IR_encoded(2) | 0.15957797560171447 | 0.3082368082368082  | 0.17906193314330138 |
+---------------+---------------------+---------------------+---------------------+

CPD of EI_encoded
+---------------+---------------------+-----+------------------

## AIC

In [48]:
# Sample sizes to loop through
sample_sizes = range(500, 10500, 500)

# Loop through each sample size
for sample_size in sample_sizes:
    print(f"\nProcessing sample size: {sample_size} (Sparse Data with AIC)")

    # Load the sparse dataset for the current sample size
    sparse_data_file = f'outcomes_sparse_{sample_size}.csv'
    df_sparse = pd.read_csv(sparse_data_file)

    # Manually encode categorical variables for IR, EI, and SP
    ir_map = {'low': 0, 'medium': 1, 'high': 2}
    ei_map = {'poor': 0, 'average': 1, 'good': 2}
    sp_map = {'decrease': 0, 'stable': 1, 'increase': 2}

    df_sparse['IR_encoded'] = df_sparse['IR'].map(ir_map)
    df_sparse['EI_encoded'] = df_sparse['EI'].map(ei_map)
    df_sparse['SP_encoded'] = df_sparse['SP'].map(sp_map)

    # Define the Hill-Climb structure learning algorithm
    hc_aic = HillClimbSearch(df_sparse[['IR_encoded', 'EI_encoded', 'SP_encoded']])
    scoring_method_aic = AICScore(df_sparse[['IR_encoded', 'EI_encoded', 'SP_encoded']])

    # Estimate the best structure using AIC
    best_dag_aic = hc_aic.estimate(scoring_method=scoring_method_aic)

    # Ensure all required nodes are present in the model, even if not connected
    best_model_aic = BayesianNetwork()
    best_model_aic.add_nodes_from(['IR_encoded', 'EI_encoded', 'SP_encoded'])  # Add all nodes
    best_model_aic.add_edges_from(best_dag_aic.edges())  # Add edges from the learned structure

    # Check if all nodes are included in the learned structure
    nodes_in_structure_aic = set(best_model_aic.nodes())
    required_nodes = {'IR_encoded', 'EI_encoded', 'SP_encoded'}

    if not required_nodes.issubset(nodes_in_structure_aic):
        print("\nNot all nodes are connected. Adding a dummy variable and ensuring all required nodes are present.")
        # Add a dummy variable to the dataset
        df_sparse['Dummy_Node'] = 1  # Constant dummy node

        # Ensure all required nodes are in the model by adding edges with the dummy node
        for node in required_nodes:
            if node not in nodes_in_structure_aic:
                best_model_aic.add_edge('Dummy_Node', node)

        # Re-estimate the structure with the dummy variable
        hc_aic = HillClimbSearch(df_sparse[['IR_encoded', 'EI_encoded', 'SP_encoded', 'Dummy_Node']])
        scoring_method_aic = AICScore(df_sparse[['IR_encoded', 'EI_encoded', 'SP_encoded', 'Dummy_Node']])
        best_dag_aic = hc_aic.estimate(scoring_method=scoring_method_aic)
        best_model_aic = BayesianNetwork(best_dag_aic.edges())

    # Display the learned structure (edges of the Bayesian Network)
    print(f"\nLearned Structure (Edges) for {sample_size} samples (AIC):")
    print(best_model_aic.edges())

    # Learn the CPDs using Maximum Likelihood Estimation (MLE)
    best_model_aic.fit(df_sparse[['IR_encoded', 'EI_encoded', 'SP_encoded']], estimator=MaximumLikelihoodEstimator)

    # Check if the model is valid after learning the parameters
    assert best_model_aic.check_model()

    # Print the learned CPDs (Conditional Probability Distributions)
    for cpd in best_model_aic.get_cpds():
        print("\nCPD of", cpd.variable)
        print(cpd)

    # Create an inference object for the AIC model
    inference_aic = VariableElimination(best_model_aic)

    # Placeholder to store predictions
    predicted_sp_labels_aic = []

    # Loop through each row in the dataset to make predictions using AIC
    for index, row in df_sparse.iterrows():
        # Prepare the evidence from the dataset (IR_encoded and EI_encoded)
        sample_input = {'IR_encoded': int(row['IR_encoded']), 'EI_encoded': int(row['EI_encoded'])}

        # Perform inference to predict the distribution for SP_encoded (Stock Price)
        predicted_sp_distribution_aic = inference_aic.query(variables=['SP_encoded'], evidence=sample_input)

        # Extract the most likely SP_encoded class
        predicted_sp_class_aic = predicted_sp_distribution_aic.values.argmax()
        sp_reverse_map = {0: 'decrease', 1: 'stable', 2: 'increase'}
        predicted_sp_label_aic = sp_reverse_map[predicted_sp_class_aic]

        # Store the predicted label
        predicted_sp_labels_aic.append(predicted_sp_label_aic)

    # Convert the list of predicted labels into a DataFrame for easier comparison
    predicted_results_df_aic = pd.DataFrame({
        'IR': df_sparse['IR'],  # Original IR column
        'EI': df_sparse['EI'],  # Original EI column
        'Predicted_SP': predicted_sp_labels_aic  # Predicted SP column
    })

    # Add the actual SP values for comparison
    predicted_results_df_aic['Actual_SP'] = df_sparse['SP']

    # Calculate accuracy of predictions for AIC
    accuracy_aic = accuracy_score(predicted_results_df_aic['Actual_SP'], predicted_results_df_aic['Predicted_SP'])
    print(f"\nPrediction Accuracy for {sample_size} samples (AIC): {accuracy_aic:.4f}")

    # Display the first few rows of predictions for AIC
    print(f"\nPredicted Results for Sparse Data (First 10 rows) for {sample_size} samples (AIC):")
    print(predicted_results_df_aic.head(10))

    # Calculate the AIC score for the Bayesian Network model
    aic_score_value = scoring_method_aic.score(best_model_aic)

    # Print the AIC score
    print(f"\nAIC Score for {sample_size} samples: {aic_score_value:.4f}")

# Notify the user that the process is done
print("\nProcessing complete for all sample sizes using AIC (Sparse Data).")


Processing sample size: 500 (Sparse Data with AIC)


  0%|          | 0/1000000 [00:00<?, ?it/s]


Learned Structure (Edges) for 500 samples (AIC):
[('IR_encoded', 'EI_encoded'), ('SP_encoded', 'EI_encoded'), ('SP_encoded', 'IR_encoded')]

CPD of IR_encoded
+---------------+---------------------+---------------+----------------------+
| SP_encoded    | SP_encoded(0)       | SP_encoded(1) | SP_encoded(2)        |
+---------------+---------------------+---------------+----------------------+
| IR_encoded(0) | 0.6022727272727273  | 0.734375      | 0.8133802816901409   |
+---------------+---------------------+---------------+----------------------+
| IR_encoded(1) | 0.18181818181818182 | 0.0390625     | 0.017605633802816902 |
+---------------+---------------------+---------------+----------------------+
| IR_encoded(2) | 0.2159090909090909  | 0.2265625     | 0.16901408450704225  |
+---------------+---------------------+---------------+----------------------+

CPD of EI_encoded
+---------------+---------------------+-----+---------------------+---------------------+
| IR_encoded    | IR

  0%|          | 0/1000000 [00:00<?, ?it/s]


Learned Structure (Edges) for 1000 samples (AIC):
[('IR_encoded', 'EI_encoded'), ('SP_encoded', 'EI_encoded'), ('SP_encoded', 'IR_encoded')]

CPD of IR_encoded
+---------------+-----------------------+---------------------+----------------------+
| SP_encoded    | SP_encoded(0)         | SP_encoded(1)       | SP_encoded(2)        |
+---------------+-----------------------+---------------------+----------------------+
| IR_encoded(0) | 0.0026455026455026454 | 0.01020408163265306 | 0.006097560975609756 |
+---------------+-----------------------+---------------------+----------------------+
| IR_encoded(1) | 0.5185185185185185    | 0.272108843537415   | 0.6280487804878049   |
+---------------+-----------------------+---------------------+----------------------+
| IR_encoded(2) | 0.47883597883597884   | 0.717687074829932   | 0.36585365853658536  |
+---------------+-----------------------+---------------------+----------------------+

CPD of EI_encoded
+---------------+---------------+----

  0%|          | 0/1000000 [00:00<?, ?it/s]


Learned Structure (Edges) for 1500 samples (AIC):
[('IR_encoded', 'SP_encoded'), ('EI_encoded', 'SP_encoded')]

CPD of IR_encoded
+---------------+-----------+
| IR_encoded(0) | 0.188667  |
+---------------+-----------+
| IR_encoded(1) | 0.746     |
+---------------+-----------+
| IR_encoded(2) | 0.0653333 |
+---------------+-----------+

CPD of EI_encoded
+---------------+-------+
| EI_encoded(0) | 0.362 |
+---------------+-------+
| EI_encoded(1) | 0.274 |
+---------------+-------+
| EI_encoded(2) | 0.364 |
+---------------+-------+

CPD of SP_encoded
+---------------+---------------------+-----+---------------------+---------------------+
| EI_encoded    | EI_encoded(0)       | ... | EI_encoded(2)       | EI_encoded(2)       |
+---------------+---------------------+-----+---------------------+---------------------+
| IR_encoded    | IR_encoded(0)       | ... | IR_encoded(1)       | IR_encoded(2)       |
+---------------+---------------------+-----+---------------------+------------

  0%|          | 0/1000000 [00:00<?, ?it/s]


Learned Structure (Edges) for 2000 samples (AIC):
[('EI_encoded', 'IR_encoded'), ('SP_encoded', 'IR_encoded'), ('SP_encoded', 'EI_encoded')]

CPD of IR_encoded
+---------------+---------------------+-----+---------------------+----------------------+
| EI_encoded    | EI_encoded(0)       | ... | EI_encoded(2)       | EI_encoded(2)        |
+---------------+---------------------+-----+---------------------+----------------------+
| SP_encoded    | SP_encoded(0)       | ... | SP_encoded(1)       | SP_encoded(2)        |
+---------------+---------------------+-----+---------------------+----------------------+
| IR_encoded(0) | 0.0                 | ... | 0.4732510288065844  | 0.706766917293233    |
+---------------+---------------------+-----+---------------------+----------------------+
| IR_encoded(1) | 0.8967136150234741  | ... | 0.5102880658436214  | 0.24812030075187969  |
+---------------+---------------------+-----+---------------------+----------------------+
| IR_encoded(2) | 0.

  0%|          | 0/1000000 [00:00<?, ?it/s]


Learned Structure (Edges) for 2500 samples (AIC):
[('IR_encoded', 'EI_encoded'), ('SP_encoded', 'EI_encoded'), ('SP_encoded', 'IR_encoded')]

CPD of IR_encoded
+---------------+--------------------+---------------------+--------------------+
| SP_encoded    | SP_encoded(0)      | SP_encoded(1)       | SP_encoded(2)      |
+---------------+--------------------+---------------------+--------------------+
| IR_encoded(0) | 0.1388888888888889 | 0.13493064312736444 | 0.0671971706454465 |
+---------------+--------------------+---------------------+--------------------+
| IR_encoded(1) | 0.1840277777777778 | 0.1210592686002522  | 0.1317418213969938 |
+---------------+--------------------+---------------------+--------------------+
| IR_encoded(2) | 0.6770833333333334 | 0.7440100882723834  | 0.8010610079575596 |
+---------------+--------------------+---------------------+--------------------+

CPD of EI_encoded
+---------------+---------------+-----+---------------------+---------------------

  0%|          | 0/1000000 [00:00<?, ?it/s]


Learned Structure (Edges) for 3000 samples (AIC):
[('IR_encoded', 'EI_encoded'), ('SP_encoded', 'EI_encoded'), ('SP_encoded', 'IR_encoded')]

CPD of IR_encoded
+---------------+---------------------+---------------------+--------------------+
| SP_encoded    | SP_encoded(0)       | SP_encoded(1)       | SP_encoded(2)      |
+---------------+---------------------+---------------------+--------------------+
| IR_encoded(0) | 0.43530419880034277 | 0.42706333973128596 | 0.638432364096081  |
+---------------+---------------------+---------------------+--------------------+
| IR_encoded(1) | 0.15509854327335046 | 0.2600767754318618  | 0.2402022756005057 |
+---------------+---------------------+---------------------+--------------------+
| IR_encoded(2) | 0.40959725792630675 | 0.31285988483685223 | 0.1213653603034134 |
+---------------+---------------------+---------------------+--------------------+

CPD of EI_encoded
+---------------+----------------------+-----+---------------------+-----

  0%|          | 0/1000000 [00:00<?, ?it/s]


Learned Structure (Edges) for 3500 samples (AIC):
[('EI_encoded', 'IR_encoded'), ('SP_encoded', 'IR_encoded'), ('SP_encoded', 'EI_encoded')]

CPD of IR_encoded
+---------------+---------------------+-----+---------------------+---------------------+
| EI_encoded    | EI_encoded(0)       | ... | EI_encoded(2)       | EI_encoded(2)       |
+---------------+---------------------+-----+---------------------+---------------------+
| SP_encoded    | SP_encoded(0)       | ... | SP_encoded(1)       | SP_encoded(2)       |
+---------------+---------------------+-----+---------------------+---------------------+
| IR_encoded(0) | 0.3644859813084112  | ... | 0.6610169491525424  | 0.5523809523809524  |
+---------------+---------------------+-----+---------------------+---------------------+
| IR_encoded(1) | 0.6121495327102804  | ... | 0.13559322033898305 | 0.37142857142857144 |
+---------------+---------------------+-----+---------------------+---------------------+
| IR_encoded(2) | 0.023364485

  0%|          | 0/1000000 [00:00<?, ?it/s]


Learned Structure (Edges) for 4000 samples (AIC):
[('IR_encoded', 'SP_encoded'), ('EI_encoded', 'SP_encoded')]

CPD of IR_encoded
+---------------+---------+
| IR_encoded(0) | 0.20775 |
+---------------+---------+
| IR_encoded(1) | 0.565   |
+---------------+---------+
| IR_encoded(2) | 0.22725 |
+---------------+---------+

CPD of EI_encoded
+---------------+---------+
| EI_encoded(0) | 0.437   |
+---------------+---------+
| EI_encoded(1) | 0.30025 |
+---------------+---------+
| EI_encoded(2) | 0.26275 |
+---------------+---------+

CPD of SP_encoded
+---------------+---------------------+-----+----------------------+---------------------+
| EI_encoded    | EI_encoded(0)       | ... | EI_encoded(2)        | EI_encoded(2)       |
+---------------+---------------------+-----+----------------------+---------------------+
| IR_encoded    | IR_encoded(0)       | ... | IR_encoded(1)        | IR_encoded(2)       |
+---------------+---------------------+-----+----------------------+-------

  0%|          | 0/1000000 [00:00<?, ?it/s]


Learned Structure (Edges) for 4500 samples (AIC):
[('IR_encoded', 'EI_encoded'), ('SP_encoded', 'EI_encoded'), ('SP_encoded', 'IR_encoded')]

CPD of IR_encoded
+---------------+---------------------+---------------------+---------------------+
| SP_encoded    | SP_encoded(0)       | SP_encoded(1)       | SP_encoded(2)       |
+---------------+---------------------+---------------------+---------------------+
| IR_encoded(0) | 0.0191044776119403  | 0.03262642740619902 | 0.01938711694809256 |
+---------------+---------------------+---------------------+---------------------+
| IR_encoded(1) | 0.5504477611940298  | 0.42169657422512236 | 0.3233270794246404  |
+---------------+---------------------+---------------------+---------------------+
| IR_encoded(2) | 0.43044776119402983 | 0.5456769983686787  | 0.6572858036272671  |
+---------------+---------------------+---------------------+---------------------+

CPD of EI_encoded
+---------------+---------------+-----+---------------------+---

  0%|          | 0/1000000 [00:00<?, ?it/s]


Learned Structure (Edges) for 5000 samples (AIC):
[('EI_encoded', 'IR_encoded'), ('SP_encoded', 'IR_encoded'), ('SP_encoded', 'EI_encoded')]

CPD of IR_encoded
+---------------+---------------------+-----+---------------------+---------------------+
| EI_encoded    | EI_encoded(0)       | ... | EI_encoded(2)       | EI_encoded(2)       |
+---------------+---------------------+-----+---------------------+---------------------+
| SP_encoded    | SP_encoded(0)       | ... | SP_encoded(1)       | SP_encoded(2)       |
+---------------+---------------------+-----+---------------------+---------------------+
| IR_encoded(0) | 0.25757575757575757 | ... | 0.43979504696840305 | 0.05555555555555555 |
+---------------+---------------------+-----+---------------------+---------------------+
| IR_encoded(1) | 0.42045454545454547 | ... | 0.07429547395388557 | 0.32954545454545453 |
+---------------+---------------------+-----+---------------------+---------------------+
| IR_encoded(2) | 0.321969696

  0%|          | 0/1000000 [00:00<?, ?it/s]


Learned Structure (Edges) for 5500 samples (AIC):
[('EI_encoded', 'IR_encoded'), ('SP_encoded', 'IR_encoded'), ('SP_encoded', 'EI_encoded')]

CPD of IR_encoded
+---------------+----------------------+-----+---------------------+---------------------+
| EI_encoded    | EI_encoded(0)        | ... | EI_encoded(2)       | EI_encoded(2)       |
+---------------+----------------------+-----+---------------------+---------------------+
| SP_encoded    | SP_encoded(0)        | ... | SP_encoded(1)       | SP_encoded(2)       |
+---------------+----------------------+-----+---------------------+---------------------+
| IR_encoded(0) | 0.5820379965457686   | ... | 0.29222520107238603 | 0.37037037037037035 |
+---------------+----------------------+-----+---------------------+---------------------+
| IR_encoded(1) | 0.4075993091537133   | ... | 0.631367292225201   | 0.5608465608465608  |
+---------------+----------------------+-----+---------------------+---------------------+
| IR_encoded(2) | 0.

  0%|          | 0/1000000 [00:00<?, ?it/s]


Learned Structure (Edges) for 6000 samples (AIC):
[('IR_encoded', 'SP_encoded'), ('EI_encoded', 'SP_encoded'), ('EI_encoded', 'IR_encoded')]

CPD of IR_encoded
+---------------+---------------------+---------------------+--------------------+
| EI_encoded    | EI_encoded(0)       | EI_encoded(1)       | EI_encoded(2)      |
+---------------+---------------------+---------------------+--------------------+
| IR_encoded(0) | 0.3867403314917127  | 0.41341367566151427 | 0.3961038961038961 |
+---------------+---------------------+---------------------+--------------------+
| IR_encoded(1) | 0.24861878453038674 | 0.3112391930835735  | 0.3276723276723277 |
+---------------+---------------------+---------------------+--------------------+
| IR_encoded(2) | 0.36464088397790057 | 0.2753471312549122  | 0.2762237762237762 |
+---------------+---------------------+---------------------+--------------------+

CPD of EI_encoded
+---------------+-----------+
| EI_encoded(0) | 0.0301667 |
+------------

  0%|          | 0/1000000 [00:00<?, ?it/s]


Learned Structure (Edges) for 6500 samples (AIC):
[('IR_encoded', 'SP_encoded'), ('EI_encoded', 'SP_encoded')]

CPD of IR_encoded
+---------------+----------+
| IR_encoded(0) | 0.249385 |
+---------------+----------+
| IR_encoded(1) | 0.360615 |
+---------------+----------+
| IR_encoded(2) | 0.39     |
+---------------+----------+

CPD of EI_encoded
+---------------+----------+
| EI_encoded(0) | 0.367077 |
+---------------+----------+
| EI_encoded(1) | 0.538923 |
+---------------+----------+
| EI_encoded(2) | 0.094    |
+---------------+----------+

CPD of SP_encoded
+---------------+---------------------+-----+---------------------+---------------+
| EI_encoded    | EI_encoded(0)       | ... | EI_encoded(2)       | EI_encoded(2) |
+---------------+---------------------+-----+---------------------+---------------+
| IR_encoded    | IR_encoded(0)       | ... | IR_encoded(1)       | IR_encoded(2) |
+---------------+---------------------+-----+---------------------+---------------+
| SP_

  0%|          | 0/1000000 [00:00<?, ?it/s]


Learned Structure (Edges) for 7000 samples (AIC):
[('IR_encoded', 'EI_encoded'), ('SP_encoded', 'EI_encoded'), ('SP_encoded', 'IR_encoded')]

CPD of IR_encoded
+---------------+--------------------+---------------------+---------------------+
| SP_encoded    | SP_encoded(0)      | SP_encoded(1)       | SP_encoded(2)       |
+---------------+--------------------+---------------------+---------------------+
| IR_encoded(0) | 0.3470562195661797 | 0.37731733914940024 | 0.31028551771585827 |
+---------------+--------------------+---------------------+---------------------+
| IR_encoded(1) | 0.4820717131474104 | 0.22846237731733915 | 0.45786033711730306 |
+---------------+--------------------+---------------------+---------------------+
| IR_encoded(2) | 0.1708720672864099 | 0.3942202835332606  | 0.23185414516683867 |
+---------------+--------------------+---------------------+---------------------+

CPD of EI_encoded
+---------------+---------------------+-----+----------------------+
| IR

  0%|          | 0/1000000 [00:00<?, ?it/s]


Learned Structure (Edges) for 7500 samples (AIC):
[('IR_encoded', 'SP_encoded'), ('EI_encoded', 'SP_encoded')]

CPD of IR_encoded
+---------------+----------+
| IR_encoded(0) | 0.388    |
+---------------+----------+
| IR_encoded(1) | 0.270133 |
+---------------+----------+
| IR_encoded(2) | 0.341867 |
+---------------+----------+

CPD of EI_encoded
+---------------+-----------+
| EI_encoded(0) | 0.847867  |
+---------------+-----------+
| EI_encoded(1) | 0.066     |
+---------------+-----------+
| EI_encoded(2) | 0.0861333 |
+---------------+-----------+

CPD of SP_encoded
+---------------+--------------------+-----+---------------------+---------------------+
| EI_encoded    | EI_encoded(0)      | ... | EI_encoded(2)       | EI_encoded(2)       |
+---------------+--------------------+-----+---------------------+---------------------+
| IR_encoded    | IR_encoded(0)      | ... | IR_encoded(1)       | IR_encoded(2)       |
+---------------+--------------------+-----+------------------

  0%|          | 0/1000000 [00:00<?, ?it/s]


Learned Structure (Edges) for 8000 samples (AIC):
[('IR_encoded', 'EI_encoded'), ('SP_encoded', 'EI_encoded'), ('SP_encoded', 'IR_encoded')]

CPD of IR_encoded
+---------------+--------------------+---------------------+---------------------+
| SP_encoded    | SP_encoded(0)      | SP_encoded(1)       | SP_encoded(2)       |
+---------------+--------------------+---------------------+---------------------+
| IR_encoded(0) | 0.4381909547738693 | 0.450143815915628   | 0.3562691131498471  |
+---------------+--------------------+---------------------+---------------------+
| IR_encoded(1) | 0.3331658291457286 | 0.3101629913710451  | 0.5239551478083588  |
+---------------+--------------------+---------------------+---------------------+
| IR_encoded(2) | 0.228643216080402  | 0.23969319271332695 | 0.11977573904179409 |
+---------------+--------------------+---------------------+---------------------+

CPD of EI_encoded
+---------------+---------------------+-----+---------------+------------

  0%|          | 0/1000000 [00:00<?, ?it/s]


Learned Structure (Edges) for 8500 samples (AIC):
[('IR_encoded', 'SP_encoded'), ('EI_encoded', 'SP_encoded')]

CPD of IR_encoded
+---------------+-----------+
| IR_encoded(0) | 0.524824  |
+---------------+-----------+
| IR_encoded(1) | 0.0607059 |
+---------------+-----------+
| IR_encoded(2) | 0.414471  |
+---------------+-----------+

CPD of EI_encoded
+---------------+----------+
| EI_encoded(0) | 0.552    |
+---------------+----------+
| EI_encoded(1) | 0.135059 |
+---------------+----------+
| EI_encoded(2) | 0.312941 |
+---------------+----------+

CPD of SP_encoded
+---------------+---------------------+-----+---------------------+---------------------+
| EI_encoded    | EI_encoded(0)       | ... | EI_encoded(2)       | EI_encoded(2)       |
+---------------+---------------------+-----+---------------------+---------------------+
| IR_encoded    | IR_encoded(0)       | ... | IR_encoded(1)       | IR_encoded(2)       |
+---------------+---------------------+-----+-------------

  0%|          | 0/1000000 [00:00<?, ?it/s]


Learned Structure (Edges) for 9000 samples (AIC):
[('EI_encoded', 'IR_encoded'), ('SP_encoded', 'IR_encoded'), ('SP_encoded', 'EI_encoded')]

CPD of IR_encoded
+---------------+---------------------+-----+---------------------+---------------------+
| EI_encoded    | EI_encoded(0)       | ... | EI_encoded(2)       | EI_encoded(2)       |
+---------------+---------------------+-----+---------------------+---------------------+
| SP_encoded    | SP_encoded(0)       | ... | SP_encoded(1)       | SP_encoded(2)       |
+---------------+---------------------+-----+---------------------+---------------------+
| IR_encoded(0) | 0.16732617297908423 | ... | 0.11569416498993963 | 0.082687338501292   |
+---------------+---------------------+-----+---------------------+---------------------+
| IR_encoded(1) | 0.4652345958168457  | ... | 0.5503018108651911  | 0.31976744186046513 |
+---------------+---------------------+-----+---------------------+---------------------+
| IR_encoded(2) | 0.367439231

  0%|          | 0/1000000 [00:00<?, ?it/s]


Learned Structure (Edges) for 9500 samples (AIC):
[('IR_encoded', 'EI_encoded'), ('IR_encoded', 'SP_encoded'), ('SP_encoded', 'EI_encoded')]

CPD of IR_encoded
+---------------+-----------+
| IR_encoded(0) | 0.862842  |
+---------------+-----------+
| IR_encoded(1) | 0.0991579 |
+---------------+-----------+
| IR_encoded(2) | 0.038     |
+---------------+-----------+

CPD of EI_encoded
+---------------+---------------------+-----+----------------------+---------------------+
| IR_encoded    | IR_encoded(0)       | ... | IR_encoded(2)        | IR_encoded(2)       |
+---------------+---------------------+-----+----------------------+---------------------+
| SP_encoded    | SP_encoded(0)       | ... | SP_encoded(1)        | SP_encoded(2)       |
+---------------+---------------------+-----+----------------------+---------------------+
| EI_encoded(0) | 0.7809477756286267  | ... | 0.45161290322580644  | 0.6444444444444445  |
+---------------+---------------------+-----+-------------------

  0%|          | 0/1000000 [00:00<?, ?it/s]


Learned Structure (Edges) for 10000 samples (AIC):
[('IR_encoded', 'EI_encoded'), ('SP_encoded', 'EI_encoded'), ('SP_encoded', 'IR_encoded')]

CPD of IR_encoded
+---------------+---------------------+---------------------+---------------------+
| SP_encoded    | SP_encoded(0)       | SP_encoded(1)       | SP_encoded(2)       |
+---------------+---------------------+---------------------+---------------------+
| IR_encoded(0) | 0.2545334652159578  | 0.22393822393822393 | 0.30370562321845035 |
+---------------+---------------------+---------------------+---------------------+
| IR_encoded(1) | 0.5858885591823277  | 0.46782496782496785 | 0.5172324436382483  |
+---------------+---------------------+---------------------+---------------------+
| IR_encoded(2) | 0.15957797560171447 | 0.3082368082368082  | 0.17906193314330138 |
+---------------+---------------------+---------------------+---------------------+

CPD of EI_encoded
+---------------+---------------------+-----+------------------

# ------------------------------------------------------------------------------------------------------------

# Hypothesis Model 500, 1000, 1500, ..., 10000 Samples (dense) 1 hidden Layer, 10 Neurons Relu

In [49]:
# Sample sizes to loop through
sample_sizes = range(500, 10500, 500)

# Define the Neural Network architecture
def create_nn_model(hidden_layers=1, nodes_per_layer=10):
    model = models.Sequential()

    # Input layer (2 input features: IR_encoded and EI_encoded)
    model.add(layers.InputLayer(input_shape=(2,)))

    # Hidden layers
    for layer_num in range(hidden_layers):
        model.add(layers.Dense(nodes_per_layer, activation='relu', name=f"hidden_layer_{layer_num + 1}"))

    # Output layer (3 classes: decrease, stable, increase)
    model.add(layers.Dense(3, activation='softmax', name="output_layer"))

    # Compile the model
    model.compile(optimizer='adam',
                  loss='sparse_categorical_crossentropy',
                  metrics=['accuracy'])

    return model

# Loop through each sample size
for size in sample_sizes:
    # Load data for the current sample size (adjust the file paths if necessary)
    outcomes_file = f'outcomes_dense_{size}.csv'

    df = pd.read_csv(outcomes_file)

    # Manually encode categorical variables for IR, EI, and SP
    ir_map = {'low': 0, 'medium': 1, 'high': 2}
    ei_map = {'poor': 0, 'average': 1, 'good': 2}
    sp_map = {'decrease': 0, 'stable': 1, 'increase': 2}

    df['IR_encoded'] = df['IR'].map(ir_map)
    df['EI_encoded'] = df['EI'].map(ei_map)
    df['SP_encoded'] = df['SP'].map(sp_map)

    # Features (IR and EI) and labels (SP)
    X = df[['IR_encoded', 'EI_encoded']]
    y = df['SP_encoded']

    # Refresh the data split for each iteration
    X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, shuffle=False, random_state=42)
    X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, shuffle=False, random_state=42)

    # Show split confirmation
    print(f"\nSample size: {size}")
    print("Training Data:", X_train.shape, y_train.shape)
    print("Validation Data:", X_val.shape, y_val.shape)
    print("Test Data:", X_test.shape, y_test.shape)

    # Create the Neural Network model
    nn_model = create_nn_model(hidden_layers=1, nodes_per_layer=10)

    # Train the model
    history = nn_model.fit(X_train, y_train,
                           epochs=50,
                           batch_size=32,
                           validation_data=(X_val, y_val),
                           verbose=0)  # Set verbose=0 to avoid too much output

    # Evaluate on the validation set
    val_loss, val_accuracy = nn_model.evaluate(X_val, y_val, verbose=0)
    print(f"Validation Accuracy for {size} samples: {val_accuracy:.4f}")

    # Evaluate on the test set
    test_loss, test_accuracy = nn_model.evaluate(X_test, y_test, verbose=0)
    print(f"Test Accuracy for {size} samples: {test_accuracy:.4f}")

    # Make predictions on the test set
    predictions = nn_model.predict(X_test)

    # Convert the predicted probabilities to class labels
    predicted_classes = predictions.argmax(axis=1)

    # Create a list to map integers back to the original SP labels
    sp_reverse_map = ['decrease', 'stable', 'increase']

    # Convert the predicted classes to the original labels
    predicted_labels = [sp_reverse_map[label] for label in predicted_classes]

    # Create a DataFrame for the predicted probabilities
    probs_df = pd.DataFrame(predictions, columns=['Prob_decrease', 'Prob_stable', 'Prob_increase'])

    # Output the IR, EI, predicted SP, and the NN probabilities
    result_df = pd.DataFrame({
        'IR': df['IR'][:len(predicted_labels)],  # IR column from the original dataframe
        'EI': df['EI'][:len(predicted_labels)],  # EI column from the original dataframe
        'Predicted_SP': predicted_labels         # Predicted SP labels
    })

    # Combine the result with the predicted probabilities
    combined_df = pd.concat([result_df, probs_df.reset_index(drop=True)], axis=1)

    # Show the first few rows of the results for this sample size
    print(f"\nPredicted Results and Probabilities for {size} samples (First 5 rows):")
    print(combined_df.head(15))

# After the loop is done, print this message
print("\nLooping through all sample sizes complete!")


Sample size: 500
Training Data: (350, 2) (350,)
Validation Data: (75, 2) (75,)
Test Data: (75, 2) (75,)




Validation Accuracy for 500 samples: 0.3867
Test Accuracy for 500 samples: 0.3067
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 18ms/step

Predicted Results and Probabilities for 500 samples (First 5 rows):
        IR       EI Predicted_SP  Prob_decrease  Prob_stable  Prob_increase
0     high     good       stable       0.184599     0.413439       0.401963
1      low  average       stable       0.184599     0.413439       0.401963
2      low     good       stable       0.118898     0.441380       0.439722
3      low     poor       stable       0.184599     0.413439       0.401963
4      low  average       stable       0.184599     0.413439       0.401963
5     high     poor       stable       0.208342     0.421427       0.370231
6      low  average       stable       0.208342     0.421427       0.370231
7      low  average       stable       0.118898     0.441380       0.439722
8   medium     poor       stable       0.314706     0.360656       0.324638
9      low     po



Validation Accuracy for 1000 samples: 0.4933
Test Accuracy for 1000 samples: 0.4800
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step 

Predicted Results and Probabilities for 1000 samples (First 5 rows):
      IR       EI Predicted_SP  Prob_decrease  Prob_stable  Prob_increase
0    low     good     increase       0.259113     0.246560       0.494327
1   high  average     decrease       0.413746     0.250169       0.336086
2   high     poor       stable       0.387327     0.440073       0.172600
3   high     good       stable       0.136974     0.740133       0.122893
4    low     good     decrease       0.413746     0.250169       0.336086
5   high     poor       stable       0.387327     0.440073       0.172600
6   high  average       stable       0.136974     0.740133       0.122893
7    low     good       stable       0.223644     0.525338       0.251019
8   high  average     increase       0.259113     0.246560       0.494327
9   high  average     increase    



Validation Accuracy for 1500 samples: 0.4489
Test Accuracy for 1500 samples: 0.5244
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step 

Predicted Results and Probabilities for 1500 samples (First 5 rows):
        IR       EI Predicted_SP  Prob_decrease  Prob_stable  Prob_increase
0     high  average       stable       0.141251     0.502523       0.356226
1      low     poor       stable       0.141251     0.502523       0.356226
2     high  average       stable       0.141251     0.502523       0.356226
3      low     poor       stable       0.141251     0.502523       0.356226
4     high     good     increase       0.240788     0.332533       0.426679
5     high  average     increase       0.240788     0.332533       0.426679
6   medium     good     decrease       0.465507     0.357900       0.176593
7     high  average       stable       0.141251     0.502523       0.356226
8     high     good       stable       0.141251     0.502523       0.356226
9     high    



Validation Accuracy for 2000 samples: 0.6433
Test Accuracy for 2000 samples: 0.6533
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step 

Predicted Results and Probabilities for 2000 samples (First 5 rows):
        IR       EI Predicted_SP  Prob_decrease  Prob_stable  Prob_increase
0   medium     poor       stable       0.303581     0.355978       0.340440
1   medium     good       stable       0.177939     0.487759       0.334301
2     high     good     increase       0.089609     0.095338       0.815053
3     high  average       stable       0.177939     0.487759       0.334301
4   medium     good       stable       0.255425     0.444989       0.299586
5     high     good       stable       0.177939     0.487759       0.334301
6   medium  average     increase       0.089609     0.095338       0.815053
7     high     good       stable       0.255425     0.444989       0.299586
8   medium     good       stable       0.177939     0.487759       0.334301
9     high  



Validation Accuracy for 2500 samples: 0.4800
Test Accuracy for 2500 samples: 0.4640
[1m12/12[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step 

Predicted Results and Probabilities for 2500 samples (First 5 rows):
        IR       EI Predicted_SP  Prob_decrease  Prob_stable  Prob_increase
0      low  average     decrease       0.478695     0.285944       0.235361
1     high  average     decrease       0.419239     0.332557       0.248204
2   medium     good     increase       0.188928     0.356344       0.454728
3     high     good     increase       0.087621     0.411028       0.501351
4      low  average     increase       0.292728     0.317339       0.389934
5   medium     good     increase       0.188928     0.356344       0.454728
6     high     poor     decrease       0.352009     0.319738       0.328253
7     high     good     decrease       0.419908     0.275655       0.304437
8   medium  average     increase       0.087621     0.411028       0.501351
9   medium  



Validation Accuracy for 3000 samples: 0.4756
Test Accuracy for 3000 samples: 0.4822
[1m15/15[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step

Predicted Results and Probabilities for 3000 samples (First 5 rows):
        IR       EI Predicted_SP  Prob_decrease  Prob_stable  Prob_increase
0   medium  average       stable       0.174193     0.640813       0.184994
1   medium  average       stable       0.330716     0.355081       0.314203
2   medium     good     decrease       0.443642     0.322048       0.234311
3   medium     good     increase       0.279533     0.320211       0.400256
4      low  average     increase       0.222099     0.259137       0.518765
5   medium     good     increase       0.222099     0.259137       0.518765
6      low  average       stable       0.332049     0.396861       0.271090
7      low     good     increase       0.374356     0.217958       0.407686
8      low     good       stable       0.332049     0.396861       0.271090
9     high  a



Validation Accuracy for 3500 samples: 0.4038
Test Accuracy for 3500 samples: 0.4533
[1m17/17[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step

Predicted Results and Probabilities for 3500 samples (First 5 rows):
        IR       EI Predicted_SP  Prob_decrease  Prob_stable  Prob_increase
0      low  average       stable       0.218072     0.397343       0.384586
1   medium     good     increase       0.146729     0.417636       0.435635
2     high     good     decrease       0.403791     0.244463       0.351746
3   medium     poor       stable       0.152731     0.425283       0.421987
4     high  average     increase       0.146729     0.417636       0.435635
5     high     poor     increase       0.146729     0.417636       0.435635
6     high     poor     increase       0.245442     0.320002       0.434556
7   medium     good     decrease       0.403791     0.244463       0.351746
8   medium     good     increase       0.146729     0.417636       0.435635
9      low   



Validation Accuracy for 4000 samples: 0.5100
Test Accuracy for 4000 samples: 0.5000
[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step

Predicted Results and Probabilities for 4000 samples (First 5 rows):
        IR       EI Predicted_SP  Prob_decrease  Prob_stable  Prob_increase
0     high     good     decrease       0.336555     0.326931       0.336514
1     high     poor     increase       0.294310     0.260369       0.445321
2     high     good       stable       0.385697     0.435477       0.178826
3     high     good     increase       0.160346     0.414523       0.425131
4     high  average     increase       0.160346     0.414523       0.425131
5     high     good       stable       0.385697     0.435477       0.178826
6      low     poor     increase       0.197766     0.123645       0.678588
7   medium  average     increase       0.197766     0.123645       0.678588
8     high  average     increase       0.197766     0.123645       0.678588
9     high  a



Validation Accuracy for 4500 samples: 0.3807
Test Accuracy for 4500 samples: 0.3674
[1m22/22[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step

Predicted Results and Probabilities for 4500 samples (First 5 rows):
        IR       EI Predicted_SP  Prob_decrease  Prob_stable  Prob_increase
0     high  average     increase       0.311919     0.338470       0.349611
1   medium     good     decrease       0.437216     0.368341       0.194444
2     high  average     decrease       0.437216     0.368341       0.194444
3      low     poor     increase       0.303218     0.249961       0.446821
4     high  average     decrease       0.437216     0.368341       0.194444
5   medium     poor     decrease       0.440408     0.438466       0.121127
6   medium  average       stable       0.316601     0.487965       0.195433
7      low     good     decrease       0.440408     0.438466       0.121127
8   medium     good     decrease       0.401988     0.250458       0.347554
9      low   



Validation Accuracy for 5000 samples: 0.5933
Test Accuracy for 5000 samples: 0.5760
[1m24/24[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step

Predicted Results and Probabilities for 5000 samples (First 5 rows):
        IR       EI Predicted_SP  Prob_decrease  Prob_stable  Prob_increase
0   medium  average       stable       0.233842     0.559916       0.206242
1     high     good     decrease       0.657153     0.012733       0.330114
2      low     good       stable       0.082473     0.611749       0.305778
3   medium     good     decrease       0.510800     0.289077       0.200122
4   medium  average     decrease       0.657153     0.012733       0.330114
5     high     poor     decrease       0.510800     0.289077       0.200122
6      low     good     decrease       0.510800     0.289077       0.200122
7      low     good     decrease       0.460375     0.245870       0.293755
8      low  average       stable       0.247723     0.440271       0.312006
9      low  a



Validation Accuracy for 5500 samples: 0.4703
Test Accuracy for 5500 samples: 0.4788
[1m26/26[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step

Predicted Results and Probabilities for 5500 samples (First 5 rows):
        IR       EI Predicted_SP  Prob_decrease  Prob_stable  Prob_increase
0      low     poor     increase       0.336944     0.164698       0.498359
1   medium     poor     increase       0.336944     0.164698       0.498359
2      low     good     increase       0.361391     0.170057       0.468552
3      low  average     increase       0.418973     0.116435       0.464592
4      low  average     increase       0.336944     0.164698       0.498359
5     high  average     increase       0.335626     0.205951       0.458423
6      low     good     increase       0.336944     0.164698       0.498359
7   medium     poor     increase       0.361391     0.170057       0.468552
8      low  average     increase       0.361391     0.170057       0.468552
9      low   



Validation Accuracy for 6000 samples: 0.5778
Test Accuracy for 6000 samples: 0.5800
[1m29/29[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step

Predicted Results and Probabilities for 6000 samples (First 5 rows):
        IR    EI Predicted_SP  Prob_decrease  Prob_stable  Prob_increase
0     high  poor     increase       0.240051     0.357938       0.402011
1     high  poor     increase       0.116181     0.411262       0.472557
2     high  good       stable       0.207292     0.727783       0.064925
3     high  good     increase       0.116181     0.411262       0.472557
4     high  good     increase       0.401033     0.133618       0.465350
5     high  poor       stable       0.207292     0.727783       0.064925
6   medium  poor       stable       0.207292     0.727783       0.064925
7     high  poor       stable       0.207292     0.727783       0.064925
8     high  poor       stable       0.207292     0.727783       0.064925
9     high  poor       stable       0.20729



Validation Accuracy for 6500 samples: 0.4308
Test Accuracy for 6500 samples: 0.4256
[1m31/31[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step

Predicted Results and Probabilities for 6500 samples (First 5 rows):
        IR       EI Predicted_SP  Prob_decrease  Prob_stable  Prob_increase
0   medium  average     increase       0.376865     0.224026       0.399109
1      low     poor     increase       0.240146     0.366020       0.393834
2     high     good     increase       0.351432     0.262875       0.385692
3     high  average     increase       0.240146     0.366020       0.393834
4     high  average     increase       0.198214     0.349142       0.452644
5      low  average     increase       0.376865     0.224026       0.399109
6      low  average     increase       0.376865     0.224026       0.399109
7   medium     good       stable       0.166054     0.769430       0.064516
8     high     poor     increase       0.351432     0.262875       0.385692
9     high   



Validation Accuracy for 7000 samples: 0.4105
Test Accuracy for 7000 samples: 0.4733
[1m33/33[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step

Predicted Results and Probabilities for 7000 samples (First 5 rows):
        IR       EI Predicted_SP  Prob_decrease  Prob_stable  Prob_increase
0     high     good     increase       0.265145     0.262653       0.472202
1      low     good     decrease       0.450940     0.362721       0.186339
2      low  average       stable       0.403232     0.451772       0.144996
3   medium     poor       stable       0.339201     0.359879       0.300920
4   medium     good     increase       0.265145     0.262653       0.472202
5      low     good       stable       0.403232     0.451772       0.144996
6      low     good     increase       0.211184     0.162561       0.626256
7      low  average     decrease       0.450940     0.362721       0.186339
8     high  average       stable       0.339201     0.359879       0.300920
9   medium   



Validation Accuracy for 7500 samples: 0.5840
Test Accuracy for 7500 samples: 0.5884
[1m36/36[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step

Predicted Results and Probabilities for 7500 samples (First 5 rows):
        IR       EI Predicted_SP  Prob_decrease  Prob_stable  Prob_increase
0      low  average     increase       0.054684     0.424319       0.520997
1      low  average       stable       0.397712     0.432001       0.170288
2      low     good       stable       0.079163     0.463660       0.457177
3      low     poor       stable       0.021312     0.707951       0.270737
4   medium     good     increase       0.069523     0.278524       0.651953
5   medium  average       stable       0.079163     0.463660       0.457177
6   medium     poor     increase       0.069523     0.278524       0.651953
7   medium     good     increase       0.054684     0.424319       0.520997
8      low     poor     increase       0.069523     0.278524       0.651953
9   medium   



Validation Accuracy for 8000 samples: 0.5217
Test Accuracy for 8000 samples: 0.4850
[1m38/38[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step

Predicted Results and Probabilities for 8000 samples (First 5 rows):
        IR       EI Predicted_SP  Prob_decrease  Prob_stable  Prob_increase
0   medium     good     decrease       0.386739     0.340688       0.272573
1     high     poor       stable       0.174050     0.709941       0.116009
2   medium     good       stable       0.338429     0.366376       0.295195
3   medium     good     decrease       0.386739     0.340688       0.272573
4      low     poor       stable       0.174050     0.709941       0.116009
5   medium     good     increase       0.249725     0.216810       0.533465
6     high     poor       stable       0.338429     0.366376       0.295195
7   medium     good       stable       0.174050     0.709941       0.116009
8     high     good       stable       0.174050     0.709941       0.116009
9     high   



Validation Accuracy for 8500 samples: 0.4541
Test Accuracy for 8500 samples: 0.4635
[1m40/40[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step

Predicted Results and Probabilities for 8500 samples (First 5 rows):
        IR       EI Predicted_SP  Prob_decrease  Prob_stable  Prob_increase
0   medium     poor     increase       0.137728     0.262114       0.600158
1      low     poor     increase       0.417623     0.130552       0.451825
2      low     poor     increase       0.137728     0.262114       0.600158
3      low     poor     increase       0.218148     0.141088       0.640763
4      low  average     increase       0.417623     0.130552       0.451825
5   medium     poor     decrease       0.471980     0.325000       0.203020
6      low  average       stable       0.424002     0.432571       0.143428
7     high  average     decrease       0.426956     0.406862       0.166182
8   medium     good     increase       0.137728     0.262114       0.600158
9      low   



Validation Accuracy for 9000 samples: 0.5081
Test Accuracy for 9000 samples: 0.4741
[1m43/43[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step

Predicted Results and Probabilities for 9000 samples (First 5 rows):
        IR       EI Predicted_SP  Prob_decrease  Prob_stable  Prob_increase
0   medium     poor       stable       0.303665     0.450745       0.245590
1   medium     good     decrease       0.394312     0.251016       0.354672
2      low     poor     decrease       0.523760     0.242248       0.233992
3   medium  average       stable       0.338132     0.511933       0.149935
4     high     poor       stable       0.237191     0.403132       0.359677
5      low     poor       stable       0.205977     0.541235       0.252787
6      low     poor       stable       0.338132     0.511933       0.149935
7      low     poor       stable       0.237191     0.403132       0.359677
8      low     poor       stable       0.323967     0.391830       0.284204
9      low   



Validation Accuracy for 9500 samples: 0.4933
Test Accuracy for 9500 samples: 0.4947
[1m45/45[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step

Predicted Results and Probabilities for 9500 samples (First 5 rows):
        IR       EI Predicted_SP  Prob_decrease  Prob_stable  Prob_increase
0   medium     good     increase       0.151972     0.420493       0.427535
1      low     good     decrease       0.453110     0.256497       0.290393
2     high  average       stable       0.255299     0.376072       0.368629
3     high  average       stable       0.041479     0.733495       0.225025
4      low     good       stable       0.255299     0.376072       0.368629
5     high  average     increase       0.151972     0.420493       0.427535
6     high  average     increase       0.231452     0.306137       0.462411
7     high     good       stable       0.272090     0.415344       0.312566
8   medium     poor     decrease       0.453110     0.256497       0.290393
9      low   



Validation Accuracy for 10000 samples: 0.4593
Test Accuracy for 10000 samples: 0.4580
[1m47/47[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step

Predicted Results and Probabilities for 10000 samples (First 5 rows):
        IR       EI Predicted_SP  Prob_decrease  Prob_stable  Prob_increase
0      low     good       stable       0.363933     0.372413       0.263655
1      low  average       stable       0.363933     0.372413       0.263655
2      low     good     decrease       0.501489     0.182556       0.315955
3   medium     poor       stable       0.354805     0.491198       0.153997
4      low     good     decrease       0.501489     0.182556       0.315955
5      low     good       stable       0.363933     0.372413       0.263655
6   medium     poor       stable       0.278338     0.453451       0.268210
7      low     good       stable       0.354805     0.491198       0.153997
8      low  average       stable       0.346791     0.416422       0.236787
9      low

# Hypothesis Model 500, 1000, 1500, ..., 10000 Samples (sparse) 1 hidden Layer, 10 Neurons Relu

In [50]:
# Sample sizes to loop through
sample_sizes = range(500, 10500, 500)

# Define the Neural Network architecture
def create_nn_model(hidden_layers=1, nodes_per_layer=10):
    model = models.Sequential()

    # Input layer (2 input features: IR_encoded and EI_encoded)
    model.add(layers.InputLayer(input_shape=(2,)))

    # Hidden layers
    for layer_num in range(hidden_layers):
        model.add(layers.Dense(nodes_per_layer, activation='relu', name=f"hidden_layer_{layer_num + 1}"))

    # Output layer (3 classes: decrease, stable, increase)
    model.add(layers.Dense(3, activation='softmax', name="output_layer"))

    # Compile the model
    model.compile(optimizer='adam',
                  loss='sparse_categorical_crossentropy',
                  metrics=['accuracy'])

    return model

# Loop through each sample size
for size in sample_sizes:
    # Load data for the current sample size (adjust the file paths for sparse data)
    outcomes_file = f'outcomes_sparse_{size}.csv'

    df = pd.read_csv(outcomes_file)

    # Manually encode categorical variables for IR, EI, and SP
    ir_map = {'low': 0, 'medium': 1, 'high': 2}
    ei_map = {'poor': 0, 'average': 1, 'good': 2}
    sp_map = {'decrease': 0, 'stable': 1, 'increase': 2}

    df['IR_encoded'] = df['IR'].map(ir_map)
    df['EI_encoded'] = df['EI'].map(ei_map)
    df['SP_encoded'] = df['SP'].map(sp_map)

    # Features (IR and EI) and labels (SP)
    X = df[['IR_encoded', 'EI_encoded']]
    y = df['SP_encoded']

    # Refresh the data split for each iteration
    X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, shuffle=False, random_state=42)
    X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, shuffle=False, random_state=42)

    # Show split confirmation
    print(f"\nSample size: {size}")
    print("Training Data:", X_train.shape, y_train.shape)
    print("Validation Data:", X_val.shape, y_val.shape)
    print("Test Data:", X_test.shape, y_test.shape)

    # Create the Neural Network model
    nn_model = create_nn_model(hidden_layers=1, nodes_per_layer=10)

    # Train the model
    history = nn_model.fit(X_train, y_train,
                           epochs=50,
                           batch_size=32,
                           validation_data=(X_val, y_val),
                           verbose=0)  # Set verbose=0 to avoid too much output

    # Evaluate on the validation set
    val_loss, val_accuracy = nn_model.evaluate(X_val, y_val, verbose=0)
    print(f"Validation Accuracy for {size} samples: {val_accuracy:.4f}")

    # Evaluate on the test set
    test_loss, test_accuracy = nn_model.evaluate(X_test, y_test, verbose=0)
    print(f"Test Accuracy for {size} samples: {test_accuracy:.4f}")

    # Make predictions on the test set
    predictions = nn_model.predict(X_test)

    # Convert the predicted probabilities to class labels
    predicted_classes = predictions.argmax(axis=1)

    # Create a list to map integers back to the original SP labels
    sp_reverse_map = ['decrease', 'stable', 'increase']

    # Convert the predicted classes to the original labels
    predicted_labels = [sp_reverse_map[label] for label in predicted_classes]

    # Create a DataFrame for the predicted probabilities
    probs_df = pd.DataFrame(predictions, columns=['Prob_decrease', 'Prob_stable', 'Prob_increase'])

    # Output the IR, EI, predicted SP, and the NN probabilities
    result_df = pd.DataFrame({
        'IR': df['IR'][:len(predicted_labels)],  # IR column from the original dataframe
        'EI': df['EI'][:len(predicted_labels)],  # EI column from the original dataframe
        'Predicted_SP': predicted_labels         # Predicted SP labels
    })

    # Combine the result with the predicted probabilities
    combined_df = pd.concat([result_df, probs_df.reset_index(drop=True)], axis=1)

    # Show the first few rows of the results for this sample size
    print(f"\nPredicted Results and Probabilities for {size} samples (First 5 rows):")
    print(combined_df.head(15))

# After the loop is done, print this message
print("\nLooping through all sparse sample sizes complete!")


Sample size: 500
Training Data: (350, 2) (350,)
Validation Data: (75, 2) (75,)
Test Data: (75, 2) (75,)




Validation Accuracy for 500 samples: 0.5733
Test Accuracy for 500 samples: 0.4800
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 31ms/step

Predicted Results and Probabilities for 500 samples (First 5 rows):
      IR       EI Predicted_SP  Prob_decrease  Prob_stable  Prob_increase
0    low  average     increase       0.143132     0.213182       0.643686
1    low     poor     increase       0.075034     0.122164       0.802802
2    low  average     increase       0.143132     0.213182       0.643686
3    low  average     increase       0.266842     0.273357       0.459801
4   high  average     increase       0.243131     0.309835       0.447034
5    low  average     increase       0.075034     0.122164       0.802802
6    low  average     increase       0.143132     0.213182       0.643686
7    low  average     increase       0.143132     0.213182       0.643686
8    low  average     increase       0.143132     0.213182       0.643686
9    low  average     increase       



Validation Accuracy for 1000 samples: 0.3933
Test Accuracy for 1000 samples: 0.5067
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step

Predicted Results and Probabilities for 1000 samples (First 5 rows):
        IR       EI Predicted_SP  Prob_decrease  Prob_stable  Prob_increase
0     high     good     increase       0.298668     0.323521       0.377811
1   medium  average     decrease       0.582541     0.251349       0.166110
2   medium     good     increase       0.353570     0.257290       0.389139
3   medium     good     decrease       0.424754     0.334132       0.241114
4     high     poor     decrease       0.705424     0.172635       0.121940
5     high     poor     increase       0.353570     0.257290       0.389139
6   medium  average     increase       0.298668     0.323521       0.377811
7     high     poor     decrease       0.582541     0.251349       0.166110
8     high  average     increase       0.298668     0.323521       0.377811
9   medium    



Validation Accuracy for 1500 samples: 0.4756
Test Accuracy for 1500 samples: 0.4889
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step 

Predicted Results and Probabilities for 1500 samples (First 5 rows):
        IR       EI Predicted_SP  Prob_decrease  Prob_stable  Prob_increase
0      low     good     increase       0.195549     0.285216       0.519235
1   medium     poor       stable       0.203763     0.492334       0.303903
2      low  average       stable       0.203763     0.492334       0.303903
3   medium     poor     increase       0.145176     0.180595       0.674228
4   medium     poor     increase       0.195549     0.285216       0.519235
5      low     good       stable       0.203763     0.492334       0.303903
6   medium     good       stable       0.266151     0.418630       0.315219
7   medium     poor       stable       0.271597     0.437392       0.291011
8      low  average     increase       0.195549     0.285216       0.519235
9      low  av



Validation Accuracy for 2000 samples: 0.5100
Test Accuracy for 2000 samples: 0.4833
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step 

Predicted Results and Probabilities for 2000 samples (First 5 rows):
        IR       EI Predicted_SP  Prob_decrease  Prob_stable  Prob_increase
0     high  average     decrease       0.456307     0.194505       0.349188
1   medium     poor     increase       0.167476     0.330511       0.502014
2     high     poor     decrease       0.440657     0.272951       0.286392
3      low     poor     decrease       0.562813     0.226753       0.210434
4      low  average     increase       0.167476     0.330511       0.502014
5   medium     good     decrease       0.456307     0.194505       0.349188
6      low     poor     increase       0.209542     0.373828       0.416630
7      low  average     decrease       0.456307     0.194505       0.349188
8   medium     poor     increase       0.250649     0.286707       0.462644
9      low  



Validation Accuracy for 2500 samples: 0.4747
Test Accuracy for 2500 samples: 0.4320
[1m12/12[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step

Predicted Results and Probabilities for 2500 samples (First 5 rows):
        IR       EI Predicted_SP  Prob_decrease  Prob_stable  Prob_increase
0     high     poor     increase       0.207123     0.255693       0.537184
1   medium  average     increase       0.302808     0.338714       0.358478
2     high  average       stable       0.223994     0.390249       0.385757
3     high     poor     increase       0.207123     0.255693       0.537184
4      low     good     increase       0.240246     0.259965       0.499788
5     high     good     increase       0.207123     0.255693       0.537184
6     high  average     increase       0.207123     0.255693       0.537184
7   medium     good     increase       0.207123     0.255693       0.537184
8     high     good     increase       0.215506     0.290271       0.494222
9     high   



Validation Accuracy for 3000 samples: 0.5133
Test Accuracy for 3000 samples: 0.5311
[1m15/15[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step

Predicted Results and Probabilities for 3000 samples (First 5 rows):
        IR       EI Predicted_SP  Prob_decrease  Prob_stable  Prob_increase
0      low  average     decrease       0.593663     0.233875       0.172462
1     high     good     decrease       0.478554     0.397575       0.123871
2     high     poor     decrease       0.478554     0.397575       0.123871
3      low     good     increase       0.223189     0.364249       0.412561
4      low     good     decrease       0.478554     0.397575       0.123871
5   medium  average       stable       0.140026     0.472614       0.387360
6     high  average     decrease       0.593663     0.233875       0.172462
7     high     poor       stable       0.268563     0.515502       0.215935
8   medium  average     increase       0.082583     0.386054       0.531363
9     high   



Validation Accuracy for 3500 samples: 0.4514
Test Accuracy for 3500 samples: 0.4552
[1m17/17[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step

Predicted Results and Probabilities for 3500 samples (First 5 rows):
        IR       EI Predicted_SP  Prob_decrease  Prob_stable  Prob_increase
0     high  average       stable       0.355309     0.373811       0.270880
1      low     poor       stable       0.355309     0.373811       0.270880
2      low  average     decrease       0.448963     0.277495       0.273541
3      low  average       stable       0.429962     0.430252       0.139786
4   medium     good       stable       0.355309     0.373811       0.270880
5   medium  average     increase       0.294335     0.297392       0.408273
6   medium     good       stable       0.332437     0.562865       0.104698
7   medium     good       stable       0.380488     0.560466       0.059046
8   medium  average       stable       0.205259     0.404674       0.390067
9   medium  a



Validation Accuracy for 4000 samples: 0.5317
Test Accuracy for 4000 samples: 0.4767
[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step 

Predicted Results and Probabilities for 4000 samples (First 5 rows):
        IR       EI Predicted_SP  Prob_decrease  Prob_stable  Prob_increase
0   medium     poor     increase       0.251790     0.277014       0.471196
1      low     good     decrease       0.390547     0.348368       0.261085
2   medium     poor     increase       0.402254     0.187782       0.409965
3   medium     poor     increase       0.251790     0.277014       0.471196
4      low  average     increase       0.223699     0.315413       0.460888
5   medium     poor     decrease       0.390547     0.348368       0.261085
6      low  average     decrease       0.390547     0.348368       0.261085
7      low     good     increase       0.402254     0.187782       0.409965
8     high     poor     decrease       0.390547     0.348368       0.261085
9      low  



Validation Accuracy for 4500 samples: 0.5081
Test Accuracy for 4500 samples: 0.4919
[1m22/22[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step

Predicted Results and Probabilities for 4500 samples (First 5 rows):
        IR       EI Predicted_SP  Prob_decrease  Prob_stable  Prob_increase
0     high     good       stable       0.320971     0.450670       0.228359
1   medium     good       stable       0.320971     0.450670       0.228359
2     high  average     decrease       0.463144     0.237587       0.299268
3   medium  average     increase       0.272885     0.093126       0.633988
4     high     good     increase       0.335809     0.232453       0.431737
5   medium  average     increase       0.272885     0.093126       0.633988
6   medium     poor     decrease       0.463144     0.237587       0.299268
7     high     good     increase       0.272885     0.093126       0.633988
8   medium     good     increase       0.272885     0.093126       0.633988
9     high   



Validation Accuracy for 5000 samples: 0.4093
Test Accuracy for 5000 samples: 0.4453
[1m24/24[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step

Predicted Results and Probabilities for 5000 samples (First 5 rows):
      IR       EI Predicted_SP  Prob_decrease  Prob_stable  Prob_increase
0    low     good       stable       0.280387     0.384944       0.334669
1   high     good       stable       0.195250     0.624451       0.180298
2   high  average     increase       0.322615     0.315833       0.361552
3   high  average       stable       0.280387     0.384944       0.334669
4   high  average     increase       0.322615     0.315833       0.361552
5   high     poor       stable       0.207444     0.433443       0.359112
6    low     poor       stable       0.195250     0.624451       0.180298
7   high     good       stable       0.207444     0.433443       0.359112
8   high     good       stable       0.149462     0.475282       0.375257
9    low     good     increase   



Validation Accuracy for 5500 samples: 0.4788
Test Accuracy for 5500 samples: 0.4921
[1m26/26[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step

Predicted Results and Probabilities for 5500 samples (First 5 rows):
        IR       EI Predicted_SP  Prob_decrease  Prob_stable  Prob_increase
0   medium     good     increase       0.178425     0.330437       0.491138
1   medium     good     increase       0.300481     0.184253       0.515266
2      low     poor     decrease       0.522914     0.143166       0.333920
3   medium     good     decrease       0.522914     0.143166       0.333920
4   medium  average     decrease       0.416610     0.185794       0.397596
5   medium     good     increase       0.112593     0.300079       0.587328
6   medium  average     decrease       0.522914     0.143166       0.333920
7   medium     poor     increase       0.300481     0.184253       0.515266
8      low     good     increase       0.300481     0.184253       0.515266
9      low   



Validation Accuracy for 6000 samples: 0.5700
Test Accuracy for 6000 samples: 0.5344
[1m29/29[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step

Predicted Results and Probabilities for 6000 samples (First 5 rows):
        IR       EI Predicted_SP  Prob_decrease  Prob_stable  Prob_increase
0     high  average       stable       0.120793     0.545260       0.333947
1      low  average       stable       0.172695     0.512183       0.315122
2      low     good     increase       0.167777     0.274405       0.557818
3      low  average       stable       0.081065     0.863521       0.055414
4      low  average       stable       0.081065     0.863521       0.055414
5   medium     good     decrease       0.416721     0.401627       0.181652
6      low     good       stable       0.172695     0.512183       0.315122
7      low  average       stable       0.081911     0.464558       0.453531
8      low  average     decrease       0.416721     0.401627       0.181652
9   medium  a



Validation Accuracy for 6500 samples: 0.5251
Test Accuracy for 6500 samples: 0.5210
[1m31/31[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step

Predicted Results and Probabilities for 6500 samples (First 5 rows):
        IR       EI Predicted_SP  Prob_decrease  Prob_stable  Prob_increase
0      low     poor     decrease       0.419639     0.338115       0.242245
1   medium     poor     increase       0.392943     0.145522       0.461535
2      low  average     increase       0.342805     0.258882       0.398313
3      low     poor     decrease       0.553372     0.271065       0.175563
4      low  average     decrease       0.597699     0.190434       0.211868
5   medium  average     increase       0.317903     0.255766       0.426331
6   medium     good     increase       0.317903     0.255766       0.426331
7     high     poor     decrease       0.553372     0.271065       0.175563
8     high  average     decrease       0.656705     0.100456       0.242839
9   medium  a



Validation Accuracy for 7000 samples: 0.5171
Test Accuracy for 7000 samples: 0.5095
[1m33/33[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step

Predicted Results and Probabilities for 7000 samples (First 5 rows):
        IR       EI Predicted_SP  Prob_decrease  Prob_stable  Prob_increase
0      low     poor     decrease       0.513070     0.044129       0.442801
1      low  average     increase       0.037696     0.231133       0.731171
2     high  average     decrease       0.513070     0.044129       0.442801
3      low     poor     increase       0.366662     0.236774       0.396564
4   medium     poor     decrease       0.513070     0.044129       0.442801
5   medium  average     increase       0.118778     0.347659       0.533563
6      low     poor     increase       0.290651     0.325567       0.383782
7      low     poor       stable       0.352580     0.489972       0.157448
8   medium     poor     increase       0.366662     0.236774       0.396564
9      low   



Validation Accuracy for 7500 samples: 0.5022
Test Accuracy for 7500 samples: 0.5004
[1m36/36[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step

Predicted Results and Probabilities for 7500 samples (First 5 rows):
        IR       EI Predicted_SP  Prob_decrease  Prob_stable  Prob_increase
0   medium     good     increase       0.219064     0.339444       0.441492
1     high     poor     increase       0.219064     0.339444       0.441492
2     high     poor     increase       0.219064     0.339444       0.441492
3      low     poor     increase       0.166061     0.260449       0.573490
4      low     poor     increase       0.324701     0.318570       0.356729
5     high     poor     increase       0.219064     0.339444       0.441492
6     high     poor     decrease       0.568153     0.131171       0.300676
7      low     poor     decrease       0.483442     0.249696       0.266861
8     high  average     decrease       0.483442     0.249696       0.266861
9   medium   



Validation Accuracy for 8000 samples: 0.5392
Test Accuracy for 8000 samples: 0.5825
[1m38/38[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step

Predicted Results and Probabilities for 8000 samples (First 5 rows):
        IR       EI Predicted_SP  Prob_decrease  Prob_stable  Prob_increase
0   medium     good     decrease       0.606664     0.192423       0.200914
1   medium  average     increase       0.139075     0.205787       0.655137
2      low     poor     decrease       0.536859     0.315946       0.147195
3   medium     good     increase       0.318797     0.322181       0.359022
4      low     good     increase       0.077703     0.379917       0.542380
5   medium  average     increase       0.285550     0.298516       0.415934
6     high     good     decrease       0.606664     0.192423       0.200914
7   medium  average     increase       0.154946     0.372583       0.472471
8      low     poor     increase       0.103976     0.140955       0.755069
9   medium   



Validation Accuracy for 8500 samples: 0.5616
Test Accuracy for 8500 samples: 0.5427
[1m40/40[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step

Predicted Results and Probabilities for 8500 samples (First 5 rows):
      IR       EI Predicted_SP  Prob_decrease  Prob_stable  Prob_increase
0    low     poor     increase       0.298659     0.293976       0.407365
1    low  average     increase       0.298659     0.293976       0.407365
2    low     poor     increase       0.108823     0.153601       0.737575
3    low     good     increase       0.466026     0.061738       0.472236
4   high     good     increase       0.221495     0.269761       0.508744
5   high     good     increase       0.419686     0.156366       0.423949
6    low     good     increase       0.466026     0.061738       0.472236
7   high  average       stable       0.370869     0.462462       0.166669
8   high     poor       stable       0.239091     0.723463       0.037446
9   high  average       stable   



Validation Accuracy for 9000 samples: 0.5207
Test Accuracy for 9000 samples: 0.5370
[1m43/43[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step

Predicted Results and Probabilities for 9000 samples (First 5 rows):
        IR       EI Predicted_SP  Prob_decrease  Prob_stable  Prob_increase
0   medium  average     increase       0.134901     0.264175       0.600923
1     high     poor     decrease       0.480053     0.280387       0.239560
2     high  average     increase       0.332436     0.164710       0.502854
3     high     poor     decrease       0.480053     0.280387       0.239560
4     high     good     increase       0.243307     0.149258       0.607435
5     high  average       stable       0.199392     0.448697       0.351911
6     high  average       stable       0.199392     0.448697       0.351911
7     high     good     increase       0.134901     0.264175       0.600923
8   medium     good     decrease       0.480053     0.280387       0.239560
9     high   



Validation Accuracy for 9500 samples: 0.5530
Test Accuracy for 9500 samples: 0.5614
[1m45/45[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step

Predicted Results and Probabilities for 9500 samples (First 5 rows):
        IR       EI Predicted_SP  Prob_decrease  Prob_stable  Prob_increase
0      low  average       stable       0.082460     0.571710       0.345831
1   medium  average     decrease       0.509572     0.213250       0.277178
2      low  average     increase       0.394141     0.051339       0.554520
3      low     poor       stable       0.082460     0.571710       0.345831
4      low     poor     increase       0.428676     0.110906       0.460418
5   medium  average     increase       0.394141     0.051339       0.554520
6      low     poor     increase       0.394141     0.051339       0.554520
7     high     poor     decrease       0.509572     0.213250       0.277178
8      low  average     increase       0.394141     0.051339       0.554520
9      low  a



Validation Accuracy for 10000 samples: 0.5153
Test Accuracy for 10000 samples: 0.5373
[1m47/47[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step

Predicted Results and Probabilities for 10000 samples (First 5 rows):
        IR       EI Predicted_SP  Prob_decrease  Prob_stable  Prob_increase
0     high  average     decrease       0.726915     0.159989       0.113096
1   medium     poor     increase       0.177329     0.220979       0.601692
2     high  average     decrease       0.524345     0.350878       0.124778
3   medium  average       stable       0.284289     0.381862       0.333849
4      low     good     increase       0.307857     0.320185       0.371959
5   medium  average     decrease       0.524345     0.350878       0.124778
6   medium     poor     increase       0.117047     0.293174       0.589779
7      low     poor     increase       0.177329     0.220979       0.601692
8      low     good     increase       0.177329     0.220979       0.601692
9     high

# ------------------------------------------------------------------------------------------------------------

# K-L Divergence NN Dense Data

## This is the entropy for dense data distribution with a sample size of 500 - 10000 and a NN with 1 hidden layer and 10 Neurons.

In [51]:
# Define the function to save K-L divergence to a file
def save_kl_divergence(sample_size, kl_div_value):
    file_name = 'kl_div_NN_1_10_dense.csv'

    # Append the K-L divergence for this sample size to the file
    with open(file_name, 'a') as f:
        f.write(f"{sample_size},{kl_div_value:.4f}\n")

# Sample sizes to loop through
sample_sizes = range(500, 10500, 500)  # Loop from 500 to 10,000 in increments of 500

# Loop through each sample size
for sample_size in sample_sizes:
    # Load the ground truth probabilities for the current sample size
    ground_truth_probs_file = f'probabilities_dense_{sample_size}.csv'
    df_gt_probs = pd.read_csv(ground_truth_probs_file)

    # Extract the SP probabilities from the Ground Truth model (dense Bayesian network)
    # Placeholder to store K-L divergence values
    kl_divergences = []

    # Load the predictions from memory (already generated by the NN)
    outcomes_file = f'outcomes_dense_{sample_size}.csv'
    df = pd.read_csv(outcomes_file)

    # Manually encode categorical variables for IR and EI
    ir_map = {'low': 0, 'medium': 1, 'high': 2}
    ei_map = {'poor': 0, 'average': 1, 'good': 2}

    df['IR_encoded'] = df['IR'].map(ir_map)
    df['EI_encoded'] = df['EI'].map(ei_map)

    # Assuming predictions were saved in memory from the NN run
    predictions = nn_model.predict(df[['IR_encoded', 'EI_encoded']])

    # Loop through the test set predictions
    for i in range(len(df)):
        # Neural Network predicted probabilities for SP (decrease, stable, increase)
        predicted_probs = predictions[i]

        # Get the IR and EI values for the current sample
        ir_value = df.iloc[i]['IR_encoded']
        ei_value = df.iloc[i]['EI_encoded']

        # Map encoded values back to original labels
        ir_value = {0: 'low', 1: 'medium', 2: 'high'}[ir_value]
        ei_value = {0: 'poor', 1: 'average', 2: 'good'}[ei_value]

        # Get the corresponding ground truth probabilities for SP given IR and EI
        col_prefix = f'SP_given_IR_{ir_value}_EI_{ei_value}'
        ground_truth_probs = df_gt_probs.filter(like=col_prefix).values.flatten()

        # Ensure the probabilities are non-zero to avoid division by zero
        epsilon = 1e-10
        ground_truth_probs = np.clip(ground_truth_probs, epsilon, 1)

        # Compute K-L divergence (Neural Network vs Ground Truth)
        kl_div = entropy(predicted_probs, ground_truth_probs)
        kl_divergences.append(kl_div)

    # Calculate the average K-L divergence over all samples for the current sample size
    average_kl_divergence = np.mean(kl_divergences)

    # Save the K-L divergence value to a CSV file
    save_kl_divergence(sample_size, average_kl_divergence)

    # Print confirmation
    print(f"Average K-L Divergence for {sample_size} samples: {average_kl_divergence:.4f}")

# Once all sample sizes are processed, the K-L divergences will be saved in 'kl_div_NN_1_10_dense.csv'
print("\nK-L divergence calculations complete.")

[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step 
Average K-L Divergence for 500 samples: 0.6167
[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step 
Average K-L Divergence for 1000 samples: 0.3102
[1m47/47[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step
Average K-L Divergence for 1500 samples: 0.2202
[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step
Average K-L Divergence for 2000 samples: 0.4451
[1m79/79[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step
Average K-L Divergence for 2500 samples: 0.4494
[1m94/94[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step
Average K-L Divergence for 3000 samples: 0.4133
[1m110/110[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step
Average K-L Divergence for 3500 samples: 0.2037
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step
Average K-L Divergence for 4000 samples: 0.2856
[1m141/141[0m [32m━━━━━━

# K-L Divergence NN Sparse Data

## This is the entropy for sparse data distribution with a sample size of 500 - 10000 and a NN with 1 hidden layer and 10 Neurons.

In [52]:
# Define the function to save K-L divergence to a file at the end
def save_kl_divergences_to_file(kl_divergence_data):
    file_name = 'K-L Divergence sparse NN 1_10.csv'  # Changed file name to indicate sparse data

    # Save the K-L divergences to a CSV file
    with open(file_name, 'w') as f:
        f.write('Size,NN_Sparse_1_10_Entropy\n')  # Updated the header to indicate sparse data
        for sample_size, kl_div_value in kl_divergence_data.items():
            f.write(f"{sample_size},{kl_div_value:.4f}\n")

# Placeholder to store K-L divergence values for all sample sizes
kl_divergence_results = {}

# Sample sizes to loop through
sample_sizes = range(500, 10500, 500)

# Loop through each sample size
for sample_size in sample_sizes:
    # Load the ground truth probabilities for the current sample size (sparse data)
    ground_truth_probs_file = f'probabilities_sparse_{sample_size}.csv'  # Changed to sparse data file
    df_gt_probs = pd.read_csv(ground_truth_probs_file)

    # Load outcomes for the current sample size (sparse data)
    outcomes_file = f'outcomes_sparse_{sample_size}.csv'  # Changed to sparse data file
    df = pd.read_csv(outcomes_file)

    # Manually encode categorical variables for IR, EI, and SP
    ir_map = {'low': 0, 'medium': 1, 'high': 2}
    ei_map = {'poor': 0, 'average': 1, 'good': 2}

    df['IR_encoded'] = df['IR'].map(ir_map)
    df['EI_encoded'] = df['EI'].map(ei_map)

    # Placeholder to store K-L divergence values
    kl_divergences = []

    # Loop through the test set predictions
    for i in range(len(df)):
        # Neural Network predicted probabilities for SP (decrease, stable, increase)
        predicted_probs = predictions[i]  # Assuming the NN predictions are already available in memory for this sample size

        # Get the IR and EI values for the current sample
        ir_value = df.iloc[i]['IR_encoded']
        ei_value = df.iloc[i]['EI_encoded']

        # Map encoded values back to original labels
        ir_value = {0: 'low', 1: 'medium', 2: 'high'}[ir_value]
        ei_value = {0: 'poor', 1: 'average', 2: 'good'}[ei_value]

        # Get the corresponding ground truth probabilities for SP given IR and EI
        col_prefix = f'SP_given_IR_{ir_value}_EI_{ei_value}'
        ground_truth_probs = df_gt_probs.filter(like=col_prefix).values.flatten()

        # Ensure the probabilities are non-zero to avoid division by zero
        epsilon = 1e-10
        ground_truth_probs = np.clip(ground_truth_probs, epsilon, 1)

        # Compute K-L divergence (Neural Network vs Ground Truth)
        kl_div = entropy(predicted_probs, ground_truth_probs)
        kl_divergences.append(kl_div)

    # Calculate the average K-L divergence over all samples for the current sample size
    average_kl_divergence = np.mean(kl_divergences)

    # Store the result for this sample size
    kl_divergence_results[sample_size] = average_kl_divergence

    # Print confirmation for each sample size
    print(f"Average K-L Divergence for {sample_size} sparse samples: {average_kl_divergence:.4f}")

# Once all sample sizes are processed, save the results to a CSV file
save_kl_divergences_to_file(kl_divergence_results)

# Print completion message
print("\nK-L divergence calculations complete and saved to 'K-L Divergence sparse NN 1_10.csv'.")

Average K-L Divergence for 500 sparse samples: 0.5892
Average K-L Divergence for 1000 sparse samples: 0.2951
Average K-L Divergence for 1500 sparse samples: 0.2273
Average K-L Divergence for 2000 sparse samples: 0.5658
Average K-L Divergence for 2500 sparse samples: 0.2875
Average K-L Divergence for 3000 sparse samples: 0.4919
Average K-L Divergence for 3500 sparse samples: 0.3117
Average K-L Divergence for 4000 sparse samples: 0.2632
Average K-L Divergence for 4500 sparse samples: 0.3158
Average K-L Divergence for 5000 sparse samples: 0.3611
Average K-L Divergence for 5500 sparse samples: 0.2915
Average K-L Divergence for 6000 sparse samples: 0.4458
Average K-L Divergence for 6500 sparse samples: 0.2278
Average K-L Divergence for 7000 sparse samples: 0.4178
Average K-L Divergence for 7500 sparse samples: 0.2448
Average K-L Divergence for 8000 sparse samples: 0.4186
Average K-L Divergence for 8500 sparse samples: 0.7007
Average K-L Divergence for 9000 sparse samples: 0.2301
Average K-L

# ------------------------------------------------------------------------------------------------------------