<a href="https://colab.research.google.com/github/nonyeezeh/Research-Project-Code/blob/main/NN_Dense_1_10_Relu.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Imports

In [1]:
pip install pgmpy

Collecting pgmpy
  Downloading pgmpy-0.1.26-py3-none-any.whl.metadata (9.1 kB)
Downloading pgmpy-0.1.26-py3-none-any.whl (2.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.0/2.0 MB[0m [31m14.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pgmpy
Successfully installed pgmpy-0.1.26


In [2]:
import numpy as np
import pandas as pd
from pgmpy.models import BayesianNetwork
from pgmpy.models import BayesianModel
from pgmpy.factors.discrete import TabularCPD
from pgmpy.sampling import BayesianModelSampling
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import tensorflow as tf
from tensorflow.keras import layers, models
from pgmpy.estimators import HillClimbSearch, BicScore, AICScore, MaximumLikelihoodEstimator
from pgmpy.inference import VariableElimination
from sklearn.metrics import accuracy_score
from scipy.stats import entropy
import os
import matplotlib.pyplot as plt

# Bayesian Network Data Generation 500, 1000, 1500, ..., 10000 Samples (dense)

In [3]:
# Define the mappings for IR, EI, SP
ir_map = {0: 'low', 1: 'medium', 2: 'high'}
ei_map = {0: 'poor', 1: 'average', 2: 'good'}
sp_map = {0: 'decrease', 1: 'stable', 2: 'increase'}

# Define the dense Bayesian Network
dense_model = BayesianNetwork([('IR', 'EI'), ('EI', 'SP'), ('IR', 'SP')])

# Function to generate CPDs
def generate_cpds():
    ir_probs = np.random.rand(3)
    ir_probs /= ir_probs.sum()  # Normalize to make it a valid probability distribution

    ei_given_ir_probs = np.random.rand(3, 3)
    ei_given_ir_probs /= ei_given_ir_probs.sum(axis=0, keepdims=True)

    sp_probs = np.random.rand(3, 3, 3)
    sp_probs /= sp_probs.sum(axis=0, keepdims=True)

    sp_probs_reshaped = sp_probs.reshape(3, -1)

    return ir_probs, ei_given_ir_probs, sp_probs_reshaped

# Save probabilities in a single CSV file
def save_probabilities(ir_probs, ei_probs, sp_probs, filename):
    # Create a DataFrame for IR probabilities
    ir_df = pd.DataFrame({
        'IR_State': ['low', 'medium', 'high'],
        'IR_Prob': ir_probs
    })

    # Create a DataFrame for EI given IR probabilities
    ei_df = pd.DataFrame(ei_probs, columns=['EI_given_IR_low', 'EI_given_IR_medium', 'EI_given_IR_high'])
    ei_df['EI_State'] = ['poor', 'average', 'good']

    # Create a DataFrame for SP given IR and EI probabilities
    sp_df = pd.DataFrame(sp_probs, columns=[
        'SP_given_IR_low_EI_poor', 'SP_given_IR_low_EI_average', 'SP_given_IR_low_EI_good',
        'SP_given_IR_medium_EI_poor', 'SP_given_IR_medium_EI_average', 'SP_given_IR_medium_EI_good',
        'SP_given_IR_high_EI_poor', 'SP_given_IR_high_EI_average', 'SP_given_IR_high_EI_good'
    ])
    sp_df['SP_State'] = ['decrease', 'stable', 'increase']

    # Combine all data into a single DataFrame
    combined_df = pd.concat([ir_df, ei_df, sp_df], axis=1)

    # Save the combined DataFrame as a single CSV file
    combined_df.to_csv(filename, index=False)

# Save outcomes in a CSV file
def save_outcomes(data_dense, filename):
    data_dense['IR'] = data_dense['IR'].map(ir_map)
    data_dense['EI'] = data_dense['EI'].map(ei_map)
    data_dense['SP'] = data_dense['SP'].map(sp_map)
    data_dense.to_csv(filename, index=False)

# Generate datasets for different sample sizes for the dense model
sample_sizes = range(500, 10500, 500)
for size in sample_sizes:
    # Generate the CPDs
    ir_probs, ei_given_ir_probs, sp_probs_reshaped = generate_cpds()

    # Define CPDs for the dense model
    cpd_ir = TabularCPD(variable='IR', variable_card=3, values=[[ir_probs[0]], [ir_probs[1]], [ir_probs[2]]])
    cpd_ei_dense = TabularCPD(variable='EI', variable_card=3,
                              values=ei_given_ir_probs,
                              evidence=['IR'], evidence_card=[3])
    cpd_sp_dense = TabularCPD(variable='SP', variable_card=3,
                              values=sp_probs_reshaped,
                              evidence=['IR', 'EI'], evidence_card=[3, 3])

    dense_model.add_cpds(cpd_ir, cpd_ei_dense, cpd_sp_dense)

    # Check if the model is valid
    assert dense_model.check_model()

    # Generate samples
    sampler_dense = BayesianModelSampling(dense_model)
    data_dense = sampler_dense.forward_sample(size=size)

    # Save probabilities in one file
    save_probabilities(ir_probs, ei_given_ir_probs, sp_probs_reshaped, f'probabilities_dense_{size}.csv')

    # Save outcomes (low, medium, high) in another file
    save_outcomes(data_dense, f'outcomes_dense_{size}.csv')

# Notify the user that the process is done
print("Data generation and saving complete for the dense model!")

  0%|          | 0/3 [00:00<?, ?it/s]



  0%|          | 0/3 [00:00<?, ?it/s]



  0%|          | 0/3 [00:00<?, ?it/s]



  0%|          | 0/3 [00:00<?, ?it/s]



  0%|          | 0/3 [00:00<?, ?it/s]



  0%|          | 0/3 [00:00<?, ?it/s]



  0%|          | 0/3 [00:00<?, ?it/s]



  0%|          | 0/3 [00:00<?, ?it/s]



  0%|          | 0/3 [00:00<?, ?it/s]



  0%|          | 0/3 [00:00<?, ?it/s]



  0%|          | 0/3 [00:00<?, ?it/s]



  0%|          | 0/3 [00:00<?, ?it/s]



  0%|          | 0/3 [00:00<?, ?it/s]



  0%|          | 0/3 [00:00<?, ?it/s]



  0%|          | 0/3 [00:00<?, ?it/s]



  0%|          | 0/3 [00:00<?, ?it/s]



  0%|          | 0/3 [00:00<?, ?it/s]



  0%|          | 0/3 [00:00<?, ?it/s]



  0%|          | 0/3 [00:00<?, ?it/s]



  0%|          | 0/3 [00:00<?, ?it/s]



Data generation and saving complete for the dense model!


# Hypothesis Model: 500, 1000, 1500, ..., 10000 Samples (dense) 1 hidden Layer, 10 Neurons Relu

In [4]:
# Sample sizes to loop through
sample_sizes = range(500, 10500, 500)

# Define the Neural Network architecture
def create_nn_model(hidden_layers=1, nodes_per_layer=10):
    model = models.Sequential()

    # Input layer (2 input features: IR_encoded and EI_encoded)
    model.add(layers.InputLayer(input_shape=(2,)))

    # Hidden layers
    for layer_num in range(hidden_layers):
        model.add(layers.Dense(nodes_per_layer, activation='relu', name=f"hidden_layer_{layer_num + 1}"))

    # Output layer (3 classes: decrease, stable, increase)
    model.add(layers.Dense(3, activation='softmax', name="output_layer"))

    # Compile the model
    model.compile(optimizer='adam',
                  loss='sparse_categorical_crossentropy',
                  metrics=['accuracy'])

    return model

# Loop through each sample size
for size in sample_sizes:
    # Load data for the current sample size (adjust the file paths if necessary)
    outcomes_file = f'outcomes_dense_{size}.csv'

    df = pd.read_csv(outcomes_file)

    # Manually encode categorical variables for IR, EI, and SP
    ir_map = {'low': 0, 'medium': 1, 'high': 2}
    ei_map = {'poor': 0, 'average': 1, 'good': 2}
    sp_map = {'decrease': 0, 'stable': 1, 'increase': 2}

    df['IR_encoded'] = df['IR'].map(ir_map)
    df['EI_encoded'] = df['EI'].map(ei_map)
    df['SP_encoded'] = df['SP'].map(sp_map)

    # Features (IR and EI) and labels (SP)
    X = df[['IR_encoded', 'EI_encoded']]
    y = df['SP_encoded']

    # Refresh the data split for each iteration
    X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, shuffle=False, random_state=42)
    X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, shuffle=False, random_state=42)

    # Show split confirmation
    print(f"\nSample size: {size}")
    print("Training Data:", X_train.shape, y_train.shape)
    print("Validation Data:", X_val.shape, y_val.shape)
    print("Test Data:", X_test.shape, y_test.shape)

    # Create the Neural Network model
    nn_model = create_nn_model(hidden_layers=1, nodes_per_layer=10)

    # Train the model
    history = nn_model.fit(X_train, y_train,
                           epochs=50,
                           batch_size=32,
                           validation_data=(X_val, y_val),
                           verbose=0)  # Set verbose=0 to avoid too much output

    # Evaluate on the validation set
    val_loss, val_accuracy = nn_model.evaluate(X_val, y_val, verbose=0)
    print(f"Validation Accuracy for {size} samples: {val_accuracy:.4f}")

    # Evaluate on the test set
    test_loss, test_accuracy = nn_model.evaluate(X_test, y_test, verbose=0)
    print(f"Test Accuracy for {size} samples: {test_accuracy:.4f}")

    # Make predictions on the test set
    predictions = nn_model.predict(X_test)

    # Convert the predicted probabilities to class labels
    predicted_classes = predictions.argmax(axis=1)

    # Create a list to map integers back to the original SP labels
    sp_reverse_map = ['decrease', 'stable', 'increase']

    # Convert the predicted classes to the original labels
    predicted_labels = [sp_reverse_map[label] for label in predicted_classes]

    # Create a DataFrame for the predicted probabilities
    probs_df = pd.DataFrame(predictions, columns=['Prob_decrease', 'Prob_stable', 'Prob_increase'])

    # Output the IR, EI, predicted SP, and the NN probabilities
    result_df = pd.DataFrame({
        'IR': df['IR'][:len(predicted_labels)],  # IR column from the original dataframe
        'EI': df['EI'][:len(predicted_labels)],  # EI column from the original dataframe
        'Predicted_SP': predicted_labels         # Predicted SP labels
    })

    # Combine the result with the predicted probabilities
    combined_df = pd.concat([result_df, probs_df.reset_index(drop=True)], axis=1)

    # Show the first few rows of the results for this sample size
    print(f"\nPredicted Results and Probabilities for {size} samples (First 5 rows):")
    print(combined_df.head(15))

# After the loop is done, print this message
print("\nLooping through all sample sizes complete!")




Sample size: 500
Training Data: (350, 2) (350,)
Validation Data: (75, 2) (75,)
Test Data: (75, 2) (75,)
Validation Accuracy for 500 samples: 0.3733
Test Accuracy for 500 samples: 0.4267
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 73ms/step

Predicted Results and Probabilities for 500 samples (First 5 rows):
        IR       EI Predicted_SP  Prob_decrease  Prob_stable  Prob_increase
0     high  average     decrease       0.397317     0.324352       0.278331
1     high  average     decrease       0.397317     0.324352       0.278331
2   medium  average       stable       0.176859     0.427281       0.395859
3      low  average     decrease       0.397317     0.324352       0.278331
4     high     poor     decrease       0.397317     0.324352       0.278331
5      low     good     decrease       0.345505     0.324816       0.329679
6   medium  average     decrease       0.397317     0.324352       0.278331
7     high  average     decrease       0.345505     0.324816    



Validation Accuracy for 1000 samples: 0.6133
Test Accuracy for 1000 samples: 0.5200
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step 

Predicted Results and Probabilities for 1000 samples (First 5 rows):
        IR       EI Predicted_SP  Prob_decrease  Prob_stable  Prob_increase
0      low     good     decrease       0.637446     0.244309       0.118245
1      low     good     decrease       0.637446     0.244309       0.118245
2   medium     poor       stable       0.306514     0.478301       0.215185
3     high     good       stable       0.306514     0.478301       0.215185
4      low     good     decrease       0.448088     0.422940       0.128972
5      low     good     decrease       0.448088     0.422940       0.128972
6     high  average       stable       0.197167     0.621409       0.181424
7      low     good     decrease       0.448088     0.422940       0.128972
8     high  average       stable       0.306514     0.478301       0.215185
9      low  av



Validation Accuracy for 1500 samples: 0.4844
Test Accuracy for 1500 samples: 0.4844
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step 





Predicted Results and Probabilities for 1500 samples (First 5 rows):
      IR       EI Predicted_SP  Prob_decrease  Prob_stable  Prob_increase
0   high     poor     increase       0.240406     0.133639       0.625955
1   high     good     increase       0.240406     0.133639       0.625955
2    low     good     decrease       0.670969     0.129108       0.199923
3   high     good     increase       0.394471     0.181965       0.423565
4   high     poor     increase       0.380773     0.187053       0.432174
5    low     good     decrease       0.466308     0.216449       0.317242
6    low  average     decrease       0.502784     0.208432       0.288784
7   high  average     increase       0.309082     0.163291       0.527627
8    low  average     increase       0.380773     0.187053       0.432174
9    low     good     increase       0.380773     0.187053       0.432174
10  high     poor     decrease       0.502784     0.208432       0.288784
11  high     poor     increase       0.309



Validation Accuracy for 2000 samples: 0.5500
Test Accuracy for 2000 samples: 0.5500
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step 





Predicted Results and Probabilities for 2000 samples (First 5 rows):
        IR       EI Predicted_SP  Prob_decrease  Prob_stable  Prob_increase
0   medium  average     decrease       0.561782     0.221001       0.217218
1      low     good       stable       0.212091     0.549913       0.237996
2     high  average     decrease       0.466032     0.363487       0.170481
3     high     good     decrease       0.561782     0.221001       0.217218
4      low     good     decrease       0.561782     0.221001       0.217218
5   medium     poor     decrease       0.466032     0.363487       0.170481
6      low     good     decrease       0.466032     0.363487       0.170481
7   medium     poor     decrease       0.561782     0.221001       0.217218
8   medium     good     decrease       0.602151     0.316104       0.081745
9   medium     poor     decrease       0.561782     0.221001       0.217218
10    high  average       stable       0.147619     0.586623       0.265758
11     low     poo



Validation Accuracy for 3000 samples: 0.4089
Test Accuracy for 3000 samples: 0.3933
[1m15/15[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step 

Predicted Results and Probabilities for 3000 samples (First 5 rows):
        IR       EI Predicted_SP  Prob_decrease  Prob_stable  Prob_increase
0     high     poor       stable       0.255047     0.415497       0.329456
1     high     poor     increase       0.262323     0.243874       0.493804
2     high     good     increase       0.323275     0.302433       0.374292
3     high     good       stable       0.255047     0.415497       0.329456
4     high     poor       stable       0.286213     0.381320       0.332467
5   medium     good     increase       0.262323     0.243874       0.493804
6   medium     poor       stable       0.255047     0.415497       0.329456
7     high     poor       stable       0.255047     0.415497       0.329456
8   medium  average     increase       0.250260     0.340986       0.408754
9   medium  



Validation Accuracy for 3500 samples: 0.6629
Test Accuracy for 3500 samples: 0.6305
[1m17/17[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step 

Predicted Results and Probabilities for 3500 samples (First 5 rows):
        IR       EI Predicted_SP  Prob_decrease  Prob_stable  Prob_increase
0      low     good       stable       0.118759     0.833203       0.048037
1     high  average       stable       0.263944     0.467105       0.268951
2   medium     poor       stable       0.263944     0.467105       0.268951
3      low     good     increase       0.269289     0.167283       0.563429
4     high     good     increase       0.269289     0.167283       0.563429
5   medium     poor       stable       0.193571     0.461658       0.344771
6     high  average       stable       0.118759     0.833203       0.048037
7   medium     poor       stable       0.263944     0.467105       0.268951
8   medium     poor       stable       0.118759     0.833203       0.048037
9   medium  



Validation Accuracy for 4000 samples: 0.6150
Test Accuracy for 4000 samples: 0.5450
[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step

Predicted Results and Probabilities for 4000 samples (First 5 rows):
        IR       EI Predicted_SP  Prob_decrease  Prob_stable  Prob_increase
0   medium  average     increase       0.302044     0.342154       0.355802
1   medium  average       stable       0.181888     0.640084       0.178029
2   medium  average       stable       0.181888     0.640084       0.178029
3   medium  average       stable       0.370244     0.437903       0.191853
4   medium  average       stable       0.181888     0.640084       0.178029
5     high  average       stable       0.181888     0.640084       0.178029
6   medium     good       stable       0.181888     0.640084       0.178029
7   medium     poor       stable       0.181888     0.640084       0.178029
8   medium  average     increase       0.302044     0.342154       0.355802
9      low  a



Validation Accuracy for 4500 samples: 0.5541
Test Accuracy for 4500 samples: 0.5496
[1m22/22[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step

Predicted Results and Probabilities for 4500 samples (First 5 rows):
        IR       EI Predicted_SP  Prob_decrease  Prob_stable  Prob_increase
0   medium     poor       stable       0.197346     0.489837       0.312816
1   medium  average     decrease       0.445613     0.240261       0.314127
2     high     good     decrease       0.648520     0.134380       0.217100
3     high     poor       stable       0.343928     0.478721       0.177351
4      low     poor     decrease       0.447719     0.224472       0.327809
5   medium     poor     decrease       0.433678     0.253714       0.312608
6   medium  average     decrease       0.433678     0.253714       0.312608
7      low     poor     decrease       0.433678     0.253714       0.312608
8   medium     good       stable       0.197346     0.489837       0.312816
9   medium  a



Validation Accuracy for 5000 samples: 0.4453
Test Accuracy for 5000 samples: 0.4680
[1m24/24[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step

Predicted Results and Probabilities for 5000 samples (First 5 rows):
        IR       EI Predicted_SP  Prob_decrease  Prob_stable  Prob_increase
0     high     poor     decrease       0.445916     0.333274       0.220810
1     high     good     decrease       0.445916     0.333274       0.220810
2   medium     poor       stable       0.070414     0.471757       0.457828
3      low     poor     decrease       0.496766     0.298099       0.205135
4      low     good     decrease       0.445916     0.333274       0.220810
5     high     good     decrease       0.496766     0.298099       0.205135
6     high     good       stable       0.070414     0.471757       0.457828
7   medium     good       stable       0.070414     0.471757       0.457828
8   medium  average     decrease       0.471277     0.315613       0.213109
9   medium  a



Validation Accuracy for 5500 samples: 0.4994
Test Accuracy for 5500 samples: 0.4994
[1m26/26[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step

Predicted Results and Probabilities for 5500 samples (First 5 rows):
        IR       EI Predicted_SP  Prob_decrease  Prob_stable  Prob_increase
0      low     poor       stable       0.204005     0.483302       0.312694
1   medium     poor       stable       0.292188     0.491234       0.216579
2     high     good     decrease       0.367008     0.310726       0.322266
3      low     good       stable       0.176205     0.614912       0.208883
4   medium     good       stable       0.204005     0.483302       0.312694
5     high     good     decrease       0.493715     0.296638       0.209648
6      low     good       stable       0.204005     0.483302       0.312694
7     high     good     decrease       0.493715     0.296638       0.209648
8   medium     good       stable       0.133192     0.444642       0.422166
9      low  a



Validation Accuracy for 6000 samples: 0.6011
Test Accuracy for 6000 samples: 0.5878
[1m29/29[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step

Predicted Results and Probabilities for 6000 samples (First 5 rows):
        IR       EI Predicted_SP  Prob_decrease  Prob_stable  Prob_increase
0     high  average     increase       0.333378     0.267549       0.399072
1   medium     poor       stable       0.403492     0.573033       0.023475
2     high     good       stable       0.203424     0.527101       0.269475
3   medium  average     increase       0.182155     0.172275       0.645570
4   medium     poor     increase       0.267599     0.255916       0.476485
5   medium  average       stable       0.203424     0.527101       0.269475
6   medium  average     increase       0.182155     0.172275       0.645570
7   medium     good     increase       0.267599     0.255916       0.476485
8      low     poor       stable       0.179021     0.700806       0.120173
9      low   



Validation Accuracy for 6500 samples: 0.5282
Test Accuracy for 6500 samples: 0.5897
[1m31/31[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step

Predicted Results and Probabilities for 6500 samples (First 5 rows):
        IR       EI Predicted_SP  Prob_decrease  Prob_stable  Prob_increase
0     high  average     decrease       0.520068     0.036148       0.443784
1   medium  average     decrease       0.520068     0.036148       0.443784
2   medium     good     increase       0.387501     0.172758       0.439740
3      low     good     decrease       0.520068     0.036148       0.443784
4   medium     good     decrease       0.672795     0.124670       0.202536
5   medium  average     increase       0.361246     0.104399       0.534355
6      low  average     decrease       0.672795     0.124670       0.202536
7   medium     good     decrease       0.672795     0.124670       0.202536
8   medium     good     increase       0.250817     0.161373       0.587810
9     high   



Validation Accuracy for 7000 samples: 0.5400
Test Accuracy for 7000 samples: 0.5933
[1m33/33[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step

Predicted Results and Probabilities for 7000 samples (First 5 rows):
        IR       EI Predicted_SP  Prob_decrease  Prob_stable  Prob_increase
0   medium     good       stable       0.298737     0.618920       0.082343
1   medium     good       stable       0.023996     0.491241       0.484763
2   medium     poor     increase       0.070807     0.372091       0.557102
3   medium     poor     increase       0.302844     0.199146       0.498010
4   medium     good       stable       0.023996     0.491241       0.484763
5      low  average       stable       0.298737     0.618920       0.082343
6      low  average       stable       0.298737     0.618920       0.082343
7      low  average       stable       0.298737     0.618920       0.082343
8   medium     poor       stable       0.023996     0.491241       0.484763
9      low  a



Validation Accuracy for 7500 samples: 0.5787
Test Accuracy for 7500 samples: 0.5644
[1m36/36[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step

Predicted Results and Probabilities for 7500 samples (First 5 rows):
        IR       EI Predicted_SP  Prob_decrease  Prob_stable  Prob_increase
0   medium  average     decrease       0.343607     0.340411       0.315983
1      low  average     increase       0.185304     0.252565       0.562131
2     high  average     decrease       0.343607     0.340411       0.315983
3   medium     good     increase       0.272150     0.225334       0.502516
4   medium     good     increase       0.185304     0.252565       0.562131
5     high     good       stable       0.098532     0.783021       0.118447
6      low     good     decrease       0.504212     0.246856       0.248932
7     high     good     increase       0.272150     0.225334       0.502516
8      low  average     increase       0.272150     0.225334       0.502516
9      low  a



Validation Accuracy for 8000 samples: 0.4625
Test Accuracy for 8000 samples: 0.4433
[1m38/38[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step

Predicted Results and Probabilities for 8000 samples (First 5 rows):
        IR       EI Predicted_SP  Prob_decrease  Prob_stable  Prob_increase
0   medium     poor     increase       0.086264     0.450336       0.463401
1   medium     good     decrease       0.464080     0.464039       0.071881
2   medium  average     decrease       0.541623     0.123711       0.334666
3   medium     good     increase       0.086264     0.450336       0.463401
4      low     poor     decrease       0.541623     0.123711       0.334666
5   medium     poor     decrease       0.464080     0.464039       0.071881
6   medium  average     decrease       0.464080     0.464039       0.071881
7   medium     good     decrease       0.464080     0.464039       0.071881
8   medium  average     decrease       0.541623     0.123711       0.334666
9     high   



Validation Accuracy for 8500 samples: 0.4580
Test Accuracy for 8500 samples: 0.4745
[1m40/40[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step

Predicted Results and Probabilities for 8500 samples (First 5 rows):
        IR       EI Predicted_SP  Prob_decrease  Prob_stable  Prob_increase
0     high  average       stable       0.203614     0.499765       0.296621
1     high  average       stable       0.203614     0.499765       0.296621
2      low  average       stable       0.283534     0.391162       0.325304
3     high     good       stable       0.371592     0.397870       0.230538
4     high     good       stable       0.283534     0.391162       0.325304
5      low     poor       stable       0.283534     0.391162       0.325304
6   medium     poor       stable       0.185447     0.630981       0.183572
7   medium  average     increase       0.246054     0.223063       0.530883
8     high     poor       stable       0.283534     0.391162       0.325304
9     high  a



Validation Accuracy for 9000 samples: 0.4385
Test Accuracy for 9000 samples: 0.4896
[1m43/43[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step

Predicted Results and Probabilities for 9000 samples (First 5 rows):
        IR       EI Predicted_SP  Prob_decrease  Prob_stable  Prob_increase
0      low     good       stable       0.266259     0.594636       0.139105
1   medium     poor       stable       0.266259     0.594636       0.139105
2      low  average       stable       0.310328     0.389605       0.300067
3      low  average     decrease       0.466722     0.307848       0.225430
4      low     good       stable       0.310328     0.389605       0.300067
5   medium     poor       stable       0.310328     0.389605       0.300067
6   medium     poor       stable       0.310328     0.389605       0.300067
7     high  average       stable       0.310328     0.389605       0.300067
8      low     good     decrease       0.466722     0.307848       0.225430
9      low  a



Validation Accuracy for 9500 samples: 0.4807
Test Accuracy for 9500 samples: 0.4751
[1m45/45[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step

Predicted Results and Probabilities for 9500 samples (First 5 rows):
        IR       EI Predicted_SP  Prob_decrease  Prob_stable  Prob_increase
0      low     good     decrease       0.590618     0.196784       0.212598
1     high     good     increase       0.347501     0.241407       0.411092
2      low     good     decrease       0.590618     0.196784       0.212598
3      low     good       stable       0.356821     0.406637       0.236542
4      low     poor     increase       0.347501     0.241407       0.411092
5   medium     poor     increase       0.347501     0.241407       0.411092
6      low  average     decrease       0.610260     0.302997       0.086743
7   medium     good       stable       0.356821     0.406637       0.236542
8     high     good     decrease       0.432996     0.174910       0.392094
9     high   



Validation Accuracy for 10000 samples: 0.5560
Test Accuracy for 10000 samples: 0.5540
[1m47/47[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step

Predicted Results and Probabilities for 10000 samples (First 5 rows):
      IR       EI Predicted_SP  Prob_decrease  Prob_stable  Prob_increase
0   high  average     increase       0.339590     0.253652       0.406758
1   high     poor     increase       0.339590     0.253652       0.406758
2   high  average     increase       0.339590     0.253652       0.406758
3   high     poor       stable       0.403729     0.425745       0.170526
4   high     poor     increase       0.339590     0.253652       0.406758
5   high     poor     increase       0.081799     0.039577       0.878624
6   high     poor       stable       0.403729     0.425745       0.170526
7   high  average     increase       0.339590     0.253652       0.406758
8   high     poor     increase       0.339590     0.253652       0.406758
9   high     poor     increase

# K-L Divergence NN Dense Data

In [14]:
# Define the function to save K-L divergence and std dev to a file
def save_kl_divergence(sample_size, kl_div_value, std_kl_div_value, first_run=False):
    file_name = 'kl_div_NN_1_10_dense.csv'

    # If it's the first run, write the headers
    if first_run:
        with open(file_name, 'w') as f:
            f.write('Size,NN_Dense_1_10_Entropy,Std_Dev\n')  # Write headers for the CSV

    # Append the K-L divergence and std dev for this sample size to the file
    with open(file_name, 'a') as f:
        f.write(f"{sample_size},{kl_div_value:.4f},{std_kl_div_value:.4f}\n")

# Sample sizes to loop through
sample_sizes = range(500, 10500, 500)  # Loop from 500 to 10,000 in increments of 500

# Loop through each sample size
first_run = True  # Flag to indicate the first run for writing headers

for sample_size in sample_sizes:
    # Load the ground truth probabilities for the current sample size
    ground_truth_probs_file = f'probabilities_dense_{sample_size}.csv'
    df_gt_probs = pd.read_csv(ground_truth_probs_file)

    # Placeholder to store K-L divergence values
    kl_divergences = []

    # Load the predictions from memory (already generated by the NN)
    outcomes_file = f'outcomes_dense_{sample_size}.csv'
    df = pd.read_csv(outcomes_file)

    # Manually encode categorical variables for IR and EI
    ir_map = {'low': 0, 'medium': 1, 'high': 2}
    ei_map = {'poor': 0, 'average': 1, 'good': 2}

    df['IR_encoded'] = df['IR'].map(ir_map)
    df['EI_encoded'] = df['EI'].map(ei_map)

    # Assuming predictions were saved in memory from the NN run
    predictions = nn_model.predict(df[['IR_encoded', 'EI_encoded']])

    # Loop through the test set predictions
    for i in range(len(df)):
        # Neural Network predicted probabilities for SP (decrease, stable, increase)
        predicted_probs = predictions[i]

        # Get the IR and EI values for the current sample
        ir_value = df.iloc[i]['IR_encoded']
        ei_value = df.iloc[i]['EI_encoded']

        # Map encoded values back to original labels
        ir_value = {0: 'low', 1: 'medium', 2: 'high'}[ir_value]
        ei_value = {0: 'poor', 1: 'average', 2: 'good'}[ei_value]

        # Get the corresponding ground truth probabilities for SP given IR and EI
        col_prefix = f'SP_given_IR_{ir_value}_EI_{ei_value}'
        ground_truth_probs = df_gt_probs.filter(like=col_prefix).values.flatten()

        # Ensure the probabilities are non-zero to avoid division by zero
        epsilon = 1e-10
        ground_truth_probs = np.clip(ground_truth_probs, epsilon, 1)

        # Compute K-L divergence (Neural Network vs Ground Truth)
        kl_div = entropy(predicted_probs, ground_truth_probs)
        kl_divergences.append(kl_div)

    # Calculate the average K-L divergence over all samples for the current sample size
    average_kl_divergence = np.mean(kl_divergences)

    # Calculate the standard deviation of the K-L divergence values
    std_kl_divergence = np.std(kl_divergences)

    # Save the K-L divergence and std dev value to a CSV file
    save_kl_divergence(sample_size, average_kl_divergence, std_kl_divergence, first_run=first_run)

    # Print confirmation
    print(f"Average K-L Divergence for {sample_size} samples: {average_kl_divergence:.4f}, Std Dev: {std_kl_divergence:.4f}")

    # After the first run, set `first_run` to False
    first_run = False

# Once all sample sizes are processed, the K-L divergences and std devs will be saved in 'kl_div_NN_1_10_dense.csv'
print("\nK-L divergence and standard deviation calculations complete.")

[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step 
Average K-L Divergence for 500 samples: 0.3831, Std Dev: 0.6113
[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step 
Average K-L Divergence for 1000 samples: 0.6829, Std Dev: 0.5498
[1m47/47[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step
Average K-L Divergence for 1500 samples: 0.4083, Std Dev: 0.4773
[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step
Average K-L Divergence for 2000 samples: 0.3955, Std Dev: 0.3888
[1m79/79[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step
Average K-L Divergence for 2500 samples: 0.3123, Std Dev: 0.3544
[1m94/94[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step
Average K-L Divergence for 3000 samples: 0.2282, Std Dev: 0.4320
[1m110/110[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step
Average K-L Divergence for 3500 samples: 0.8830, Std Dev: 0.7247
[1m125/125[0m [32m━━━━━━━━━━