<a href="https://colab.research.google.com/github/nonyeezeh/Research-Project-Code/blob/main/NN_Sparse_1_10_Relu.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Imports

In [None]:
pip install pgmpy

Collecting pgmpy
  Downloading pgmpy-0.1.26-py3-none-any.whl.metadata (9.1 kB)
Downloading pgmpy-0.1.26-py3-none-any.whl (2.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.0/2.0 MB[0m [31m21.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pgmpy
Successfully installed pgmpy-0.1.26


In [None]:
import numpy as np
import pandas as pd
from pgmpy.models import BayesianNetwork
from pgmpy.models import BayesianModel
from pgmpy.factors.discrete import TabularCPD
from pgmpy.sampling import BayesianModelSampling
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import tensorflow as tf
from tensorflow.keras import layers, models
from pgmpy.estimators import HillClimbSearch, BicScore, AICScore, MaximumLikelihoodEstimator
from pgmpy.inference import VariableElimination
from sklearn.metrics import accuracy_score
from scipy.stats import entropy
import os
import matplotlib.pyplot as plt

# Bayesian Network Data Generation 1000, 2000, ..., 10000 Samples (sparse)

In [None]:
# Define the mappings for IR, EI, SP
ir_map = {0: 'low', 1: 'medium', 2: 'high'}
ei_map = {0: 'poor', 1: 'average', 2: 'good'}
sp_map = {0: 'decrease', 1: 'stable', 2: 'increase'}

# Define the sparse Bayesian Network
sparse_model = BayesianNetwork([('IR', 'SP'), ('EI', 'SP')])

# Function to generate CPDs for the sparse model
def generate_cpds_sparse():
    # Generate probabilities for IR (unconditional)
    ir_probs = np.random.rand(3)
    ir_probs /= ir_probs.sum()  # Normalize to make it a valid probability distribution

    # Generate unconditional probabilities for EI (no dependency on IR)
    ei_probs = np.random.rand(3)
    ei_probs /= ei_probs.sum()

    # Generate conditional probabilities for SP given IR and EI
    sp_probs = np.random.rand(3, 3, 3)
    sp_probs /= sp_probs.sum(axis=0, keepdims=True)

    sp_probs_reshaped = sp_probs.reshape(3, -1)

    return ir_probs, ei_probs, sp_probs_reshaped

# Save probabilities in a single CSV file
def save_probabilities_sparse(ir_probs, ei_probs, sp_probs, filename):
    # Create a DataFrame for IR probabilities
    ir_df = pd.DataFrame({
        'IR_State': ['low', 'medium', 'high'],
        'IR_Prob': ir_probs
    })

    # Create a DataFrame for EI probabilities (since it's not conditional on IR)
    ei_df = pd.DataFrame({
        'EI_State': ['poor', 'average', 'good'],
        'EI_Prob': ei_probs
    })

    # Create a DataFrame for SP given IR and EI probabilities
    sp_df = pd.DataFrame(sp_probs, columns=[
        'SP_given_IR_low_EI_poor', 'SP_given_IR_low_EI_average', 'SP_given_IR_low_EI_good',
        'SP_given_IR_medium_EI_poor', 'SP_given_IR_medium_EI_average', 'SP_given_IR_medium_EI_good',
        'SP_given_IR_high_EI_poor', 'SP_given_IR_high_EI_average', 'SP_given_IR_high_EI_good'
    ])
    sp_df['SP_State'] = ['decrease', 'stable', 'increase']

    # Combine all data into a single DataFrame
    combined_df = pd.concat([ir_df, ei_df, sp_df], axis=1)

    # Save the combined DataFrame as a single CSV file
    combined_df.to_csv(filename, index=False)

# Save outcomes in a CSV file
def save_outcomes_sparse(data_sparse, filename):
    data_sparse['IR'] = data_sparse['IR'].map(ir_map)
    data_sparse['EI'] = data_sparse['EI'].map(ei_map)
    data_sparse['SP'] = data_sparse['SP'].map(sp_map)
    data_sparse.to_csv(filename, index=False)

# Generate datasets for different sample sizes for the sparse model
sample_sizes = range(1000, 10500, 1000)
for size in sample_sizes:
    # Generate the CPDs
    ir_probs, ei_probs, sp_probs_reshaped = generate_cpds_sparse()

    # Define CPDs for the sparse model
    cpd_ir = TabularCPD(variable='IR', variable_card=3, values=[[ir_probs[0]], [ir_probs[1]], [ir_probs[2]]])
    cpd_ei_sparse = TabularCPD(variable='EI', variable_card=3, values=[[ei_probs[0]], [ei_probs[1]], [ei_probs[2]]])
    cpd_sp_sparse = TabularCPD(variable='SP', variable_card=3,
                               values=sp_probs_reshaped,
                               evidence=['IR', 'EI'], evidence_card=[3, 3])

    sparse_model.add_cpds(cpd_ir, cpd_ei_sparse, cpd_sp_sparse)

    # Check if the model is valid
    assert sparse_model.check_model()

    # Generate samples
    sampler_sparse = BayesianModelSampling(sparse_model)
    data_sparse = sampler_sparse.forward_sample(size=size)

    # Save probabilities in one file
    save_probabilities_sparse(ir_probs, ei_probs, sp_probs_reshaped, f'probabilities_sparse_{size}.csv')

    # Save outcomes (low, medium, high) in another file
    save_outcomes_sparse(data_sparse, f'outcomes_sparse_{size}.csv')

# Notify the user that the process is done
print("Data generation and saving complete for the sparse model!")

  0%|          | 0/3 [00:00<?, ?it/s]



  0%|          | 0/3 [00:00<?, ?it/s]



  0%|          | 0/3 [00:00<?, ?it/s]



  0%|          | 0/3 [00:00<?, ?it/s]



  0%|          | 0/3 [00:00<?, ?it/s]



  0%|          | 0/3 [00:00<?, ?it/s]



  0%|          | 0/3 [00:00<?, ?it/s]



  0%|          | 0/3 [00:00<?, ?it/s]



  0%|          | 0/3 [00:00<?, ?it/s]



  0%|          | 0/3 [00:00<?, ?it/s]



  0%|          | 0/3 [00:00<?, ?it/s]



  0%|          | 0/3 [00:00<?, ?it/s]



  0%|          | 0/3 [00:00<?, ?it/s]



  0%|          | 0/3 [00:00<?, ?it/s]



  0%|          | 0/3 [00:00<?, ?it/s]



  0%|          | 0/3 [00:00<?, ?it/s]



  0%|          | 0/3 [00:00<?, ?it/s]



  0%|          | 0/3 [00:00<?, ?it/s]



  0%|          | 0/3 [00:00<?, ?it/s]



  0%|          | 0/3 [00:00<?, ?it/s]



Data generation and saving complete for the sparse model!


# Hypothesis Model 500, 1000, 1500, ..., 10000 Samples (sparse) 1 hidden Layer, 10 Neurons Relu

In [None]:
# Sample sizes to loop through
sample_sizes = range(1000, 10500, 1000)

# Define the Neural Network architecture
def create_nn_model(hidden_layers=1, nodes_per_layer=10):
    model = models.Sequential()

    # Input layer (2 input features: IR_encoded and EI_encoded)
    model.add(layers.InputLayer(input_shape=(2,)))

    # Hidden layers
    for layer_num in range(hidden_layers):
        model.add(layers.Dense(nodes_per_layer, activation='relu', name=f"hidden_layer_{layer_num + 1}"))

    # Output layer (3 classes: decrease, stable, increase)
    model.add(layers.Dense(3, activation='softmax', name="output_layer"))

    # Compile the model
    model.compile(optimizer='adam',
                  loss='sparse_categorical_crossentropy',
                  metrics=['accuracy'])

    return model

# Loop through each sample size
for size in sample_sizes:
    # Load data for the current sample size (adjust the file paths for sparse data)
    outcomes_file = f'outcomes_sparse_{size}.csv'

    df = pd.read_csv(outcomes_file)

    # Manually encode categorical variables for IR, EI, and SP
    ir_map = {'low': 0, 'medium': 1, 'high': 2}
    ei_map = {'poor': 0, 'average': 1, 'good': 2}
    sp_map = {'decrease': 0, 'stable': 1, 'increase': 2}

    df['IR_encoded'] = df['IR'].map(ir_map)
    df['EI_encoded'] = df['EI'].map(ei_map)
    df['SP_encoded'] = df['SP'].map(sp_map)

    # Features (IR and EI) and labels (SP)
    X = df[['IR_encoded', 'EI_encoded']]
    y = df['SP_encoded']

    # Refresh the data split for each iteration
    X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, shuffle=False, random_state=42)
    X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, shuffle=False, random_state=42)

    # Show split confirmation
    print(f"\nSample size: {size}")
    print("Training Data:", X_train.shape, y_train.shape)
    print("Validation Data:", X_val.shape, y_val.shape)
    print("Test Data:", X_test.shape, y_test.shape)

    # Create the Neural Network model
    nn_model = create_nn_model(hidden_layers=1, nodes_per_layer=10)

    # Train the model
    history = nn_model.fit(X_train, y_train,
                           epochs=50,
                           batch_size=32,
                           validation_data=(X_val, y_val),
                           verbose=0)  # Set verbose=0 to avoid too much output

    # Evaluate on the validation set
    val_loss, val_accuracy = nn_model.evaluate(X_val, y_val, verbose=0)
    print(f"Validation Accuracy for {size} samples: {val_accuracy:.4f}")

    # Evaluate on the test set
    test_loss, test_accuracy = nn_model.evaluate(X_test, y_test, verbose=0)
    print(f"Test Accuracy for {size} samples: {test_accuracy:.4f}")

    # Make predictions on the test set
    predictions = nn_model.predict(X_test)

    # Convert the predicted probabilities to class labels
    predicted_classes = predictions.argmax(axis=1)

    # Create a list to map integers back to the original SP labels
    sp_reverse_map = ['decrease', 'stable', 'increase']

    # Convert the predicted classes to the original labels
    predicted_labels = [sp_reverse_map[label] for label in predicted_classes]

    # Create a DataFrame for the predicted probabilities
    probs_df = pd.DataFrame(predictions, columns=['Prob_decrease', 'Prob_stable', 'Prob_increase'])

    # Output the IR, EI, predicted SP, and the NN probabilities
    result_df = pd.DataFrame({
        'IR': df['IR'][:len(predicted_labels)],  # IR column from the original dataframe
        'EI': df['EI'][:len(predicted_labels)],  # EI column from the original dataframe
        'Predicted_SP': predicted_labels         # Predicted SP labels
    })

    # Combine the result with the predicted probabilities
    combined_df = pd.concat([result_df, probs_df.reset_index(drop=True)], axis=1)

    # Show the first few rows of the results for this sample size
    print(f"\nPredicted Results and Probabilities for {size} samples (First 5 rows):")
    print(combined_df.head(15))

# After the loop is done, print this message
print("\nLooping through all sparse sample sizes complete!")




Sample size: 500
Training Data: (350, 2) (350,)
Validation Data: (75, 2) (75,)
Test Data: (75, 2) (75,)
Validation Accuracy for 500 samples: 0.4133
Test Accuracy for 500 samples: 0.4933
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 47ms/step

Predicted Results and Probabilities for 500 samples (First 5 rows):
        IR       EI Predicted_SP  Prob_decrease  Prob_stable  Prob_increase
0   medium     good     decrease       0.496978     0.344048       0.158974
1   medium     good     decrease       0.457540     0.423755       0.118705
2     high     good     decrease       0.496978     0.344048       0.158974
3      low     good     decrease       0.496978     0.344048       0.158974
4     high     good     decrease       0.496978     0.344048       0.158974
5     high     good     decrease       0.496978     0.344048       0.158974
6      low     good     increase       0.259504     0.292808       0.447688
7   medium     good     increase       0.259504     0.292808    



Validation Accuracy for 1000 samples: 0.5467
Test Accuracy for 1000 samples: 0.5400
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step 

Predicted Results and Probabilities for 1000 samples (First 5 rows):
        IR       EI Predicted_SP  Prob_decrease  Prob_stable  Prob_increase
0   medium  average       stable       0.168973     0.620075       0.210952
1     high  average       stable       0.168973     0.620075       0.210952
2     high     poor       stable       0.168973     0.620075       0.210952
3      low  average       stable       0.168973     0.620075       0.210952
4     high  average     increase       0.109114     0.185783       0.705103
5      low  average       stable       0.185938     0.466974       0.347088
6     high     good       stable       0.168973     0.620075       0.210952
7     high  average       stable       0.168973     0.620075       0.210952
8      low  average     increase       0.123192     0.184484       0.692323
9      low  av



Validation Accuracy for 1500 samples: 0.4444
Test Accuracy for 1500 samples: 0.4044




[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step 





Predicted Results and Probabilities for 1500 samples (First 5 rows):
        IR       EI Predicted_SP  Prob_decrease  Prob_stable  Prob_increase
0   medium     poor     decrease       0.365887     0.336350       0.297763
1   medium  average     decrease       0.365887     0.336350       0.297763
2   medium  average       stable       0.327540     0.462064       0.210396
3   medium     good       stable       0.264430     0.447402       0.288168
4   medium  average       stable       0.404396     0.422506       0.173097
5   medium  average       stable       0.414093     0.415351       0.170556
6   medium  average       stable       0.404396     0.422506       0.173097
7     high     good     decrease       0.365887     0.336350       0.297763
8      low  average       stable       0.264430     0.447402       0.288168
9      low     poor       stable       0.414093     0.415351       0.170556
10     low     poor       stable       0.414093     0.415351       0.170556
11     low     poo



Validation Accuracy for 2000 samples: 0.4100
Test Accuracy for 2000 samples: 0.4033
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step 





Predicted Results and Probabilities for 2000 samples (First 5 rows):
        IR       EI Predicted_SP  Prob_decrease  Prob_stable  Prob_increase
0   medium  average     increase       0.191611     0.341228       0.467161
1     high     good     increase       0.326833     0.289545       0.383623
2     high  average     decrease       0.389495     0.348137       0.262369
3   medium     poor     increase       0.191611     0.341228       0.467161
4     high     good     increase       0.298240     0.265085       0.436674
5   medium     good     decrease       0.367290     0.274827       0.357882
6   medium  average     decrease       0.367290     0.274827       0.357882
7     high     good     decrease       0.367290     0.274827       0.357882
8     high  average     decrease       0.542678     0.219997       0.237325
9     high  average     decrease       0.367290     0.274827       0.357882
10  medium     good     increase       0.298240     0.265085       0.436674
11    high     goo



Validation Accuracy for 3000 samples: 0.4444
Test Accuracy for 3000 samples: 0.4378
[1m15/15[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step 

Predicted Results and Probabilities for 3000 samples (First 5 rows):
        IR       EI Predicted_SP  Prob_decrease  Prob_stable  Prob_increase
0      low     poor     increase       0.250165     0.258599       0.491236
1      low     poor       stable       0.210465     0.455233       0.334302
2     high  average       stable       0.347911     0.394741       0.257348
3      low     good     decrease       0.525648     0.281973       0.192379
4   medium  average     decrease       0.487319     0.335727       0.176954
5   medium  average       stable       0.210465     0.455233       0.334302
6      low     poor     decrease       0.487319     0.335727       0.176954
7     high     good     decrease       0.487319     0.335727       0.176954
8   medium     poor     decrease       0.487319     0.335727       0.176954
9      low  



Validation Accuracy for 3500 samples: 0.5448
Test Accuracy for 3500 samples: 0.5448
[1m17/17[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step

Predicted Results and Probabilities for 3500 samples (First 5 rows):
        IR       EI Predicted_SP  Prob_decrease  Prob_stable  Prob_increase
0     high     good     decrease       0.450153     0.335386       0.214461
1     high  average     decrease       0.365524     0.359311       0.275165
2      low     poor     increase       0.227364     0.373448       0.399188
3     high  average     decrease       0.450153     0.335386       0.214461
4     high  average       stable       0.208037     0.612177       0.179786
5     high     poor     decrease       0.365524     0.359311       0.275165
6     high     good       stable       0.208037     0.612177       0.179786
7      low  average     decrease       0.478813     0.174708       0.346479
8      low  average       stable       0.059848     0.878502       0.061650
9   medium   



Validation Accuracy for 4000 samples: 0.4750
Test Accuracy for 4000 samples: 0.4867
[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step

Predicted Results and Probabilities for 4000 samples (First 5 rows):
        IR       EI Predicted_SP  Prob_decrease  Prob_stable  Prob_increase
0     high     poor       stable       0.289886     0.445021       0.265093
1     high     poor       stable       0.227802     0.526423       0.245775
2     high  average       stable       0.164335     0.470710       0.364955
3   medium     poor     decrease       0.422051     0.407486       0.170463
4     high     poor     increase       0.186914     0.389347       0.423739
5     high     poor       stable       0.227802     0.526423       0.245775
6   medium     poor       stable       0.164335     0.470710       0.364955
7     high     poor     increase       0.186914     0.389347       0.423739
8     high  average       stable       0.382581     0.490074       0.127345
9   medium   



Validation Accuracy for 4500 samples: 0.4622
Test Accuracy for 4500 samples: 0.5156
[1m22/22[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step

Predicted Results and Probabilities for 4500 samples (First 5 rows):
        IR       EI Predicted_SP  Prob_decrease  Prob_stable  Prob_increase
0      low  average     decrease       0.554843     0.405054       0.040103
1     high  average     decrease       0.495023     0.141591       0.363386
2     high     good     increase       0.294706     0.313007       0.392287
3     high  average     increase       0.375728     0.177354       0.446918
4     high     poor     decrease       0.554843     0.405054       0.040103
5   medium     good     decrease       0.554843     0.405054       0.040103
6      low     good     decrease       0.554843     0.405054       0.040103
7     high  average     decrease       0.514308     0.291972       0.193720
8      low     good     decrease       0.497691     0.344608       0.157701
9      low   



Validation Accuracy for 5000 samples: 0.4133
Test Accuracy for 5000 samples: 0.4133
[1m24/24[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step

Predicted Results and Probabilities for 5000 samples (First 5 rows):
        IR       EI Predicted_SP  Prob_decrease  Prob_stable  Prob_increase
0   medium     good     decrease       0.487669     0.255126       0.257205
1   medium     good     decrease       0.444973     0.343075       0.211952
2   medium     good     increase       0.340478     0.252276       0.407246
3     high     good     increase       0.385144     0.210375       0.404482
4     high  average     decrease       0.432837     0.307164       0.259998
5   medium     good     decrease       0.444973     0.343075       0.211952
6   medium     good     increase       0.340478     0.252276       0.407246
7      low     poor     decrease       0.444973     0.343075       0.211952
8   medium     good     decrease       0.444973     0.343075       0.211952
9      low  a



Validation Accuracy for 5500 samples: 0.5248
Test Accuracy for 5500 samples: 0.5503
[1m26/26[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step

Predicted Results and Probabilities for 5500 samples (First 5 rows):
        IR       EI Predicted_SP  Prob_decrease  Prob_stable  Prob_increase
0      low     poor     decrease       0.489137     0.336573       0.174290
1     high     good     decrease       0.489137     0.336573       0.174290
2     high     poor     decrease       0.489137     0.336573       0.174290
3   medium     good     decrease       0.464223     0.177410       0.358366
4     high     poor     decrease       0.489137     0.336573       0.174290
5   medium  average     increase       0.356462     0.230929       0.412609
6     high     good       stable       0.196202     0.558475       0.245323
7     high  average       stable       0.196202     0.558475       0.245323
8   medium     poor     decrease       0.675788     0.258734       0.065478
9   medium   



Validation Accuracy for 6000 samples: 0.5367
Test Accuracy for 6000 samples: 0.5444
[1m29/29[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step

Predicted Results and Probabilities for 6000 samples (First 5 rows):
      IR       EI Predicted_SP  Prob_decrease  Prob_stable  Prob_increase
0   high     poor     increase       0.240767     0.276580       0.482653
1    low  average     increase       0.240767     0.276580       0.482653
2    low  average     increase       0.020374     0.350564       0.629062
3    low     poor       stable       0.119759     0.507450       0.372791
4    low     poor       stable       0.158966     0.610554       0.230480
5    low  average       stable       0.119759     0.507450       0.372791
6    low  average     increase       0.020374     0.350564       0.629062
7    low  average     decrease       0.468065     0.264712       0.267223
8    low  average     increase       0.191270     0.306651       0.502080
9   high     good     increase   



Validation Accuracy for 6500 samples: 0.4287
Test Accuracy for 6500 samples: 0.4205
[1m31/31[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step

Predicted Results and Probabilities for 6500 samples (First 5 rows):
        IR       EI Predicted_SP  Prob_decrease  Prob_stable  Prob_increase
0      low     good     decrease       0.476812     0.282433       0.240755
1   medium  average     decrease       0.476812     0.282433       0.240755
2      low     good     decrease       0.406010     0.278938       0.315053
3   medium     poor       stable       0.350987     0.393263       0.255750
4   medium     good     decrease       0.476812     0.282433       0.240755
5      low     poor     increase       0.071712     0.453313       0.474975
6   medium  average       stable       0.301693     0.420982       0.277326
7   medium     poor       stable       0.301693     0.420982       0.277326
8   medium     poor     decrease       0.476812     0.282433       0.240755
9      low   



Validation Accuracy for 7000 samples: 0.5429
Test Accuracy for 7000 samples: 0.5514
[1m33/33[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step

Predicted Results and Probabilities for 7000 samples (First 5 rows):
        IR       EI Predicted_SP  Prob_decrease  Prob_stable  Prob_increase
0      low     good     decrease       0.457534     0.293614       0.248852
1     high     good     decrease       0.600377     0.311088       0.088535
2   medium     good     decrease       0.600377     0.311088       0.088535
3      low     good       stable       0.429938     0.479056       0.091006
4     high  average     decrease       0.457534     0.293614       0.248852
5     high     poor     decrease       0.467400     0.214051       0.318549
6   medium     good     decrease       0.483825     0.285581       0.230594
7      low  average     decrease       0.600377     0.311088       0.088535
8     high     poor     decrease       0.600377     0.311088       0.088535
9      low   



Validation Accuracy for 7500 samples: 0.4560
Test Accuracy for 7500 samples: 0.4418
[1m36/36[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step

Predicted Results and Probabilities for 7500 samples (First 5 rows):
      IR       EI Predicted_SP  Prob_decrease  Prob_stable  Prob_increase
0    low  average     increase       0.216004     0.362057       0.421939
1    low  average       stable       0.153911     0.494049       0.352040
2    low  average     increase       0.216004     0.362057       0.421939
3   high     good     increase       0.350529     0.294211       0.355260
4   high  average     increase       0.216004     0.362057       0.421939
5    low     poor     decrease       0.444908     0.355594       0.199498
6    low  average       stable       0.191550     0.435219       0.373231
7    low  average       stable       0.191550     0.435219       0.373231
8    low  average     decrease       0.444908     0.355594       0.199498
9   high     good     increase   



Validation Accuracy for 8000 samples: 0.5275
Test Accuracy for 8000 samples: 0.5125
[1m38/38[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step

Predicted Results and Probabilities for 8000 samples (First 5 rows):
        IR       EI Predicted_SP  Prob_decrease  Prob_stable  Prob_increase
0     high  average     increase       0.265637     0.232978       0.501385
1      low  average       stable       0.154516     0.589992       0.255493
2      low  average       stable       0.166759     0.526788       0.306454
3     high  average     increase       0.265637     0.232978       0.501385
4      low     poor       stable       0.166759     0.526788       0.306454
5      low  average       stable       0.218286     0.432429       0.349286
6      low     poor       stable       0.154516     0.589992       0.255493
7     high     poor     increase       0.282754     0.198482       0.518764
8   medium     poor       stable       0.205616     0.405052       0.389332
9     high   



Validation Accuracy for 8500 samples: 0.5051
Test Accuracy for 8500 samples: 0.4957
[1m40/40[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step

Predicted Results and Probabilities for 8500 samples (First 5 rows):
        IR       EI Predicted_SP  Prob_decrease  Prob_stable  Prob_increase
0     high  average     increase       0.119833     0.334487       0.545680
1     high     good     decrease       0.509121     0.254996       0.235883
2   medium     good     decrease       0.510321     0.383102       0.106577
3     high     poor     decrease       0.447641     0.284971       0.267388
4   medium  average     decrease       0.509121     0.254996       0.235883
5   medium  average       stable       0.350175     0.396339       0.253487
6     high  average     decrease       0.509121     0.254996       0.235883
7     high  average       stable       0.350175     0.396339       0.253487
8   medium     good       stable       0.126606     0.523549       0.349845
9   medium   



Validation Accuracy for 9000 samples: 0.4867
Test Accuracy for 9000 samples: 0.5052
[1m43/43[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step

Predicted Results and Probabilities for 9000 samples (First 5 rows):
        IR       EI Predicted_SP  Prob_decrease  Prob_stable  Prob_increase
0      low     poor     increase       0.085371     0.399740       0.514890
1   medium     poor     increase       0.085371     0.399740       0.514890
2     high  average     decrease       0.411834     0.237105       0.351061
3      low  average       stable       0.201882     0.463360       0.334758
4      low  average     decrease       0.482289     0.388086       0.129625
5      low  average     decrease       0.483693     0.394013       0.122293
6     high  average     increase       0.294175     0.280271       0.425554
7      low  average       stable       0.011546     0.658772       0.329682
8      low     poor       stable       0.201882     0.463360       0.334758
9      low  a



Validation Accuracy for 9500 samples: 0.4632
Test Accuracy for 9500 samples: 0.4498
[1m45/45[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step

Predicted Results and Probabilities for 9500 samples (First 5 rows):
        IR       EI Predicted_SP  Prob_decrease  Prob_stable  Prob_increase
0   medium  average     increase       0.381421     0.194572       0.424007
1     high     poor     increase       0.371281     0.255671       0.373048
2     high     poor     increase       0.381421     0.194572       0.424007
3      low  average     increase       0.178109     0.328714       0.493177
4   medium     poor     decrease       0.427614     0.216051       0.356335
5   medium     good       stable       0.310035     0.616957       0.073009
6     high     poor       stable       0.310035     0.616957       0.073009
7      low     poor       stable       0.310035     0.616957       0.073009
8      low  average     increase       0.371281     0.255671       0.373048
9      low   



Validation Accuracy for 10000 samples: 0.4580
Test Accuracy for 10000 samples: 0.4500
[1m47/47[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step

Predicted Results and Probabilities for 10000 samples (First 5 rows):
        IR       EI Predicted_SP  Prob_decrease  Prob_stable  Prob_increase
0   medium     poor     decrease       0.398184     0.326436       0.275380
1      low     poor     decrease       0.398184     0.326436       0.275380
2     high     poor     increase       0.298641     0.189217       0.512142
3     high  average     increase       0.298641     0.189217       0.512142
4      low  average     decrease       0.539115     0.299848       0.161037
5     high     poor     increase       0.325414     0.275082       0.399504
6      low     good     decrease       0.398184     0.326436       0.275380
7   medium     poor       stable       0.336777     0.370899       0.292323
8     high  average     decrease       0.398184     0.326436       0.275380
9   medium

# K-L Divergence NN Sparse Data

In [None]:
# Define the function to save K-L divergence and std dev to a file
def save_kl_divergence(sample_size, kl_div_value, std_kl_div_value, first_run=False):
    file_name = 'kl_div_NN_1_10_sparse.csv'

    # If it's the first run, write the headers
    if first_run:
        with open(file_name, 'w') as f:
            f.write('Size,NN_Sparse_1_10_Entropy,Std_Dev\n')  # Write headers for the CSV

    # Append the K-L divergence and std dev for this sample size to the file
    with open(file_name, 'a') as f:
        f.write(f"{sample_size},{kl_div_value:.4f},{std_kl_div_value:.4f}\n")

# Sample sizes to loop through
sample_sizes = range(1000, 10500, 1000)  # Loop from 500 to 10,000 in increments of 500

# Loop through each sample size
first_run = True  # Flag to indicate the first run for writing headers

for sample_size in sample_sizes:
    # Load the ground truth probabilities for the current sample size
    ground_truth_probs_file = f'probabilities_sparse_{sample_size}.csv'
    df_gt_probs = pd.read_csv(ground_truth_probs_file)

    # Placeholder to store K-L divergence values
    kl_divergences = []

    # Load the predictions from memory (already generated by the NN)
    outcomes_file = f'outcomes_sparse_{sample_size}.csv'
    df = pd.read_csv(outcomes_file)

    # Manually encode categorical variables for IR and EI
    ir_map = {'low': 0, 'medium': 1, 'high': 2}
    ei_map = {'poor': 0, 'average': 1, 'good': 2}

    df['IR_encoded'] = df['IR'].map(ir_map)
    df['EI_encoded'] = df['EI'].map(ei_map)

    # Assuming predictions were saved in memory from the NN run
    predictions = nn_model.predict(df[['IR_encoded', 'EI_encoded']])

    # Loop through the test set predictions
    for i in range(len(df)):
        # Neural Network predicted probabilities for SP (decrease, stable, increase)
        predicted_probs = predictions[i]

        # Get the IR and EI values for the current sample
        ir_value = df.iloc[i]['IR_encoded']
        ei_value = df.iloc[i]['EI_encoded']

        # Map encoded values back to original labels
        ir_value = {0: 'low', 1: 'medium', 2: 'high'}[ir_value]
        ei_value = {0: 'poor', 1: 'average', 2: 'good'}[ei_value]

        # Get the corresponding ground truth probabilities for SP given IR and EI
        col_prefix = f'SP_given_IR_{ir_value}_EI_{ei_value}'
        ground_truth_probs = df_gt_probs.filter(like=col_prefix).values.flatten()

        # Ensure the probabilities are non-zero to avoid division by zero
        epsilon = 1e-10
        ground_truth_probs = np.clip(ground_truth_probs, epsilon, 1)

        # Compute K-L divergence (Neural Network vs Ground Truth)
        kl_div = entropy(predicted_probs, ground_truth_probs)
        kl_divergences.append(kl_div)

    # Calculate the average K-L divergence over all samples for the current sample size
    average_kl_divergence = np.mean(kl_divergences)

    # Calculate the standard deviation of the K-L divergence values
    std_kl_divergence = np.std(kl_divergences)

    # Save the K-L divergence and std dev value to a CSV file
    save_kl_divergence(sample_size, average_kl_divergence, std_kl_divergence, first_run=first_run)

    # Print confirmation
    print(f"Average K-L Divergence for {sample_size} samples: {average_kl_divergence:.4f}, Std Dev: {std_kl_divergence:.4f}")

    # After the first run, set `first_run` to False
    first_run = False

# Once all sample sizes are processed, the K-L divergences and std devs will be saved in 'kl_div_NN_1_10_sparse.csv'
print("\nK-L divergence and standard deviation calculations complete.")

[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step 
Average K-L Divergence for 500 samples: 0.2801, Std Dev: 0.2647
[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step 
Average K-L Divergence for 1000 samples: 0.4282, Std Dev: 0.3127
[1m47/47[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step
Average K-L Divergence for 1500 samples: 0.3050, Std Dev: 0.3148
[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step
Average K-L Divergence for 2000 samples: 0.2219, Std Dev: 0.1781
[1m79/79[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step
Average K-L Divergence for 2500 samples: 0.0796, Std Dev: 0.0306
[1m94/94[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step
Average K-L Divergence for 3000 samples: 0.2201, Std Dev: 0.2397
[1m110/110[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step
Average K-L Divergence for 3500 samples: 0.6165, Std Dev: 0.5699
[1m125/125[0m [32m━━━━━━━━━━