<a href="https://colab.research.google.com/github/nonyeezeh/Research-Project-Code/blob/main/NN_Sparse_1_3_Relu_6.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Imports

In [1]:
pip install pgmpy

Collecting pgmpy
  Downloading pgmpy-0.1.26-py3-none-any.whl.metadata (9.1 kB)
Downloading pgmpy-0.1.26-py3-none-any.whl (2.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.0/2.0 MB[0m [31m16.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pgmpy
Successfully installed pgmpy-0.1.26


In [2]:
import numpy as np
import pandas as pd
from pgmpy.estimators import HillClimbSearch, BicScore, MaximumLikelihoodEstimator
from pgmpy.models import BayesianNetwork
from pgmpy.estimators import BayesianEstimator
from sklearn.model_selection import train_test_split
from scipy.stats import entropy
from tabulate import tabulate

from tensorflow.keras import models, layers, regularizers, callbacks
from sklearn.model_selection import train_test_split

# Bayesian Network Data Generation 500, ..., 20000 Samples (sparse)

In [27]:
# Function to generate CPDs for the sparse structure with 5 nodes influencing SP
def generate_cpds_sparse_5_total_nodes():
    # Generate random probabilities for each of the 5 independent nodes
    ir_probs = np.random.rand(3)
    ir_probs /= ir_probs.sum()

    ei_given_ir_probs = np.random.rand(3, 3)
    ei_given_ir_probs /= ei_given_ir_probs.sum(axis=0, keepdims=True)

    irt_given_ir_probs = np.random.rand(3, 3)
    irt_given_ir_probs /= irt_given_ir_probs.sum(axis=0, keepdims=True)

    ms_given_ir_probs = np.random.rand(3, 3)
    ms_given_ir_probs /= ms_given_ir_probs.sum(axis=0, keepdims=True)

    geo_given_ir_probs = np.random.rand(3, 3)
    geo_given_ir_probs /= geo_given_ir_probs.sum(axis=0, keepdims=True)

    # SP depends on the 5 other nodes without interactions between them
    sp_probs = np.random.rand(3, 3, 3, 3, 3, 3)
    sp_probs /= sp_probs.sum(axis=0, keepdims=True)

    return ir_probs, ei_given_ir_probs, irt_given_ir_probs, ms_given_ir_probs, geo_given_ir_probs, sp_probs

# Function to generate and save samples with the sparse structure of 5 nodes total
def generate_and_save_samples_sparse_5_total_nodes(ir_probs, ei_probs, irt_probs, ms_probs, geo_probs, sp_probs, sample_size, filename):
    output_data = []

    # Generate `sample_size` random samples
    for _ in range(sample_size):
        # Sample each of the 5 nodes individually
        ir_state_idx = np.random.choice(3, p=ir_probs)
        ir_state = ['low', 'medium', 'high'][ir_state_idx]

        ei_probs_given_ir = ei_probs[:, ir_state_idx]
        ei_state_idx = np.random.choice(3, p=ei_probs_given_ir)
        ei_state = ['poor', 'average', 'good'][ei_state_idx]

        irt_probs_given_ir = irt_probs[:, ir_state_idx]
        irt_state_idx = np.random.choice(3, p=irt_probs_given_ir)
        irt_state = ['weak', 'moderate', 'strong'][irt_state_idx]

        ms_probs_given_ir = ms_probs[:, ir_state_idx]
        ms_state_idx = np.random.choice(3, p=ms_probs_given_ir)
        ms_state = ['low', 'medium', 'high'][ms_state_idx]

        geo_probs_given_ir = geo_probs[:, ir_state_idx]
        geo_state_idx = np.random.choice(3, p=geo_probs_given_ir)
        geo_state = ['urban', 'suburban', 'rural'][geo_state_idx]

        # Calculate SP probability based on the state of each node (sparse dependency on each)
        sp_probs_given_all = sp_probs[:, ir_state_idx, ei_state_idx, irt_state_idx, ms_state_idx, geo_state_idx]
        sp_state_idx = np.random.choice(3, p=sp_probs_given_all)
        sp_state = ['decrease', 'stable', 'increase'][sp_state_idx]

        # Append sample data to output list including probabilities for all nodes
        output_data.append({
            'IR_State': ir_state,
            'EI_State': ei_state,
            'IRT_State': irt_state,
            'MS_State': ms_state,
            'GEO_State': geo_state,
            'SP_Probabilities (decrease, stable, increase)': ', '.join([f'{prob:.4f}' for prob in sp_probs_given_all]),
            'Chosen_SP_State': sp_state
        })

    # Create a DataFrame from the output data
    output_df = pd.DataFrame(output_data)

    # Save the output DataFrame to a CSV file
    output_df.to_csv(filename, index=False)

    # Print the first few rows for visual confirmation
    print(f"\nSample size: {sample_size} - First few rows of generated samples:\n")
    print(tabulate(output_df.head(), headers='keys', tablefmt='grid'))

# Generate and save samples for sample sizes
sample_sizes = [50, 100, 500, 1000, 5000, 10000, 15000, 20000]

for size in sample_sizes:
    ir_probs, ei_probs, irt_probs, ms_probs, geo_probs, sp_probs = generate_cpds_sparse_5_total_nodes()
    generate_and_save_samples_sparse_5_total_nodes(ir_probs, ei_probs, irt_probs, ms_probs, geo_probs, sp_probs, size, f'combined_probabilities_{size}.csv')

print("\nGeneration and saving of individual samples complete for all sample sizes!")


Sample size: 50 - First few rows of generated samples:

+----+------------+------------+-------------+------------+-------------+-------------------------------------------------+-------------------+
|    | IR_State   | EI_State   | IRT_State   | MS_State   | GEO_State   | SP_Probabilities (decrease, stable, increase)   | Chosen_SP_State   |
|  0 | medium     | good       | strong      | medium     | suburban    | 0.0527, 0.6158, 0.3315                          | stable            |
+----+------------+------------+-------------+------------+-------------+-------------------------------------------------+-------------------+
|  1 | low        | good       | weak        | low        | suburban    | 0.2405, 0.6238, 0.1357                          | stable            |
+----+------------+------------+-------------+------------+-------------+-------------------------------------------------+-------------------+
|  2 | medium     | good       | strong      | medium     | suburban    | 0.052

# NN & KL-Div

In [28]:
# Sample sizes to loop through
sample_sizes = [50, 100, 500, 1000, 5000, 10000, 15000, 20000]
# Define the Neural Network architecture with L2 regularization
def create_nn_model(hidden_layers=1, nodes_per_layer=3, l2_lambda=0.01):
    model = models.Sequential()
    model.add(layers.InputLayer(input_shape=(5,)))  # Updated input shape to include 5 features

    # Hidden layers with L2 regularization and Dropout
    for layer_num in range(hidden_layers):
        model.add(layers.Dense(
            nodes_per_layer,
            activation='relu',
            kernel_regularizer=regularizers.l2(l2_lambda),  # L2 regularization
            name=f"hidden_layer_{layer_num + 1}"
        ))
        model.add(layers.Dropout(0.2))  # Dropout layer to reduce overfitting

    # Output layer (3 classes: decrease, stable, increase) with L2 regularization
    model.add(layers.Dense(
        3,
        activation='softmax',
        kernel_regularizer=regularizers.l2(l2_lambda),
        name="output_layer"
    ))

    model.compile(optimizer='adam',
                  loss='sparse_categorical_crossentropy',
                  metrics=['accuracy'])

    return model

# Prepare a dictionary to store extracted data for each sample size
extracted_data = {}

# Extract the required columns from all sample sizes first
for size in sample_sizes:
    outcomes_file = f'combined_probabilities_{size}.csv'
    df = pd.read_csv(outcomes_file)

    # Include new nodes in the required columns
    required_columns = ['IR_State', 'EI_State', 'IRT_State', 'MS_State', 'GEO_State', 'Chosen_SP_State']
    df_extracted = df[required_columns]

    # Encode categorical variables for IR, EI, IRT, MS, GEO, and SP
    ir_map = {'low': 0, 'medium': 1, 'high': 2}
    ei_map = {'poor': 0, 'average': 1, 'good': 2}
    irt_map = {'weak': 0, 'moderate': 1, 'strong': 2}
    ms_map = {'low': 0, 'medium': 1, 'high': 2}
    geo_map = {'urban': 0, 'suburban': 1, 'rural': 2}
    sp_map = {'decrease': 0, 'stable': 1, 'increase': 2}

    df_extracted['IR_encoded'] = df_extracted['IR_State'].map(ir_map)
    df_extracted['EI_encoded'] = df_extracted['EI_State'].map(ei_map)
    df_extracted['IRT_encoded'] = df_extracted['IRT_State'].map(irt_map)
    df_extracted['MS_encoded'] = df_extracted['MS_State'].map(ms_map)
    df_extracted['GEO_encoded'] = df_extracted['GEO_State'].map(geo_map)
    df_extracted['SP_encoded'] = df_extracted['Chosen_SP_State'].map(sp_map)

    extracted_data[size] = df_extracted

# Initialize list to store K-L divergence and standard deviation results
results = []
epsilon = 1e-10  # Small value for smoothing

for size in sample_sizes:
    df = extracted_data[size]

    # Features (IR, EI, IRT, MS, GEO) and labels (SP)
    X = df[['IR_encoded', 'EI_encoded', 'IRT_encoded', 'MS_encoded', 'GEO_encoded']]
    y = df['SP_encoded']

    # Split into training, validation, and test sets
    X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, shuffle=True, random_state=42)
    X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, shuffle=True, random_state=42)

    # Create and train the Neural Network model
    nn_model = create_nn_model(hidden_layers=1, nodes_per_layer=3, l2_lambda=0.01)
    early_stopping = callbacks.EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)
    nn_model.fit(X_train, y_train, epochs=25, batch_size=16, validation_data=(X_val, y_val), callbacks=[early_stopping], verbose=0)

    # Evaluate model accuracy
    train_loss, train_accuracy = nn_model.evaluate(X_train, y_train, verbose=0)
    val_loss, val_accuracy = nn_model.evaluate(X_val, y_val, verbose=0)
    test_loss, test_accuracy = nn_model.evaluate(X_test, y_test, verbose=0)

    print(f"\nSample size: {size}")
    print(f"Training Accuracy: {train_accuracy:.4f}")
    print(f"Validation Accuracy: {val_accuracy:.4f}")
    print(f"Test Accuracy: {test_accuracy:.4f}")

    # Predict on test data
    predictions = nn_model.predict(X_test)
    predicted_classes = predictions.argmax(axis=1)

    # Calculate ground truth and predicted probabilities
    ground_truth_probabilities = y_test.value_counts(normalize=True).sort_index()
    predicted_probabilities = pd.Series(predicted_classes).value_counts(normalize=True).sort_index()

    # Reindex both distributions and add smoothing
    all_categories = sorted(set(ground_truth_probabilities.index).union(set(predicted_probabilities.index)))
    ground_truth_probabilities = ground_truth_probabilities.reindex(all_categories, fill_value=epsilon)
    predicted_probabilities = predicted_probabilities.reindex(all_categories, fill_value=epsilon)

    # Calculate K-L divergence and standard deviation
    kl_divergence = entropy(pk=ground_truth_probabilities, qk=predicted_probabilities)
    std_dev = np.std(predicted_probabilities - ground_truth_probabilities)

    results.append({
        'Sample_Size': size,
        'K-L_Divergence': kl_divergence,
        'Standard_Deviation': std_dev
    })

    print(f"K-L Divergence: {kl_divergence:.4f}")
    print(f"Standard Deviation: {std_dev:.4f}")

    # Map integers back to the original SP labels
    sp_reverse_map = ['decrease', 'stable', 'increase']
    predicted_labels = [sp_reverse_map[label] for label in predicted_classes]

    # Create DataFrame for displaying nodes, predicted SP, and chosen SP
    result_df = pd.DataFrame({
        'IR_State': df['IR_State'].iloc[X_test.index],
        'EI_State': df['EI_State'].iloc[X_test.index],
        'IRT_State': df['IRT_State'].iloc[X_test.index],
        'MS_State': df['MS_State'].iloc[X_test.index],
        'GEO_State': df['GEO_State'].iloc[X_test.index],
        'Chosen_SP': df['Chosen_SP_State'].iloc[X_test.index],
        'Predicted_SP': predicted_labels
    })
    print(f"\nPredicted Results for {size} samples (First 10 rows):")
    print(result_df.head(10))

    # Save results for this sample size in a dedicated CSV
    result_df.to_csv(f'test_results_{size}.csv', index=False)

# Save only K-L and Standard Deviation results to a summary file
results_df = pd.DataFrame(results)
results_df.to_csv('kl_std_results_summary.csv', index=False)

print("\nAll K-L divergence and standard deviation results have been saved in 'kl_std_results_summary.csv'.")




Sample size: 50
Training Accuracy: 0.4571
Validation Accuracy: 0.5714
Test Accuracy: 0.2500
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 51ms/step
K-L Divergence: 8.1443
Standard Deviation: 0.3680

Predicted Results for 50 samples (First 10 rows):
   IR_State EI_State IRT_State MS_State GEO_State Chosen_SP Predicted_SP
19      low     poor  moderate   medium     rural    stable       stable
4    medium  average    strong   medium  suburban    stable     decrease
13   medium     good  moderate      low     rural  decrease       stable
8       low     good  moderate   medium     rural  increase       stable
48   medium     good  moderate      low     urban  decrease     decrease
32   medium     poor  moderate   medium     urban  increase       stable
30   medium     good    strong   medium  suburban  increase       stable
39   medium     good  moderate   medium  suburban  decrease       stable





Sample size: 100
Training Accuracy: 0.2714
Validation Accuracy: 0.4000
Test Accuracy: 0.4000
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 44ms/step
K-L Divergence: 4.6611
Standard Deviation: 0.3810

Predicted Results for 100 samples (First 10 rows):
   IR_State EI_State IRT_State MS_State GEO_State Chosen_SP Predicted_SP
96     high     poor  moderate      low  suburban  decrease     decrease
4      high  average  moderate      low     rural    stable     decrease
42     high  average  moderate     high  suburban    stable     decrease
77     high  average      weak   medium  suburban  decrease     decrease
10     high     poor  moderate     high     rural  decrease     decrease
0       low     poor    strong     high     rural  decrease     decrease
9      high  average      weak   medium  suburban  decrease     decrease
69     high  average  moderate     high     rural    stable     decrease
73      low  average  moderate      low     rural  decrease     decrease
83




Sample size: 500
Training Accuracy: 0.3829
Validation Accuracy: 0.4533
Test Accuracy: 0.3467
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 25ms/step
K-L Divergence: 0.0149
Standard Deviation: 0.0576

Predicted Results for 500 samples (First 10 rows):
    IR_State EI_State IRT_State MS_State GEO_State Chosen_SP Predicted_SP
290      low     good      weak     high     urban  increase     decrease
316     high     poor  moderate   medium     urban    stable     decrease
117     high     poor    strong      low  suburban  increase       stable
455     high     good  moderate   medium     urban    stable     decrease
268   medium     poor  moderate     high  suburban    stable     increase
336     high     poor  moderate      low  suburban  increase       stable
79       low  average      weak      low     rural  increase     increase
208     high     good  moderate   medium  suburban  increase     decrease
238   medium  average      weak   medium     rural  decrease      




Sample size: 1000
Training Accuracy: 0.4100
Validation Accuracy: 0.4067
Test Accuracy: 0.3200
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step 
K-L Divergence: 0.6502
Standard Deviation: 0.3544

Predicted Results for 1000 samples (First 10 rows):
    IR_State EI_State IRT_State MS_State GEO_State Chosen_SP Predicted_SP
557   medium     poor  moderate   medium     urban    stable     increase
798   medium     poor  moderate   medium     rural  increase     increase
977      low     good      weak   medium     urban  increase     increase
136      low  average    strong   medium     urban    stable     increase
575   medium     good  moderate   medium     urban  decrease     increase
544      low     good      weak      low     rural    stable     decrease
332      low     good  moderate     high     urban    stable     increase
917   medium     poor  moderate     high     urban  decrease     increase
678   medium     poor  moderate   medium  suburban    stable    




Sample size: 5000
Training Accuracy: 0.3831
Validation Accuracy: 0.3680
Test Accuracy: 0.3960
[1m24/24[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step
K-L Divergence: 12.8182
Standard Deviation: 0.4272

Predicted Results for 5000 samples (First 10 rows):
     IR_State EI_State IRT_State MS_State GEO_State Chosen_SP Predicted_SP
790    medium     poor    strong      low  suburban  increase       stable
2879   medium     poor    strong   medium     urban  decrease       stable
2372   medium     poor    strong      low     urban  decrease       stable
1351   medium     good    strong      low  suburban  decrease       stable
3382      low     good  moderate   medium  suburban    stable       stable
3433      low  average    strong      low  suburban  increase       stable
1129   medium     good    strong      low  suburban  decrease       stable
549    medium     poor  moderate      low  suburban  decrease       stable
2835   medium     good  moderate      low  suburban  




Sample size: 10000
Training Accuracy: 0.3967
Validation Accuracy: 0.3833
Test Accuracy: 0.3840
[1m47/47[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step
K-L Divergence: 7.0338
Standard Deviation: 0.2714

Predicted Results for 10000 samples (First 10 rows):
     IR_State EI_State IRT_State MS_State GEO_State Chosen_SP Predicted_SP
2697     high  average      weak      low     rural  increase     decrease
6871   medium  average      weak     high     rural  decrease       stable
3487     high     good  moderate   medium     rural  decrease       stable
92       high     good    strong      low     rural    stable       stable
9537   medium     poor      weak   medium     rural  decrease     decrease
3205     high     poor      weak   medium  suburban    stable     decrease
6641     high  average      weak   medium     rural  decrease     decrease
8909     high  average  moderate     high     rural  increase       stable
2884     high  average      weak     high     rural 




Sample size: 15000
Training Accuracy: 0.3667
Validation Accuracy: 0.3573
Test Accuracy: 0.3773
[1m71/71[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step
K-L Divergence: 7.1294
Standard Deviation: 0.3778

Predicted Results for 15000 samples (First 10 rows):
      IR_State EI_State IRT_State MS_State GEO_State Chosen_SP Predicted_SP
8602       low     good    strong   medium     rural  decrease       stable
438        low     poor    strong     high  suburban  increase       stable
8094      high     good    strong      low  suburban  increase       stable
14355     high     good    strong   medium  suburban  increase       stable
8581       low     good  moderate   medium  suburban    stable       stable
12358      low     poor  moderate     high     rural  increase       stable
511        low     good    strong     high     urban  increase       stable
6594       low  average    strong     high     rural  increase       stable
5245       low  average  moderate     high 




Sample size: 20000
Training Accuracy: 0.3764
Validation Accuracy: 0.3763
Test Accuracy: 0.3657
[1m94/94[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step
K-L Divergence: 7.1880
Standard Deviation: 0.3933

Predicted Results for 20000 samples (First 10 rows):
      IR_State EI_State IRT_State MS_State GEO_State Chosen_SP Predicted_SP
5348    medium  average      weak      low  suburban    stable     increase
339     medium     good    strong     high     rural  increase     increase
13591     high     good      weak     high     urban  increase     increase
8153    medium     good      weak   medium     rural  increase     increase
16345     high     poor  moderate   medium  suburban  decrease     increase
16404      low     poor  moderate     high  suburban    stable     increase
17185   medium  average    strong   medium     rural  increase     increase
5709       low     good    strong   medium     rural  decrease     increase
13020   medium     poor    strong   medium 