<a href="https://colab.research.google.com/github/nonyeezeh/Research-Project-Code/blob/main/NN_Sparse_1_3_Relu_6.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Imports

In [3]:
pip install pgmpy



In [4]:
import numpy as np
import pandas as pd
from pgmpy.estimators import HillClimbSearch, BicScore, MaximumLikelihoodEstimator
from pgmpy.models import BayesianNetwork
from pgmpy.estimators import BayesianEstimator
from sklearn.model_selection import train_test_split
from scipy.stats import entropy
from tabulate import tabulate

from tensorflow.keras import models, layers, regularizers, callbacks
from sklearn.model_selection import train_test_split

# Bayesian Network Data Generation 500, ..., 20000 Samples (sparse)

In [11]:
# Function to generate CPDs for the sparse structure with 6 nodes influencing SP
def generate_cpds_sparse_6_total_nodes():
    # Generate random probabilities for the independent nodes
    ir_probs = np.random.rand(3)
    ir_probs /= ir_probs.sum()

    # Create diverse dependency structures for the nodes
    ei_given_ir_probs = np.random.rand(3, 3)
    ei_given_ir_probs /= ei_given_ir_probs.sum(axis=0, keepdims=True)

    irt_given_ei_probs = np.random.rand(3, 3)
    irt_given_ei_probs /= irt_given_ei_probs.sum(axis=0, keepdims=True)

    ms_given_irt_probs = np.random.rand(3, 3)
    ms_given_irt_probs /= ms_given_irt_probs.sum(axis=0, keepdims=True)

    geo_given_ms_probs = np.random.rand(3, 3)
    geo_given_ms_probs /= geo_given_ms_probs.sum(axis=0, keepdims=True)

    # SP depends on the 6 nodes without interactions between them
    sp_probs = np.random.rand(3, 3, 3, 3, 3, 3)
    sp_probs /= sp_probs.sum(axis=0, keepdims=True)

    return (ir_probs, ei_given_ir_probs, irt_given_ei_probs, ms_given_irt_probs, geo_given_ms_probs, sp_probs)

# Function to generate and save samples with the sparse structure of 6 nodes total
def generate_and_save_samples_sparse_6_total_nodes(ir_probs, ei_probs, irt_probs, ms_probs, geo_probs, sp_probs, sample_size, filename):
    output_data = []

    # Generate `sample_size` random samples
    for _ in range(sample_size):
        # Sample the independent node first
        ir_state_idx = np.random.choice(3, p=ir_probs)
        ir_state = ['low', 'medium', 'high'][ir_state_idx]

        # Sample dependent nodes based on the new mixed dependency structure
        ei_probs_given_ir = ei_probs[:, ir_state_idx]
        ei_state_idx = np.random.choice(3, p=ei_probs_given_ir)
        ei_state = ['poor', 'average', 'good'][ei_state_idx]

        irt_probs_given_ei = irt_probs[:, ei_state_idx]
        irt_state_idx = np.random.choice(3, p=irt_probs_given_ei)
        irt_state = ['weak', 'moderate', 'strong'][irt_state_idx]

        ms_probs_given_irt = ms_probs[:, irt_state_idx]
        ms_state_idx = np.random.choice(3, p=ms_probs_given_irt)
        ms_state = ['low', 'medium', 'high'][ms_state_idx]

        geo_probs_given_ms = geo_probs[:, ms_state_idx]
        geo_state_idx = np.random.choice(3, p=geo_probs_given_ms)
        geo_state = ['urban', 'suburban', 'rural'][geo_state_idx]

        # Calculate SP probability based on the state of each node (sparse dependency on each)
        sp_probs_given_all = sp_probs[:, ir_state_idx, ei_state_idx, irt_state_idx, ms_state_idx, geo_state_idx]
        sp_state_idx = np.random.choice(3, p=sp_probs_given_all)
        sp_state = ['decrease', 'stable', 'increase'][sp_state_idx]

        # Append sample data to output list including probabilities for all nodes
        output_data.append({
            'IR_State': ir_state,
            'EI_State': ei_state,
            'IRT_State': irt_state,
            'MS_State': ms_state,
            'GEO_State': geo_state,
            'SP_Probabilities (decrease, stable, increase)': ', '.join([f'{prob:.4f}' for prob in sp_probs_given_all]),
            'Chosen_SP_State': sp_state
        })

    # Create a DataFrame from the output data
    output_df = pd.DataFrame(output_data)

    # Save the output DataFrame to a CSV file
    output_df.to_csv(filename, index=False)

    # Print the first few rows for visual confirmation
    print(f"\nSample size: {sample_size} - First few rows of generated samples:\n")
    print(tabulate(output_df.head(), headers='keys', tablefmt='grid'))

# Generate and save samples for sample sizes
sample_sizes = [50, 100, 500, 1000, 5000, 10000, 15000, 20000]

for size in sample_sizes:
    (ir_probs, ei_probs, irt_probs, ms_probs, geo_probs, sp_probs) = generate_cpds_sparse_6_total_nodes()
    generate_and_save_samples_sparse_6_total_nodes(ir_probs, ei_probs, irt_probs, ms_probs, geo_probs, sp_probs, size, f'combined_probabilities_{size}.csv')

print("\nGeneration and saving of individual samples complete for all sample sizes!")


Sample size: 50 - First few rows of generated samples:

+----+------------+------------+-------------+------------+-------------+-------------------------------------------------+-------------------+
|    | IR_State   | EI_State   | IRT_State   | MS_State   | GEO_State   | SP_Probabilities (decrease, stable, increase)   | Chosen_SP_State   |
|  0 | low        | average    | weak        | high       | urban       | 0.4110, 0.2860, 0.3030                          | increase          |
+----+------------+------------+-------------+------------+-------------+-------------------------------------------------+-------------------+
|  1 | low        | good       | moderate    | medium     | rural       | 0.3282, 0.6537, 0.0181                          | stable            |
+----+------------+------------+-------------+------------+-------------+-------------------------------------------------+-------------------+
|  2 | medium     | poor       | moderate    | low        | suburban    | 0.240

# NN & KL-Div

In [16]:
# Sample sizes to loop through
sample_sizes = [50, 100, 500, 1000, 5000, 10000, 15000, 20000]
# Define the Neural Network architecture with L2 regularization
def create_nn_model(hidden_layers=1, nodes_per_layer=3, l2_lambda=0.01):
    model = models.Sequential()
    model.add(layers.InputLayer(input_shape=(5,)))  # Updated input shape to include 5 features

    # Hidden layers with L2 regularization and Dropout
    for layer_num in range(hidden_layers):
        model.add(layers.Dense(
            nodes_per_layer,
            activation='relu',
            kernel_regularizer=regularizers.l2(l2_lambda),  # L2 regularization
            name=f"hidden_layer_{layer_num + 1}"
        ))
        model.add(layers.Dropout(0.2))  # Dropout layer to reduce overfitting

    # Output layer (3 classes: decrease, stable, increase) with L2 regularization
    model.add(layers.Dense(
        3,
        activation='softmax',
        kernel_regularizer=regularizers.l2(l2_lambda),
        name="output_layer"
    ))

    model.compile(optimizer='adam',
                  loss='sparse_categorical_crossentropy',
                  metrics=['accuracy'])

    return model

# Prepare a dictionary to store extracted data for each sample size
extracted_data = {}

# Extract the required columns from all sample sizes first
for size in sample_sizes:
    outcomes_file = f'combined_probabilities_{size}.csv'
    df = pd.read_csv(outcomes_file)

    # Include new nodes in the required columns
    required_columns = ['IR_State', 'EI_State', 'IRT_State', 'MS_State', 'GEO_State','Chosen_SP_State']
    df_extracted = df[required_columns]

    # Encode categorical variables for IR, EI, IRT, MS, GEO, and SP
    ir_map = {'low': 0, 'medium': 1, 'high': 2}
    ei_map = {'poor': 0, 'average': 1, 'good': 2}
    irt_map = {'weak': 0, 'moderate': 1, 'strong': 2}
    ms_map = {'low': 0, 'medium': 1, 'high': 2}
    geo_map = {'urban': 0, 'suburban': 1, 'rural': 2}
    sp_map = {'decrease': 0, 'stable': 1, 'increase': 2}

    df_extracted['IR_encoded'] = df_extracted['IR_State'].map(ir_map)
    df_extracted['EI_encoded'] = df_extracted['EI_State'].map(ei_map)
    df_extracted['IRT_encoded'] = df_extracted['IRT_State'].map(irt_map)
    df_extracted['MS_encoded'] = df_extracted['MS_State'].map(ms_map)
    df_extracted['GEO_encoded'] = df_extracted['GEO_State'].map(geo_map)
    df_extracted['SP_encoded'] = df_extracted['Chosen_SP_State'].map(sp_map)

    extracted_data[size] = df_extracted

# Initialize list to store K-L divergence and standard deviation results
results = []
epsilon = 1e-10  # Small value for smoothing

for size in sample_sizes:
    df = extracted_data[size]

    # Features (IR, EI, IRT, MS, GEO) and labels (SP)
    X = df[['IR_encoded', 'EI_encoded', 'IRT_encoded', 'MS_encoded', 'GEO_encoded']]
    y = df['SP_encoded']

    # Split into training, validation, and test sets
    X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, shuffle=True, random_state=42)
    X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, shuffle=True, random_state=42)

    # Create and train the Neural Network model
    nn_model = create_nn_model(hidden_layers=1, nodes_per_layer=3, l2_lambda=0.01)
    early_stopping = callbacks.EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)
    nn_model.fit(X_train, y_train, epochs=25, batch_size=16, validation_data=(X_val, y_val), callbacks=[early_stopping], verbose=0)

    # Evaluate model accuracy
    train_loss, train_accuracy = nn_model.evaluate(X_train, y_train, verbose=0)
    val_loss, val_accuracy = nn_model.evaluate(X_val, y_val, verbose=0)
    test_loss, test_accuracy = nn_model.evaluate(X_test, y_test, verbose=0)

    print(f"\nSample size: {size}")
    print(f"Training Accuracy: {train_accuracy:.4f}")
    print(f"Validation Accuracy: {val_accuracy:.4f}")
    print(f"Test Accuracy: {test_accuracy:.4f}")

    # Predict on test data
    predictions = nn_model.predict(X_test)
    predicted_classes = predictions.argmax(axis=1)

    # Calculate ground truth and predicted probabilities
    ground_truth_probabilities = y_test.value_counts(normalize=True).sort_index()
    predicted_probabilities = pd.Series(predicted_classes).value_counts(normalize=True).sort_index()

    # Reindex both distributions and add smoothing
    all_categories = sorted(set(ground_truth_probabilities.index).union(set(predicted_probabilities.index)))
    ground_truth_probabilities = ground_truth_probabilities.reindex(all_categories, fill_value=epsilon)
    predicted_probabilities = predicted_probabilities.reindex(all_categories, fill_value=epsilon)

    # Calculate K-L divergence and standard deviation
    kl_divergence = entropy(pk=ground_truth_probabilities, qk=predicted_probabilities)
    std_dev = np.std(predicted_probabilities - ground_truth_probabilities)

    results.append({
        'Sample_Size': size,
        'K-L_Divergence': kl_divergence,
        'Standard_Deviation': std_dev
    })

    print(f"K-L Divergence: {kl_divergence:.4f}")
    print(f"Standard Deviation: {std_dev:.4f}")

    # Map integers back to the original SP labels
    sp_reverse_map = ['decrease', 'stable', 'increase']
    predicted_labels = [sp_reverse_map[label] for label in predicted_classes]

    # Create DataFrame for displaying nodes, predicted SP, and chosen SP
    result_df = pd.DataFrame({
        'IR_State': df['IR_State'].iloc[X_test.index],
        'EI_State': df['EI_State'].iloc[X_test.index],
        'IRT_State': df['IRT_State'].iloc[X_test.index],
        'MS_State': df['MS_State'].iloc[X_test.index],
        'GEO_State': df['GEO_State'].iloc[X_test.index],
        'Chosen_SP': df['Chosen_SP_State'].iloc[X_test.index],
        'Predicted_SP': predicted_labels
    })
    print(f"\nPredicted Results for {size} samples (First 10 rows):")
    print(result_df.head(10))

    # Save results for this sample size in a dedicated CSV
    result_df.to_csv(f'test_results_{size}.csv', index=False)

# Save only K-L and Standard Deviation results to a summary file
results_df = pd.DataFrame(results)
results_df.to_csv('kl_std_results_summary.csv', index=False)

print("\nAll K-L divergence and standard deviation results have been saved in 'kl_std_results_summary.csv'.")




Sample size: 50
Training Accuracy: 0.2571
Validation Accuracy: 0.4286
Test Accuracy: 0.2500
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 56ms/step
K-L Divergence: 2.9163
Standard Deviation: 0.3680

Predicted Results for 50 samples (First 10 rows):
   IR_State EI_State IRT_State MS_State GEO_State Chosen_SP Predicted_SP
19     high     good  moderate      low     urban  decrease       stable
4       low     good    strong   medium  suburban  decrease       stable
13      low  average    strong   medium     rural  decrease       stable
8    medium     good  moderate      low     urban  decrease       stable
48      low     poor    strong      low  suburban    stable     decrease
32      low     poor  moderate     high     urban  decrease     decrease
30      low     good      weak      low  suburban  increase       stable
39      low  average      weak      low  suburban    stable       stable





Sample size: 100
Training Accuracy: 0.2286
Validation Accuracy: 0.3333
Test Accuracy: 0.3333
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 45ms/step
K-L Divergence: 5.0582
Standard Deviation: 0.4838

Predicted Results for 100 samples (First 10 rows):
   IR_State EI_State IRT_State MS_State GEO_State Chosen_SP Predicted_SP
96     high     good  moderate     high     urban  decrease     increase
4      high     good    strong   medium     rural    stable     increase
42     high     good  moderate     high     urban  decrease     increase
77     high     good    strong     high     rural  decrease     increase
10     high     good    strong      low     urban    stable     increase
0      high     poor  moderate   medium     urban    stable     increase
9      high     good  moderate   medium  suburban    stable     increase
69   medium  average    strong     high     urban    stable     increase
73      low     poor    strong   medium     rural  increase     increase
83




Sample size: 500
Training Accuracy: 0.2886
Validation Accuracy: 0.3200
Test Accuracy: 0.2933
[1m1/3[0m [32m━━━━━━[0m[37m━━━━━━━━━━━━━━[0m [1m0s[0m 47ms/step



[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 17ms/step
K-L Divergence: 0.0692
Standard Deviation: 0.1147

Predicted Results for 500 samples (First 10 rows):
    IR_State EI_State IRT_State MS_State GEO_State Chosen_SP Predicted_SP
290      low  average    strong      low     rural  increase       stable
316   medium     poor  moderate      low     rural    stable     decrease
117      low  average      weak      low     rural  increase     decrease
455      low  average    strong   medium     rural  increase       stable
268   medium     poor    strong      low     urban  increase     increase
336      low     good    strong   medium     urban    stable     increase
79    medium     poor  moderate   medium     rural    stable     decrease
208      low     good  moderate      low  suburban  decrease       stable
238      low     poor    strong   medium  suburban  decrease     decrease
477      low     good    strong      low     rur




Sample size: 1000
Training Accuracy: 0.3643
Validation Accuracy: 0.3000
Test Accuracy: 0.3533
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 14ms/step




K-L Divergence: 0.5729
Standard Deviation: 0.2151

Predicted Results for 1000 samples (First 10 rows):
    IR_State EI_State IRT_State MS_State GEO_State Chosen_SP Predicted_SP
557      low     good    strong   medium  suburban  decrease     decrease
798      low     poor    strong     high     rural    stable     increase
977      low     poor    strong     high     urban    stable     increase
136   medium     good    strong   medium     rural  decrease     increase
575      low     poor      weak      low     urban    stable     decrease
544      low  average    strong   medium  suburban  increase     increase
332      low  average      weak      low     rural  increase     decrease
917      low     good    strong   medium  suburban    stable     decrease
678      low     good    strong   medium     rural  increase     increase
363   medium  average    strong     high     urban    stable     increase

Sample size: 5000
Training Accuracy: 0.3466
Validation Accuracy: 0.3707
Test Accur




Sample size: 10000
Training Accuracy: 0.3911
Validation Accuracy: 0.3907
Test Accuracy: 0.3747
[1m47/47[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step
K-L Divergence: 13.3051
Standard Deviation: 0.4424

Predicted Results for 10000 samples (First 10 rows):
     IR_State EI_State IRT_State MS_State GEO_State Chosen_SP Predicted_SP
2697   medium     poor    strong      low     urban  decrease     decrease
6871      low  average  moderate      low  suburban  increase     decrease
3487      low     good  moderate      low     urban  decrease     decrease
92        low     good  moderate      low     rural  decrease     decrease
9537     high     poor    strong   medium     rural    stable     decrease
3205   medium  average  moderate      low     urban  increase     decrease
6641      low     poor  moderate      low  suburban  increase     decrease
8909   medium     good  moderate      low     urban    stable     decrease
2884      low     poor    strong   medium     rural




Sample size: 15000
Training Accuracy: 0.3785
Validation Accuracy: 0.4022
Test Accuracy: 0.3716
[1m71/71[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step
K-L Divergence: 0.5317
Standard Deviation: 0.2654

Predicted Results for 15000 samples (First 10 rows):
      IR_State EI_State IRT_State MS_State GEO_State Chosen_SP Predicted_SP
8602      high     good      weak      low     urban  decrease       stable
438       high     good      weak   medium     rural  decrease       stable
8094      high     good      weak      low     rural    stable       stable
14355   medium     poor      weak   medium  suburban  increase       stable
8581      high     good      weak   medium     rural    stable       stable
12358   medium     poor  moderate   medium  suburban  increase     increase
511       high  average    strong      low     urban  increase     increase
6594    medium  average    strong      low     urban    stable     increase
5245    medium  average  moderate      low 




Sample size: 20000
Training Accuracy: 0.3971
Validation Accuracy: 0.4030
Test Accuracy: 0.3947
[1m94/94[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step
K-L Divergence: 6.0782
Standard Deviation: 0.2132

Predicted Results for 20000 samples (First 10 rows):
      IR_State EI_State IRT_State MS_State GEO_State Chosen_SP Predicted_SP
5348      high     good  moderate     high     rural  decrease     increase
339     medium  average    strong   medium     urban  increase     decrease
13591     high     good      weak     high     urban    stable     increase
8153       low     poor      weak      low     rural  increase     decrease
16345     high     good  moderate      low  suburban  increase     increase
16404     high     good  moderate     high     urban  decrease     increase
17185   medium  average  moderate   medium  suburban    stable     decrease
5709      high     good  moderate   medium  suburban  increase     increase
13020      low     poor      weak      low 