<a href="https://colab.research.google.com/github/nonyeezeh/Research-Project-Code/blob/main/NN_Sparse_2_6_Relu_RQ2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Imports

In [1]:
pip install pgmpy

Collecting pgmpy
  Downloading pgmpy-0.1.26-py3-none-any.whl.metadata (9.1 kB)
Downloading pgmpy-0.1.26-py3-none-any.whl (2.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.0/2.0 MB[0m [31m12.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pgmpy
Successfully installed pgmpy-0.1.26


In [2]:
import numpy as np
import pandas as pd
from scipy.stats import entropy
from tabulate import tabulate
from sklearn.model_selection import train_test_split
from tensorflow.keras import models, layers, regularizers, callbacks

# Bayesian Network Data Generation 500, ..., 20000 Samples (sparse)

In [4]:
# Function to generate CPDs for the sparse structure with 5 nodes influencing SP
def generate_cpds_sparse_6_total_nodes():
    # Generate random probabilities for each of the 5 independent nodes
    ir_probs = np.random.rand(3)
    ir_probs /= ir_probs.sum()

    irt_given_ir_probs = np.random.rand(3, 3)
    irt_given_ir_probs /= irt_given_ir_probs.sum(axis=0, keepdims=True)

    ei_given_ir_irt_probs = np.random.rand(3, 3, 3)
    ei_given_ir_irt_probs /= ei_given_ir_irt_probs.sum(axis=0, keepdims=True)

    geo_probs = np.random.rand(3)
    geo_probs /= geo_probs.sum()

    ue_given_geo_probs = np.random.rand(3, 3)
    ue_given_geo_probs /= ue_given_geo_probs.sum(axis=0, keepdims=True)

    # SP depends on the 5 other nodes without interactions between them
    sp_probs = np.random.rand(3, 3, 3, 3, 3, 3)
    sp_probs /= sp_probs.sum(axis=0, keepdims=True)

    return ir_probs, irt_given_ir_probs, ei_given_ir_irt_probs, geo_probs, ue_given_geo_probs, sp_probs

# Function to generate and save samples with the sparse structure of 6 nodes total
def generate_and_save_samples_sparse_6_total_nodes(ir_probs, irt_probs, ei_probs, geo_probs, ue_probs, sp_probs, sample_size, filename):
    output_data = []

    # Generate `sample_size` random samples
    for _ in range(sample_size):
        # Sample each of the 5 nodes individually
        ir_state_idx = np.random.choice(3, p=ir_probs)
        ir_state = ['low', 'medium', 'high'][ir_state_idx]

        irt_probs_given_ir = irt_probs[:, ir_state_idx]
        irt_state_idx = np.random.choice(3, p=irt_probs_given_ir)
        irt_state = ['decreasing', 'steady', 'increasing'][irt_state_idx]

        ei_probs_given_ir_irt = ei_probs[:, ir_state_idx, irt_state_idx]
        ei_state_idx = np.random.choice(3, p=ei_probs_given_ir_irt)
        ei_state = ['poor', 'average', 'good'][ei_state_idx]

        geo_state_idx = np.random.choice(3, p=geo_probs)
        geo_state = ['recession', 'stable', 'growth'][geo_state_idx]

        ue_probs_given_geo = ue_probs[:, geo_state_idx]
        ue_state_idx = np.random.choice(3, p=ue_probs_given_geo)
        ue_state = ['high', 'medium', 'low'][ue_state_idx]

        # Calculate SP probability based on the state of each node (sparse dependency on each)
        sp_probs_given_all = sp_probs[:, ir_state_idx, irt_state_idx, ei_state_idx, geo_state_idx, ue_state_idx]
        sp_state_idx = np.random.choice(3, p=sp_probs_given_all)
        sp_state = ['decrease', 'stable', 'increase'][sp_state_idx]

        # Append sample data to output list including probabilities for all nodes
        output_data.append({
            'IR_State': ir_state,
            'IRT_State': irt_state,
            'EI_State': ei_state,
            'GEO_State': geo_state,
            'UE_State': ue_state,
            'SP_Probabilities (decrease, stable, increase)': ', '.join([f'{prob:.4f}' for prob in sp_probs_given_all]),
            'Chosen_SP_State': sp_state
        })

    # Create a DataFrame from the output data
    output_df = pd.DataFrame(output_data)

    # Save the output DataFrame to a CSV file
    output_df.to_csv(filename, index=False)

    # Print the first few rows for visual confirmation
    print(f"\nSample size: {sample_size} - First few rows of generated samples:\n")
    print(tabulate(output_df.head(), headers='keys', tablefmt='grid'))

# Generate and save samples for sample sizes
sample_sizes = [50, 100, 150, 200, 500, 1000, 5000, 10000, 15000, 20000]
for size in sample_sizes:
    ir_probs, irt_probs, ei_probs, geo_probs, ue_probs, sp_probs = generate_cpds_sparse_6_total_nodes()
    generate_and_save_samples_sparse_6_total_nodes(ir_probs, irt_probs, ei_probs, geo_probs, ue_probs, sp_probs, size, f'combined_probabilities_{size}.csv')

print("\nGeneration and saving of individual samples complete for all sample sizes!")


Sample size: 50 - First few rows of generated samples:

+----+------------+-------------+------------+-------------+------------+-------------------------------------------------+-------------------+
|    | IR_State   | IRT_State   | EI_State   | GEO_State   | UE_State   | SP_Probabilities (decrease, stable, increase)   | Chosen_SP_State   |
|  0 | low        | increasing  | poor       | growth      | low        | 0.0698, 0.5591, 0.3711                          | increase          |
+----+------------+-------------+------------+-------------+------------+-------------------------------------------------+-------------------+
|  1 | low        | steady      | poor       | stable      | low        | 0.0461, 0.4955, 0.4584                          | increase          |
+----+------------+-------------+------------+-------------+------------+-------------------------------------------------+-------------------+
|  2 | high       | increasing  | good       | stable      | low        | 0.164

# NN & KL-Div

In [5]:
# Sample sizes to loop through
sample_sizes = [50, 100, 150, 200, 500, 1000, 5000, 10000, 15000, 20000]

# Define the Neural Network architecture with L2 regularization
def create_nn_model(hidden_layers=2, nodes_per_layer=6, l2_lambda=0.01):
    model = models.Sequential()
    model.add(layers.InputLayer(input_shape=(5,)))  # 5 input features: IR, IRT, EI, GEO, UE

    # Hidden layers with L2 regularization and Dropout
    for layer_num in range(hidden_layers):
        model.add(layers.Dense(
            nodes_per_layer,
            activation='relu',
            kernel_regularizer=regularizers.l2(l2_lambda),
            name=f"hidden_layer_{layer_num + 1}"
        ))
        model.add(layers.Dropout(0.2))  # Dropout layer to reduce overfitting

    # Output layer (3 classes: decrease, stable, increase) with L2 regularization
    model.add(layers.Dense(
        3,
        activation='softmax',
        kernel_regularizer=regularizers.l2(l2_lambda),
        name="output_layer"
    ))

    model.compile(optimizer='adam',
                  loss='sparse_categorical_crossentropy',
                  metrics=['accuracy'])

    return model

# Prepare a dictionary to store extracted data for each sample size
extracted_data = {}

# Extract the required columns from all sample sizes first
for size in sample_sizes:
    outcomes_file = f'combined_probabilities_{size}.csv'
    df = pd.read_csv(outcomes_file)

    required_columns = ['IR_State', 'IRT_State', 'EI_State', 'GEO_State', 'UE_State', 'Chosen_SP_State']
    df_extracted = df[required_columns]

    # Encode categorical variables for IR, IRT, EI, GEO, UE, and SP
    ir_map = {'low': 0, 'medium': 1, 'high': 2}
    irt_map = {'decreasing': 0, 'steady': 1, 'increasing': 2}
    ei_map = {'poor': 0, 'average': 1, 'good': 2}
    geo_map = {'recession': 0, 'stable': 1, 'growth': 2}
    ue_map = {'high': 0, 'medium': 1, 'low': 2}
    sp_map = {'decrease': 0, 'stable': 1, 'increase': 2}

    df_extracted['IR_encoded'] = df_extracted['IR_State'].map(ir_map)
    df_extracted['IRT_encoded'] = df_extracted['IRT_State'].map(irt_map)
    df_extracted['EI_encoded'] = df_extracted['EI_State'].map(ei_map)
    df_extracted['GEO_encoded'] = df_extracted['GEO_State'].map(geo_map)
    df_extracted['UE_encoded'] = df_extracted['UE_State'].map(ue_map)
    df_extracted['SP_encoded'] = df_extracted['Chosen_SP_State'].map(sp_map)

    extracted_data[size] = df_extracted

# Initialize list to store K-L divergence and standard deviation results
results = []
epsilon = 1e-10  # Small value for smoothing

for size in sample_sizes:
    df = extracted_data[size]

    # Features (IR, IRT, EI, GEO, UE) and labels (SP)
    X = df[['IR_encoded', 'IRT_encoded', 'EI_encoded', 'GEO_encoded', 'UE_encoded']]
    y = df['SP_encoded']

    # Split into training, validation, and test sets
    X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, shuffle=True, random_state=42)
    X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, shuffle=True, random_state=42)

    # Create and train the Neural Network model
    nn_model = create_nn_model(hidden_layers=2, nodes_per_layer=6, l2_lambda=0.01)
    early_stopping = callbacks.EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)
    nn_model.fit(X_train, y_train, epochs=25, batch_size=16, validation_data=(X_val, y_val), callbacks=[early_stopping], verbose=0)

    # Evaluate model accuracy
    train_loss, train_accuracy = nn_model.evaluate(X_train, y_train, verbose=0)
    val_loss, val_accuracy = nn_model.evaluate(X_val, y_val, verbose=0)
    test_loss, test_accuracy = nn_model.evaluate(X_test, y_test, verbose=0)

    print(f"\nSample size: {size}")
    print(f"Training Accuracy: {train_accuracy:.4f}")
    print(f"Validation Accuracy: {val_accuracy:.4f}")
    print(f"Test Accuracy: {test_accuracy:.4f}")

    # Predict on test data
    predictions = nn_model.predict(X_test)
    predicted_classes = predictions.argmax(axis=1)

    # Calculate ground truth and predicted probabilities
    ground_truth_probabilities = y_test.value_counts(normalize=True).sort_index()
    predicted_probabilities = pd.Series(predicted_classes).value_counts(normalize=True).sort_index()

    # Reindex both distributions and add smoothing
    all_categories = sorted(set(ground_truth_probabilities.index).union(set(predicted_probabilities.index)))
    ground_truth_probabilities = ground_truth_probabilities.reindex(all_categories, fill_value=epsilon)
    predicted_probabilities = predicted_probabilities.reindex(all_categories, fill_value=epsilon)

    # Calculate K-L divergence and standard deviation
    kl_divergence = entropy(pk=ground_truth_probabilities, qk=predicted_probabilities)
    std_dev = np.std(predicted_probabilities - ground_truth_probabilities)

    results.append({
        'Sample_Size': size,
        'K-L_Divergence': kl_divergence,
        'Standard_Deviation': std_dev
    })

    print(f"K-L Divergence: {kl_divergence:.4f}")
    print(f"Standard Deviation: {std_dev:.4f}")

    # Map integers back to the original SP labels
    sp_reverse_map = ['decrease', 'stable', 'increase']
    predicted_labels = [sp_reverse_map[label] for label in predicted_classes]

    # Create DataFrame for displaying nodes, predicted SP, and chosen SP
    result_df = pd.DataFrame({
        'IR_State': df['IR_State'].iloc[X_test.index],
        'IRT_State': df['IRT_State'].iloc[X_test.index],
        'EI_State': df['EI_State'].iloc[X_test.index],
        'GEO_State': df['GEO_State'].iloc[X_test.index],
        'UE_State': df['UE_State'].iloc[X_test.index],
        'Chosen_SP': df['Chosen_SP_State'].iloc[X_test.index],
        'Predicted_SP': predicted_labels
    })
    print(f"\nPredicted Results for {size} samples (First 10 rows):")
    print(result_df.head(10))

    # Save results for this sample size in a dedicated CSV
    result_df.to_csv(f'test_results_{size}.csv', index=False)

# Save only K-L and Standard Deviation results to a summary file
results_df = pd.DataFrame(results)
results_df.to_csv('kl_std_results_summary.csv', index=False)

print("\nAll K-L divergence and standard deviation results have been saved in 'kl_std_results_summary.csv'.")




Sample size: 50
Training Accuracy: 0.4286
Validation Accuracy: 0.1429
Test Accuracy: 0.2500
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 55ms/step
K-L Divergence: 13.7949
Standard Deviation: 0.4677

Predicted Results for 50 samples (First 10 rows):
   IR_State   IRT_State EI_State  GEO_State UE_State Chosen_SP Predicted_SP
19      low  decreasing     good     growth   medium  decrease     increase
4       low  increasing  average  recession      low  decrease     increase
13     high  increasing     poor     stable     high    stable       stable
8    medium  decreasing     good  recession     high  decrease       stable
48     high  increasing     poor     growth   medium  decrease     increase
32     high      steady     good     growth      low  increase     increase
30      low  decreasing     good     growth   medium    stable     increase
39      low      steady     good  recession     high  decrease       stable





Sample size: 100
Training Accuracy: 0.3000
Validation Accuracy: 0.5333
Test Accuracy: 0.7333
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 91ms/step
K-L Divergence: 5.3755
Standard Deviation: 0.1886

Predicted Results for 100 samples (First 10 rows):
   IR_State   IRT_State EI_State  GEO_State UE_State Chosen_SP Predicted_SP
96     high  increasing     poor  recession      low  decrease       stable
4      high  increasing  average  recession   medium    stable       stable
42     high  increasing     poor     growth     high    stable       stable
77     high  decreasing     good     growth   medium  increase       stable
10     high  increasing     good     growth      low  decrease       stable
0      high  increasing     good     stable     high    stable       stable
9      high      steady     good     stable     high    stable       stable
69     high  increasing     good  recession     high    stable       stable
73     high  increasing     good     stable   me




Sample size: 150
Training Accuracy: 0.1905
Validation Accuracy: 0.5000
Test Accuracy: 0.2609
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 84ms/step
K-L Divergence: 1.1062
Standard Deviation: 0.4642

Predicted Results for 150 samples (First 10 rows):
    IR_State   IRT_State EI_State  GEO_State UE_State Chosen_SP Predicted_SP
32    medium      steady     good     growth     high  decrease       stable
145   medium      steady     good     growth      low  increase       stable
108   medium      steady     good     growth     high  increase       stable
16    medium  decreasing     good     growth     high  decrease       stable
146      low  increasing     poor     growth     high  decrease       stable
85    medium  increasing     poor     growth      low  increase       stable
76    medium  decreasing  average     growth   medium  increase       stable
36    medium      steady     good     stable   medium    stable       stable
68    medium      steady     poor     g




Sample size: 200
Training Accuracy: 0.3643
Validation Accuracy: 0.3333
Test Accuracy: 0.2333
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 54ms/step
K-L Divergence: 0.2064
Standard Deviation: 0.2228

Predicted Results for 200 samples (First 10 rows):
    IR_State   IRT_State EI_State  GEO_State UE_State Chosen_SP Predicted_SP
95       low      steady  average     stable     high  decrease       stable
115   medium  increasing     good     stable   medium  increase       stable
135   medium  increasing     good  recession   medium  decrease     increase
195     high  decreasing  average  recession   medium    stable     increase
78       low  decreasing     poor     growth   medium  increase     decrease
117   medium  increasing     good  recession   medium  decrease     increase
75       low  decreasing     poor     growth   medium    stable     decrease
143   medium  increasing     good     growth   medium    stable       stable
165   medium  increasing  average     s




Sample size: 500
Training Accuracy: 0.3229
Validation Accuracy: 0.3600
Test Accuracy: 0.3200
[1m1/3[0m [32m━━━━━━[0m[37m━━━━━━━━━━━━━━[0m [1m0s[0m 56ms/step



[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 21ms/step




K-L Divergence: 0.6362
Standard Deviation: 0.3583

Predicted Results for 500 samples (First 10 rows):
    IR_State   IRT_State EI_State  GEO_State UE_State Chosen_SP Predicted_SP
290   medium  decreasing     poor     growth      low  increase       stable
316   medium      steady     good     growth      low  increase     increase
117   medium  decreasing  average     growth   medium  increase       stable
455   medium  increasing     good  recession   medium    stable     increase
268     high      steady     good     growth      low    stable     increase
336     high      steady     good     growth   medium    stable     increase
79      high      steady  average     growth      low  increase       stable
208   medium      steady     good  recession     high  increase     increase
238   medium  increasing     good  recession     high  increase     increase
477     high      steady  average     stable     high    stable     increase

Sample size: 1000
Training Accuracy: 0.3657
Valida




Sample size: 5000
Training Accuracy: 0.3411
Validation Accuracy: 0.3147
Test Accuracy: 0.3307
[1m24/24[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step
K-L Divergence: 14.3139
Standard Deviation: 0.4734

Predicted Results for 5000 samples (First 10 rows):
     IR_State   IRT_State EI_State  GEO_State UE_State Chosen_SP Predicted_SP
790      high  decreasing  average     growth   medium    stable     increase
2879     high  decreasing     poor     growth   medium  increase     increase
2372     high      steady     good     growth   medium  increase     increase
1351      low  decreasing     good     growth   medium    stable     increase
3382     high  decreasing  average     growth   medium    stable     increase
3433     high  decreasing     good  recession     high    stable     increase
1129     high  decreasing  average     growth   medium    stable     increase
549      high      steady     good     growth   medium  increase     increase
2835     high  increasing 




Sample size: 10000
Training Accuracy: 0.3599
Validation Accuracy: 0.3380
Test Accuracy: 0.3700
[1m47/47[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step
K-L Divergence: 13.4107
Standard Deviation: 0.4455

Predicted Results for 10000 samples (First 10 rows):
     IR_State   IRT_State EI_State  GEO_State UE_State Chosen_SP Predicted_SP
2697      low  decreasing     poor  recession     high    stable       stable
6871     high  decreasing  average     stable   medium  decrease       stable
3487   medium  increasing  average     stable      low  decrease       stable
92       high      steady     poor     stable     high    stable       stable
9537   medium  increasing  average     stable      low    stable       stable
3205     high  increasing     good     stable     high  decrease       stable
6641     high  decreasing     good     stable   medium  decrease       stable
8909      low  decreasing     poor     stable      low    stable       stable
2884     high  increasin




Sample size: 15000
Training Accuracy: 0.3650
Validation Accuracy: 0.3413
Test Accuracy: 0.3680
[1m71/71[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step
K-L Divergence: 13.4565
Standard Deviation: 0.4469

Predicted Results for 15000 samples (First 10 rows):
      IR_State   IRT_State EI_State  GEO_State UE_State Chosen_SP Predicted_SP
8602    medium  decreasing     good  recession     high  decrease     decrease
438     medium  decreasing     poor  recession     high  increase     decrease
8094    medium      steady     good     growth   medium    stable     decrease
14355   medium  decreasing     good  recession     high  decrease     decrease
8581      high  decreasing  average     growth   medium  decrease     decrease
12358   medium      steady     good     stable      low  increase     decrease
511       high  decreasing     good     stable      low  decrease     decrease
6594      high      steady     poor  recession     high  increase     decrease
5245      high 




Sample size: 20000
Training Accuracy: 0.3566
Validation Accuracy: 0.3627
Test Accuracy: 0.3570
[1m94/94[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step
K-L Divergence: 13.7086
Standard Deviation: 0.4548

Predicted Results for 20000 samples (First 10 rows):
      IR_State   IRT_State EI_State  GEO_State UE_State Chosen_SP Predicted_SP
5348      high  increasing  average     stable   medium  increase       stable
339        low  increasing  average     stable     high  increase       stable
13591     high  decreasing     good  recession     high  increase       stable
8153       low      steady     poor  recession      low  decrease       stable
16345      low      steady     good     stable     high    stable       stable
16404      low      steady     poor  recession      low  decrease       stable
17185      low      steady     poor     stable      low  increase       stable
5709    medium  increasing  average     stable   medium  decrease       stable
13020     high 