<a href="https://colab.research.google.com/github/nonyeezeh/Research-Project-Code/blob/main/code_4.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Imports

In [78]:
import numpy as np
from tabulate import tabulate
import pandas as pd
import networkx as nx
import plotly.graph_objects as go

import pandas as pd
import tensorflow as tf
from sklearn.preprocessing import OneHotEncoder

from pgmpy.estimators import HillClimbSearch, BicScore, MaximumLikelihoodEstimator
from pgmpy.models import BayesianNetwork
from pgmpy.inference import VariableElimination
from sklearn.metrics import mean_squared_error

# Research Question

##### In this research, how does the predictive accuracy of a neural network compare to that of a Bayesian network in predicting stock prices, particularly when trained on varying sample sizes of data generated by a Bayesian network?

# Expectations

1. With larger training samples, the neural network's performance is expected to improve due to having sufficient data for effective learning, while the Bayesian network may outperform the neural network on smaller samples.
2. The Bayesian network is anticipated to show more consistent performance across different sample sizes due to its probabilistic nature and reliance on prior knowledge.
3. The neural network might require more computational resources and time to train, especially with increasing sample sizes, compared to the Bayesian network.

# Ground Truth

## Bayesian Network Data Generation

In [68]:
# Define the number of samples
num_samples = 50

# Define the possible values for each variable
values = {
    'IR': ['low', 'medium', 'high'],
    'EI': ['poor', 'average', 'good'],
    'SP': ['decrease', 'stable', 'increase']
}

#-------------------------------------------------------------------------------

#Normal Data Generation
# Functions to sample each variable with probabilities
# def sample_IR():
#     probabilities = np.random.dirichlet(np.ones(len(values['IR'])))
#     rounded_probs = [round(p, 2) for p in probabilities]
#     chosen_index = np.argmax(probabilities)
#     chosen_value = values['IR'][chosen_index]
#     return chosen_value, rounded_probs

# def sample_EI(ir=None):
#     probabilities = np.random.dirichlet(np.ones(len(values['EI'])))
#     rounded_probs = [round(p, 2) for p in probabilities]
#     chosen_index = np.argmax(probabilities)
#     chosen_value = values['EI'][chosen_index]
#     return chosen_value, rounded_probs

# def sample_SP(ir, ei):
#     # Generate probabilities based on a joint influence of IR and EI
#     probabilities = np.random.dirichlet(np.ones(len(values['SP'])))
#     rounded_probs = [round(p, 2) for p in probabilities]
#     chosen_index = np.argmax(probabilities)
#     chosen_value = values['SP'][chosen_index]
#     return chosen_value, rounded_probs

#-------------------------------------------------------------------------------

#-------------------------------------------------------------------------------

#Introducing Noise
# Define a function to add Gaussian noise
def add_noise(probabilities, noise_level=0.05):
    noisy_probs = probabilities + np.random.normal(0, noise_level, len(probabilities))
    noisy_probs = np.clip(noisy_probs, 0, 1)  # Ensure probabilities are within [0, 1]
    noisy_probs /= noisy_probs.sum()  # Normalize to ensure they sum to 1
    return noisy_probs

def sample_IR():
    probabilities = np.random.dirichlet(np.ones(len(values['IR'])))
    probabilities = add_noise(probabilities, noise_level=0.05)  # Add noise
    rounded_probs = [round(p, 2) for p in probabilities]
    chosen_index = np.argmax(probabilities)
    chosen_value = values['IR'][chosen_index]
    return chosen_value, rounded_probs

def sample_EI(ir=None):
    probabilities = np.random.dirichlet(np.ones(len(values['EI'])))
    probabilities = add_noise(probabilities, noise_level=0.05)  # Add noise
    rounded_probs = [round(p, 2) for p in probabilities]
    chosen_index = np.argmax(probabilities)
    chosen_value = values['EI'][chosen_index]
    return chosen_value, rounded_probs

def sample_SP(ir, ei):
    probabilities = np.random.dirichlet(np.ones(len(values['SP'])))
    probabilities = add_noise(probabilities, noise_level=0.05)  # Add noise
    rounded_probs = [round(p, 2) for p in probabilities]
    chosen_index = np.argmax(probabilities)
    chosen_value = values['SP'][chosen_index]
    return chosen_value, rounded_probs

#-------------------------------------------------------------------------------

# Randomly determine the structure (edges)
edges = []
if np.random.rand() > 0.5:
    edges.append(('IR', 'EI'))
if np.random.rand() > 0.5:
    edges.append(('EI', 'IR'))
if np.random.rand() > 0.5:
    edges.append(('IR', 'SP'))
if np.random.rand() > 0.5:
    edges.append(('EI', 'SP'))

# Ensure there's at least one edge to SP (either from IR or EI)
if not any(edge[1] == 'SP' for edge in edges):
    edges.append(np.random.choice([('IR', 'SP'), ('EI', 'SP')]))

# Generate the data and capture probabilities
data = []
probabilities_data = []

for _ in range(num_samples):
    ir, ir_probs = sample_IR()
    ei, ei_probs = sample_EI(ir)
    sp, sp_probs = sample_SP(ir, ei)

    data.append([ir, ei, sp])
    probabilities_data.append([
        ','.join(map(str, ir_probs)),
        ir,
        ','.join(map(str, ei_probs)),
        ei,
        ','.join(map(str, sp_probs)),
        sp
    ])

# Convert to DataFrame for the main data
df = pd.DataFrame(data, columns=['IR', 'EI', 'SP'])

# Save the main data to a CSV file
df.to_csv('bn_data_structure.csv', index=False)

# Convert to DataFrame for probabilities and chosen values
probabilities_df = pd.DataFrame(probabilities_data, columns=[
    'IR_Probabilities', 'Chosen_IR',
    'EI_Probabilities', 'Chosen_EI',
    'SP_Probabilities', 'Chosen_SP'
])

# Save the probabilities and chosen values to a CSV file
#probabilities_df.to_csv('bn_probabilities.csv', index=False)

# Display the DataFrame as a fancy table
print("\nProbabilities and chosen values:")
print(tabulate(probabilities_df.head(), headers='keys', tablefmt='fancy_grid'))

#-----------------------------------------------------------------------------------------------------

# Extract the necessary columns from the probabilities_df DataFrame
test_data = probabilities_df[['Chosen_IR', 'Chosen_EI', 'Chosen_SP']]

# Find the SP Probability corresponding to the chosen SP
test_data['SP_Probability'] = probabilities_df.apply(
    lambda row: float(row['SP_Probabilities'].split(',')[values['SP'].index(row['Chosen_SP'])]),
    axis=1
)

# Rename the columns to match the intended output format
test_data.rename(columns={
    'Chosen_IR': 'IR',
    'Chosen_EI': 'EI',
    'Chosen_SP': 'SP'
}, inplace=True)

# Save the test data to a new CSV file with only the specified columns
test_data.to_csv('bn_test_data_for_NN.csv', index=False)

# Print confirmation
print("Test data saved successfully as bn_test_data_for_NN.csv.")


Probabilities and chosen values:
╒════╤════════════════════╤═════════════╤════════════════════╤═════════════╤════════════════════╤═════════════╕
│    │ IR_Probabilities   │ Chosen_IR   │ EI_Probabilities   │ Chosen_EI   │ SP_Probabilities   │ Chosen_SP   │
╞════╪════════════════════╪═════════════╪════════════════════╪═════════════╪════════════════════╪═════════════╡
│  0 │ 0.79,0.2,0.01      │ low         │ 0.5,0.19,0.31      │ poor        │ 0.14,0.86,0.0      │ stable      │
├────┼────────────────────┼─────────────┼────────────────────┼─────────────┼────────────────────┼─────────────┤
│  1 │ 0.62,0.23,0.16     │ low         │ 0.04,0.39,0.56     │ good        │ 0.59,0.0,0.41      │ decrease    │
├────┼────────────────────┼─────────────┼────────────────────┼─────────────┼────────────────────┼─────────────┤
│  2 │ 0.27,0.15,0.58     │ high        │ 0.39,0.48,0.13     │ average     │ 0.19,0.31,0.51     │ increase    │
├────┼────────────────────┼─────────────┼────────────────────┼────────



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [69]:
# Visualize the Bayesian Network structure using Plotly
G = nx.DiGraph()

# Add nodes and edges
G.add_edges_from(edges)

# Extract node positions for Plotly
pos = nx.spring_layout(G)
edge_x = []
edge_y = []
arrow_x = []
arrow_y = []

for edge in G.edges():
    x0, y0 = pos[edge[0]]
    x1, y1 = pos[edge[1]]
    edge_x.append(x0)
    edge_x.append(x1)
    edge_x.append(None)
    edge_y.append(y0)
    edge_y.append(y1)
    edge_y.append(None)

    # Move arrows closer to the target node (x1, y1)
    arrow_x.append(0.90 * x1 + 0.10 * x0)
    arrow_y.append(0.90 * y1 + 0.10 * x0)


edge_trace = go.Scatter(
    x=edge_x, y=edge_y,
    line=dict(width=2, color='gray'),
    hoverinfo='none',
    mode='lines')

node_x = []
node_y = []
node_text = []
node_hovertext = []
node_color = []

for node in G.nodes():
    x, y = pos[node]
    node_x.append(x)
    node_y.append(y)
    node_text.append(node)

    # Customize hover text for each node
    if node == 'IR':
        node_hovertext.append('Interest Rates')
    elif node == 'EI':
        node_hovertext.append('Economic Indicators')
    elif node == 'SP':
        node_hovertext.append('Stock Price')
    else:
        node_hovertext.append(node)

    # Highlight the SP node with a different color
    if node == 'SP':
        node_color.append('pink')
    else:
        node_color.append('purple')

node_trace = go.Scatter(
    x=node_x, y=node_y,
    mode='markers+text',
    text=node_text,
    textposition="top center",
    hoverinfo='text',
    hovertext=node_hovertext,
    marker=dict(size=50, color=node_color, line=dict(width=2)))

# Adding the arrow heads, placing them correctly outside the nodes
#arrow_trace = go.Scatter(
    #x=arrow_x, y=arrow_y,
    #mode='markers',
    #marker=dict(size=10, color='black', symbol='triangle-up'),
    #hoverinfo='none'
#)

#fig = go.Figure(data=[edge_trace, node_trace, arrow_trace],
fig = go.Figure(data=[edge_trace, node_trace],
             layout=go.Layout(
                showlegend=False,
                hovermode='closest',
                margin=dict(b=20, l=20, r=20, t=50),  # Adjusted margins to fit the title
                xaxis=dict(showgrid=False, zeroline=False),
                yaxis=dict(showgrid=False, zeroline=False),
                plot_bgcolor='aliceblue')
                )

# Update layout to include a proper title
fig.update_layout(title_text="Bayesian Network Structure", title_x=0.5)

fig.show()

# Neural Network

## Neural Network Training

In [72]:
# Load the data
data = pd.read_csv('bn_data_structure.csv')

# Preprocess the data
# One-hot encode IR and EI using a separate encoder
encoder_X = OneHotEncoder(sparse=False)
X = encoder_X.fit_transform(data[['IR', 'EI']])

# One-hot encode SP with a specific order of categories
encoder_y = OneHotEncoder(sparse=False, categories=[['decrease', 'stable', 'increase']])
y = encoder_y.fit_transform(data[['SP']])

# Build and train the neural network
model = tf.keras.Sequential([
    tf.keras.layers.Dense(30, input_dim=X.shape[1], activation='relu'), # 30 nodes in hidden layer
    #tf.keras.layers.Dense(25, input_dim=X.shape[1], activation='relu'),  # More neurons, additional layer
    #tf.keras.layers.Dense(40, activation='relu'),  # Additional layer, more neurons
    #tf.keras.layers.Dense(15, input_dim=X.shape[1], activation='relu'),  # More neurons, additional layer
    #tf.keras.layers.Dense(10, activation='relu'),  # Additional layer, more neurons
    tf.keras.layers.Dense(y.shape[1], activation='softmax')
])
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
#model.fit(X, y, epochs=50, batch_size=16, verbose=1)  # Train with 50 epochs
#model.fit(X, y, epochs=100, batch_size=16, verbose=1)  # Train with 100 epochs
model.fit(X, y, epochs=20, batch_size=10, verbose=1)  # Train with 50 epochs

# Get predicted probabilities from the NN
y_pred_probs = model.predict(X)

# Convert probabilities to predicted SP categories
y_pred = np.argmax(y_pred_probs, axis=1)

# Use numpy to map the indices to the actual category labels
predicted_sp = np.array(encoder_y.categories_[0])[y_pred]

# Get the final probability for the predicted SP
predicted_sp_probs = [y_pred_probs[i, idx] for i, idx in enumerate(y_pred)]

# Prepare the output DataFrame
output_df = pd.DataFrame({
    'IR': data['IR'],
    'EI': data['EI'],
    'Actual SP': data['SP'],
    'Predicted SP': predicted_sp,
    'Predicted SP Probability': predicted_sp_probs
})

# Display the first 10 rows of the output
print(output_df.head(10))

# Optionally save the output to a CSV file
output_df.to_csv('nn_output_with_probabilities.csv', index=False)

# Display the full range of predicted probabilities as a separate output
print("\nFull Range of Predicted SP Probabilities:")
for i in range(10):  # Display the first 10 rows for brevity
    print(f"Sample {i+1} - IR: {data['IR'].iloc[i]}, EI: {data['EI'].iloc[i]}")
    print(f"Predicted SP Probabilities: {y_pred_probs[i]}")
    print(f"Actual SP: {data['SP'].iloc[i]}, Predicted SP: {predicted_sp[i]}")
    print("------")

Epoch 1/20



`sparse` was renamed to `sparse_output` in version 1.2 and will be removed in 1.4. `sparse_output` is ignored unless you leave `sparse` to its default value.


`sparse` was renamed to `sparse_output` in version 1.2 and will be removed in 1.4. `sparse_output` is ignored unless you leave `sparse` to its default value.


Do not pass an `input_shape`/`input_dim` argument to a layer. When using Sequential models, prefer using an `Input(shape)` object as the first layer in the model instead.



[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - accuracy: 0.4225 - loss: 1.0198  
Epoch 2/20
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.3542 - loss: 1.0541 
Epoch 3/20
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.4397 - loss: 1.0446 
Epoch 4/20
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.4258 - loss: 1.0412 
Epoch 5/20
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.4189 - loss: 1.0337 
Epoch 6/20
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.3925 - loss: 1.0838 
Epoch 7/20
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.4119 - loss: 1.0425 
Epoch 8/20
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.5064 - loss: 0.9796 
Epoch 9/20
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4

## NN and BN (Ground Truth) MSE

In [73]:
# Load the test data with BN probabilities
bn_test_data = pd.read_csv('bn_test_data_for_NN.csv')

# Load the NN output data with NN probabilities
nn_output_data = pd.read_csv('nn_output_with_probabilities.csv')

# Ensure the data is aligned by checking for matching IR, EI, and SP
assert np.all(bn_test_data['IR'] == nn_output_data['IR'])
assert np.all(bn_test_data['EI'] == nn_output_data['EI'])
assert np.all(bn_test_data['SP'] == nn_output_data['Actual SP'])

# Calculate the squared differences between BN and NN probabilities
bn_prob = bn_test_data['SP_Probability']
nn_prob = nn_output_data['Predicted SP Probability']
squared_diffs = (bn_prob - nn_prob) ** 2

# Combine the relevant columns into a DataFrame for display
comparison_df = pd.DataFrame({
    'BN Probability': bn_prob,
    'NN Probability': round(nn_prob,3),
    'Squared Difference': round(squared_diffs,3)
})

# Display the first few rows to see the comparison
print("Comparison of BN and NN probabilities (first few rows):")
print(comparison_df.head(10))

# Calculate the Mean Squared Error (MSE)
mse = squared_diffs.mean()

# Display the MSE value
print(f"\nMean Squared Error (MSE) between BN and NN probabilities: {round(mse,3)}")

Comparison of BN and NN probabilities (first few rows):
   BN Probability  NN Probability  Squared Difference
0            0.86           0.389               0.222
1            0.59           0.452               0.019
2            0.51           0.559               0.002
3            0.59           0.452               0.019
4            0.51           0.559               0.002
5            0.76           0.582               0.032
6            0.63           0.375               0.065
7            0.51           0.486               0.001
8            0.44           0.452               0.000
9            0.53           0.559               0.001

Mean Squared Error (MSE) between BN and NN probabilities: 0.044


# Learned BN

In [84]:
import pandas as pd

# Load the NN output data
nn_output_data = pd.read_csv('nn_output_with_probabilities.csv')

# Load the ground truth BN probabilities
bn_test_data = pd.read_csv('bn_test_data_for_NN.csv')

# Display the first few rows of each dataset to ensure they are loaded correctly
print("NN Output Data:")
print(nn_output_data.head())

print("\nGround Truth BN Data:")
print(bn_test_data.head())

NN Output Data:
     IR       EI Actual SP Predicted SP  Predicted SP Probability
0   low     poor    stable     decrease                  0.389251
1   low     good  decrease     decrease                  0.452134
2  high  average  increase     increase                  0.559337
3   low     good  decrease     decrease                  0.452134
4  high  average    stable     increase                  0.559337

Ground Truth BN Data:
     IR       EI        SP  SP_Probability
0   low     poor    stable            0.86
1   low     good  decrease            0.59
2  high  average  increase            0.51
3   low     good  decrease            0.59
4  high  average    stable            0.51


In [87]:
# Step 2: Learn the BN Structure from the NN Predictions
hc = HillClimbSearch(nn_output_data)
learned_model = hc.estimate(scoring_method=BicScore(nn_output_data))

# Display the learned structure (edges)
print("Learned BN Structure:")
print(learned_model.edges())

# Create a directed graph from the learned BN structure
G = nx.DiGraph()

# Add edges to the graph
G.add_edges_from(learned_model.edges())

# Extract node positions for the plot
pos = nx.spring_layout(G)

# Initialize edge and node traces
edge_x = []
edge_y = []

for edge in G.edges():
    x0, y0 = pos[edge[0]]
    x1, y1 = pos[edge[1]]
    edge_x += [x0, x1, None]
    edge_y += [y0, y1, None]

edge_trace = go.Scatter(
    x=edge_x, y=edge_y,
    line=dict(width=2, color='gray'),
    hoverinfo='none',
    mode='lines')

node_x = []
node_y = []
node_text = []

for node in G.nodes():
    x, y = pos[node]
    node_x.append(x)
    node_y.append(y)
    node_text.append(node)

node_trace = go.Scatter(
    x=node_x, y=node_y,
    text=node_text,
    mode='markers+text',
    textposition='top center',
    hoverinfo='text',
    marker=dict(
        showscale=False,
        color='purple',
        size=30,
        line_width=2)
)

# Create the figure
fig = go.Figure(data=[edge_trace, node_trace],
                layout=go.Layout(
                    title="Learned Bayesian Network Structure",
                    title_x=0.5,
                    showlegend=False,
                    hovermode='closest',
                    margin=dict(b=20, l=20, r=20, t=40),
                    xaxis=dict(showgrid=False, zeroline=False),
                    yaxis=dict(showgrid=False, zeroline=False)
                ))

# Display the figure
fig.show()

  0%|          | 0/1000000 [00:00<?, ?it/s]

Learned BN Structure:
[('IR', 'Predicted SP Probability'), ('Predicted SP', 'IR'), ('Predicted SP', 'Actual SP'), ('Predicted SP Probability', 'EI')]
