<a href="https://colab.research.google.com/github/nonyeezeh/Research-Project-Code/blob/main/code_4.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Imports

In [5]:
import numpy as np
import pandas as pd
import networkx as nx
import plotly.graph_objects as go

import pandas as pd
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import classification_report, accuracy_score

# Research Question

##### In this research, how does the predictive accuracy of a neural network compare to that of a Bayesian network in predicting stock prices, particularly when trained on varying sample sizes of data generated by a Bayesian network?

# Expectations

1. With larger training samples, the neural network's performance is expected to improve due to having sufficient data for effective learning, while the Bayesian network may outperform the neural network on smaller samples.
2. The Bayesian network is anticipated to show more consistent performance across different sample sizes due to its probabilistic nature and reliance on prior knowledge.
3. The neural network might require more computational resources and time to train, especially with increasing sample sizes, compared to the Bayesian network.

# Data: 3 Nodes, 500 Samples

## Bayesian Network Data Generation

In [16]:
# Define the number of samples
num_samples = 500

# Define the possible values for each variable
values = {
    'IR': ['low', 'medium', 'high'],
    'EI': ['poor', 'average', 'good'],
    'SP': ['decrease', 'stable', 'increase']
}

# Functions to sample each variable with probabilities
def sample_IR():
    probabilities = np.random.dirichlet(np.ones(len(values['IR'])))
    rounded_probs = [round(p, 2) for p in probabilities]

    # Ensure that the chosen IR is the one with the highest probability
    chosen_index = np.argmax(probabilities)
    chosen_value = values['IR'][chosen_index]

    return chosen_value, rounded_probs

def sample_EI(ir=None):
    probabilities = np.random.dirichlet(np.ones(len(values['EI'])))
    rounded_probs = [round(p, 2) for p in probabilities]

    # Ensure that the chosen EI is the one with the highest probability
    chosen_index = np.argmax(probabilities)
    chosen_value = values['EI'][chosen_index]

    return chosen_value, rounded_probs

def sample_SP(ir=None, ei=None):
    probabilities = np.random.dirichlet(np.ones(len(values['SP'])))
    rounded_probs = [round(p, 2) for p in probabilities]

    # Ensure that the chosen SP is the one with the highest probability
    chosen_index = np.argmax(probabilities)
    chosen_value = values['SP'][chosen_index]

    return chosen_value, rounded_probs

# Randomly determine the structure
edges = []
if np.random.rand() > 0.5:
    edges.append(('IR', 'EI'))
if np.random.rand() > 0.5:
    edges.append(('EI', 'IR'))
if np.random.rand() > 0.5:
    edges.append(('IR', 'SP'))
if np.random.rand() > 0.5:
    edges.append(('EI', 'SP'))

# Ensure there's at least one edge to SP (either from IR or EI)
if not any(edge[1] == 'SP' for edge in edges):
    edges.append(np.random.choice([('IR', 'SP'), ('EI', 'SP')]))

# Generate the data and capture probabilities
data = []
probabilities_data = []

for _ in range(num_samples):
    ir, ir_probs = sample_IR()
    ei, ei_probs = sample_EI(ir)
    sp, sp_probs = sample_SP(ir, ei)

    data.append([ir, ei, sp])
    probabilities_data.append([
        ','.join(map(str, ir_probs)),
        ir,
        ','.join(map(str, ei_probs)),
        ei,
        ','.join(map(str, sp_probs)),
        sp
    ])

# Convert to DataFrame for the main data
df = pd.DataFrame(data, columns=['IR', 'EI', 'SP'])

# Save the main data to a CSV file
df.to_csv('bn_data_structure.csv', index=False)

# Convert to DataFrame for probabilities and chosen values
probabilities_df = pd.DataFrame(probabilities_data, columns=[
    'IR_Probabilities', 'Chosen_IR',
    'EI_Probabilities', 'Chosen_EI',
    'SP_Probabilities', 'Chosen_SP'
])

# Save the probabilities and chosen values to a CSV file
probabilities_df.to_csv('bn_probabilities.csv', index=False)

# Display the first 5 rows of each DataFrame
print("Generated data:")
print(df.head())

print("\nProbabilities and chosen values:")
print(probabilities_df.head())

print("\nMain data and probabilities saved successfully.")

Generated data:
       IR       EI        SP
0     low     good  increase
1     low     good    stable
2  medium  average  increase
3     low     good  increase
4  medium     good  increase

Probabilities and chosen values:
  IR_Probabilities Chosen_IR EI_Probabilities Chosen_EI SP_Probabilities  \
0   0.54,0.45,0.01       low   0.29,0.03,0.67      good    0.06,0.34,0.6   
1    0.49,0.21,0.3       low   0.41,0.08,0.51      good   0.14,0.82,0.04   
2   0.21,0.53,0.26    medium   0.32,0.62,0.06   average   0.32,0.23,0.45   
3    0.6,0.25,0.15       low   0.04,0.35,0.61      good    0.4,0.05,0.54   
4    0.1,0.89,0.01    medium   0.13,0.12,0.75      good   0.11,0.31,0.58   

  Chosen_SP  
0  increase  
1    stable  
2  increase  
3  increase  
4  increase  

Main data and probabilities saved successfully.


In [12]:
# Visualize the Bayesian Network structure using Plotly
G = nx.DiGraph()

# Add nodes and edges
G.add_edges_from(edges)

# Extract node positions for Plotly
pos = nx.spring_layout(G)
edge_x = []
edge_y = []
arrow_x = []
arrow_y = []

for edge in G.edges():
    x0, y0 = pos[edge[0]]
    x1, y1 = pos[edge[1]]
    edge_x.append(x0)
    edge_x.append(x1)
    edge_x.append(None)
    edge_y.append(y0)
    edge_y.append(y1)
    edge_y.append(None)

    # Move arrows closer to the target node (x1, y1)
    arrow_x.append(0.85 * x1 + 0.15 * x0)
    arrow_y.append(0.85 * y1 + 0.15 * y0)


edge_trace = go.Scatter(
    x=edge_x, y=edge_y,
    line=dict(width=2, color='gray'),
    hoverinfo='none',
    mode='lines')

node_x = []
node_y = []
node_text = []
node_color = []

for node in G.nodes():
    x, y = pos[node]
    node_x.append(x)
    node_y.append(y)
    node_text.append(node)

    # Highlight the SP node with a different color
    if node == 'SP':
        node_color.append('pink')
    else:
        node_color.append('purple')

node_trace = go.Scatter(
    x=node_x, y=node_y,
    mode='markers+text',
    text=node_text,
    textposition="top center",
    hoverinfo='text',
    marker=dict(size=50, color=node_color, line=dict(width=2)))

# Adding the arrow heads, placing them correctly outside the nodes
arrow_trace = go.Scatter(
    x=arrow_x, y=arrow_y,
    mode='markers',
    marker=dict(size=10, color='black', symbol='triangle-up'),
    hoverinfo='none'
)

fig = go.Figure(data=[edge_trace, node_trace, arrow_trace],
             layout=go.Layout(
                showlegend=False,
                hovermode='closest',
                margin=dict(b=20, l=20, r=20, t=50),  # Adjusted margins to fit the title
                xaxis=dict(showgrid=False, zeroline=False),
                yaxis=dict(showgrid=False, zeroline=False),
                plot_bgcolor='aliceblue')
                )

# Update layout to include a proper title
fig.update_layout(title_text="Bayesian Network Structure", title_x=0.5)

fig.show()

## Neural Network

### Neural Network Training

In [13]:
import pandas as pd
import numpy as np
import tensorflow as tf
from sklearn.preprocessing import OneHotEncoder

# Load the data
data = pd.read_csv('bn_data_structure.csv')
prob_data = pd.read_csv('bn_probabilities.csv')

# Preprocess the data
encoder = OneHotEncoder(sparse=False)
X = encoder.fit_transform(data[['IR', 'EI']])
y = encoder.fit_transform(data[['SP']])

# Build and train the neural network
model = tf.keras.Sequential([
    tf.keras.layers.Dense(10, input_dim=X.shape[1], activation='relu'),
    tf.keras.layers.Dense(y.shape[1], activation='softmax')
])
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
model.fit(X, y, epochs=50, batch_size=16)

# Get predicted probabilities from the NN
y_pred_probs = model.predict(X)

# Extract the actual SP index from one-hot encoded y
actual_sp_indices = y.argmax(axis=1)

# Create a DataFrame to hold the actual and predicted SP along with probabilities
output_data = []

for i in range(len(data)):
    # Ground truth probability from BN
    actual_sp = data['SP'].iloc[i]
    actual_sp_index = actual_sp_indices[i]
    bn_prob = float(prob_data.iloc[i]['SP_Probabilities'].split(',')[actual_sp_index])

    # Predicted probability from NN
    nn_prob = y_pred_probs[i][actual_sp_index]

    # Add to output data
    output_data.append({
        'IR': data['IR'].iloc[i],
        'EI': data['EI'].iloc[i],
        'Actual SP': actual_sp,
        'BN Probability': bn_prob,
        'NN Probability': nn_prob,
        'Predicted SP': encoder.categories_[-1][y_pred_probs[i].argmax()],
        'Predicted SP Probabilities': ','.join(map(str, np.round(y_pred_probs[i], 2)))
    })

# Convert output data to DataFrame
output_df = pd.DataFrame(output_data)

# Save the output to a CSV file
output_df.to_csv('nn_output_with_probabilities.csv', index=False)

# Show the first 10 rows
print(output_df.head(10))

print("NN output with probabilities saved to 'nn_output_with_probabilities.csv'")

Epoch 1/50



`sparse` was renamed to `sparse_output` in version 1.2 and will be removed in 1.4. `sparse_output` is ignored unless you leave `sparse` to its default value.


`sparse` was renamed to `sparse_output` in version 1.2 and will be removed in 1.4. `sparse_output` is ignored unless you leave `sparse` to its default value.


Do not pass an `input_shape`/`input_dim` argument to a layer. When using Sequential models, prefer using an `Input(shape)` object as the first layer in the model instead.



[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 1ms/step - accuracy: 0.2914 - loss: 1.1632
Epoch 2/50
[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.3209 - loss: 1.1428 
Epoch 3/50
[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.3781 - loss: 1.1062 
Epoch 4/50
[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.3200 - loss: 1.1195
Epoch 5/50
[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.3644 - loss: 1.1018 
Epoch 6/50
[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.3147 - loss: 1.1152 
Epoch 7/50
[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.3323 - loss: 1.1037 
Epoch 8/50
[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.3106 - loss: 1.1144
Epoch 9/50
[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0

### Neural Network Structure Learning