In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from src.bayesian_network import BayesianNetwork

# Load the data
data = pd.read_csv('data/diabetes.csv')

# Display basic information about the dataset
print(data.info())
print("\nSample data:")
print(data.head())

# Check for missing values
print("\nMissing values:")
print(data.isnull().sum())

# Basic statistics
print("\nBasic statistics:")
print(data.describe())

# Correlation matrix
print("\nCorrelation matrix:")
print(data.corr())

# Split the data
train_data, test_data = train_test_split(data, test_size=0.2, random_state=42)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 768 entries, 0 to 767
Data columns (total 9 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Pregnancies               768 non-null    int64  
 1   Glucose                   768 non-null    int64  
 2   BloodPressure             768 non-null    int64  
 3   SkinThickness             768 non-null    int64  
 4   Insulin                   768 non-null    int64  
 5   BMI                       768 non-null    float64
 6   DiabetesPedigreeFunction  768 non-null    float64
 7   Age                       768 non-null    int64  
 8   Outcome                   768 non-null    int64  
dtypes: float64(2), int64(7)
memory usage: 54.1 KB
None

Sample data:
   Pregnancies  Glucose  BloodPressure  ...  DiabetesPedigreeFunction  Age  Outcome
0            6      148             72  ...                     0.627   50        1
1            1       85             66  ...          

In [3]:
# Create and fit the Bayesian Network
bn = BayesianNetwork(method='hill_climb', max_parents=3)

# Define categorical columns
categorical_columns = ['Outcome']
bn.set_categorical_columns(categorical_columns)

# Fit the model
bn.fit(train_data)

# Evaluate the model
test_ll = bn.log_likelihood(test_data)
print(f"\nTest log-likelihood: {test_ll:.4f}")

# Cross-validation
mean_ll, std_ll = bn.cross_validate(data, k_folds=5)
print(f"Cross-validation: Mean LL = {mean_ll:.4f}, Std = {std_ll:.4f}")

# Compute sensitivity for the 'Outcome' variable
sensitivity = bn.compute_sensitivity('Outcome', num_samples=1000)
print("\nSensitivity analysis for 'Outcome':")
for node, value in sorted(sensitivity.items(), key=lambda x: x[1], reverse=True):
    print(f"Sensitivity to {node}: {value:.4f}")

# Network structure
print("\nNetwork structure:")
for node, parents in bn.explain_structure().items():
    print(f"{node}: {parents}")

AttributeError: 'BayesianNetwork' object has no attribute 'set_categorical_columns'