In [3]:
import sys
import os

# Add the parent directory to sys.path
notebook_dir = os.path.abspath('')
project_dir = os.path.dirname(notebook_dir)
sys.path.append(project_dir)

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import logging
from scipy import stats
import networkx as nx
import json
from pgmpy.estimators import PC
from pgmpy.models import BayesianNetwork as PgmpyBN
from pgmpy.estimators import BayesianEstimator
from pgmpy.inference import VariableElimination
from datetime import datetime

from src.bayesian_network import BayesianNetwork
from src.bayesian_node import BayesianNode, CategoricalNode

logging.basicConfig(level=logging.INFO)

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

from src.data_processing import prepare_data

In [2]:
# Prepare data
print("Preparing data...")
data, categorical_columns, categories = prepare_data()
print(categories)
# Use PC algorithm for structure learning
pc = PC(data)
edges = pc.estimate(significance_level=0.05)

# Create a Bayesian Network using pgmpy
model = PgmpyBN(edges)

# Print out the nodes in the model
print("Nodes in the Bayesian Network model:")
print(model.nodes())


Preparing data...
Data loaded successfully
Features selected successfully
Data merged successfully
Merged data shape: (1113, 25)
Original data types:
Subject                   int64
FS_TotCort_GM_Vol         int64
FS_SubCort_GM_Vol         int64
FS_Total_GM_Vol           int64
FS_Tot_WM_Vol             int64
FS_BrainStem_Vol          int64
FS_L_Hippo_Vol            int64
FS_R_Hippo_Vol            int64
FS_L_Amygdala_Vol         int64
FS_R_Amygdala_Vol         int64
FS_L_Caudate_Vol          int64
FS_R_Caudate_Vol          int64
FS_L_Putamen_Vol          int64
FS_R_Putamen_Vol          int64
Age                      object
Gender                   object
CogFluidComp_Unadj      float64
CogCrystalComp_Unadj    float64
MMSE_Score                int64
NEOFAC_O                float64
NEOFAC_C                float64
ProcSpeed_Unadj         float64
CardSort_Unadj          float64
PicVocab_Unadj          float64
ReadEng_Unadj           float64
dtype: object

Missing values:
Subject            

  0%|          | 0/5 [00:00<?, ?it/s]

Nodes in the Bayesian Network model:
[]


In [None]:

# Fit the parameters
model.fit(data, estimator=BayesianEstimator, prior_type="BDeu")

# Perform inference
inference = VariableElimination(model)

# Gather unique values for each column
unique_values = {col: data[col].unique().tolist() for col in data.columns}

# Example: Predict CogFluidComp_Unadj given Age and Gender
# Note: For comprehensive analysis, you might want to run this for all combinations
predictions = {}

# Check nodes in the model
nodes_in_model = model.nodes()

for age_category in unique_values['Age']:
    for gender_category in unique_values['Gender']:
        # Ensure all evidence nodes are in the model
        if 'Age' in nodes_in_model and 'Gender' in nodes_in_model:
            evidence = {'Age': age_category, 'Gender': gender_category}
            try:
                prediction = inference.query(['CogFluidComp_Unadj'], evidence=evidence)
                predictions[f"Age_{age_category}_Gender_{gender_category}"] = str(prediction)
            except Exception as e:
                predictions[f"Age_{age_category}_Gender_{gender_category}"] = f"Error: {str(e)}"
        else:
            predictions[f"Age_{age_category}_Gender_{gender_category}"] = "Error: Evidence node not in model"

# Gather influences
influences = {node: model.get_parents(node) for node in model.nodes() if model.get_parents(node)}

# Save results to JSON
results = {
    "learned_structure": [edge for edge in model.edges()],
    "unique_values": unique_values,
    "predictions": predictions,
    "influences": influences
}

filename = f"{datetime.now().strftime('%Y-%m-%d-%H-%M')}-findings.json"
with open(filename, 'w') as f:
    json.dump(results, f, indent=4)

print(f"Results saved to {filename}")

In [4]:
def diagnose_categorical_data(data, categorical_columns, categories):
    issues = []
    
    for col in categorical_columns:
        if col not in data.columns:
            issues.append(f"Column '{col}' not found in data")
            continue
        
        min_val = data[col].min()
        if pd.api.types.is_numeric_dtype(data[col]) and min_val < 0:
            issues.append(f"Negative values found in categorical column '{col}'. Min value: {min_val}")
        
        if col in categories:
            unique_values = set(data[col].dropna().unique())
            if not set(categories[col]).issuperset(unique_values):
                issues.append(f"Mismatch in categories for '{col}'. Data contains values not in specified categories.")
        else:
            issues.append(f"No categories specified for categorical column '{col}'")
    
    return issues

In [5]:
def check_distributions(data):
    problematic_columns = []
    stats_dict = {}
    cols_to_check = [col for col in data.columns if col != 'Subject']
    for column in cols_to_check:
        # Skip non-numeric columns
        if not np.issubdtype(data[column].dtype, np.number):
            continue

        # Calculate statistics
        skewness = stats.skew(data[column].dropna())
        kurtosis = stats.kurtosis(data[column].dropna())
        shapiro_test = stats.shapiro(data[column].dropna())

        # Check for extreme values
        is_problematic = (
            abs(skewness) > 2 or 
            abs(kurtosis) > 7 or 
            shapiro_test.pvalue < 0.05
        )

        if is_problematic:
            problematic_columns.append(column)
            stats_dict[column] = {
                'skewness': skewness,
                'kurtosis': kurtosis,
                'shapiro_pvalue': shapiro_test.pvalue
            }

    return problematic_columns, stats_dict

# Assuming 'processed_data' is your DataFrame
problematic_cols, stats = check_distributions(data)

print("Columns with problematic distributions:")
for col in problematic_cols:
    print(f"\n{col}:")
    print(f"  Skewness: {stats[col]['skewness']:.2f}")
    print(f"  Kurtosis: {stats[col]['kurtosis']:.2f}")
    print(f"  Shapiro-Wilk p-value: {stats[col]['shapiro_pvalue']:.4f}")

Columns with problematic distributions:

FS_TotCort_GM_Vol:
  Skewness: 0.23
  Kurtosis: -0.14
  Shapiro-Wilk p-value: 0.0015

FS_SubCort_GM_Vol:
  Skewness: 0.21
  Kurtosis: -0.15
  Shapiro-Wilk p-value: 0.0014

FS_Total_GM_Vol:
  Skewness: 0.19
  Kurtosis: -0.29
  Shapiro-Wilk p-value: 0.0021

FS_Tot_WM_Vol:
  Skewness: 0.36
  Kurtosis: 0.01
  Shapiro-Wilk p-value: 0.0000

FS_BrainStem_Vol:
  Skewness: 0.49
  Kurtosis: 1.17
  Shapiro-Wilk p-value: 0.0000

FS_L_Hippo_Vol:
  Skewness: -0.14
  Kurtosis: 2.08
  Shapiro-Wilk p-value: 0.0000

FS_R_Hippo_Vol:
  Skewness: 0.13
  Kurtosis: 0.27
  Shapiro-Wilk p-value: 0.0059

FS_L_Amygdala_Vol:
  Skewness: 0.37
  Kurtosis: 0.01
  Shapiro-Wilk p-value: 0.0000

FS_R_Amygdala_Vol:
  Skewness: 0.50
  Kurtosis: 0.80
  Shapiro-Wilk p-value: 0.0000

FS_L_Caudate_Vol:
  Skewness: 0.37
  Kurtosis: 0.06
  Shapiro-Wilk p-value: 0.0000

FS_R_Caudate_Vol:
  Skewness: 0.36
  Kurtosis: 0.12
  Shapiro-Wilk p-value: 0.0000

Age:
  Skewness: -0.11
  Kurtosis: 