In [1]:
from ucimlrepo import fetch_ucirepo
import pandas as pd
import numpy as np
from pgmpy.models import DiscreteBayesianNetwork
from pgmpy.estimators import ExpectationMaximization
from pgmpy.factors.discrete import TabularCPD
from pgmpy.inference import VariableElimination
from imblearn.over_sampling import SMOTE
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectKBest, mutual_info_classif, RFE
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score

In [2]:
# Data Preprocessing

diabetes_dataset = fetch_ucirepo(id=891)
X_features = diabetes_dataset.data.features
y_targets = diabetes_dataset.data.targets

# Discretize BMI with categories
X_features.loc[:, "BMI"] = pd.cut(
    X_features.loc[:, "BMI"],
    bins = [0, 18.5, 25, 30, 35, float('inf')],
    labels = [0, 1, 2, 3, 4]
).astype(int)

# Categorize GenHlth, MentHlth, PhysHlth
X_features.loc[:, "GenHlth"] = pd.cut(
    X_features.loc[:, "GenHlth"],
    bins = [0, 2, 3, 5],
    labels = [0, 1, 2]
).astype(int)

X_features.loc[:, "MentHlth"] = pd.cut(
    X_features.loc[:, "MentHlth"],
    bins=[-1, 5, 15, 30],
    labels=[0, 1, 2]
).astype(int)

X_features.loc[:, "PhysHlth"] = pd.cut(
    X_features.loc[:, "PhysHlth"],
    bins=[-1, 5, 15, 30],
    labels=[0, 1, 2]
).astype(int)

# Shift features to start from 0
X_features.loc[:, "Age"] = X_features.loc[:, "Age"] - 1
X_features.loc[:, "Education"] = X_features.loc[:, "Education"] - 1
X_features.loc[:, "Income"] = X_features.loc[:, "Income"] - 1
X_features.loc[:, "GenHlth"] = X_features.loc[:, "GenHlth"] - 1

# Split into train, test, validation sets
X_train, X_test_validate, y_train, y_test_validate = train_test_split(X_features, y_targets, test_size = 0.2,
                                                                      random_state = 42, stratify = y_targets)
X_test, X_validate, y_test, y_validate = train_test_split(X_test_validate, y_test_validate, test_size = 0.5,
                                                          random_state = 42, stratify = y_test_validate)

# Account for class imbalance
print("Imbalanced: \n", y_train.value_counts())

smote = SMOTE(random_state = 42)
X_train, y_train = smote.fit_resample(X_train, y_train)

print("Balanced: \n", y_train.value_counts())

Imbalanced: 
 Diabetes_binary
0                  174667
1                   28277
Name: count, dtype: int64
Balanced: 
 Diabetes_binary
0                  174667
1                  174667
Name: count, dtype: int64


In [12]:
X_train.columns

Index(['HighBP', 'HighChol', 'CholCheck', 'BMI', 'Smoker', 'Stroke',
       'HeartDiseaseorAttack', 'PhysActivity', 'Fruits', 'Veggies',
       'HvyAlcoholConsump', 'AnyHealthcare', 'NoDocbcCost', 'GenHlth',
       'MentHlth', 'PhysHlth', 'DiffWalk', 'Sex', 'Age', 'Education',
       'Income'],
      dtype='object')

In [3]:
#use trained model if set to true
useSaved = False

In [9]:
# -------------------------------
# BUILD MODEL STRUCTURE
# -------------------------------
latent_nodes = {
    "health_stats",
    "demographics",
    "risk_factors",
    "personal_traits"
}

edges = [
    ("BMI", "health_stats"),
    ("PhysActivity", "health_stats"),
    ("DiffWalk", "health_stats"),
    ("HighBP", "health_stats"),
    ("HighChol", "health_stats"),

    ("Fruits", "risk_factors"),
    ("Veggies", "risk_factors"),
    ("HvyAlcoholConsump", "risk_factors"),
    ("HeartDiseaseorAttack", "risk_factors"),
    ("Smoker", "risk_factors"),
    ("Stroke", "risk_factors"),

    ("AnyHealthcare", "personal_traits"),
    ("NoDocbcCost", "personal_traits"),
    ("CholCheck", "personal_traits"),

    ("Education", "demographics"),
    ("Income", "demographics"),
    ("Sex", "demographics"),
    ("Age", "demographics"),

    ("health_stats", "Diabetes_binary"),
    ("demographics", "Diabetes_binary"),
    ("risk_factors", "Diabetes_binary"),
    ("personal_traits", "Diabetes_binary"),
]

model = DiscreteBayesianNetwork(edges, latents=latent_nodes)

# -------------------------------
# CARDINALITIES
# -------------------------------
cardinalities = {col: X_train[col].nunique() for col in X_train.columns}
cardinalities.update({
    "Diabetes_binary": 2,
    "health_stats": 5,
    "demographics": 4,
    "risk_factors": 6,
    "personal_traits": 3
})

# -------------------------------
# FAST UNIFORM CPD GENERATION
# -------------------------------
from pgmpy.factors.discrete import TabularCPD
import numpy as np

cpds = []

for var in model.nodes():
    parents = model.get_parents(var)
    var_card = cardinalities[var]

    if len(parents) == 0:
        # Fast: each CPD = uniform distribution
        values = np.full((var_card, 1), 1.0 / var_card)
        cpds.append(TabularCPD(variable=var, variable_card=var_card, values=values))

    else:
        parent_cards = [cardinalities[p] for p in parents]
        num_parent_configs = np.prod(parent_cards)

        # Uniform CPD for all parent configurations
        values = np.tile(
            np.full((var_card,), 1.0 / var_card).reshape(var_card, 1),
            num_parent_configs
        )

        cpds.append(
            TabularCPD(
                variable=var,
                variable_card=var_card,
                values=values,
                evidence=parents,
                evidence_card=parent_cards
            )
        )

model.add_cpds(*cpds)
model.check_model()

# -------------------------------
# EM TRAINING
# -------------------------------
if not useSaved:
    combined = X_train.join(y_train)
    small_train = combined.sample(20000)

    model.fit(small_train, estimator=ExpectationMaximization)

    model.save("model_bill.bif", filetype="bif")
else:
    model = DiscreteBayesianNetwork.load("model_bill.bif", filetype="bif")


INFO:pgmpy: Datatype (N=numerical, C=Categorical Unordered, O=Categorical Ordered) inferred from data: 
 {'HighBP': 'N', 'HighChol': 'N', 'CholCheck': 'N', 'BMI': 'N', 'Smoker': 'N', 'Stroke': 'N', 'HeartDiseaseorAttack': 'N', 'PhysActivity': 'N', 'Fruits': 'N', 'Veggies': 'N', 'HvyAlcoholConsump': 'N', 'AnyHealthcare': 'N', 'NoDocbcCost': 'N', 'GenHlth': 'N', 'MentHlth': 'N', 'PhysHlth': 'N', 'DiffWalk': 'N', 'Sex': 'N', 'Age': 'N', 'Education': 'N', 'Income': 'N', 'Diabetes_binary': 'N'}


  0%|          | 0/100 [00:00<?, ?it/s]



In [14]:
from pgmpy.inference import VariableElimination

# --- Inference object ---
model_infer = VariableElimination(model)

# If you have an explicit set of latent variables, define it:
latents = {
    "PhysicalHealth",
    "Diet",
    "HealthcareAccess",
    "SocioeconomicStatus",
    "RiskFactor",
    "Demographic",
    "UnderlyingCondition",
}

# Use only observed variables that are both:
#   - in the model, AND
#   - in your X_* DataFrames
observed_vars = [
    col for col in X_test.columns        # or X_train.columns, same set of features
    if col in model.nodes() and col not in latents
]

print("Observed vars actually used as evidence:", observed_vars)

# ------------- TEST SET -------------
preds_test = []

# (Optional but recommended) handle any NaNs:
X_test_clean = X_test.copy()
X_test_clean = X_test_clean[observed_vars].fillna(0)

for _, row in X_test_clean.iterrows():
    # Build evidence dict: only observed vars, as ints
    evidence = {var: int(row[var]) for var in observed_vars}

    q = model_infer.query(variables=["Diabetes_binary"], evidence=evidence)
    # argmax over [P(D=0), P(D=1)]
    preds_test.append(int(q.values.argmax()))

print("Test accuracy:", accuracy_score(y_test, preds_test))

# ------------- VALIDATION SET -------------
preds_val = []

X_val_clean = X_validate.copy()
X_val_clean = X_val_clean[observed_vars].fillna(0)

for _, row in X_val_clean.iterrows():
    evidence = {var: int(row[var]) for var in observed_vars}

    q = model_infer.query(variables=["Diabetes_binary"], evidence=evidence)
    preds_val.append(int(q.values.argmax()))

print("Validation accuracy:", accuracy_score(y_validate, preds_val))

Observed vars actually used as evidence: ['HighBP', 'HighChol', 'CholCheck', 'BMI', 'Smoker', 'Stroke', 'HeartDiseaseorAttack', 'PhysActivity', 'Fruits', 'Veggies', 'HvyAlcoholConsump', 'AnyHealthcare', 'NoDocbcCost', 'DiffWalk', 'Sex', 'Age', 'Education', 'Income']
Test accuracy: 0.6930778934090193
Validation accuracy: 0.6885840428886787
