In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from pgmpy.models import DiscreteBayesianNetwork
from pgmpy.estimators import MaximumLikelihoodEstimator
from pgmpy.inference import VariableElimination
from ucimlrepo import fetch_ucirepo

  from .autonotebook import tqdm as notebook_tqdm


In [61]:
dataset = fetch_ucirepo(id=891)
X = dataset.data.features
y = dataset.data.targets

In [62]:
#Data cleaning (so it plays nice with pgmpy)
#convert BMI into underweight, normal, overweight, obese, and extremely obese (respectively 0 through 4)
X.loc[:, "BMI"] = pd.cut(
    X.loc[:, "BMI"],
    bins=[0, 18.5, 25, 30, 35, 40, 45, float("inf")],
    labels=[0, 1, 2, 3, 4, 5, 6]
).astype(int)

#shift age to be 0 through 12 instead of 1 through 13
X.loc[:, "Age"] = X.loc[:, "Age"] - 1

#shift education to be 0 through 5 instead of 1 through 6
X.loc[:, "Education"] = X.loc[:, "Education"] - 1

#shift income to be 0 through 7 instead of 1 through 8 
X.loc[:, "Income"] = X.loc[:, "Income"] - 1

# GenHlth: 1–5 → 0 ("good"), 1 ("average"), 2 ("poor")
X.loc[:, "GenHlth"] = pd.cut(
    X["GenHlth"],
    bins=[0, 2, 3, 5],       # (0,2], (2,3], (3,5]
    labels=[0, 1, 2]
).astype(int)

# MentHlth: 0–30 days → good / medium / poor
X.loc[:, "MentHlth"] = pd.cut(
    X["MentHlth"],
    bins=[-1, 5, 15, 30],    # (-1,5], (5,15], (15,30]
    labels=[0, 1, 2]
).astype(int)

# PhysHlth: 0–30 days → good / medium / poor
X.loc[:, "PhysHlth"] = pd.cut(
    X["PhysHlth"],
    bins=[-1, 5, 15, 30],
    labels=[0, 1, 2]
).astype(int)

In [63]:
X.loc[:, "Diabetes_binary"] = y

X_train, X_test_validate, y_train, y_test_validate = train_test_split(X, y, train_size=0.8, random_state=42)
X_test, X_validate, y_test, y_validate = train_test_split(X_test_validate, y_test_validate, train_size=0.5, random_state=42)

X_train.loc[:, "Diabetes_binary"] = y_train
X_validate.loc[:, "Diabetes_binary"] = y_validate
X_test.loc[:, "Diabetes_binary"] = y_test

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X.loc[:, "Diabetes_binary"] = y


In [64]:
edges_to_keep = [
    ("GenHlth", "BMI"),
    ("DiffWalk", "BMI"),
    ("Sex", "BMI"),
    ("HighBP", "BMI"),
    ("HighChol", "BMI"),
    ("BMI", "Diabetes_binary")
]
nodes_to_keep = ["BMI", "Diabetes_binary", "GenHlth", "DiffWalk", "Sex", "HighBP", "HighChol"]

train_subset = X_train[nodes_to_keep]

model = DiscreteBayesianNetwork(edges_to_keep)
model.fit(train_subset, estimator=MaximumLikelihoodEstimator)
model.check_model()

INFO:pgmpy: Datatype (N=numerical, C=Categorical Unordered, O=Categorical Ordered) inferred from data: 
 {'BMI': 'N', 'Diabetes_binary': 'N', 'GenHlth': 'N', 'DiffWalk': 'N', 'Sex': 'N', 'HighBP': 'N', 'HighChol': 'N'}


True

In [65]:
model.save("MLE.bif", filetype="bif")

In [66]:
def bn_predict(model, X, y, target="Diabetes_binary"):
    infer = VariableElimination(model)
    preds = []

    for _, row in X.iterrows():
        evidence = row.to_dict()
        evidence.pop(target, None)
        q = infer.query([target], evidence=evidence, show_progress=False)
        preds.append(int(q.values.argmax()))

    acc = accuracy_score(y, preds)
    return preds, acc

predict_nodes = [n for n in nodes_to_keep if n != "Diabetes_binary"]

preds_val, acc_val = bn_predict(model, X_validate[predict_nodes], y_validate)
preds_test, acc_test = bn_predict(model, X_test[predict_nodes], y_test)
print("Validation accuracy:", acc_val)
print("Test accuracy:", acc_test)

Validation accuracy: 0.8626616209397666
Test accuracy: 0.8615184484389783


In [67]:
proportion_1 = 1 - y_validate["Diabetes_binary"].mean()
proportion_2 = 1 - y_test["Diabetes_binary"].mean()
print(proportion_1)
print(proportion_2)

0.8626616209397666
0.8615184484389782
