In [None]:
from ucimlrepo import fetch_ucirepo
import pandas as pd
import numpy as np
from pgmpy.models import DiscreteBayesianNetwork
from pgmpy.estimators import ExpectationMaximization
from pgmpy.factors.discrete import TabularCPD
from pgmpy.inference import VariableElimination
from imblearn.over_sampling import SMOTE
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectKBest, mutual_info_classif, RFE
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import f1_score

In [None]:
# Data Preprocessing

diabetes_dataset = fetch_ucirepo(id=891)
X_features = diabetes_dataset.data.features
y_targets = diabetes_dataset.data.targets

# Discretize BMI with categories
X_features.loc[:, "BMI"] = pd.cut(
    X_features.loc[:, "BMI"],
    bins = [0, 18.5, 25, 30, 35, float('inf')],
    labels = [0, 1, 2, 3, 4]
).astype(int)

# Categorize GenHlth, MentHlth, PhysHlth
X_features.loc[:, "GenHlth"] = pd.cut(
    X_features.loc[:, "GenHlth"],
    bins = [0, 2, 3, 5], 
    labels = [0, 1, 2]
).astype(int)

X_features.loc[:, "MentHlth"] = pd.cut(
    X_features.loc[:, "MentHlth"],
    bins=[-1, 5, 15, 30],    
    labels=[0, 1, 2]
).astype(int)

X_features.loc[:, "PhysHlth"] = pd.cut(
    X_features.loc[:, "PhysHlth"],
    bins=[-1, 5, 15, 30],
    labels=[0, 1, 2]
).astype(int)

# Shift features to start from 0
X_features.loc[:, "Age"] = X_features.loc[:, "Age"] - 1
X_features.loc[:, "Education"] = X_features.loc[:, "Education"] - 1
X_features.loc[:, "Income"] = X_features.loc[:, "Income"] - 1
X_features.loc[:, "GenHlth"] = X_features.loc[:, "GenHlth"] - 1

# Split into train, test, validation sets
X_train, X_test_validate, y_train, y_test_validate = train_test_split(X_features, y_targets, test_size = 0.2, 
                                                                      random_state = 42, stratify = y_targets)
X_test, X_validate, y_test, y_validate = train_test_split(X_test_validate, y_test_validate, test_size = 0.5, 
                                                          random_state = 42, stratify = y_test_validate)

# Account for class imbalance
print("Imbalanced: \n", y_train.value_counts())

smote = SMOTE(random_state = 42)
X_train, y_train = smote.fit_resample(X_train, y_train)

print("Balanced: \n", y_train.value_counts())

In [None]:
# Feature Selection

# Select 15 most relevant features
kbest_features = SelectKBest(mutual_info_classif, k = 15)
X_kbest_features = kbest_features.fit_transform(X_train, y_train.values.ravel())
selected_features_i = kbest_features.get_support(indices = True)
selected_feature_names = X_train.columns[selected_features_i]

# RFE to finalize feature selection
rf = RandomForestClassifier(n_estimators = 100, random_state = 42)
rfe = RFE(estimator = rf, n_features_to_select = 15)
rfe.fit(X_train[selected_feature_names], y_train.values.ravel())
selected_features = selected_feature_names[rfe.get_support()]

print("Selected features:", selected_features)

# Include only selected features
X_train = X_train[selected_features]
X_test = X_test[selected_features]
X_test_validate = X_test_validate[selected_features]

In [9]:
# Develop Bayesian Network

def model_setup(useSaved, X_train, y_train):
    if useSaved == False:
        model = DiscreteBayesianNetwork(
            [
                # Lifestyle latent variables
                ("GenHlth", "Lifestyle"),
                ("Smoker", "Lifestyle"),
                ("PhysActivity", "Lifestyle"),
                
                # Physical health latent variables
                ("HighBP", "PhysHlth"),
                ("HighChol", "PhysHlth"),
                ("BMI", "PhysHlth"),
                ("DiffWalk", "PhysHlth"),
                
                # Diet latent variables
                ("Fruits", "Diet"),
                ("Veggies", "Diet"),

                # Healthcare access latent variables
                ("CholCheck", "HealthcareAccess"),
                ("AnyHealthcare", "HealthcareAccess"),
    
                # Socioeconomic latent variables
                ("Sex", "SocioecStatus"),
                ("Age", "SocioecStatus"),
                ("Education", "SocioecStatus"),
                ("Income", "SocioecStatus"),

                # Final diagnosis latent variables
                ("Lifestyle", "Diabetes_binary"),
                ("PhysHlth", "Diabetes_binary"),
                ("Diet", "Diabetes_binary"),
                ("HealthcareAccess", "Diabetes_binary"),
                ("SocioecStatus", "Diabetes_binary"),
            ],
            latents = {"Lifestyle", "PhysHlth", "Diet", "HealthcareAccess", "SocioecStatus"}
        )

        # Acquire cardinalities
        cardinalities = {col: X_features[col].nunique() for col in X_features.columns}
        cardinalities.update({"Diabetes_binary" : 2}) # Prediabetes/diabetes, no diabetes
        cardinalities.update({"Lifestyle" : 3}) # Good, average, poor
        cardinalities.update({"PhysHlth" : 3}) # Good, average, poor
        cardinalities.update({"Diet" : 3}) # Good, average, poor
        cardinalities.update({"HealthcareAccess" : 3}) # Good, poor, none
        cardinalities.update({"SocioecStatus" : 3}) # Upper, middle, lower class

        # Generate CPTs
        cpts = []
        for var in model.nodes(): 
            var_card = cardinalities[var]
            parents = model.get_parents(var)
    
            if len(parents) == 0:
                probs = np.random.dirichlet(np.ones(var_card))
                cpts.append(TabularCPD(variable = var, variable_card = var_card, values = np.transpose([probs])))
            else:
                parent_cards = [cardinalities[parent] for parent in parents] 
                num_parent_configs = np.prod(parent_cards)
    
                probs = np.zeros((var_card, num_parent_configs))
    
                for col in range(num_parent_configs): 
                    probs[:, col] = np.random.dirichlet(np.ones(var_card))
    
                cpts.append(TabularCPD(variable = var, variable_card = var_card, values = probs, evidence = parents, evidence_card = parent_cards))
    
        model.add_cpds(*cpts)
    
        #check model for faults
        model.check_model()

        # EM
        train_combined = X_train.join(y_train)
        subset_train_combined = train_combined.sample(20000)
        model.fit(data = subset_train_combined, estimator = ExpectationMaximization)
        model.save("sganti-model.bif", filetype = "bif")

    else:
        model = DiscreteBayesianNetwork.load("sganti-model.bif", fyletype ="bif")

    # Perform inference
    model_infer = VariableElimination(model)

    return model_infer

In [None]:
# Model Evaluation

def evaluate_model(model_infer, X_test, X_validate):
    # Test set
    preds_test = []

    for _, row in X_test.iterrows():
        evidence = {k: str(v) for k, v in row.to_dict().items()}
        q = model_infer.query(["Diabetes_binary"], evidence = evidence)
        preds_test.append(int(q.values.argmax()))
 
    print("Test accuracy: ", accuracy_score(y_test, preds_test))
    print("Test precision: ", precision_score(y_test, preds_test))

    # F1 Score
    f1_test = f1_score(y_test, preds_test, average="binary")
    print("Test F1: ", f1_test)

    # Validation set
    preds_valid = [] 

    for _, row in X_validate.iterrows():
        evidence = {k: str(v) for k, v in row.to_dict().items()} 
        q = model_infer.query(["Diabetes_binary"], evidence = evidence)
        preds_valid.append(int(q.values.argmax()))

    print("Validation accuracy: ", accuracy_score(y_validate, preds_valid))
    print("Validation precision: ", precision_score(y_validate, preds_valid))

    # F1 Score
    f1_valid = f1_score(y_validate, preds_valid, average="binary")
    print("Validation F1: ", f1_valid)

In [None]:
# Build Pipeline

# Set up model
model_infer = model_setup(False, X_train, y_train)

# Compute model accuracy
evaluate_model(model_infer, X_test, X_validate)