In [2]:
import pandas as pd
import torch
import numpy as np
import random
from itertools import product

In [3]:
def load_dataset():
    # Load the dataset from the CSV file
    df = pd.read_csv("data/diabetes.csv")
    return df

In [4]:
data = load_dataset()
data.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [5]:
# Set outcome to random probability
data["Outcome"] = data["Outcome"].apply(lambda x: random.random())

# Rename outcome column
data = data.rename(columns={"Outcome": "Probability"})

# Calculate Uncertainty
data["Uncertainty"] = 1 - 2 * np.abs(data["Probability"] - 0.5)

data.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Probability,Uncertainty
0,6,148,72,35,0,33.6,0.627,50,0.229367,0.458734
1,1,85,66,29,0,26.6,0.351,31,0.990484,0.019032
2,8,183,64,0,0,23.3,0.672,32,0.161301,0.322602
3,1,89,66,23,94,28.1,0.167,21,0.822908,0.354183
4,0,137,40,35,168,43.1,2.288,33,0.188979,0.377958


In [6]:
def mean_uncertainty_deviation(subgroup, overall_mean):
    """
    Calculate the Mean Uncertainty Deviation of a subgroup.
    """
    subgroup_mean = subgroup["Uncertainty"].mean()
    deviation = subgroup_mean - overall_mean
    return deviation


def z_score_uncertainty(subgroup, overall_mean, overall_std):
    """
    Calculate the Z-Score of the subgroup's uncertainty.
    """
    subgroup_mean = subgroup["Uncertainty"].mean()
    z_score = (subgroup_mean - overall_mean) / overall_std
    return z_score


def wracc_uncertainty(subgroup, overall_mean):
    """
    Calculate the Weighted Relative Accuracy (WRAcc) using uncertainty.
    """
    p_s = len(subgroup) / len(data)
    subgroup_mean = subgroup["Uncertainty"].mean()
    wracc_value = p_s * (subgroup_mean - overall_mean)
    return wracc_value


def lift(subgroup, overall_mean):
    """
    Calculate the lift of a subgroup.
    """
    subgroup_mean = subgroup["Uncertainty"].mean()
    lift_value = subgroup_mean / overall_mean
    return lift_value

In [7]:
class BeamSearchEMM:
    def __init__(self, data, attributes, beam_width=5, max_depth=3, min_support=0.1):
        self.data = data
        self.attributes = attributes
        self.beam_width = beam_width
        self.max_depth = max_depth
        self.min_support = min_support
        self.subgroups = []

    def find_subgroups(self, quality_measure):
        overall_mean = self.data["Uncertainty"].mean()
        overall_std = self.data["Uncertainty"].std()

        # Initialize beam with empty condition
        beam = [({"conditions": [], "depth": 0}, self.data)]

        for depth in range(1, self.max_depth + 1):
            candidates = []
            for parent_cond, parent_data in beam:
                # Generate possible conditions for each attribute
                for attr in self.attributes:
                    # Create conditions using percentiles
                    percentiles = [25, 50, 75]
                    values = np.percentile(self.data[attr], percentiles)
                    operators = ["<=", ">"]

                    for op, val in product(operators, values):
                        new_condition = f"{attr} {op} {val}"
                        if new_condition in parent_cond["conditions"]:
                            continue  # Skip if condition already exists

                        # Combine new condition with parent conditions
                        conditions = parent_cond["conditions"] + [new_condition]
                        subgroup_data = self.apply_conditions(self.data, conditions)

                        # Check if subgroup meets minimum support
                        if len(subgroup_data) / len(self.data) >= self.min_support:
                            # Calculate the quality of the subgroup
                            qm_value = self.calculate_quality(
                                subgroup_data,
                                quality_measure,
                                overall_mean,
                                overall_std,
                            )
                            candidates.append(
                                (
                                    (
                                        {"conditions": conditions, "depth": depth},
                                        subgroup_data,
                                    ),
                                    qm_value,
                                )
                            )

            # Keep top candidates based on quality measure
            candidates.sort(key=lambda x: x[1], reverse=True)
            beam = [candidate for candidate, qm in candidates[: self.beam_width]]

            # Store the best subgroups
            for candidate, qm_value in candidates[: self.beam_width]:
                subgroup_info = {
                    "conditions": candidate[0]["conditions"],
                    "quality": qm_value,
                    "size": len(candidate[1]),
                }
                self.subgroups.append(subgroup_info)

        # Return all subgroups found, sorted by quality
        self.subgroups.sort(key=lambda x: x["quality"], reverse=True)
        return self.subgroups

    def apply_conditions(self, data, conditions):
        """
        Apply a list of conditions to filter the data.
        """
        condition_str = " & ".join(
            [
                f"(data['{cond.split()[0]}'] {cond.split()[1]} {cond.split()[2]})"
                for cond in conditions
            ]
        )
        return data[eval(condition_str)]

    def calculate_quality(
        self, subgroup_data, quality_measure, overall_mean, overall_std
    ):
        """
        Calculate the quality of a subgroup using the specified quality measure.
        """
        if quality_measure == "mean_uncertainty_deviation":
            return mean_uncertainty_deviation(subgroup_data, overall_mean)
        elif quality_measure == "z_score_uncertainty":
            return z_score_uncertainty(subgroup_data, overall_mean, overall_std)
        elif quality_measure == "wracc_uncertainty":
            return wracc_uncertainty(subgroup_data, overall_mean)
        elif quality_measure == "lift":
            return lift(subgroup_data, overall_mean)
        else:
            raise ValueError(f"Unknown quality measure: {quality_measure}")

In [8]:
# Define the attributes to consider
attributes = [
    "Pregnancies",
    "Glucose",
    "BloodPressure",
    "SkinThickness",
    "Insulin",
    "BMI",
    "DiabetesPedigreeFunction",
    "Age",
]

In [9]:
# Initialize the BeamSearchEMM object
beam_search_emm = BeamSearchEMM(
    data, attributes, beam_width=3, max_depth=2, min_support=0.05
)

# Find subgroups using 'mean_uncertainty_deviation' as the quality measure
subgroups_mud = beam_search_emm.find_subgroups("mean_uncertainty_deviation")

# Display the top subgroups
print("Subgroups based on Mean Uncertainty Deviation:")
for idx, subgroup in enumerate(subgroups_mud, start=1):
    print(f"Subgroup {idx}:")
    print(f"  Conditions: {' AND '.join(subgroup['conditions'])}")
    print(f"  Quality (Mean Uncertainty Deviation): {subgroup['quality']:.4f}")
    print(f"  Subgroup Size: {subgroup['size']}\n")

Subgroups based on Mean Uncertainty Deviation:
Subgroup 1:
  Conditions: SkinThickness > 32.0 AND Glucose <= 117.0
  Quality (Mean Uncertainty Deviation): 0.0895
  Subgroup Size: 71

Subgroup 2:
  Conditions: SkinThickness > 32.0 AND Pregnancies <= 3.0
  Quality (Mean Uncertainty Deviation): 0.0885
  Subgroup Size: 106

Subgroup 3:
  Conditions: SkinThickness > 32.0 AND Pregnancies <= 1.0
  Quality (Mean Uncertainty Deviation): 0.0745
  Subgroup Size: 70

Subgroup 4:
  Conditions: SkinThickness > 32.0
  Quality (Mean Uncertainty Deviation): 0.0448
  Subgroup Size: 188

Subgroup 5:
  Conditions: Glucose > 140.25
  Quality (Mean Uncertainty Deviation): 0.0286
  Subgroup Size: 192

Subgroup 6:
  Conditions: BMI > 36.6
  Quality (Mean Uncertainty Deviation): 0.0268
  Subgroup Size: 188



In [10]:
# Initialize the BeamSearchEMM object
beam_search_emm = BeamSearchEMM(
    data, attributes, beam_width=3, max_depth=2, min_support=0.1
)
# Find subgroups using 'mean_uncertainty_deviation' as the quality measure
subgroups_wracc = beam_search_emm.find_subgroups("wracc_uncertainty")

# Display the top subgroups
print("Subgroups based on wracc:")
for idx, subgroup in enumerate(subgroups_wracc, start=1):
    print(f"Subgroup {idx}:")
    print(f"  Conditions: {' AND '.join(subgroup['conditions'])}")
    print(f"  Quality (Mean Uncertainty Deviation): {subgroup['quality']:.4f}")
    print(f"  Subgroup Size: {subgroup['size']}\n")

Subgroups based on wracc:
Subgroup 1:
  Conditions: BMI > 27.3 AND Age <= 29.0
  Quality (Mean Uncertainty Deviation): 0.0133
  Subgroup Size: 275

Subgroup 2:
  Conditions: SkinThickness > 32.0 AND Pregnancies <= 6.0
  Quality (Mean Uncertainty Deviation): 0.0123
  Subgroup Size: 137

Subgroup 3:
  Conditions: BMI > 27.3 AND Age <= 41.0
  Quality (Mean Uncertainty Deviation): 0.0123
  Subgroup Size: 440

Subgroup 4:
  Conditions: BMI > 27.3
  Quality (Mean Uncertainty Deviation): 0.0113
  Subgroup Size: 574

Subgroup 5:
  Conditions: SkinThickness > 32.0
  Quality (Mean Uncertainty Deviation): 0.0110
  Subgroup Size: 188

Subgroup 6:
  Conditions: Glucose > 99.0
  Quality (Mean Uncertainty Deviation): 0.0074
  Subgroup Size: 571

