In [None]:
import logging
import numpy as np
import pandas as pd
from typing import Union
import random

In [None]:
# Adjust logging level if needed
logging.basicConfig(level=logging.DEBUG)

# Set random seed for reproducibility
random.seed(42)

In [1]:
class Description:

    def __init__(
        self, attribute: str = None, value: Union[str, float, int, bool, list] = None
    ):
        self.description = {}
        if attribute is not None:
            self.description[attribute] = value

    def __contains__(self, col):
        return col in self.description

    def extend(self, attribute, value):
        self.description[attribute] = value
        return self

    def decrypt(self, translation):
        for key, value in self.description.items():
            if key in translation:
                self.description[key] = translation[key][value]

    def get_mask(self, df):
        if not self.description:
            return pd.Series([True] * len(df), index=df.index)
        mask = pd.Series([True] * len(df), index=df.index)
        for attribute, value in self.description.items():
            if isinstance(value, list) and len(value) == 2:
                # Apply range condition
                mask &= (df[attribute] > value[0]) & (df[attribute] <= value[1])
            else:
                # Apply equality condition
                mask &= df[attribute] == value
        return mask

    def __str__(self):
        if not self.description:
            return "all"
        else:
            result = []
            for key, value in self.description.items():
                if isinstance(value, list) and len(value) == 2:
                    result.append(f"{value[0]:.2f} < {key} ≤ {value[1]:.2f}")
                else:
                    result.append(f"{key} = {value}")
            return " AND ".join(result)


class Subgroup:

    def __init__(self, data: pd.DataFrame, description: Description):
        self.data = data
        self.description = description
        self.score = None
        self.coverage = None

    @classmethod
    def create(cls, df: pd.DataFrame, description: Description):
        mask = description.get_mask(df)
        data = df[mask]
        return cls(data, description)

    def evaluate_quality(
        self, overall_stats: dict, quality_measure_func, min_size, target
    ):
        if self.size < min_size:
            self.score = -np.inf  # Esclude sottogruppi troppo piccoli
        else:
            self.score = quality_measure_func(self.data, overall_stats, target)

    def decrypt_description(self, translation):
        self.description.decrypt(translation)

    @property
    def size(self):
        return len(self.data)

    def print(self):
        logging.debug(f"{str(self.description)} {self.score} ({self.size})")


class Beam:

    def __init__(self, settings: dict):
        self.subgroups = []
        self.candidates = []
        self.max_items = settings["width"]
        self.candidate_size = int(
            settings.get("candidate_size", settings["width"] ** 2)
        )
        self.strategy = settings["strategy"]
        self.min_score = None
        self.scores = []

    def add(self, subgroup: Subgroup):
        if subgroup.score == -np.inf:
            return  # Skip subgroups with invalid scores

        if len(self.candidates) < self.candidate_size:
            self.candidates.append(subgroup)
            self.scores.append(subgroup.score)
            self.min_score = (
                min(self.scores) if self.strategy == "maximize" else max(self.scores)
            )
        elif (self.strategy == "maximize" and subgroup.score > self.min_score) or (
            self.strategy == "minimize" and subgroup.score < self.min_score
        ):
            idx = self.scores.index(self.min_score)
            del self.scores[idx]
            del self.candidates[idx]
            self.candidates.append(subgroup)
            self.scores.append(subgroup.score)
            self.min_score = (
                min(self.scores) if self.strategy == "maximize" else max(self.scores)
            )

    def sort(self, attribute: str = "score") -> None:
        if attribute == "score":
            self.candidates.sort(
                key=lambda x: x.score, reverse=(self.strategy == "maximize")
            )
            self.subgroups.sort(
                key=lambda x: x.score, reverse=(self.strategy == "maximize")
            )
        elif attribute == "coverage":
            self.candidates.sort(
                key=lambda x: x.score * x.coverage,
                reverse=(self.strategy == "maximize"),
            )
        else:
            raise ValueError("Invalid sort attribute")

    def select_cover_based(self):
        self.sort()
        if len(self.candidates) > self.max_items:
            index = np.array([], dtype=int)
            for subgroup in self.candidates:
                overlap_size = np.intersect1d(subgroup.data.index.values, index).size
                if subgroup.data.index.size == 0:
                    subgroup.coverage = 0
                else:
                    subgroup.coverage = 1 - (overlap_size / subgroup.data.index.size)
                index = np.unique(np.concatenate((index, subgroup.data.index.values)))
            self.sort(attribute="coverage")
        self.subgroups = self.candidates[: self.max_items]
        self.scores = [s.score for s in self.subgroups]
        self.min_score = (
            min(self.scores) if self.strategy == "maximize" else max(self.scores)
        )

    def decrypt_descriptions(self, translation):
        for s in self.subgroups:
            s.decrypt_description(translation)

    def print(self):
        self.sort(attribute="coverage")
        logging.debug("-" * 20)
        for s in self.subgroups:
            s.print()


# Quality Measures


def mean_uncertainty_deviation(
    subgroup_data: pd.DataFrame, overall_stats: dict, target: str
) -> float:
    subgroup_mean = subgroup_data[target].mean()
    return subgroup_mean - overall_stats["mean"]


def z_score_uncertainty(
    subgroup_data: pd.DataFrame, overall_stats: dict, target: str
) -> float:
    subgroup_mean = subgroup_data[target].mean()
    subgroup_std = subgroup_data[target].std()
    subgroup_stderr = subgroup_std / np.sqrt(len(subgroup_data))
    if subgroup_std == 0:
        return 0
    return (subgroup_mean - overall_stats["mean"]) / subgroup_stderr


def calculate_entropy(values, num_bins=4):
    counts, _ = np.histogram(values, bins=num_bins)
    probabilities = counts / counts.sum()
    probabilities = probabilities[probabilities > 0]  # Avoid log(0)
    entropy = -np.sum(probabilities * np.log2(probabilities))
    return entropy


def wracc_uncertainty(
    subgroup_data: pd.DataFrame, overall_stats: dict, target: str
) -> float:
    p = len(subgroup_data) / overall_stats["total_count"]
    subgroup_mean = subgroup_data[target].mean()
    entropy_weight = calculate_entropy(
        subgroup_data[target], num_bins=overall_stats["num_bins"]
    )
    entropy_ratio = (
        entropy_weight / overall_stats["entropy"]
        if overall_stats["entropy"] != 0
        else 1
    )
    return p * (subgroup_mean - overall_stats["mean"]) * entropy_ratio


def expand_description(
    description: Description, df: pd.DataFrame, target: str, num_bins=4
):
    new_descriptions = []
    used_attributes = set(description.description.keys())
    for attribute in df.columns:
        if attribute in used_attributes or attribute == target:
            continue
        if pd.api.types.is_numeric_dtype(df[attribute]):
            # For numerical attributes, create bins
            # bins = np.linspace(df[attribute].min(), df[attribute].max(), num_bins + 1)
            bins = np.quantile(df[attribute].dropna(), np.linspace(0, 1, num_bins + 1))
            for i in range(len(bins) - 1):
                bin_range = [bins[i], bins[i + 1]]
                new_desc = Description()
                new_desc.description = description.description.copy()
                new_desc.extend(attribute, bin_range)
                new_descriptions.append(new_desc)
        else:
            # For categorical attributes, use unique values
            unique_values = df[attribute].unique()
            for value in unique_values:
                new_desc = Description()
                new_desc.description = description.description.copy()
                new_desc.extend(attribute, value)
                new_descriptions.append(new_desc)
    return new_descriptions


def beam_search(df: pd.DataFrame, settings: dict):

    if "target" not in settings:
        raise ValueError("Target attribute not specified in settings")

    target = settings["target"]

    # If settings have attributes key, filter the dataframe
    if "attributes" in settings:
        df = df[settings["attributes"] + [target]]

    overall_mean = df[target].mean()
    overall_std = df[target].std()
    overall_entropy = calculate_entropy(
        df[target], num_bins=settings.get("num_bins", 4)
    )
    overall_stats = {
        "mean": overall_mean,
        "std": overall_std,
        "entropy": overall_entropy,
        "total_count": len(df),
        "num_bins": settings.get("num_bins", 4),
    }

    # Select the quality measure function
    quality_measure_name = settings.get("quality_measure", "mean_uncertainty_deviation")
    if quality_measure_name == "mean_uncertainty_deviation":
        quality_measure_func = mean_uncertainty_deviation
    elif quality_measure_name == "z_score_uncertainty":
        quality_measure_func = z_score_uncertainty
    elif quality_measure_name == "wracc_uncertainty":
        quality_measure_func = wracc_uncertainty
    else:
        raise ValueError(f"Unknown quality measure: {quality_measure_name}")

    initial_description = Description()
    initial_subgroup = Subgroup.create(df, initial_description)
    initial_subgroup.evaluate_quality(
        overall_stats, quality_measure_func, settings.get("min_size", 1), target
    )

    beam = Beam(settings)
    beam.add(initial_subgroup)

    max_depth = settings.get("max_depth", 3)
    for depth in range(max_depth):
        logging.debug(f"Depth {depth+1}")
        for subgroup in beam.subgroups:
            expanded_descriptions = expand_description(
                subgroup.description, df, target, num_bins=settings.get("num_bins", 4)
            )
            for description in expanded_descriptions:
                new_subgroup = Subgroup.create(df, description)
                new_subgroup.evaluate_quality(
                    overall_stats,
                    quality_measure_func,
                    settings.get("min_size", 1),
                    target,
                )
                beam.add(new_subgroup)
        beam.select_cover_based()

    beam.print()
    return beam.subgroups

In [8]:
def load_dataset():
    # Load the dataset from the CSV file
    data = pd.read_csv("data/diabetes.csv")

    # Set outcome to random probability
    data["Outcome"] = data["Outcome"].apply(lambda x: random.random())

    # Rename outcome column
    data = data.rename(columns={"Outcome": "Probability"})

    # Calculate Uncertainty
    data["Uncertainty"] = 1 - 2 * np.abs(data["Probability"] - 0.5)
    return data

In [9]:
data = load_dataset()
data.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Probability,Uncertainty
0,6,148,72,35,0,33.6,0.627,50,0.121426,0.242852
1,1,85,66,29,0,26.6,0.351,31,0.973147,0.053706
2,8,183,64,0,0,23.3,0.672,32,0.608872,0.782257
3,1,89,66,23,94,28.1,0.167,21,0.239297,0.478595
4,0,137,40,35,168,43.1,2.288,33,0.158378,0.316756


In [5]:
settings = {
    "width": 10,
    "candidate_size": 100,
    "strategy": "maximize",
    "max_depth": 3,
    "num_bins": 4,  # Number of bins for numerical attributes
    "quality_measure": "z_score_uncertainty",  # Choose the quality measure here
    # "quality_measure": "wracc_uncertainty",  # Choose the quality measure here
    # "quality_measure": "mean_uncertainty_deviation",  # Choose the quality measure here
    "min_size": 10,  # Minimum size of subgroups
    "target": "Uncertainty",  # Target attribute for quality measure
    "attributes": [
        "Pregnancies",
        "Glucose",
        "BloodPressure",
        "SkinThickness",
        "Insulin",
        "BMI",
        "DiabetesPedigreeFunction",
        "Age",
    ],
}

best_subgroups = beam_search(data, settings)

# Print the best subgroups
total_size = 0
for idx, sg in enumerate(best_subgroups, 1):
    print(f"Subgroup {idx}: {sg.description}")
    print(f"  Score ({settings['quality_measure']}): {sg.score}")
    print(f"  Size: {sg.size}")
    total_size += sg.size
    print()
print(total_size)

DEBUG:root:Depth 1
DEBUG:root:Depth 2
DEBUG:root:Depth 3
DEBUG:root:--------------------
DEBUG:root:3.00 < Pregnancies ≤ 6.00 AND 62.00 < BloodPressure ≤ 72.00 3.739635202163876 (54)
DEBUG:root:0.00 < BMI ≤ 27.30 AND 0.00 < BloodPressure ≤ 62.00 3.182958763920357 (67)
DEBUG:root:0.24 < DiabetesPedigreeFunction ≤ 0.37 AND 3.00 < Pregnancies ≤ 6.00 3.5915818610342254 (48)
DEBUG:root:0.24 < DiabetesPedigreeFunction ≤ 0.37 AND 117.00 < Glucose ≤ 140.25 2.3015156753916606 (47)
DEBUG:root:29.00 < Age ≤ 41.00 AND 30.50 < Insulin ≤ 127.25 2.1611644993712966 (36)
DEBUG:root:127.25 < Insulin ≤ 846.00 AND 24.00 < Age ≤ 29.00 1.8023910505156877 (45)
DEBUG:root:117.00 < Glucose ≤ 140.25 AND 32.00 < BMI ≤ 36.60 1.8841491775745831 (54)
DEBUG:root:3.00 < Pregnancies ≤ 6.00 2.1372566322027766 (175)
DEBUG:root:0.00 < BMI ≤ 27.30 AND 99.00 < Glucose ≤ 117.00 2.0965716418514644 (54)
DEBUG:root:3.00 < Pregnancies ≤ 6.00 AND 21.00 < Age ≤ 24.00 2.5171293062084836 (14)


Subgroup 1: 3.00 < Pregnancies ≤ 6.00 AND 62.00 < BloodPressure ≤ 72.00
  Score (z_score_uncertainty): 3.739635202163876
  Size: 54

Subgroup 2: 0.00 < BMI ≤ 27.30 AND 0.00 < BloodPressure ≤ 62.00
  Score (z_score_uncertainty): 3.182958763920357
  Size: 67

Subgroup 3: 0.24 < DiabetesPedigreeFunction ≤ 0.37 AND 3.00 < Pregnancies ≤ 6.00
  Score (z_score_uncertainty): 3.5915818610342254
  Size: 48

Subgroup 4: 0.24 < DiabetesPedigreeFunction ≤ 0.37 AND 117.00 < Glucose ≤ 140.25
  Score (z_score_uncertainty): 2.3015156753916606
  Size: 47

Subgroup 5: 29.00 < Age ≤ 41.00 AND 30.50 < Insulin ≤ 127.25
  Score (z_score_uncertainty): 2.1611644993712966
  Size: 36

Subgroup 6: 127.25 < Insulin ≤ 846.00 AND 24.00 < Age ≤ 29.00
  Score (z_score_uncertainty): 1.8023910505156877
  Size: 45

Subgroup 7: 117.00 < Glucose ≤ 140.25 AND 32.00 < BMI ≤ 36.60
  Score (z_score_uncertainty): 1.8841491775745831
  Size: 54

Subgroup 8: 3.00 < Pregnancies ≤ 6.00
  Score (z_score_uncertainty): 2.13725663220277