In [None]:
import numpy as np
import pandas as pd
import openml

def _nice_round(v: float) -> int:
    """Snap to human-friendly thresholds like 500, 1k, 2k, 5k, 10k, 30k, ..."""
    if v <= 0:
        return 500
    base = 10 ** np.floor(np.log10(v))
    m = v / base
    if m < 1.5:
        step = 1
    elif m < 3.5:
        step = 2
    elif m < 7.5:
        step = 5
    else:
        step = 10
    return int(step * base)

def compute_unified_size_bins(
    suites=("OpenML-CC18", 353),    # CC-18 (classification) + CTR23 (regression)
    quantiles=(0.33, 0.66),         # tertiles on log10(#instances)
    min_small=0,                    # optional floor for the small/medium cut
    return_details=True
):
    """
    Compute *unified* small/medium/large cutoffs from log10(#instances)
    over the union of the provided OpenML suites, then round to nice thresholds.
    Returns (bins, labels, details) if return_details else (bins, labels).
    """
    # Collect dataset ids from all suites
    dids = set()
    for suite_id in suites:
        suite = openml.study.get_suite(suite_id)
        for tid in suite.tasks:
            dids.add(openml.tasks.get_task(tid, download_data=False).dataset_id)
    dids = list(dids)

    # Pull #instances
    meta = openml.datasets.list_datasets(data_id=dids, output_format="dataframe")
    if "NumberOfInstances" not in meta.columns:
        raise RuntimeError("OpenML response missing NumberOfInstances.")
    n = pd.to_numeric(meta["NumberOfInstances"], errors="coerce").dropna().values
    n = n[n > 0]
    if len(n) < 10:
        raise RuntimeError("Too few datasets to compute stable quantiles.")

    # Quantiles on log10(n) → round to nice thresholds
    x = np.log10(n)
    q1, q2 = np.quantile(x, quantiles)
    b1 = max(min_small, _nice_round(10 ** q1))
    b2 = _nice_round(10 ** q2)
    if b2 <= b1:  # ensure monotonicity
        b2 = _nice_round(b1 * 3.1)

    bins   = [b1, b2, np.inf]
    labels = ["small", "medium", "large"]

    note = (f"We defined small/medium/large by tertiles of log10(#instances) computed "
            f"on the combined CC-18 and CTR23 suites, then rounded to convenient "
            f"thresholds (small ≤ {b1:,}, medium ≤ {b2:,}, large > {b2:,}). "
            f"These cutoffs were fixed prior to running experiments.")

    if return_details:
        return bins, labels, {
            "n_datasets": int(len(n)),
            "quantiles_log10": (float(q1), float(q2)),
            "raw_thresholds": (float(10**q1), float(10**q2)),
            "rounded_thresholds": (int(b1), int(b2)),
            "paper_note": note,
        }
    return bins, labels

def annotate_size_bin(df: pd.DataFrame, bins, labels, col="NumberOfInstances", out_col="size_bin"):
    """Add a 'size_bin' categorical column to a metadata DF using unified bins."""
    df = df.copy()
    df[out_col] = pd.cut(pd.to_numeric(df[col], errors="coerce"),
                         bins=[-np.inf, bins[0], bins[1], np.inf],
                         labels=labels, right=True, include_lowest=True)
    return df

# --- Example (use once at the start of your pipeline) ---
if __name__ == "__main__":
    bins, labels, info = compute_unified_size_bins(return_details=True)
    print("UNIFIED SIZE_BINS:", bins)
    print("UNIFIED SIZE_LABELS:", labels)
    print(info["paper_note"])
    # Use `annotate_size_bin(meta_df, bins, labels)` for both CC-18 and CTR23 tables.


In [None]:
# pip install openml pandas numpy

import os
import re
import json
from collections import OrderedDict, defaultdict
from pathlib import Path
from typing import Tuple, List, Any, Dict

import numpy as np
import pandas as pd
import openml


# ============================ Config ==========================================
SIZE_BINS = [0, 2000, 10000, np.inf]
SIZE_LABELS = ["small", "medium", "large"]
CAT_FRAC_THRESHOLD = 0.6  # >= → high_categorical


# ============================ Small helpers ===================================
def _ensure_columns(df, cols, fill=np.nan):
    for c in cols:
        if c not in df.columns:
            df[c] = fill
    return df

def _safe_num(s):
    return pd.to_numeric(s, errors="coerce")

def _compute_missing(df: pd.DataFrame) -> Tuple[int, float]:
    missing_cells = int(df.isna().sum().sum())
    total_cells = int(df.shape[0] * df.shape[1]) if df.size else 0
    missing_pct = float(missing_cells / total_cells) if total_cells > 0 else 0.0
    return missing_cells, missing_pct

def _safe_imbalance(y: pd.Series) -> float | None:
    try:
        vc = y.value_counts(dropna=False)
        if len(vc) < 2:
            return None
        maj, minc = vc.max(), vc.min()
        return float(maj / minc) if minc > 0 else None
    except Exception:
        return None

def _task_type_from_classes(n_classes: int | float | None) -> str:
    if pd.isna(n_classes):
        return "unknown"
    return "binary" if int(n_classes) == 2 else "multiclass"

def _openml_page_url(did: int) -> str:
    return f"https://www.openml.org/d/{did}"

def _file_safe_exact(name: str) -> str:
    # Keep exact appearance; only neutralize path-breaking chars.
    if os.sep in name:
        name = name.replace(os.sep, "⧸")
    return name.replace(":", "：")

def _openml_download_url_placeholder(did: int, dataset_name: str) -> str:
    # A stable placeholder like your example (OpenML uses a file id internally, but this works as a readable reference).
    fname = re.sub(r"[^\w\-.]", "_", dataset_name.strip())
    return f"https://api.openml.org/data/v1/download/{did}/{fname}.arff"

def _make_task_intro(dataset, did: int, dataset_name: str) -> str:
    # Compose a markdown block similar to your example
    creators = dataset.creator if isinstance(dataset.creator, str) else (
        ", ".join(dataset.creator) if dataset.creator else "Unknown"
    )
    openml_page = _openml_page_url(did)
    original = getattr(dataset, "original_data_url", None)
    original_md = f"[Original]({original})" if original else "[Original](N/A)"
    source_md = f"[OpenML]({openml_page}) · {original_md}"
    return (
        f"**Author**: {creators}  \n"
        f"**Source**: {source_md}  \n"
        f"**Please cite**: https://archive.ics.uci.edu/ml/citation_policy.html  \n\n"
        f"* Abstract:\n\n"
        f"(Optional) Add a short abstract/description here.\n"
    )

def _split_numeric_categorical(features) -> Tuple[List[str], List[str]]:
    num, cat = [], []
    for f in features:
        dt = (f.data_type or "").lower()
        if dt in {"numeric", "real", "integer"}:
            num.append(f.name)
        elif dt in {"nominal", "string", "date"}:
            cat.append(f.name)
        else:
            cat.append(f.name)  # conservative default
    return num, cat

def _ensure_dir(p: str | Path) -> Path:
    p = Path(p)
    p.mkdir(parents=True, exist_ok=True)
    return p


# ============================ 1) Inspect suite attributes =====================
def inspect_suite_attributes(suite_name: str) -> pd.DataFrame:
    """
    Given an OpenML suite (e.g., 'OpenML-CC18'), return a table of dataset-level attributes
    available from OpenML for all datasets in that suite.
    """
    suite = openml.study.get_suite(suite_name)
    task_ids = suite.tasks
    dids = [openml.tasks.get_task(tid, download_data=False).dataset_id for tid in task_ids]

    meta = openml.datasets.list_datasets(data_id=dids, output_format="dataframe")
    cols = [
        "did","name","NumberOfInstances","NumberOfFeatures","NumberOfClasses",
        "NumberOfNumericFeatures","NumberOfSymbolicFeatures",
        "MajorityClassSize","MinorityClassSize","PercentageOfMissingValues","version","status"
    ]
    meta = _ensure_columns(meta, cols)
    df = meta[cols].copy()
    for c in cols[2:]:
        df[c] = pd.to_numeric(df[c], errors="coerce")

    # Add what we can from dataset-level API (target, license, creator, url, desc lengths)
    extra_rows: List[Dict[str, Any]] = []
    for did in df["did"]:
        try:
            ds = openml.datasets.get_dataset(int(did), download_data=False)
            extra_rows.append({
                "did": int(did),
                "default_target": ds.default_target_attribute,
                "licence": getattr(ds, "licence", None),
                "creator": ds.creator if isinstance(ds.creator, str) else (", ".join(ds.creator) if ds.creator else None),
                "citation": getattr(ds, "citation", None),
                "original_data_url": getattr(ds, "original_data_url", None),
                "description_len": len(getattr(ds, "description", "") or ""),
                "n_features_declared": len(ds.features) if getattr(ds, "features", None) else np.nan
            })
        except Exception:
            extra_rows.append({
                "did": int(did),
                "default_target": None,
                "licence": None,
                "creator": None,
                "citation": None,
                "original_data_url": None,
                "description_len": np.nan,
                "n_features_declared": np.nan
            })
    extra = pd.DataFrame(extra_rows)

    out = df.merge(extra, on="did", how="left")
    return out


# ============================ 2) Selection (your criteria) ====================
def build_selection_from_suite(suite_name: str) -> pd.DataFrame:
    """
    Same selection logic you specified, applied to any OpenML suite,
    but selecting up to TWO datasets each time instead of one.
    """
    suite = openml.study.get_suite(suite_name)
    task_ids = suite.tasks
    dids = [openml.tasks.get_task(tid, download_data=False).dataset_id for tid in task_ids]

    meta = openml.datasets.list_datasets(data_id=dids, output_format="dataframe")
    cols = [
        "did","name","NumberOfInstances","NumberOfFeatures","NumberOfClasses",
        "NumberOfNumericFeatures","NumberOfSymbolicFeatures",
        "MajorityClassSize","MinorityClassSize","PercentageOfMissingValues"
    ]
    meta = _ensure_columns(meta, cols)
    df = meta[cols].copy()
    for c in cols[2:]:
        df[c] = pd.to_numeric(df[c], errors="coerce")

    df["class_type"] = np.where(df["NumberOfClasses"] == 2, "binary", "multiclass")
    df["size_bin"] = pd.cut(
        df["NumberOfInstances"], bins=SIZE_BINS, labels=SIZE_LABELS,
        right=True, include_lowest=True
    )
    den = (df["NumberOfSymbolicFeatures"].fillna(0) + df["NumberOfNumericFeatures"].fillna(0))
    df["cat_frac"] = (df["NumberOfSymbolicFeatures"].fillna(0) / den.replace(0, np.nan)).fillna(0.0)
    df["feature_mix"] = np.where(df["cat_frac"] >= CAT_FRAC_THRESHOLD, "high_categorical", "high_numerical")
    df["missing_pct"] = _safe_num(df["PercentageOfMissingValues"]).fillna(0.0)

    with np.errstate(divide="ignore", invalid="ignore"):
        df["imbalance_ratio"] = (df["MajorityClassSize"] / df["MinorityClassSize"]).replace([np.inf, -np.inf], np.nan)

    selected_rows: list[pd.DataFrame] = []
    selected_dids: set[int] = set()

    # ---- modified helpers: return up to k rows instead of 1 ----
    def _pick_max_features(candidates: pd.DataFrame, k: int = 2) -> pd.DataFrame | None:
        if candidates.empty:
            return None
        c = candidates.copy()
        c["NumberOfFeatures"] = _safe_num(c["NumberOfFeatures"]).fillna(-np.inf)
        c["missing_pct"] = _safe_num(c["PercentageOfMissingValues"]).fillna(0.0)
        maxF = c["NumberOfFeatures"].max()
        at_max = c[c["NumberOfFeatures"] == maxF]
        # sort by missing_pct then name, then take top-k
        return at_max.sort_values(["missing_pct", "name"], ascending=[True, True]).head(k)

    def _pick_centered(candidates: pd.DataFrame, fmix: str, k: int = 2) -> pd.DataFrame | None:
        if candidates.empty:
            return None
        target_center = 0.6 if fmix == "high_categorical" else 0.0
        cc = candidates.copy()
        cc["missing_pct"] = _safe_num(cc["PercentageOfMissingValues"]).fillna(0.0)
        cc = cc.assign(center_distance=(cc["cat_frac"] - target_center).abs())
        # sort by distance to target cat_frac, then missing_pct, then name; take top-k
        return cc.sort_values(
            ["center_distance", "missing_pct", "name"],
            ascending=[True, True, True]
        ).head(k)

    def _avail(pool: pd.DataFrame) -> pd.DataFrame:
        return pool[~pool["did"].isin(selected_dids)]

    cells = [(t, s) for t in ["binary", "multiclass"] for s in SIZE_LABELS]

    for (task, size) in cells:
        block = _avail(df[(df["class_type"] == task) & (df["size_bin"] == size)])
        if block.empty:
            continue

        block_cat = block[block["feature_mix"] == "high_categorical"]
        block_num = block[block["feature_mix"] == "high_numerical"]

        # candidates for "winner" side (up to 2 each)
        top_cat = _pick_max_features(block_cat, k=2) if not block_cat.empty else None
        top_num = _pick_max_features(block_num, k=2) if not block_num.empty else None

        # for deciding which mix wins, use the *first* row of each
        max_cat = _safe_num(block_cat["NumberOfFeatures"]).max() if not block_cat.empty else -np.inf
        max_num = _safe_num(block_num["NumberOfFeatures"]).max() if not block_num.empty else -np.inf

        if max_cat > max_num:
            winner_mix, winner_pick = "high_categorical", top_cat
            loser_mix = "high_numerical"
        elif max_num > max_cat:
            winner_mix, winner_pick = "high_numerical", top_num
            loser_mix = "high_categorical"
        else:
            if top_cat is None and top_num is None:
                continue
            elif top_cat is None:
                winner_mix, winner_pick, loser_mix = "high_numerical", top_num, "high_categorical"
            elif top_num is None:
                winner_mix, winner_pick, loser_mix = "high_categorical", top_cat, "high_numerical"
            else:
                # compare the *best* candidate of each
                c_row, n_row = top_cat.iloc[0], top_num.iloc[0]
                if (c_row["missing_pct"], c_row["name"]) <= (n_row["missing_pct"], n_row["name"]):
                    winner_mix, winner_pick, loser_mix = "high_categorical", top_cat, "high_numerical"
                else:
                    winner_mix, winner_pick, loser_mix = "high_numerical", top_num, "high_categorical"

        # add up to TWO "winner" datasets
        if winner_pick is not None and not winner_pick.empty:
            selected_rows.append(winner_pick)
            selected_dids.update(winner_pick["did"].tolist())

        # now recompute availability and pick up to TWO "loser" datasets
        block = _avail(block)
        loser_pool = block[block["feature_mix"] == loser_mix]

        loser_pick = _pick_centered(loser_pool, loser_mix, k=2)
        if (loser_pick is None or loser_pick.empty) and not block.empty:
            # fallback: centered within whole block
            loser_pick = _pick_centered(block, loser_mix, k=2)

        if loser_pick is not None and not loser_pick.empty:
            selected_rows.append(loser_pick)
            selected_dids.update(loser_pick["did"].tolist())

    sel = (
        pd.concat(selected_rows, ignore_index=True)
        .drop_duplicates(subset=["did"])
        .reset_index(drop=True)
        if selected_rows else pd.DataFrame(columns=df.columns)
    )

    # Attach task_ids (traceability)
    did_to_tasks = defaultdict(list)
    for tid in task_ids:
        t = openml.tasks.get_task(tid, download_data=False)
        did_to_tasks[t.dataset_id].append(tid)
    sel["task_ids"] = sel["did"].map(did_to_tasks)
    return sel


# ============================ 3) Save datasets + info.json ====================
def save_selected_datasets(selection: pd.DataFrame, base_dir: str = "./data") -> pd.DataFrame:
    records = []

    for _, row in selection.iterrows():
        did = int(row["did"])
        try:
            ds = openml.datasets.get_dataset(did, download_data=True)
            dataset_name = ds.name  # exact
            dataset_name_path = _file_safe_exact(dataset_name)

            target_name = ds.default_target_attribute
            # IMPORTANT: use the categorical_indicator returned by get_data
            X, y, categorical_indicator, attribute_names = ds.get_data(
                dataset_format="dataframe",
                target=target_name if target_name else None
            )

            # --- robust feature typing using categorical_indicator ---
            # categorical_indicator aligns with X.columns order
            if categorical_indicator is None:
                # very rare, fallback to "object→cat, else numeric"
                cat_mask = [pd.api.types.is_object_dtype(X[c]) or pd.api.types.is_categorical_dtype(X[c])
                            for c in X.columns]
            else:
                cat_mask = list(categorical_indicator)

            cat_feats = [col for col, is_cat in zip(X.columns, cat_mask) if is_cat]
            num_feats = [col for col, is_cat in zip(X.columns, cat_mask) if not is_cat]

            # assemble dataframe to save
            if y is not None:
                # guard y name when target_name is None
                y_name = target_name if isinstance(target_name, str) and len(target_name) else "target"
                df_all = pd.concat([X, y.rename(y_name)], axis=1)
            else:
                df_all = X.copy()

            # missingness
            missing_cells = int(df_all.isna().sum().sum())
            total_cells = int(df_all.shape[0] * df_all.shape[1]) if df_all.size else 0
            missing_pct = float(missing_cells / total_cells) if total_cells > 0 else 0.0

            n_num = len(num_feats)
            n_cat = len(cat_feats)
            n_features = n_num + n_cat
            sample_size = int(df_all.shape[0])
            cat_frac_est = float(n_cat / n_features) if n_features > 0 else 0.0

            # classes / task type / imbalance
            n_classes = int(row["NumberOfClasses"]) if not pd.isna(row.get("NumberOfClasses")) else None
            if n_classes is None and y is not None and y.notna().any():
                n_classes = int(y.nunique())

            task_type = row.get("class_type") if isinstance(row.get("class_type"), str) else (
                "binary" if (n_classes == 2) else "multiclass"
            )
            imbalance_ratio = None
            if y is not None and task_type in {"binary", "multiclass"}:
                vc = y.value_counts(dropna=False)
                if len(vc) >= 2 and vc.min() > 0:
                    imbalance_ratio = float(vc.max() / vc.min())

            # URLs & intro
            source_url = _openml_download_url_placeholder(did, dataset_name)
            openml_page = _openml_page_url(did)
            task_intro = _make_task_intro(ds, did, dataset_name)

            # Output
            out_dir = _ensure_dir(Path(base_dir) / task_type / dataset_name_path)
            csv_path = out_dir / f"{dataset_name_path}.csv"
            info_path = out_dir / "info.json"
            df_all.to_csv(csv_path, index=False)

            info = OrderedDict()
            info["name"] = dataset_name
            info["n_num_features"] = n_num
            info["n_cat_features"] = n_cat
            info["sample_size"] = sample_size
            info["n_features"] = n_features
            info["cat_frac_est"] = cat_frac_est
            info["missing_cells_estimated_from_file"] = missing_cells
            info["missing_pct_estimated_from_file"] = missing_pct
            info["source"] = source_url
            info["openml_page"] = openml_page
            info["task_intro"] = task_intro
            info["task_type"] = task_type
            info["openml_id"] = did
            info["imbalance_ratio"] = imbalance_ratio
            info["n_classes"] = n_classes
            info["target_variable"] = target_name if target_name else "target"
            info["target_description"] = None
            info["num_feature_intro"] = OrderedDict((f, "numeric feature") for f in num_feats)
            info["cat_feature_intro"] = OrderedDict((f, "categorical feature") for f in cat_feats)

            with open(info_path, "w", encoding="utf-8") as f:
                json.dump(info, f, indent=2, ensure_ascii=False)

            records.append({
                "did": did,
                "name": dataset_name,
                "task_type": task_type,
                "saved_csv": str(csv_path),
                "saved_info": str(info_path),
            })
            print(f"Saved: {dataset_name} → {csv_path}")

        except Exception as e:
            print(f"[WARN] Skipped DID={did} due to error: {e}")

    return pd.DataFrame.from_records(records)



# ============================ 4) Example run ==================================
if __name__ == "__main__":
    SUITE = "OpenML-CC18"  # change to any OpenML suite/collection name

    print("Inspecting suite attributes...")
    suite_attrs = inspect_suite_attributes(SUITE)
    print(suite_attrs.head(12).to_string(index=False))  # peek

    print("\nBuilding selection with your criteria...")
    sel = build_selection_from_suite(SUITE)
    print(sel[["did", "name", "class_type", "size_bin", "feature_mix"]].to_string(index=False))

    print("\nSaving selected datasets + info.json...")
    summary = save_selected_datasets(sel, base_dir="./data")

    print("\nSummary of saved datasets:")
    if len(summary):
        print(summary.to_string(index=False))
    else:
        print("(no datasets saved)")


In [None]:
from typing import Optional
from collections import defaultdict

import numpy as np
import pandas as pd
import openml


# Binning only for labeling (no filtering)
SIZE_BINS = [0, 2000, 10000, np.inf]
SIZE_LABELS = ["small", "medium", "large"]
CAT_FRAC_THRESHOLD = 0.5  # >= → high_categorical


def build_ctr23_feature_winner_normal_loser_selection() -> pd.DataFrame:
    """
    CTR23 (regression suite, id=353) selector with *no caps*:
      For each size_bin in [small, medium, large]:
        1) Compare the *maximum NumberOfFeatures* available between the two mixes
           ("high_categorical" vs "high_numerical") within that size bin.
           The mix with the higher max is the *feature-winner* for that bin.
        2) Pick up to TWO datasets from the winning mix:
             - highest NumberOfFeatures (ties → lower missing_pct → name)
        3) From the losing mix, pick up to TWO datasets using the centered mix rule:
             - cat_frac closest to 0.6 (if high_categorical) or 0.0 (if high_numerical)
             - ties → lower missing_pct → name
        4) If a mix has no candidates in the size bin, fall back to the other mix
           (still avoiding duplicates when possible), using the same tie-breakers.
      Returns up to 12 datasets total (4 per size bin).
    """

    # --- tiny helpers (scoped) ---
    def _ensure_columns(df: pd.DataFrame, cols, fill=np.nan) -> pd.DataFrame:
        for c in cols:
            if c not in df.columns:
                df[c] = fill
        return df

    def _safe_num(s: pd.Series) -> pd.Series:
        return pd.to_numeric(s, errors="coerce")

    # --- Load CTR23 task set (regression) ---
    suite = openml.study.get_suite(353)  # CTR23
    task_ids = suite.tasks
    dids = [openml.tasks.get_task(tid, download_data=False).dataset_id for tid in task_ids]

    # --- Pull dataset metadata ---
    meta = openml.datasets.list_datasets(data_id=dids, output_format="dataframe")
    cols_wanted = [
        "did", "name",
        "NumberOfInstances", "NumberOfFeatures", "NumberOfClasses",
        "NumberOfNumericFeatures", "NumberOfSymbolicFeatures",
        "PercentageOfMissingValues"
    ]
    meta = _ensure_columns(meta, cols_wanted)
    df = meta[cols_wanted].copy()

    # numeric coercion
    for c in cols_wanted[2:]:
        df[c] = pd.to_numeric(df[c], errors="coerce")

    # task type label (CTR23 is regression; kept for consistency)
    df["task_type"] = "regression"

    # size bins (no filtering)
    df["size_bin"] = pd.cut(
        df["NumberOfInstances"],
        bins=SIZE_BINS,
        labels=SIZE_LABELS,
        right=True,
        include_lowest=True
    )

    # feature mix & cat fraction
    den = df["NumberOfSymbolicFeatures"].fillna(0) + df["NumberOfNumericFeatures"].fillna(0)
    df["cat_frac"] = (df["NumberOfSymbolicFeatures"].fillna(0) / den.replace(0, np.nan)).fillna(0.0)
    df["feature_mix"] = np.where(df["cat_frac"] >= CAT_FRAC_THRESHOLD, "high_categorical", "high_numerical")

    # missingness (percentage)
    df["missing_pct"] = _safe_num(df["PercentageOfMissingValues"]).fillna(0.0)

    # helper to avoid reusing the same DID
    selected_rows = []
    selected_dids = set()

    def avail(pool: pd.DataFrame) -> pd.DataFrame:
        return pool[~pool["did"].isin(selected_dids)]

    # tie-break helpers (now with k)
    def pick_max_features(candidates: pd.DataFrame, k: int = 2) -> Optional[pd.DataFrame]:
        """
        Top-k by NumberOfFeatures desc (we filter by max first),
        then missing_pct asc, then name asc.
        Returns a DataFrame with up to k rows.
        """
        if candidates is None or candidates.empty:
            return None
        maxF = candidates["NumberOfFeatures"].max()
        at_max = candidates[candidates["NumberOfFeatures"] == maxF]
        return at_max.sort_values(["missing_pct", "name"], ascending=[True, True]).head(k)

    def pick_centered(candidates: pd.DataFrame, fmix: str, k: int = 2) -> Optional[pd.DataFrame]:
        """
        Centered mix rule:
          - target cat_frac = 0.6 if high_categorical, else 0.0
          - sort by |cat_frac - target|, then missing_pct, then name
        Returns up to k rows.
        """
        if candidates is None or candidates.empty:
            return None
        target = 0.6 if fmix == "high_categorical" else 0.0
        cc = candidates.copy()
        cc["center_distance"] = (cc["cat_frac"] - target).abs()
        return cc.sort_values(
            ["center_distance", "missing_pct", "name"],
            ascending=[True, True, True]
        ).head(k)

    # Iterate each size bin
    for size in SIZE_LABELS:
        block = avail(df[df["size_bin"] == size])
        if block.empty:
            continue

        # Split by mix within this size bin
        block_cat = block[block["feature_mix"] == "high_categorical"]
        block_num = block[block["feature_mix"] == "high_numerical"]

        # Determine max features per mix
        max_cat = block_cat["NumberOfFeatures"].max() if not block_cat.empty else -np.inf
        max_num = block_num["NumberOfFeatures"].max() if not block_num.empty else -np.inf

        # Precompute the "max-features" picks (up to 2) for both mixes
        top_cat = pick_max_features(block_cat, k=2) if not block_cat.empty else None
        top_num = pick_max_features(block_num, k=2) if not block_num.empty else None

        # Choose winner mix with deterministic tie-break (based on best candidate of each)
        if max_cat > max_num:
            winner_mix, loser_mix = "high_categorical", "high_numerical"
            winner_pick, loser_pool = top_cat, block_num
        elif max_num > max_cat:
            winner_mix, loser_mix = "high_numerical", "high_categorical"
            winner_pick, loser_pool = top_num, block_cat
        else:
            # Tie on max features
            if top_cat is None and top_num is None:
                continue
            elif top_cat is None:
                winner_mix, loser_mix = "high_numerical", "high_categorical"
                winner_pick, loser_pool = top_num, block_cat
            elif top_num is None:
                winner_mix, loser_mix = "high_categorical", "high_numerical"
                winner_pick, loser_pool = top_cat, block_num
            else:
                # Both exist; compare their best (first) candidates
                c_row = top_cat.iloc[0]
                n_row = top_num.iloc[0]
                cat_key = (c_row["missing_pct"], c_row["name"])
                num_key = (n_row["missing_pct"], n_row["name"])
                if cat_key <= num_key:
                    winner_mix, loser_mix = "high_categorical", "high_numerical"
                    winner_pick, loser_pool = top_cat, block_num
                else:
                    winner_mix, loser_mix = "high_numerical", "high_categorical"
                    winner_pick, loser_pool = top_num, block_cat

        # 1) Add up to TWO winners (max-features within its mix)
        if winner_pick is not None and not winner_pick.empty:
            selected_rows.append(winner_pick)
            selected_dids.update(winner_pick["did"].tolist())

        # 2) Add up to TWO losers using *centered mix* rule
        loser_pool = avail(loser_pool)  # exclude any just-selected DIDs
        loser_pick = pick_centered(loser_pool, loser_mix, k=2)

        if loser_pick is not None and not loser_pick.empty:
            selected_rows.append(loser_pick)
            selected_dids.update(loser_pick["did"].tolist())
        else:
            # Fallback: centered from the whole size-bin block
            fallback_pool = avail(block)
            fallback_pick = pick_centered(fallback_pool, loser_mix, k=2)
            if fallback_pick is not None and not fallback_pick.empty:
                selected_rows.append(fallback_pick)
                selected_dids.update(fallback_pick["did"].tolist())

    # --- Build selection DF ---
    if selected_rows:
        sel = (
            pd.concat(selected_rows, ignore_index=True)
            .drop_duplicates(subset=["did"])
            .reset_index(drop=True)
        )
    else:
        sel = pd.DataFrame(columns=df.columns)

    # Map did -> CTR23 task IDs (collect all tasks per DID)
    did_to_tasks = defaultdict(list)
    for tid in task_ids:
        task = openml.tasks.get_task(tid, download_data=False)
        did_to_tasks[task.dataset_id].append(tid)
    sel["task_ids"] = sel["did"].map(did_to_tasks)

    # Sort for readability
    sel = sel.sort_values(["size_bin", "feature_mix", "name"]).reset_index(drop=True)
    return sel

if __name__ == "__main__":
    BASE_DIR = "./data"

    print("Building CTR23 selection (feature-winner + centered-loser, 2+2 per size bin)...")
    sel = build_ctr23_feature_winner_normal_loser_selection()
    print("\nSelected datasets:")
    print(sel[["did", "name", "size_bin", "feature_mix", "cat_frac"]].to_string(index=False))

    print("\nSaving selected datasets + info.json...")
    summary = save_selected_datasets(sel, base_dir=BASE_DIR)

    print("\nSummary of saved datasets:")
    if len(summary):
        print(summary.to_string(index=False))
    else:
        print("(no datasets saved)")


In [None]:
from typing import Optional
from collections import defaultdict
import time

import numpy as np
import pandas as pd
import openml

# --- Configuration ---
SIZE_BINS = [0, 2000, 10000, np.inf]
SIZE_LABELS = ["small", "medium", "large"]
CAT_FRAC_THRESHOLD = 0.6  # >= → high_categorical (match your main script)
DELAY_SECONDS = 3         # delay between OpenML calls to avoid overload


def build_suite_all_datasets_selection(
    suite_name: str,
    delay: int = DELAY_SECONDS
) -> pd.DataFrame:
    """
    Return ALL datasets in a given OpenML suite (e.g. 'OpenML-CC18'),
    with delay between OpenML API calls to avoid overloading the server.

    Adds handy extra columns:
      - class_type (binary / multiclass from NumberOfClasses)
      - size_bin (small / medium / large)
      - cat_frac, feature_mix (high_categorical / high_numerical)
      - missing_pct
      - task_ids (list of OpenML task IDs using that dataset)
    """

    # --- tiny helpers (scoped) ---
    def _ensure_columns(df: pd.DataFrame, cols, fill=np.nan) -> pd.DataFrame:
        for c in cols:
            if c not in df.columns:
                df[c] = fill
        return df

    def _safe_num(s: pd.Series) -> pd.Series:
        return pd.to_numeric(s, errors="coerce")

    # --- Load suite & tasks ---
    print(f"Fetching suite metadata from OpenML: {suite_name!r} ...")
    suite = openml.study.get_suite(suite_name)
    task_ids = suite.tasks
    print(f"Loaded suite with {len(task_ids)} tasks. Fetching dataset IDs...")

    dids: list[int] = []
    for i, tid in enumerate(task_ids, 1):
        task = openml.tasks.get_task(tid, download_data=False)
        dids.append(task.dataset_id)
        print(f"  [{i}/{len(task_ids)}] Got task {tid} → dataset {task.dataset_id}")
        time.sleep(delay)  # throttle requests

    # --- Pull dataset metadata for ALL dids ---
    print("Downloading dataset metadata table...")
    meta = openml.datasets.list_datasets(data_id=dids, output_format="dataframe")
    cols_wanted = [
        "did", "name",
        "NumberOfInstances", "NumberOfFeatures", "NumberOfClasses",
        "NumberOfNumericFeatures", "NumberOfSymbolicFeatures",
        "MajorityClassSize", "MinorityClassSize",
        "PercentageOfMissingValues",
    ]
    meta = _ensure_columns(meta, cols_wanted)
    df = meta[cols_wanted].copy()

    # numeric coercion
    for c in cols_wanted[2:]:
        df[c] = _safe_num(df[c])

    # class_type from NumberOfClasses (used by your saver)
    df["class_type"] = np.where(df["NumberOfClasses"] == 2, "binary", "multiclass")

    # size bins
    df["size_bin"] = pd.cut(
        df["NumberOfInstances"],
        bins=SIZE_BINS,
        labels=SIZE_LABELS,
        right=True,
        include_lowest=True,
    )

    # feature mix & cat fraction
    den = df["NumberOfSymbolicFeatures"].fillna(0) + df["NumberOfNumericFeatures"].fillna(0)
    df["cat_frac"] = (
        df["NumberOfSymbolicFeatures"].fillna(0) / den.replace(0, np.nan)
    ).fillna(0.0)
    df["feature_mix"] = np.where(
        df["cat_frac"] >= CAT_FRAC_THRESHOLD,
        "high_categorical",
        "high_numerical",
    )

    # missingness
    df["missing_pct"] = _safe_num(df["PercentageOfMissingValues"]).fillna(0.0)

    # imbalance ratio (if sizes available)
    with np.errstate(divide="ignore", invalid="ignore"):
        df["imbalance_ratio"] = (
            df["MajorityClassSize"] / df["MinorityClassSize"]
        ).replace([np.inf, -np.inf], np.nan)

    # Map did -> suite task IDs (collect all tasks per DID)
    print("Mapping dataset IDs to their corresponding tasks...")
    did_to_tasks = defaultdict(list)
    for i, tid in enumerate(task_ids, 1):
        task = openml.tasks.get_task(tid, download_data=False)
        did_to_tasks[task.dataset_id].append(tid)
        print(f"  [{i}/{len(task_ids)}] Linked task {tid}")
        time.sleep(delay)

    df["task_ids"] = df["did"].map(did_to_tasks)

    df = df.sort_values(["class_type", "size_bin", "feature_mix", "name"]).reset_index(drop=True)
    print(f"\n✅ Finished collecting suite {suite_name!r} datasets: {len(df)} total.")
    return df


# ============================ Example run =====================================
if __name__ == "__main__":

    SUITE = "OpenML-CC18"   # or any other suite name
    BASE_DIR = "./data"

    print(f"Building {SUITE} selection with ALL datasets (delayed mode)...")
    sel = build_suite_all_datasets_selection(SUITE, delay=DELAY_SECONDS)

    print("\nSelected datasets (all in suite):")
    print(sel[["did", "name", "class_type", "size_bin", "feature_mix", "cat_frac"]].to_string(index=False))

    print("\nSaving selected datasets + info.json...")
    summary = save_selected_datasets(sel, base_dir=BASE_DIR)

    print("\nSummary of saved datasets:")
    if len(summary):
        print(summary.to_string(index=False))
    else:
        print("(no datasets saved)")


In [None]:
from typing import Optional
from collections import defaultdict
import time

import numpy as np
import pandas as pd
import openml

# --- Configuration ---
SIZE_BINS = [0, 2000, 10000, np.inf]
SIZE_LABELS = ["small", "medium", "large"]
CAT_FRAC_THRESHOLD = 0.5
DELAY_SECONDS = 3  # delay between OpenML calls to avoid overload


def build_ctr23_all_datasets_selection(delay: int = DELAY_SECONDS) -> pd.DataFrame:
    """
    Return ALL datasets in CTR23 (regression suite, id=353),
    with delay between OpenML API calls to avoid overloading the server.
    """
    def _ensure_columns(df: pd.DataFrame, cols, fill=np.nan) -> pd.DataFrame:
        for c in cols:
            if c not in df.columns:
                df[c] = fill
        return df

    def _safe_num(s: pd.Series) -> pd.Series:
        return pd.to_numeric(s, errors="coerce")

    # --- Load CTR23 task set (regression) ---
    print("Fetching CTR23 suite metadata from OpenML...")
    suite = openml.study.get_suite(353)
    task_ids = suite.tasks
    print(f"Loaded suite with {len(task_ids)} tasks. Fetching dataset IDs...")

    dids = []
    for i, tid in enumerate(task_ids, 1):
        task = openml.tasks.get_task(tid, download_data=False)
        dids.append(task.dataset_id)
        print(f"  [{i}/{len(task_ids)}] Got task {tid} → dataset {task.dataset_id}")
        time.sleep(delay)  # prevent hammering the API

    # --- Pull dataset metadata for ALL dids ---
    print("Downloading dataset metadata table...")
    meta = openml.datasets.list_datasets(data_id=dids, output_format="dataframe")
    cols_wanted = [
        "did", "name",
        "NumberOfInstances", "NumberOfFeatures", "NumberOfClasses",
        "NumberOfNumericFeatures", "NumberOfSymbolicFeatures",
        "PercentageOfMissingValues"
    ]
    meta = _ensure_columns(meta, cols_wanted)
    df = meta[cols_wanted].copy()

    # numeric coercion
    for c in cols_wanted[2:]:
        df[c] = _safe_num(df[c])

    # CTR23 is regression
    df["task_type"] = "regression"

    # size bins (for inspection)
    df["size_bin"] = pd.cut(
        df["NumberOfInstances"],
        bins=SIZE_BINS,
        labels=SIZE_LABELS,
        right=True,
        include_lowest=True
    )

    # feature mix & cat fraction
    den = df["NumberOfSymbolicFeatures"].fillna(0) + df["NumberOfNumericFeatures"].fillna(0)
    df["cat_frac"] = (df["NumberOfSymbolicFeatures"].fillna(0) / den.replace(0, np.nan)).fillna(0.0)
    df["feature_mix"] = np.where(df["cat_frac"] >= CAT_FRAC_THRESHOLD,
                                 "high_categorical", "high_numerical")

    # missingness
    df["missing_pct"] = _safe_num(df["PercentageOfMissingValues"]).fillna(0.0)

    # Map did -> CTR23 task IDs (collect all tasks per DID)
    print("Mapping dataset IDs to their corresponding tasks...")
    did_to_tasks = defaultdict(list)
    for i, tid in enumerate(task_ids, 1):
        task = openml.tasks.get_task(tid, download_data=False)
        did_to_tasks[task.dataset_id].append(tid)
        print(f"  [{i}/{len(task_ids)}] Linked task {tid}")
        time.sleep(delay)

    df["task_ids"] = df["did"].map(did_to_tasks)

    df = df.sort_values(["size_bin", "feature_mix", "name"]).reset_index(drop=True)
    print(f"\n✅ Finished collecting CTR23 datasets: {len(df)} total.")
    return df


# ============================ Example run =====================================
if __name__ == "__main__":
    BASE_DIR = "./data"

    print("Building CTR23 selection with ALL datasets (delayed mode)...")
    sel = build_ctr23_all_datasets_selection(delay=3)
    print("\nSelected datasets (all CTR23):")
    print(sel[["did", "name", "size_bin", "feature_mix", "cat_frac"]].to_string(index=False))

    print("\nSaving selected datasets + info.json...")
    summary = save_selected_datasets(sel, base_dir=BASE_DIR)

    print("\nSummary of saved datasets:")
    if len(summary):
        print(summary.to_string(index=False))
    else:
        print("(no datasets saved)")
