In [1]:
from typing import Literal, Optional, Tuple

import numpy as np

from utils.dataset import baseline_preprocessing, fetch_openml_dataset_by_id, nan_feat_preprocessing
from models.gradient_boosting import GradientBoostingClassifier


METHOD_CHOICES = ["baseline", "nan_feat"]


def train_by_baseline(
    openml_id: int,
    seed: Optional[int] = None,
    method: Literal["baseline", "nan_feat"] = "baseline"
) -> Tuple[np.ndarray, np.ndarray]:
    
    if method not in METHOD_CHOICES:
        raise ValueError(f"method must be in {METHOD_CHOICES}, but got {method}")
    
    preproc = baseline_preprocessing if method == "baseline" else nan_feat_preprocessing

    # id: {2, 41138} --> {anneal, APSFailure}
    X_train, X_test, y_train, y_test, weight_train, weight_test = preproc(
        *fetch_openml_dataset_by_id(openml_id, seed=seed)
    )
    print(X_train.shape)
    gb = GradientBoostingClassifier(seed=seed)
    gb.fit(X_train, y_train, sample_weight=weight_train)
    preds = gb.predict(X_test)
    return preds, y_test, weight_test


def accuracy(preds: np.ndarray, labels: np.ndarray, weights: Optional[np.ndarray] = None) -> float:
    weights = weights if weights is not None else np.ones_like(labels) / labels.size
    return (preds == labels) @ weights

In [2]:
for data_id in [2, 41138]:
    preds, labels, weights = train_by_baseline(data_id, seed=0, method="baseline")
    print(accuracy(preds, labels, weights), accuracy(preds, labels))

    preds, labels, weights = train_by_baseline(data_id, seed=0, method="nan_feat")
    print(accuracy(preds, labels, weights), accuracy(preds, labels))

(673, 83)
0.9576184300645093 0.9244444444444446
(673, 136)
0.9576184300645094 0.9244444444444446
(57000, 170)
0.9381362532426982 0.9590526315789469
(57000, 509)
0.9381362532426982 0.9590526315789469
