In [11]:
from typing import Optional, Tuple

import numpy as np

from utils.dataset import baseline_preprocessing, fetch_openml_dataset_by_id
from models.gradient_boosting import GradientBoostingClassifier


def train_by_baseline(
    openml_id: int,
    seed: Optional[int] = None
) -> Tuple[np.ndarray, np.ndarray]:

    # id: {2, 41138} --> {anneal, APSFailure}
    X_train, X_test, y_train, y_test, weight_train, weight_test = baseline_preprocessing(
        *fetch_openml_dataset_by_id(openml_id, seed=seed)
    )
    gb = GradientBoostingClassifier(seed=seed)
    gb.fit(X_train, y_train, sample_weight=weight_train)
    preds = gb.predict(X_test)
    return preds, y_test, weight_test


def accuracy(preds: np.ndarray, labels: np.ndarray, weights: Optional[np.ndarray] = None) -> float:
    weights = weights if weights is not None else np.ones_like(labels) / labels.size
    return (preds == labels) @ weights

In [9]:
preds, labels, weights = train_by_baseline(2, seed=0)

In [13]:
accuracy(preds, labels, weights), accuracy(preds, labels)

(0.9576184300645093, 0.9244444444444446)