# Gradient Boosting Baseline
This notebook trains a gradient boosting classifier on synthetic ledger and payroll features
to validate the `ml_core` feature pipelines.

In [None]:
from datetime import date

import numpy as np
import pandas as pd
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import precision_score, recall_score

from ml_core.features import FeatureConfig
from ml_core.experiments.logging import log_experiment

# Mock executor returns deterministic features for demonstration
class MockExecutor:
    def fetch_dataframe(self, query: str, *, params=None):
        if "Posting" in query:
            return pd.DataFrame({
                "org_id": [params["org_id"]] * 6,
                "occurred_date": pd.date_range(end=params["as_of"], periods=6),
                "account_id": ["acct"] * 6,
                "net_amount_aud": np.linspace(-500, 500, 6),
                "posting_count": [3, 4, 5, 3, 2, 1],
            })
        if "PayRun" in query:
            return pd.DataFrame({
                "payment_date": pd.date_range(end=params["as_of"], periods=4),
                "period_end": pd.date_range(end=params["as_of"], periods=4) - pd.Timedelta(days=1),
                "payslip_count": [10, 12, 9, 11],
                "gross_pay_amount": [55000, 53000, 54000, 52000],
            })
        return pd.DataFrame({
            "status": ["OPEN", "CLOSED"],
            "opened_at": pd.date_range(end=params["as_of"], periods=2),
            "resolved_at": pd.date_range(end=params["as_of"], periods=2),
            "source": ["reconciliation", "operations"],
        })

executor = MockExecutor()
config = FeatureConfig(org_id="demo-org", as_of_date=date.today())

from ml_core.features.pipeline import build_training_set
features = build_training_set(executor, config)

# Synthetic binary target
rng = np.random.default_rng(seed=42)
X = np.tile(features.drop(columns=["org_id"]).to_numpy(), (50, 1))
y = rng.integers(0, 2, size=50)

model = GradientBoostingClassifier(random_state=42)
model.fit(X, y)
preds = model.predict(X)
precision = precision_score(y, preds, zero_division=0)
recall = recall_score(y, preds, zero_division=0)

log_experiment(
    "gradient_boosting_baseline",
    params={"n_features": X.shape[1]},
    metrics={"precision": float(precision), "recall": float(recall)},
)

precision, recall
