# Isolation Forest Baseline
The anomaly-detection baseline evaluates isolation forest using the same aggregated features
exported by `ml_core`.

In [None]:
from datetime import date

import numpy as np
import pandas as pd
from sklearn.ensemble import IsolationForest
from sklearn.metrics import precision_score, recall_score

from ml_core.features import FeatureConfig
from ml_core.experiments.logging import log_experiment
from ml_core.features.pipeline import build_training_set

class MockExecutor:
    def fetch_dataframe(self, query: str, *, params=None):
        if "Posting" in query:
            return pd.DataFrame({
                "org_id": [params["org_id"]] * 3,
                "occurred_date": pd.date_range(end=params["as_of"], periods=3),
                "account_id": ["acct"] * 3,
                "net_amount_aud": [1000, -250, 500],
                "posting_count": [5, 4, 3],
            })
        if "PayRun" in query:
            return pd.DataFrame({
                "payment_date": pd.date_range(end=params["as_of"], periods=3),
                "period_end": pd.date_range(end=params["as_of"], periods=3) - pd.Timedelta(days=2),
                "payslip_count": [9, 10, 11],
                "gross_pay_amount": [48000, 50000, 49500],
            })
        return pd.DataFrame({
            "status": ["OPEN"],
            "opened_at": pd.date_range(end=params["as_of"], periods=1),
            "resolved_at": [pd.NaT],
            "source": ["reconciliation"],
        })

executor = MockExecutor()
config = FeatureConfig(org_id="demo-org", as_of_date=date.today())
features = build_training_set(executor, config).drop(columns=["org_id"]).to_numpy()

rng = np.random.default_rng(seed=123)
X = np.tile(features, (100, 1)) + rng.normal(scale=0.1, size=(100, features.shape[1]))
y = np.zeros(100, dtype=int)
y[rng.choice(100, size=10, replace=False)] = 1  # synthetic anomalies

model = IsolationForest(random_state=123, contamination=0.1)
model.fit(X)
scores = model.predict(X)
preds = (scores == -1).astype(int)
precision = precision_score(y, preds, zero_division=0)
recall = recall_score(y, preds, zero_division=0)

log_experiment(
    "isolation_forest_baseline",
    params={"contamination": 0.1},
    metrics={"precision": float(precision), "recall": float(recall)},
)

precision, recall
