In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import StratifiedShuffleSplit

# last n days as validation
gap_days = 7
val_days = 90
max_dt = train["datetime"].max()
val_start = max_dt - pd.Timedelta(days=val_days)
gap_start = val_start - pd.Timedelta(days=gap_days)

train_full = train[train["datetime"] < gap_start].copy()
valid_full = train[train["datetime"] >= val_start].copy()

print("Train:", train_full["datetime"].min(), "→", train_full["datetime"].max(), len(train_full))
print("Valid:", valid_full["datetime"].min(), "→", valid_full["datetime"].max(), len(valid_full))

def make_balanced_subset_regression(
    df: pd.DataFrame,
    target_col: str = "target",
    group_cols=("prediction_unit_id", "is_consumption"),
    n_bins: int = 20,
    frac: float = 0.10,
    max_rows: int | None = 300_000,
    random_state: int = 343,
) -> pd.DataFrame:
    rng = np.random.default_rng(random_state)
    parts = []

    if max_rows is not None:
        frac = min(frac, max_rows / max(len(df), 1))

    for _, gdf in df.groupby(list(group_cols), sort=False):
        if len(gdf) < 100:
            parts.append(gdf)
            continue

        n_take = int(np.ceil(len(gdf) * frac))
        n_take = max(20, min(n_take, len(gdf)))

        y = gdf[target_col]
        try:
            y_binned = pd.qcut(y, q=min(n_bins, len(gdf)), duplicates="drop")
            if y_binned.nunique() < 2:
                idx = rng.choice(gdf.index.to_numpy(), size=n_take, replace=False)
                parts.append(gdf.loc[idx])
                continue

            sss = StratifiedShuffleSplit(n_splits=1, train_size=n_take, random_state=random_state)
            idx_take, _ = next(sss.split(np.zeros(len(gdf)), y_binned))
            parts.append(gdf.iloc[idx_take])

        except Exception:
            idx = rng.choice(gdf.index.to_numpy(), size=n_take, replace=False)
            parts.append(gdf.loc[idx])

    out = pd.concat(parts, axis=0)

    if max_rows is not None and len(out) > max_rows:
        out = out.sample(n=max_rows, random_state=random_state)

    return out.sort_values(["prediction_unit_id","is_consumption","datetime"]).reset_index(drop=True)

train_sub = make_balanced_subset_regression(
    train_full,
    frac=0.10,
    max_rows=300_000,
    n_bins=20,
    random_state=343
)

print("Balanced train subset rows:", len(train_sub))

# X and y for training
DROP_COLS = [
    "target", "row_id", "datetime"
]
feature_cols = [c for c in train.columns if c not in DROP_COLS]

X_train = train_sub[feature_cols]
y_train = train_sub["target"]

X_valid = valid_full[feature_cols]
y_valid = valid_full["target"]

print("X_train:", X_train.shape, "X_valid:", X_valid.shape)