In [1]:
import enefit
env = enefit.make_env()
iter_test = env.iter_test()

In [2]:
import pandas as pd
import numpy as np
from datetime import datetime
import xgboost as xgb

In [3]:
DATA_DIR = "/kaggle/input/predict-energy-behavior-of-prosumers/"
df_data = pd.read_csv(DATA_DIR + "train.csv")
df_client = pd.read_csv(DATA_DIR + "client.csv")
df_historical_weather = pd.read_csv(DATA_DIR + "historical_weather.csv")
df_forecast_weather = pd.read_csv(DATA_DIR + "forecast_weather.csv")
df_electricity_prices = pd.read_csv(DATA_DIR + "electricity_prices.csv")
df_gas_prices = pd.read_csv(DATA_DIR + "gas_prices.csv")
df_weather_station_to_county_mapping = pd.read_csv(DATA_DIR + "weather_station_to_county_mapping.csv")

In [4]:
def dbg(name, df):
    print(f"{name}: {len(df):,} rows")

def upsample_daily_to_hourly(df: pd.DataFrame, date_col: str) -> pd.DataFrame:
    # unsample (process) client data. Client data is reported daily, but we need it hourly.
    # So we take ever row (1 day) and duplicate (explode) it into 24 rows
    df_hourly = df.copy(deep=True) # deep copy to not make changes to dataframe from parameter
    df_hourly[date_col] = pd.to_datetime(df_hourly[date_col])

    # create a column of lists, where each list contains [00:00, 01:00, ... 23:00] for that day
    df_hourly['datetime'] = df_hourly[date_col].apply(lambda x: [x + pd.Timedelta(hours=i) for i in range(24)])
    df_hourly = df_hourly.explode('datetime')

    # drop the original daily date column as it's no longer needed
    df_hourly = df_hourly.drop(columns=[date_col])
    return df_hourly


# nb! needs more testing as this method was generated by AI.
def process_forecast_weather(df_forecast: pd.DataFrame, location_map: dict) -> pd.DataFrame:
    """
    Cleans, aggregates, and pivots forecast weather data.

    Logic:
    1. Map Lat/Lon to County ID.
    2. Convert 'Origin Time' (when forecast was made) to 'Target Time' (when weather happens).
    3. Group forecasts into 'batches' (Day 1 forecast vs Day 2 forecast).
    4. Average the values per county/hour/batch.
    5. Pivot so batches become columns (e.g., temperature_1, temperature_2).
    """
    # Safety Copy
    df = df_forecast.copy(deep=True)

    # 1. Map Coordinates to County
    # We use the dictionary passed in from the main function
    df['county'] = [location_map.get((x, y), -1) for x, y in zip(df['latitude'], df['longitude'])]

    # Filter out valid locations only
    df = df[df['county'] != -1]

    # 2. Calculate Target Time
    # Standardize origin time to 02:00:00 (removes minute/second noise)
    df['origin_datetime'] = pd.to_datetime(df['origin_datetime'])
    df['origin_datetime'] = pd.to_datetime(df['origin_datetime'].dt.date.astype(str) + ' 02:00:00')

    # Target Time = Origin + Hours Ahead
    df['forecast_datetime'] = df['origin_datetime'] + pd.to_timedelta(df['hours_ahead'], unit='h')

    # We don't need origin time anymore
    df.drop(columns=['origin_datetime'], inplace=True)

    # 3. Create "Batches" (cumcount) todo:rename
    # A batch represents how far out the forecast is (Day 1 vs Day 2)
    # (hours_ahead - 1) // 24 + 1 results in: 1 for 0-24h, 2 for 25-48h
    df['cumcount'] = (df['hours_ahead'] - 1) // 24 + 1

    # 4. Aggregate (Mean) by County, Time, and Batch
    # Identify feature columns (exclude IDs and Time)
    exclude_cols = ['latitude', 'longitude', 'hours_ahead', 'forecast_datetime', 'cumcount', 'county', 'data_block_id']
    feature_cols = [col for col in df.columns if col not in exclude_cols]

    agg_dict = {col: 'mean' for col in feature_cols}
    # We must keep 'cumcount' in the groupby keys, so we don't aggregate it

    df_grouped = df.groupby(['county', 'forecast_datetime', 'cumcount']).agg(agg_dict)

    # 5. Pivot (Unstack)
    # Moves 'cumcount' from a row index to a column suffix
    df_pivoted = df_grouped.unstack(level=-1)

    # Flatten MultiIndex columns: ('temperature', 1) -> 'temperature_1'
    df_pivoted.columns = [f'{col[0]}_{col[1]}' for col in df_pivoted.columns]

    df_pivoted.reset_index(inplace=True)
    df_pivoted.rename(columns={'forecast_datetime': 'datetime'}, inplace=True)

    # Handle missing values (if a forecast is missing, fill with 0 or strictly manage it)
    df_pivoted.fillna(0, inplace=True)

    return df_pivoted

def process_historical_weather(df_historical: pd.DataFrame, location_map: dict) -> pd.DataFrame:
    #compose historical weather data (averaging stations per county).
    df = df_historical.copy(deep=True)
    df['datetime'] = pd.to_datetime(df['datetime'])

    df['county'] = [location_map.get((x, y), -1) for x, y in zip(df['latitude'], df['longitude'])]
    df = df[df['county'] != -1]

    exclude_cols = ['latitude', 'longitude', 'datetime', 'county', 'data_block_id']
    agg_dict = {col: 'mean' for col in df.columns if col not in exclude_cols}
    df_grouped = df.groupby(['county', 'datetime']).agg(agg_dict)
    df_grouped.reset_index(inplace=True)

    return df_grouped


  #### CHANGE HERE NEW FUNCTION ####
def add_time_features(df: pd.DataFrame, datetime_col: str = "datetime") -> pd.DataFrame:
    out = df.copy(deep=True)
    dt = pd.to_datetime(out[datetime_col])

    # Basic calendar parts
    out["hour"] = dt.dt.hour.astype("int8")
    out["dayofweek"] = dt.dt.dayofweek.astype("int8")   
    out["month"] = dt.dt.month.astype("int8")          
    out["day"] = dt.dt.day.astype("int8")          
    out["is_weekend"] = (out["dayofweek"] >= 5).astype("int8")

    # Cyclical encodings
    out["hour_sin"] = np.sin(2 * np.pi * out["hour"] / 24)
    out["hour_cos"] = np.cos(2 * np.pi * out["hour"] / 24)

    out["month_sin"] = np.sin(2 * np.pi * out["month"] / 12)
    out["month_cos"] = np.cos(2 * np.pi * out["month"] / 12)

    return out

def generate_features(
    df_data: pd.DataFrame,
    df_client: pd.DataFrame,
    df_gas_prices: pd.DataFrame,
    df_electricity_prices: pd.DataFrame,
    df_historical_weather: pd.DataFrame,
    df_forecast_weather: pd.DataFrame,
    df_weather_station_to_county_mapping: pd.DataFrame,
    train_start = '2021-09-01 11:00:00'):
  
  print(dbg("start", df_data))

  # The weather data uses Latitude/Longitude, but the energy data uses "Counties".
  # We need a dictionary to translate coordinates into county IDs so we can join them later.
  # dictionary: {(lat, lon) -> county_id}
  df_weather_station_to_county_mapping = df_weather_station_to_county_mapping[
      df_weather_station_to_county_mapping.notnull().all(axis=1)
    ].sort_values(by="county")
  result_dict = dict(zip(
      zip(
        round(df_weather_station_to_county_mapping['latitude'],1),
        round(df_weather_station_to_county_mapping['longitude'],1)),
      df_weather_station_to_county_mapping['county']))
  df_historical_weather = df_historical_weather[df_historical_weather['datetime'] >= train_start]

  # ---------------------- client data -----------------------
  df_client_hourly = upsample_daily_to_hourly(df_client, date_col='date')
  if 'data_block_id' in df_client_hourly.columns:
    df_client_hourly.drop(columns=['data_block_id'], inplace=True)

  key = ["county", "product_type", "is_business", "datetime"]
  print(df_client_hourly.duplicated(key).sum())

  # merge into our main dataframe
  df_data['datetime'] = pd.to_datetime(df_data['datetime'])
  df_data = df_data.merge(df_client_hourly, on=['county','product_type','is_business','datetime'], how='left')

  print(dbg("after client merge", df_data))

  #### CHANGE HERE THREE ROWS ####
  # ---- client missing indicators + fill ----
  for col in ["eic_count", "installed_capacity"]:
    df_data[f"{col}_missing"] = df_data[col].isna().astype("int8")
    df_data[col] = df_data[col].fillna(0.0)

  # Filter data to ensure we don't go past the available client data
  client_end_date = df_client_hourly['datetime'].max()
  df_data = df_data[df_data['datetime'] <= client_end_date]

  # --------------- gas prices -------------------
  df_gas_hourly = upsample_daily_to_hourly(df_gas_prices, date_col='forecast_date')
  cols_to_drop = ['origin_date', 'data_block_id']
  df_gas_hourly.drop(columns=[c for c in cols_to_drop if c in df_gas_hourly.columns], inplace=True)

  gas_end_date = df_gas_hourly['datetime'].max()
  df_data = df_data[df_data['datetime'] <= gas_end_date]

  df_data = df_data.merge(df_gas_hourly, on=['datetime'], how='left')

  print(dbg("after gas merge", df_data))

  #--------------- electricity (already hourly) ------------------------------
  df_electricity_prices_try = df_electricity_prices.copy(deep=True)
  if 'origin_date' in df_electricity_prices_try.columns:
    df_electricity_prices_try.drop(columns=['origin_date'], inplace=True)
  if 'data_block_id' in df_electricity_prices_try.columns:
    df_electricity_prices_try.drop(columns=['data_block_id'], inplace=True)
  df_electricity_prices_try['forecast_date'] = pd.to_datetime(df_electricity_prices_try['forecast_date'])
  df_electricity_prices_try.rename(columns={"forecast_date": "datetime"}, inplace=True)
  df_data = df_data.merge(df_electricity_prices_try, on=['datetime'], how='left')

  print(dbg("after elec merge", df_data))

  # --- forecast weather ---
  df_forecast_processed = process_forecast_weather(df_forecast_weather, result_dict)
  df_data = df_data.merge(df_forecast_processed, on=['county', 'datetime'], how='left')

  print(dbg("after forecast merge", df_data))

  # --- process historical weather ---
  df_weather_processed = process_historical_weather(df_historical_weather, result_dict)
  df_data = df_data.merge(df_weather_processed, on=['county', 'datetime'], how='left')
  df_data = add_time_features(df_data, datetime_col="datetime")

  return df_data

In [5]:
def add_lag_rolling_features(
    df: pd.DataFrame,
    group_cols: list,
    target_col: str = "target",
    datetime_col: str = "datetime",
    lags: tuple = (1, 24, 48, 168),
    roll_windows: tuple = (24, 168),
    add_diff: bool = True
) -> pd.DataFrame:
    out = df.copy(deep=True)
    out[datetime_col] = pd.to_datetime(out[datetime_col])
    out = out.sort_values(group_cols + [datetime_col])

    g = out.groupby(group_cols, sort=False)[target_col]

    # Lags
    for L in lags:
        out[f"{target_col}_lag_{L}"] = g.shift(L)

    # Diffs
    if add_diff:
        out[f"{target_col}_diff_1"] = out[target_col] - out[f"{target_col}_lag_1"]
        if 24 in lags:
            out[f"{target_col}_diff_24"] = out[target_col] - out[f"{target_col}_lag_24"]

    # Rolling stats on past values (per group, no leakage)
    for W in roll_windows:
        out[f"{target_col}_roll_mean_{W}"] = g.shift(1).transform(
            lambda s: s.rolling(W, min_periods=max(3, W//10)).mean()
        )
        out[f"{target_col}_roll_std_{W}"] = g.shift(1).transform(
            lambda s: s.rolling(W, min_periods=max(3, W//10)).std()
        )

    return out

In [6]:
combined_df = generate_features(
    df_data,
    df_client,
    df_gas_prices,
    df_electricity_prices,
    df_historical_weather,
    df_forecast_weather,
    df_weather_station_to_county_mapping)

start: 2,018,352 rows
None
0
after client merge: 2,018,352 rows
None
after gas merge: 2,012,112 rows
None
after elec merge: 2,012,112 rows
None
after forecast merge: 2,012,112 rows
None


In [7]:
  # ---- lags & rolling stats ----
combined_df = add_lag_rolling_features(
    combined_df,
    group_cols=["county", "product_type", "is_business"],
    target_col="target",
    datetime_col="datetime",
    lags=(1, 24, 48, 168),
    roll_windows=(24, 168),
    add_diff=True
  )

In [8]:
# last n days as validation
gap_days = 7
val_days = 90
max_dt = combined_df["datetime"].max()
val_start = max_dt - pd.Timedelta(days=val_days)
gap_start = val_start - pd.Timedelta(days=gap_days)

train_full = combined_df[combined_df["datetime"] < gap_start].copy()
valid_full = combined_df[combined_df["datetime"] >= val_start].copy()

print("Train:", train_full["datetime"].min(), "→", train_full["datetime"].max(), len(train_full))
print("Valid:", valid_full["datetime"].min(), "→", valid_full["datetime"].max(), len(valid_full))

def make_balanced_subset_regression(
    df: pd.DataFrame,
    target_col: str = "target",
    group_cols=("prediction_unit_id", "is_consumption"),
    n_bins: int = 20,
    frac: float = 0.10,
    max_rows: int | None = 300_000,
    random_state: int = 343,
) -> pd.DataFrame:
    rng = np.random.default_rng(random_state)
    parts = []

    if max_rows is not None:
        frac = min(frac, max_rows / max(len(df), 1))

    for _, gdf in df.groupby(list(group_cols), sort=False):
        if len(gdf) < 100:
            parts.append(gdf)
            continue

        n_take = int(np.ceil(len(gdf) * frac))
        n_take = max(20, min(n_take, len(gdf)))

        y = gdf[target_col]
        try:
            y_binned = pd.qcut(y, q=min(n_bins, len(gdf)), duplicates="drop")
            if y_binned.nunique() < 2:
                idx = rng.choice(gdf.index.to_numpy(), size=n_take, replace=False)
                parts.append(gdf.loc[idx])
                continue

            sss = StratifiedShuffleSplit(n_splits=1, train_size=n_take, random_state=random_state)
            idx_take, _ = next(sss.split(np.zeros(len(gdf)), y_binned))
            parts.append(gdf.iloc[idx_take])

        except Exception:
            idx = rng.choice(gdf.index.to_numpy(), size=n_take, replace=False)
            parts.append(gdf.loc[idx])

    out = pd.concat(parts, axis=0)

    if max_rows is not None and len(out) > max_rows:
        out = out.sample(n=max_rows, random_state=random_state)

    return out.sort_values(["prediction_unit_id","is_consumption","datetime"]).reset_index(drop=True)

train_sub = make_balanced_subset_regression(
    train_full,
    frac=0.10,
    max_rows=300_000,
    n_bins=20,
    random_state=343
)

print("Balanced train subset rows:", len(train_sub))

# X and y for training
DROP_COLS = [
    "target", "row_id", "datetime"
]

#KEEP_COLS = [
#    "county", "product_type", "is_business", "is_consumption", "hour_sin", "hour_cos", "month_sin, month_cos",
#    "euros_per_mwh", "lowest_price_per_mwh", "highest_price_per_mwh"
#]

feature_cols = [c for c in combined_df.columns if c not in DROP_COLS]
#feature_cols = [c for c in combined_df.columns if c in KEEP_COLS]


X_train = train_sub[feature_cols]
y_train = train_sub["target"]

X_valid = valid_full[feature_cols]
y_valid = valid_full["target"]

print("X_train:", X_train.shape, "X_valid:", X_valid.shape)

Train: 2021-09-01 00:00:00 → 2023-02-21 22:00:00 1704108
Valid: 2023-02-28 23:00:00 → 2023-05-29 23:00:00 285588
Balanced train subset rows: 170480
X_train: (170480, 70) X_valid: (285588, 70)


In [9]:
#DROP_COLS = [
#    "target", "row_id", "datetime"
#]

#feature_cols = [c for c in combined_df.columns if c not in DROP_COLS]

#X_train = combined_df[feature_cols]
#y_train = combined_df["target"]

mask = y_train.notna()
X_train = X_train.loc[mask]
y_train = y_train.loc[mask]

xgbc = xgb.XGBRegressor()
fitted_xgbc = xgbc.fit(X_train, y_train)

In [10]:
import numpy as np
import pandas as pd

# series key -> last seen target
last_y = {}
printed = False

def update_state(revealed_targets: pd.DataFrame):
    if revealed_targets is None or len(revealed_targets) == 0:
        return

    rt = revealed_targets.copy()
    rt["is_business"] = rt["is_business"].astype(int)
    rt["is_consumption"] = rt["is_consumption"].astype(int)

    for r in rt.itertuples(index=False):
        key = (r.county, r.product_type, r.is_business, r.is_consumption)
        last_y[key] = float(r.target)

def add_lag1_from_state(df: pd.DataFrame) -> pd.DataFrame:
    out = df.copy()
    out["is_business"] = out["is_business"].astype(int)
    out["is_consumption"] = out["is_consumption"].astype(int)

    lag1 = []
    for r in out.itertuples(index=False):
        key = (r.county, r.product_type, r.is_business, r.is_consumption)
        lag1.append(last_y.get(key, np.nan))

    out["target_lag_1"] = lag1
    out["target_lag_1_missing"] = out["target_lag_1"].isna().astype("int8")
    out["target_lag_1"] = out["target_lag_1"].fillna(0.0)
    return out

def make_prediction_with_features(
    test, revealed_targets, client, historical_weather,
    forecast_weather, electricity_prices, gas_prices,
    sample_prediction, station_map, model, feature_cols
):
    global printed

    update_state(revealed_targets)

    feat = generate_features(
        df_data=test.copy(),
        df_client=client,
        df_gas_prices=gas_prices,
        df_electricity_prices=electricity_prices,
        df_historical_weather=historical_weather,
        df_forecast_weather=forecast_weather,
        df_weather_station_to_county_mapping=station_map
    )

    feat = add_lag1_from_state(feat)

    # ---- DEBUG ONCE ----
    if not printed:
        print("test rows:", len(test), "sample_prediction rows:", len(sample_prediction), "feat rows:", len(feat))
        print("test cols:", list(test.columns))
        print("sample_prediction cols:", list(sample_prediction.columns))
        printed = True

    # ---- ALIGN ROWS TO SAMPLE_PREDICTION ----
    # Enefit normally has row_id. If present, use it to guarantee correct shape/order.
    if "row_id" in sample_prediction.columns and "row_id" in feat.columns:
        feat_aligned = sample_prediction[["row_id"]].merge(feat, on="row_id", how="left")
    else:
        # fallback: assume same order/length as test; reindex to sample_prediction index
        feat_aligned = feat.reindex(sample_prediction.index)

    X_test = feat_aligned.reindex(columns=feature_cols, fill_value=np.nan)

    return X_test

all_preds = []
counter = 0
for (test, revealed_targets, client, historical_weather,
        forecast_weather, electricity_prices, gas_prices, sample_prediction) in iter_test:

    test = test.rename(columns={'prediction_datetime':'datetime'})
    X_test = make_prediction_with_features(
        test, revealed_targets, client, historical_weather,
        forecast_weather, electricity_prices, gas_prices,
        sample_prediction,
        station_map=df_weather_station_to_county_mapping,   # your mapping df
        model=fitted_xgbc,                                  # your trained model
        feature_cols=feature_cols
    )
    sample_prediction['target'] = fitted_xgbc.predict(X_test)
    env.predict(sample_prediction)

This version of the API is not optimized and should not be used to estimate the runtime of your code on the hidden test set.
start: 3,120 rows
None
0
after client merge: 3,120 rows
None
after gas merge: 0 rows
None
after elec merge: 0 rows
None
after forecast merge: 0 rows
None
test rows: 3120 sample_prediction rows: 3120 feat rows: 0
test cols: ['county', 'is_business', 'product_type', 'is_consumption', 'datetime', 'row_id', 'prediction_unit_id', 'currently_scored']
sample_prediction cols: ['row_id', 'target']
start: 3,120 rows
None
0
after client merge: 3,120 rows
None
after gas merge: 0 rows
None
after elec merge: 0 rows
None
after forecast merge: 0 rows
None
start: 3,120 rows
None
0
after client merge: 3,120 rows
None
after gas merge: 0 rows
None
after elec merge: 0 rows
None
after forecast merge: 0 rows
None
start: 3,120 rows
None
0
after client merge: 3,120 rows
None
after gas merge: 0 rows
None
after elec merge: 0 rows
None
after forecast merge: 0 rows
None
