In [1]:
import pandas as pd
import numpy as np
import xgboost as xgb
import enefit
import warnings

warnings.filterwarnings('ignore')

DATA_DIR = "/kaggle/input/predict-energy-behavior-of-prosumers/"
ESTONIA_HOLIDAYS = [
    "2022-01-01", "2022-02-24", "2022-04-15", "2022-04-17", "2022-05-01",
    "2022-06-05", "2022-06-23", "2022-06-24", "2022-08-20", "2022-12-24",
    "2022-12-25", "2022-12-26", "2023-01-01", "2023-02-24", "2023-04-07",
    "2023-04-09", "2023-05-01", "2023-05-28", "2023-06-23", "2023-06-24",
    "2023-08-20", "2023-12-24", "2023-12-25", "2023-12-26"
]

def upsample_daily_to_hourly(df: pd.DataFrame, date_col: str) -> pd.DataFrame:
    if df.empty: return df
    df_hourly = df.copy()
    df_hourly[date_col] = pd.to_datetime(df_hourly[date_col])
    df_hourly['datetime'] = df_hourly[date_col].apply(
        lambda x: [x + pd.Timedelta(hours=i) for i in range(24)]
    )
    df_hourly = df_hourly.explode('datetime').drop(columns=[date_col])
    return df_hourly

def process_forecast_weather(df_forecast: pd.DataFrame, location_map: dict) -> pd.DataFrame:
    if df_forecast.empty: return pd.DataFrame()
    df = df_forecast.copy()
    
    df['county'] = [location_map.get((round(x,1), round(y,1)), -1) for x, y in zip(df['latitude'], df['longitude'])]
    df = df[df['county'] != -1]
    
    df['origin_datetime'] = pd.to_datetime(df['origin_datetime'])
    df['origin_date'] = df['origin_datetime'].dt.floor('D') 
    df['forecast_datetime'] = df['origin_date'] + pd.to_timedelta(df['hours_ahead'], unit='h')
    
    exclude = ['latitude', 'longitude', 'hours_ahead', 'forecast_datetime', 'origin_datetime', 'origin_date', 'county', 'data_block_id']
    feats = [c for c in df.columns if c not in exclude]
    
    df_grouped = df.groupby(['county', 'forecast_datetime'])[feats].mean().reset_index()
    df_grouped.rename(columns={'forecast_datetime': 'datetime'}, inplace=True)
    return df_grouped

def add_time_features(df: pd.DataFrame) -> pd.DataFrame:
    dt = df["datetime"].dt
    df["hour"] = dt.hour.astype("int8")
    df["month"] = dt.month.astype("int8")
    df["dayofweek"] = dt.dayofweek.astype("int8")
    
    holidays = pd.to_datetime(ESTONIA_HOLIDAYS)
    df["is_holiday"] = df["datetime"].dt.normalize().isin(holidays).astype("int8")
    df["hour_sin"] = np.sin(2 * np.pi * df["hour"] / 24)
    df["hour_cos"] = np.cos(2 * np.pi * df["hour"] / 24)
    return df

# --- FEATURE GENERATION ---

def generate_features(
    df_data, df_client, df_gas, df_elec, df_forecast, location_map
):
    df_data['datetime'] = pd.to_datetime(df_data['datetime'])
    
    # 1. Client Data
    df_client_h = upsample_daily_to_hourly(df_client, 'date')
    if not df_client_h.empty:
        df_data = df_data.merge(
            df_client_h[['county', 'product_type', 'is_business', 'datetime', 'eic_count', 'installed_capacity']],
            on=['county', 'product_type', 'is_business', 'datetime'], 
            how='left'
        )
    
    # This handles the case where client data is slightly delayed
    df_data['eic_count'] = df_data.groupby(['county', 'is_business', 'product_type'])['eic_count'].ffill().fillna(0)
    df_data['installed_capacity'] = df_data.groupby(['county', 'is_business', 'product_type'])['installed_capacity'].ffill().fillna(0)

    # 2. Gas Prices
    df_gas_h = upsample_daily_to_hourly(df_gas, 'forecast_date')
    if not df_gas_h.empty:
        df_data = df_data.merge(df_gas_h[['datetime', 'lowest_price_per_mwh', 'highest_price_per_mwh']], on='datetime', how='left')
    df_data['lowest_price_per_mwh'] = df_data['lowest_price_per_mwh'].ffill().fillna(0)
    df_data['highest_price_per_mwh'] = df_data['highest_price_per_mwh'].ffill().fillna(0)

    if not df_elec.empty:
        df_elec = df_elec.rename(columns={'forecast_date': 'datetime'})
        df_elec['datetime'] = pd.to_datetime(df_elec['datetime'])
        df_data = df_data.merge(df_elec[['datetime', 'euros_per_mwh']], on='datetime', how='left')
        df_data['euros_per_mwh'] = df_data['euros_per_mwh'].ffill().fillna(0)

    df_weather = process_forecast_weather(df_forecast, location_map)
    if not df_weather.empty:
        df_data = df_data.merge(df_weather, on=['county', 'datetime'], how='left')
        # Fill weather NaNs (if any) with mean
        feat_cols = [c for c in df_weather.columns if c not in ['county', 'datetime']]
        df_data[feat_cols] = df_data[feat_cols].fillna(df_data[feat_cols].mean())

    df_data = add_time_features(df_data)
    
    return df_data

# --- LOAD DATA ---
print("Loading data...")
df_train = pd.read_csv(DATA_DIR + "train.csv")
df_client = pd.read_csv(DATA_DIR + "client.csv")
df_gas = pd.read_csv(DATA_DIR + "gas_prices.csv")
df_elec = pd.read_csv(DATA_DIR + "electricity_prices.csv")
df_forecast = pd.read_csv(DATA_DIR + "forecast_weather.csv")
df_map = pd.read_csv(DATA_DIR + "weather_station_to_county_mapping.csv")

df_map = df_map.dropna(subset=['latitude', 'longitude', 'county'])
loc_map = dict(zip(zip(round(df_map['latitude'],1), round(df_map['longitude'],1)), df_map['county']))

# --- PREPARE TRAINING ---
print("Generating features...")
df_all = generate_features(df_train, df_client, df_gas, df_elec, df_forecast, loc_map)
df_all = df_all.dropna(subset=['target']) # Only drop rows with no target during training

# split models
common_feats = [
    'county', 'product_type', 'is_business', 
    'eic_count', 'installed_capacity',
    'lowest_price_per_mwh', 'highest_price_per_mwh', 'euros_per_mwh',
    'temperature', 'dewpoint', 'cloudcover_high', 'cloudcover_low', 'cloudcover_mid', 'cloudcover_total',
    '10_metre_u_wind_component', '10_metre_v_wind_component', 'direct_solar_radiation',
    'hour', 'dayofweek', 'month', 'is_holiday', 'hour_sin', 'hour_cos'
]
# Filter features that actually exist in dataframe
features = [c for c in common_feats if c in df_all.columns]

print(f"Training on {len(df_all)} rows with features: {features}")

# Train Model 0 (Production)
X0 = df_all[df_all['is_consumption'] == 0][features]
y0 = df_all[df_all['is_consumption'] == 0]['target']
model_prod = xgb.XGBRegressor(
    n_estimators=1000, learning_rate=0.05, max_depth=6, 
    early_stopping_rounds=50, enable_categorical=True, device="cuda", tree_method="hist"
)
model_prod.fit(X0, y0, eval_set=[(X0, y0)], verbose=100)

# Train Model 1 (Consumption)
X1 = df_all[df_all['is_consumption'] == 1][features]
y1 = df_all[df_all['is_consumption'] == 1]['target']
model_cons = xgb.XGBRegressor(
    n_estimators=1000, learning_rate=0.05, max_depth=6, 
    early_stopping_rounds=50, enable_categorical=True, device="cuda", tree_method="hist"
)
model_cons.fit(X1, y1, eval_set=[(X1, y1)], verbose=100)

print("Training Done.")



Loading data...
Generating features...
Training on 2017824 rows with features: ['county', 'product_type', 'is_business', 'eic_count', 'installed_capacity', 'lowest_price_per_mwh', 'highest_price_per_mwh', 'euros_per_mwh', 'temperature', 'dewpoint', 'cloudcover_high', 'cloudcover_low', 'cloudcover_mid', 'cloudcover_total', '10_metre_u_wind_component', '10_metre_v_wind_component', 'direct_solar_radiation', 'hour', 'dayofweek', 'month', 'is_holiday', 'hour_sin', 'hour_cos']
Parameters: { "device" } are not used.

[0]	validation_0-rmse:378.81325
[100]	validation_0-rmse:102.59892
[200]	validation_0-rmse:87.70264
[300]	validation_0-rmse:80.60616
[400]	validation_0-rmse:75.14251
[500]	validation_0-rmse:71.01000
[600]	validation_0-rmse:67.44845
[700]	validation_0-rmse:64.67036
[800]	validation_0-rmse:62.39110
[900]	validation_0-rmse:60.35809
[999]	validation_0-rmse:58.43324
Parameters: { "device" } are not used.

[0]	validation_0-rmse:1223.78557
[100]	validation_0-rmse:183.76288
[200]	validati

In [2]:
# Storage class to accumulate API data
class HistoryStorage:
    def __init__(self):
        self.client = df_client.copy()
        self.gas = df_gas.copy()
        self.elec = df_elec.copy()
        self.forecast = df_forecast.copy()
        
    def update(self, client_new, gas_new, elec_new, forecast_new):
        # Concatenate and de-duplicate (keep latest)
        self.client = pd.concat([self.client, client_new]).drop_duplicates(subset=['county', 'is_business', 'product_type', 'date'], keep='last')
        self.gas = pd.concat([self.gas, gas_new]).drop_duplicates(subset=['forecast_date'], keep='last')
        self.elec = pd.concat([self.elec, elec_new]).drop_duplicates(subset=['forecast_date'], keep='last')
        self.forecast = pd.concat([self.forecast, forecast_new]).drop_duplicates(subset=['latitude', 'longitude', 'hours_ahead', 'origin_datetime'], keep='last')

storage = HistoryStorage()

env = enefit.make_env()
iter_test = env.iter_test()

In [3]:
print("Starting inference...")

for (test, revealed_targets, client, historical_weather,
        forecast_weather, electricity_prices, gas_prices, sample_prediction) in iter_test:
    storage.update(client, gas_prices, electricity_prices, forecast_weather)
    
    test['datetime'] = pd.to_datetime(test['prediction_datetime'])
    X_test = generate_features(
        test, 
        storage.client, 
        storage.gas, 
        storage.elec, 
        storage.forecast, 
        loc_map
    )
    
    X_test_vals = X_test[features]
    
    cat_cols = ['county', 'product_type', 'is_business', 'is_consumption']
    for col in cat_cols:
        if col in X_test_vals.columns:
            X_test_vals[col] = X_test_vals[col].astype('category')
    
    mask_cons = X_test['is_consumption'] == 1
    preds = np.zeros(len(X_test))
    
    # Predict Consumption
    if mask_cons.any():
        X_cons = X_test_vals[mask_cons]
        dtest_cons = xgb.DMatrix(X_cons, enable_categorical=True, feature_names=features)
        preds[mask_cons] = model_cons.get_booster().predict(dtest_cons)
        
    # Predict Production
    if (~mask_cons).any():
        X_prod = X_test_vals[~mask_cons]
        dtest_prod = xgb.DMatrix(X_prod, enable_categorical=True, feature_names=features)
        preds[~mask_cons] = model_prod.get_booster().predict(dtest_prod)
        
    sample_prediction['target'] = np.clip(preds, 0, None)
    
    env.predict(sample_prediction)

print("Inference finished.")

Starting inference...
This version of the API is not optimized and should not be used to estimate the runtime of your code on the hidden test set.
Inference finished.
