In [1]:
import numpy as np
import pandas as pd
from tqdm import tqdm

import lightgbm

In [2]:
DATA_PATH = "/kaggle/input/predict-energy-behavior-of-prosumers"
DATA = pd.read_csv(f"{DATA_PATH}/train.csv", parse_dates=["datetime"])
DATA = DATA.astype(
    {
        "county": int, 
        "product_type": int, 
        "is_business": float, 
        "is_consumption": float
    }
)

## Load and preprocess training data

In [3]:
# Feature engineering
def temporal_features(df: pd.DataFrame) -> pd.DataFrame:
    df = df.assign(
        month=df["datetime"].dt.month.astype(float),
        day=df["datetime"].dt.day.astype(float),
        weekday=df["datetime"].dt.weekday.astype(float),
        hour=df["datetime"].dt.hour.astype(float),
        is_weekend=(df["datetime"].dt.weekday >= 5).astype(float)
    )
    return df


def fourier_features(df: pd.DataFrame) -> pd.DataFrame:
    def fft(df: pd.DataFrame, column: str, period: float, max_freq: int = 1):
        for freq in range(1, max_freq + 1):
            frac_period = 2 * np.pi * np.array(df[column] / period)
            df[f"{column}_sin_{freq}"] = np.sin(frac_period * freq)
            df[f"{column}_cos_{freq}"] = np.cos(frac_period * freq)

        return df
    
    df = fft(df, "month", 12, 2)
    df = fft(df, "weekday", 7, 1)
    df = fft(df, "hour", 24, 2)
    return df


def lagged_target_features(df: pd.DataFrame, lags: list[int]) -> pd.DataFrame:
    lagged_feature_dfs = []
    for lag in lags:
        lagged_df = pd.DataFrame(
            {
                f"datetime_lag_{lag}": df["datetime"] - pd.Timedelta(hours=lag),
                "prediction_unit_id": df["prediction_unit_id"],
                "is_consumption": df["is_consumption"],
            }
        )
        lagged_features_df = pd.merge(
            lagged_df,
            df[["datetime", "prediction_unit_id", "is_consumption", "target"]],
            how="left",
            left_on=[f"datetime_lag_{lag}", "prediction_unit_id", "is_consumption"],
            right_on=["datetime", "prediction_unit_id", "is_consumption"]
        )

        lagged_features_df = (
            lagged_features_df[[f"datetime_lag_{lag}", "target"]]
            .rename(columns={"target": f"target_lag_{lag}"})
        )
        lagged_feature_dfs.append(lagged_features_df)
    return pd.concat([df] + lagged_feature_dfs, axis=1)
        

In [4]:
# Temporal features
DATA = temporal_features(DATA)

# Fourier features
DATA = fourier_features(DATA)

# Lagged target features
lags = [1, 2, 7, 24, 7 * 24]
DATA = lagged_target_features(DATA, lags)

# Drop irrelevant cols
lagged_datetime_cols = [f"datetime_lag_{lag}" for lag in lags]
DATA = DATA.drop(lagged_datetime_cols, axis=1)

# Drop missing target values
lagged_target_features = [f"target_lag_{lag}" for lag in lags]
DATA = DATA.dropna(subset=["target"] + lagged_target_features)

## Baseline models

In [None]:
# Feature selection
CATEGORICAL_FEATURES = ["county", "product_type"]
FEATURES = [c for c in DATA.columns if c not in ["target", "datetime"]]
LABEL = "target"

print(DATA[FEATURES].shape, DATA[LABEL].shape)

In [None]:
# https://lightgbm.readthedocs.io/en/latest/Parameters.html
TRAINING_PARAMS = {
    'objective': 'regression',
    'metric': 'mae',
    'boosting_type': 'gbdt',
    'num_leaves': 31,
    'learning_rate': 0.05,
    'feature_fraction': 0.9,
    'bagging_fraction': 0.8,
    'bagging_freq': 5,
    'verbose': 0
}

In [None]:
train_dataset = lightgbm.Dataset(
    data=DATA[FEATURES],
    label=DATA[LABEL],
    feature_name="auto",
    categorical_feature=CATEGORICAL_FEATURES
)

In [None]:
model = lightgbm.train(
    TRAINING_PARAMS, 
    train_dataset,
    num_boost_round=1000,
)

In [None]:
# # Train / val split
# start_datetime, stop_datetime = DATASET["datetime"].min(), DATASET["datetime"].max()
# total_days = int((stop_datetime - start_datetime).total_seconds() / 3600 / 24)

# n_folds = 10
# evaluation_days = 30
# max_training_days = 180

# start_evaluation_set = stop_datetime.floor("1D") - n_folds * pd.Timedelta(days=evaluation_days)
# start_training_set = start_evaluation_set - pd.Timedelta(days=max_training_days)

# scores, iterations = [], []
# for i in range(n_folds):
#     val_start = start_evaluation_set + i * pd.Timedelta(days=evaluation_days)
#     val_end = val_start + pd.Timedelta(days=evaluation_days)
#     train_start = val_end - pd.Timedelta(days=max_training_days)
    
#     # Select and transform training data
#     train = DATASET[(DATASET["datetime"] >= train_start) & (DATASET["datetime"] < val_start)]
#     X_train, y_train = train[FEATURES], train[LABEL]
#     train_lgbm = lightgbm.Dataset(data=X_train, label=y_train)
    
#     # Select and transform validation data
#     val = DATASET[(DATASET["datetime"] >= val_start) & (DATASET["datetime"] <= val_end)]
#     X_val, y_val = val[FEATURES], val[LABEL]
#     val_lgbm = lightgbm.Dataset(X_val, y_val)
    
#     model = lightgbm.train(
#         TRAINING_PARAMS, 
#         train_lgbm,
#         num_boost_round=1000,
#         valid_sets=[val_lgbm],
#         callbacks=[lightgbm.callback.early_stopping(10)]
#     )
#     y_hat = model.predict(X_val)
#     scores.append(np.mean(np.abs(y_hat - np.array(y_val))))
#     iterations.append(model.best_iteration)

# mean_score = np.mean(scores)
# mean_iterations = np.mean(iterations)

# print(mean_score)

## Final Model

In [None]:
# %%time

# # Maybe consider not training on the full dataset and only the last few months?
# # Check what timestamps I actually need to make predictions for.
# model = lightgbm.train(
#     TRAINING_PARAMS, 
#     lightgbm.Dataset(DATASET[FEATURES], DATASET[LABEL]),
#     num_boost_round=int(mean_iterations),
# )

## Submission

In [6]:
import enefit

env = enefit.make_env()
iter_test = env.iter_test()

In [47]:
def preprocess_test_data(df: pd.DataFrame) -> pd.DataFrame:
    df = df.astype({"county": int, "product_type": int, "is_business": float, "is_consumption": float})
    df = df.rename(columns={"prediction_datetime": "datetime"})
    df["datetime"] = pd.to_datetime(df["datetime"])
#     df = df.drop(["row_id", "prediction_unit_id"], axis=1)
    return df


def lagged_test_target_features(test_df, train_df, lags):
    
    lagged_feature_dfs = []

    for lag in lags:
        lagged_df = pd.DataFrame(
            {
                f"datetime_lag_{lag}": test_df["datetime"] - pd.Timedelta(hours=lag),
                "prediction_unit_id": test_df["prediction_unit_id"],
                "is_consumption": test_df["is_consumption"],
            }
        )
        lagged_features_df = pd.merge(
            lagged_df,
            train_df[["datetime", "prediction_unit_id", "is_consumption", "target"]],
            how="left",
            left_on=[f"datetime_lag_{lag}", "prediction_unit_id", "is_consumption"],
            right_on=["datetime", "prediction_unit_id", "is_consumption"]
        )

        lagged_features_df = (
            lagged_features_df[[f"datetime_lag_{lag}", "target"]]
            .rename(columns={"target": f"target_lag_{lag}"})
        )
        lagged_feature_dfs.append(lagged_features_df)
        
    return pd.concat([test_df] + lagged_feature_dfs, axis=1)

In [56]:
test_df = test_dfs[1]
test_df = preprocess_test_data(test_df)
test_df = lagged_test_target_features(test_df, DATA, lags=[1])

In [57]:
# Make some lag features un-available
test_df.loc[:5, "target_lag_1"] = np.nan

In [58]:
test_df

Unnamed: 0,county,is_business,product_type,is_consumption,datetime,row_id,prediction_unit_id,currently_scored,datetime_lag_1,target_lag_1
0,0,0.0,1,0.0,2023-05-29 00:00:00,2008992,0,False,2023-05-28 23:00:00,
1,0,0.0,1,1.0,2023-05-29 00:00:00,2008993,0,False,2023-05-28 23:00:00,
2,0,0.0,2,0.0,2023-05-29 00:00:00,2008994,1,False,2023-05-28 23:00:00,
3,0,0.0,2,1.0,2023-05-29 00:00:00,2008995,1,False,2023-05-28 23:00:00,
4,0,0.0,3,0.0,2023-05-29 00:00:00,2008996,2,False,2023-05-28 23:00:00,
...,...,...,...,...,...,...,...,...,...,...
3115,15,1.0,0,1.0,2023-05-29 23:00:00,2012107,64,False,2023-05-29 22:00:00,212.626
3116,15,1.0,1,0.0,2023-05-29 23:00:00,2012108,59,False,2023-05-29 22:00:00,0.000
3117,15,1.0,1,1.0,2023-05-29 23:00:00,2012109,59,False,2023-05-29 22:00:00,33.624
3118,15,1.0,3,0.0,2023-05-29 23:00:00,2012110,60,False,2023-05-29 22:00:00,0.000


In [59]:
previous_test_df = test_dfs[0]
previous_sample_prediction = sample_prediction_dfs[0]
previous_test_df = preprocess_test_data(previous_test_df)

In [61]:
previous_test_df = previous_test_df.merge(right=previous_sample_prediction, how="inner", on="row_id")

In [62]:
previous_test_df

Unnamed: 0,county,is_business,product_type,is_consumption,datetime,row_id,prediction_unit_id,currently_scored,target
0,0,0.0,1,0.0,2023-05-28 00:00:00,2005872,0,False,0.0
1,0,0.0,1,1.0,2023-05-28 00:00:00,2005873,0,False,0.0
2,0,0.0,2,0.0,2023-05-28 00:00:00,2005874,1,False,0.0
3,0,0.0,2,1.0,2023-05-28 00:00:00,2005875,1,False,0.0
4,0,0.0,3,0.0,2023-05-28 00:00:00,2005876,2,False,0.0
...,...,...,...,...,...,...,...,...,...
3115,15,1.0,0,1.0,2023-05-28 23:00:00,2008987,64,False,0.0
3116,15,1.0,1,0.0,2023-05-28 23:00:00,2008988,59,False,0.0
3117,15,1.0,1,1.0,2023-05-28 23:00:00,2008989,59,False,0.0
3118,15,1.0,3,0.0,2023-05-28 23:00:00,2008990,60,False,0.0


In [38]:
lags = [1]
lagged_feature_dfs = []


        

In [39]:
pd.concat([test_df, lagged_feature_dfs[0]], axis=1)

Unnamed: 0,county,is_business,product_type,is_consumption,datetime,row_id,prediction_unit_id,currently_scored,datetime_lag_1,target_lag_1
0,0,0.0,1,0.0,2023-05-31 00:00:00,2015232,0,False,2023-05-30 23:00:00,3.105
1,0,0.0,1,1.0,2023-05-31 00:00:00,2015233,0,False,2023-05-30 23:00:00,559.781
2,0,0.0,2,0.0,2023-05-31 00:00:00,2015234,1,False,2023-05-30 23:00:00,0.000
3,0,0.0,2,1.0,2023-05-31 00:00:00,2015235,1,False,2023-05-30 23:00:00,4.874
4,0,0.0,3,0.0,2023-05-31 00:00:00,2015236,2,False,2023-05-30 23:00:00,34.055
...,...,...,...,...,...,...,...,...,...,...
3115,15,1.0,0,1.0,2023-05-31 23:00:00,2018347,64,False,2023-05-31 22:00:00,207.244
3116,15,1.0,1,0.0,2023-05-31 23:00:00,2018348,59,False,2023-05-31 22:00:00,0.000
3117,15,1.0,1,1.0,2023-05-31 23:00:00,2018349,59,False,2023-05-31 22:00:00,36.083
3118,15,1.0,3,0.0,2023-05-31 23:00:00,2018350,60,False,2023-05-31 22:00:00,0.001


In [33]:
DATA[
    (DATA["datetime"] == pd.Timestamp("2023-05-30 22:00:00"))
    & (DATA["is_consumption"] == 0.0)
    & (DATA["prediction_unit_id"] == 60)
]

Unnamed: 0,county,is_business,product_type,target,is_consumption,datetime,data_block_id,row_id,prediction_unit_id,month,...,weekday_cos_1,hour_sin_1,hour_cos_1,hour_sin_2,hour_cos_2,target_lag_1,target_lag_2,target_lag_7,target_lag_24,target_lag_168
2015100,15,1.0,3,0.0,0.0,2023-05-30 22:00:00,636,2015100,60,5.0,...,0.62349,-0.5,0.866025,-0.866025,0.5,6.438,38.308,541.976,0.0,0.0


In [7]:
test_dfs = []
sample_prediction_dfs = []
for (test, _, _, _, _, _, _, sample_prediction) in iter_test:
    # Preprocessing and feature selection
#     X_test = preprocess_test_data(test)

#     X_test = temporal_features(X_test)
#     X_test = fourier_features(X_test)
    
#     # Predict on test set
#     y_hat = model.predict(X_test[FEATURES])
#     y_hat = np.maximum(y_hat, 0.0)
    sample_prediction['target'] = 0.0
    env.predict(sample_prediction)
    
    test_dfs.append(test)
    sample_prediction_dfs.append(sample_prediction)


This version of the API is not optimized and should not be used to estimate the runtime of your code on the hidden test set.


In [35]:
test_dfs[3]

Unnamed: 0,county,is_business,product_type,is_consumption,prediction_datetime,row_id,prediction_unit_id,currently_scored
0,0,False,1,False,2023-05-31 00:00:00,2015232,0,False
1,0,False,1,True,2023-05-31 00:00:00,2015233,0,False
2,0,False,2,False,2023-05-31 00:00:00,2015234,1,False
3,0,False,2,True,2023-05-31 00:00:00,2015235,1,False
4,0,False,3,False,2023-05-31 00:00:00,2015236,2,False
...,...,...,...,...,...,...,...,...
3115,15,True,0,True,2023-05-31 23:00:00,2018347,64,False
3116,15,True,1,False,2023-05-31 23:00:00,2018348,59,False
3117,15,True,1,True,2023-05-31 23:00:00,2018349,59,False
3118,15,True,3,False,2023-05-31 23:00:00,2018350,60,False


In [None]:
sample_prediction_dfs[0]

In [None]:
[
    (df["prediction_datetime"].min(), df["prediction_datetime"].max())
    for df in test_dfs
]

In [40]:
DATA["datetime"].max()

Timestamp('2023-05-31 23:00:00')