In [None]:
import numpy as np
import pandas as pd
from tqdm import tqdm

import lightgbm

In [None]:
DATA_PATH = "/kaggle/input/predict-energy-behavior-of-prosumers"
DATA = pd.read_csv(f"{DATA_PATH}/train.csv", parse_dates=["datetime"])
DATA = DATA.astype(
    {
        "county": int, 
        "product_type": int, 
        "is_business": float, 
        "is_consumption": float
    }
)

## Load and preprocess training data

In [None]:
# Feature engineering
def temporal_features(df: pd.DataFrame) -> pd.DataFrame:
    df = df.assign(
        month=df["datetime"].dt.month.astype(float),
        day=df["datetime"].dt.day.astype(float),
        weekday=df["datetime"].dt.weekday.astype(float),
        hour=df["datetime"].dt.hour.astype(float),
        is_weekend=(df["datetime"].dt.weekday >= 5).astype(float)
    )
    return df


def fourier_features(df: pd.DataFrame) -> pd.DataFrame:
    def fft(df: pd.DataFrame, column: str, period: float, max_freq: int = 1):
        for freq in range(1, max_freq + 1):
            frac_period = 2 * np.pi * np.array(df[column] / period)
            df[f"{column}_sin_{freq}"] = np.sin(frac_period * freq)
            df[f"{column}_cos_{freq}"] = np.cos(frac_period * freq)

        return df
    
    df = fft(df, "month", 12, 2)
    df = fft(df, "weekday", 7, 1)
    df = fft(df, "hour", 24, 2)
    return df

In [None]:
# Temporal features
DATA = temporal_features(DATA)

# Fourier features
DATA = fourier_features(DATA)

# Drop irrelevant cols
cols_to_drop = ["data_block_id", "row_id", "prediction_unit_id"]
DATA = DATA.drop(cols_to_drop, axis=1)

# Drop missing target values
DATA = DATA.dropna(subset=["target"])

## Baseline models

In [None]:
# Feature selection
CATEGORICAL_FEATURES = ["county", "product_type"]
FEATURES = [c for c in DATA.columns if c not in ["target", "datetime"]]
LABEL = "target"

print(DATA[FEATURES].shape, DATA[LABEL].shape)

In [None]:
# https://lightgbm.readthedocs.io/en/latest/Parameters.html
TRAINING_PARAMS = {
    'objective': 'regression',
    'metric': 'mae',
    'boosting_type': 'gbdt',
    'num_leaves': 31,
    'learning_rate': 0.05,
    'feature_fraction': 0.9,
    'bagging_fraction': 0.8,
    'bagging_freq': 5,
    'verbose': 0
}

In [None]:
train_dataset = lightgbm.Dataset(
    data=DATA[FEATURES],
    label=DATA[LABEL],
    feature_name="auto",
    categorical_feature=CATEGORICAL_FEATURES
)

In [None]:
model = lightgbm.train(
    TRAINING_PARAMS, 
    train_dataset,
    num_boost_round=250,
)

In [None]:
# # Train / val split
# start_datetime, stop_datetime = DATASET["datetime"].min(), DATASET["datetime"].max()
# total_days = int((stop_datetime - start_datetime).total_seconds() / 3600 / 24)

# n_folds = 10
# evaluation_days = 30
# max_training_days = 180

# start_evaluation_set = stop_datetime.floor("1D") - n_folds * pd.Timedelta(days=evaluation_days)
# start_training_set = start_evaluation_set - pd.Timedelta(days=max_training_days)

# scores, iterations = [], []
# for i in range(n_folds):
#     val_start = start_evaluation_set + i * pd.Timedelta(days=evaluation_days)
#     val_end = val_start + pd.Timedelta(days=evaluation_days)
#     train_start = val_end - pd.Timedelta(days=max_training_days)
    
#     # Select and transform training data
#     train = DATASET[(DATASET["datetime"] >= train_start) & (DATASET["datetime"] < val_start)]
#     X_train, y_train = train[FEATURES], train[LABEL]
#     train_lgbm = lightgbm.Dataset(data=X_train, label=y_train)
    
#     # Select and transform validation data
#     val = DATASET[(DATASET["datetime"] >= val_start) & (DATASET["datetime"] <= val_end)]
#     X_val, y_val = val[FEATURES], val[LABEL]
#     val_lgbm = lightgbm.Dataset(X_val, y_val)
    
#     model = lightgbm.train(
#         TRAINING_PARAMS, 
#         train_lgbm,
#         num_boost_round=1000,
#         valid_sets=[val_lgbm],
#         callbacks=[lightgbm.callback.early_stopping(10)]
#     )
#     y_hat = model.predict(X_val)
#     scores.append(np.mean(np.abs(y_hat - np.array(y_val))))
#     iterations.append(model.best_iteration)

# mean_score = np.mean(scores)
# mean_iterations = np.mean(iterations)

# print(mean_score)

## Final Model

In [None]:
# %%time

# # Maybe consider not training on the full dataset and only the last few months?
# # Check what timestamps I actually need to make predictions for.
# model = lightgbm.train(
#     TRAINING_PARAMS, 
#     lightgbm.Dataset(DATASET[FEATURES], DATASET[LABEL]),
#     num_boost_round=int(mean_iterations),
# )

## Submission

In [None]:
import enefit

env = enefit.make_env()
iter_test = env.iter_test()

In [None]:
def preprocess_test_data(df: pd.DataFrame) -> pd.DataFrame:
    df = df.astype({"county": int, "product_type": int, "is_business": float, "is_consumption": float})
    df = df.rename(columns={"prediction_datetime": "datetime"})
    df["datetime"] = pd.to_datetime(df["datetime"])
    df = df.drop(["row_id", "prediction_unit_id"], axis=1)
    return df

In [None]:
for (test, _, _, _, _, _, _, sample_prediction) in iter_test:
    # Preprocessing and feature selection
    X_test = preprocess_test_data(test)

    X_test = temporal_features(X_test)
    X_test = fourier_features(X_test)
    
    # Predict on test set
    y_hat = model.predict(X_test[FEATURES])
    y_hat = np.maximum(y_hat, 0.0)
    sample_prediction['target'] = y_hat
    env.predict(sample_prediction)
