In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tqdm import tqdm

import lightgbm

DEBUG = False

In [2]:
DATA_PATH = "/kaggle/input/predict-energy-behavior-of-prosumers"
DATA = pd.read_csv(f"{DATA_PATH}/train.csv", parse_dates=["datetime"])
DATA = DATA.astype(
    {
        "county": int, 
        "product_type": int, 
        "is_business": float, 
        "is_consumption": float
    }
)

## Load and preprocess training data

In [3]:
# Feature engineering
def temporal_features(df: pd.DataFrame) -> pd.DataFrame:
    df = df.assign(
        month=df["datetime"].dt.month.astype(float),
        day=df["datetime"].dt.day.astype(float),
        weekday=df["datetime"].dt.weekday.astype(float),
        hour=df["datetime"].dt.hour.astype(float),
        is_weekend=(df["datetime"].dt.weekday >= 5).astype(float)
    )
    return df


def fourier_features(df: pd.DataFrame) -> pd.DataFrame:
    def fft(df: pd.DataFrame, column: str, period: float, max_freq: int = 1):
        for freq in range(1, max_freq + 1):
            frac_period = 2 * np.pi * np.array(df[column] / period)
            df[f"{column}_sin_{freq}"] = np.sin(frac_period * freq)
            df[f"{column}_cos_{freq}"] = np.cos(frac_period * freq)

        return df
    
    df = fft(df, "month", 12, 2)
    df = fft(df, "weekday", 7, 1)
    df = fft(df, "hour", 24, 2)
    return df


def lagged_target_features(df: pd.DataFrame, lags: list[int]) -> pd.DataFrame:
    lagged_feature_dfs = []
    for lag in lags:
        lagged_df = pd.DataFrame(
            {
                f"datetime_lag_{lag}": df["datetime"] - pd.Timedelta(hours=lag),
                "prediction_unit_id": df["prediction_unit_id"],
                "is_consumption": df["is_consumption"],
            }
        )
        lagged_features_df = pd.merge(
            lagged_df,
            df[["datetime", "prediction_unit_id", "is_consumption", "target"]],
            how="left",
            left_on=[f"datetime_lag_{lag}", "prediction_unit_id", "is_consumption"],
            right_on=["datetime", "prediction_unit_id", "is_consumption"]
        )

        lagged_features_df = (
            lagged_features_df[[f"datetime_lag_{lag}", "target"]]
            .rename(columns={"target": f"target_lag_{lag}"})
        )
        lagged_feature_dfs.append(lagged_features_df)
    return pd.concat([df] + lagged_feature_dfs, axis=1)
        

In [4]:
# Temporal features
DATA = temporal_features(DATA)

# Fourier features
DATA = fourier_features(DATA)

# Lagged target features -- in hours 
# LAGS = [days * 24 for days in range(2, 8)]
# DATA = lagged_target_features(DATA, LAGS)

# # Drop irrelevant cols
# lagged_datetime_cols = [f"datetime_lag_{lag}" for lag in LAGS]
# DATA = DATA.drop(lagged_datetime_cols, axis=1)

# Drop missing target values
# lagged_target_features = [f"target_lag_{lag}" for lag in LAGS]
DATA = DATA.dropna(subset=["target"])

## Baseline models

In [5]:
# Feature selection
LABEL = "target"
EXCLUDE_FEATURES = ["datetime", "data_block_id", "prediction_unit_id", "row_id"]
CATEGORICAL_FEATURES = ["county", "product_type"]
FEATURES = [c for c in DATA.columns if c not in [LABEL] + EXCLUDE_FEATURES]


print(DATA[FEATURES].shape, DATA[LABEL].shape)

(2017824, 19) (2017824,)


In [6]:
FEATURES

['county',
 'is_business',
 'product_type',
 'is_consumption',
 'month',
 'day',
 'weekday',
 'hour',
 'is_weekend',
 'month_sin_1',
 'month_cos_1',
 'month_sin_2',
 'month_cos_2',
 'weekday_sin_1',
 'weekday_cos_1',
 'hour_sin_1',
 'hour_cos_1',
 'hour_sin_2',
 'hour_cos_2']

In [7]:
# https://lightgbm.readthedocs.io/en/latest/Parameters.html
TRAINING_PARAMS = {
    'objective': 'regression',
    'metric': 'mae',
    'boosting_type': 'gbdt',
    'num_leaves': 31,
    'learning_rate': 0.05,
    'feature_fraction': 0.9,
    'bagging_fraction': 0.8,
    'bagging_freq': 5,
    'verbose': 0
}

In [8]:
train_dataset = lightgbm.Dataset(
    data=DATA[FEATURES],
    label=DATA[LABEL],
    feature_name="auto",
    categorical_feature=CATEGORICAL_FEATURES
)

In [9]:
model = lightgbm.train(
    TRAINING_PARAMS, 
    train_dataset,
    num_boost_round=200,
)



You can set `force_col_wise=true` to remove the overhead.


In [10]:
# # Train / val split
# start_datetime, stop_datetime = DATASET["datetime"].min(), DATASET["datetime"].max()
# total_days = int((stop_datetime - start_datetime).total_seconds() / 3600 / 24)

# n_folds = 10
# evaluation_days = 30
# max_training_days = 180

# start_evaluation_set = stop_datetime.floor("1D") - n_folds * pd.Timedelta(days=evaluation_days)
# start_training_set = start_evaluation_set - pd.Timedelta(days=max_training_days)

# scores, iterations = [], []
# for i in range(n_folds):
#     val_start = start_evaluation_set + i * pd.Timedelta(days=evaluation_days)
#     val_end = val_start + pd.Timedelta(days=evaluation_days)
#     train_start = val_end - pd.Timedelta(days=max_training_days)
    
#     # Select and transform training data
#     train = DATASET[(DATASET["datetime"] >= train_start) & (DATASET["datetime"] < val_start)]
#     X_train, y_train = train[FEATURES], train[LABEL]
#     train_lgbm = lightgbm.Dataset(data=X_train, label=y_train)
    
#     # Select and transform validation data
#     val = DATASET[(DATASET["datetime"] >= val_start) & (DATASET["datetime"] <= val_end)]
#     X_val, y_val = val[FEATURES], val[LABEL]
#     val_lgbm = lightgbm.Dataset(X_val, y_val)
    
#     model = lightgbm.train(
#         TRAINING_PARAMS, 
#         train_lgbm,
#         num_boost_round=1000,
#         valid_sets=[val_lgbm],
#         callbacks=[lightgbm.callback.early_stopping(10)]
#     )
#     y_hat = model.predict(X_val)
#     scores.append(np.mean(np.abs(y_hat - np.array(y_val))))
#     iterations.append(model.best_iteration)

# mean_score = np.mean(scores)
# mean_iterations = np.mean(iterations)

# print(mean_score)

## Final Model

In [11]:
# %%time

# # Maybe consider not training on the full dataset and only the last few months?
# # Check what timestamps I actually need to make predictions for.
# model = lightgbm.train(
#     TRAINING_PARAMS, 
#     lightgbm.Dataset(DATASET[FEATURES], DATASET[LABEL]),
#     num_boost_round=int(mean_iterations),
# )

## Submission

In [12]:
import enefit

env = enefit.make_env()
iter_test = env.iter_test()

In [13]:
if DEBUG:
    enefit.make_env.__called__ = False
    type(env)._state = type(type(env)._state).__dict__['INIT']
    iter_test = env.iter_test()

In [14]:
def preprocess_test_data(df: pd.DataFrame) -> pd.DataFrame:
    df = df.astype({"county": int, "product_type": int, "is_business": float, "is_consumption": float})
    df = df.rename(columns={"prediction_datetime": "datetime"})
    df["datetime"] = pd.to_datetime(df["datetime"])
    return df


def lagged_test_target_features(test_df, train_df, lags):
    
    lagged_feature_dfs = []

    for lag in lags:
        lagged_df = pd.DataFrame(
            {
                f"datetime_lag_{lag}": test_df["datetime"] - pd.Timedelta(hours=lag),
                "prediction_unit_id": test_df["prediction_unit_id"],
                "is_consumption": test_df["is_consumption"],
            }
        )
        lagged_features_df = pd.merge(
            left=lagged_df,
            right=train_df[["datetime", "prediction_unit_id", "is_consumption", "target"]],
            how="left",
            right_on=["datetime", "prediction_unit_id", "is_consumption"],
            left_on=[f"datetime_lag_{lag}", "prediction_unit_id", "is_consumption"],
        )

        lagged_features_df = (
            lagged_features_df[[f"datetime_lag_{lag}", "target"]]
            .rename(columns={"target": f"target_lag_{lag}"})
        )
        lagged_feature_dfs.append(lagged_features_df)
        
    return pd.concat([test_df] + lagged_feature_dfs, axis=1)


def impute_lagged_target_features(features_df: pd.DataFrame, running_predictions: pd.DataFrame, lags: list[int]) -> pd.DataFrame:
    for lag in lags:
        # Merge current features (with missing values) and previous predictions
        impute_df = pd.merge(
            features_df[["datetime", f"datetime_lag_{lag}", f"target_lag_{lag}", "prediction_unit_id", "is_consumption"]],
            running_predictions[["prediction_datetime", "prediction_target", "prediction_unit_id", "is_consumption"]],
            left_on=[f"datetime_lag_{lag}", "prediction_unit_id", "is_consumption"],
            right_on=["prediction_datetime", "prediction_unit_id", "is_consumption"],
            how="left"
        )
        
        # Where features are missing, impute with predictions
        impute_df[f"target_lag_{lag}"] = impute_df[f"target_lag_{lag}"].where(
            impute_df[f"target_lag_{lag}"].notnull(),
            impute_df["prediction_target"]
        )
        
        # Assign features to imputed values in features df
        features_df[f"target_lag_{lag}"] = impute_df[f"target_lag_{lag}"]
    
    return features_df


In [15]:
raw_test_dfs = []
X_test_dfs = []
revealed_target_dfs = []
sample_prediction_dfs = []

# Add to running df of targets as they are revealed
running_revealed_targets = pd.DataFrame()

for (
    raw_test_df,
    raw_revealed_targets,
    _, 
    _, 
    _, 
    _, 
    _, 
    sample_prediction,
) in iter_test:

    # Preprocessing and feature selection
    revealed_targets = preprocess_test_data(raw_revealed_targets)
    running_revealed_targets = pd.concat([running_revealed_targets, revealed_targets], axis=0)
    running_revealed_targets = running_revealed_targets.reset_index(drop=True)

    X_test = preprocess_test_data(raw_test_df)
    X_test = temporal_features(X_test)
    X_test = fourier_features(X_test)
#     X_test = lagged_test_target_features(X_test, running_revealed_targets, LAGS)
    
    # Predict on test set
    y_hat = model.predict(X_test[FEATURES])
    y_hat = np.clip(y_hat, 0.0, 15500.0)
    sample_prediction['target'] = y_hat
    
    sample_prediction["target"] = sample_prediction["target"].fillna(0.0)
    
    env.predict(sample_prediction)
    
    raw_test_dfs.append(raw_test_df)
    X_test_dfs.append(X_test)
    sample_prediction_dfs.append(sample_prediction)
    revealed_target_dfs.append(revealed_targets)

This version of the API is not optimized and should not be used to estimate the runtime of your code on the hidden test set.
