Baseline taken [Enefit Generic Notebook](https://www.kaggle.com/code/greysky/enefit-generic-notebook) 

In [1]:
import os
import gc
import pickle

import numpy as np
import pandas as pd
import polars as pl
import plotly.express as px
import holidays
import ephem
import pytz
from datetime import datetime

from sklearn.ensemble import VotingRegressor, StackingRegressor, BaggingRegressor
from sklearn.model_selection import train_test_split, GridSearchCV, TimeSeriesSplit, cross_val_score, cross_val_predict
from sklearn.metrics import mean_squared_error, mean_absolute_error

import lightgbm as lgb
from catboost import CatBoostRegressor
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from sklearn.ensemble import VotingRegressor
import joblib

import warnings
with warnings.catch_warnings():
    warnings.simplefilter(action='ignore', category=pd.errors.PerformanceWarning)



# Import Data

In [2]:
root = "/kaggle/input/predict-energy-behavior-of-prosumers"

data_cols = ['target', 'county', 'is_business', 'product_type', 'is_consumption', 'datetime', 'row_id']
client_cols = ['product_type', 'county', 'eic_count', 'installed_capacity', 'is_business', 'date']
gas_prices_cols = ['forecast_date', 'lowest_price_per_mwh', 'highest_price_per_mwh']
electricity_prices_cols = ['forecast_date', 'euros_per_mwh']
forecast_weather_cols = ['latitude', 'longitude', 'hours_ahead', 'temperature', 'dewpoint', 'cloudcover_high', 'cloudcover_low', 'cloudcover_mid', 'cloudcover_total', '10_metre_u_wind_component', '10_metre_v_wind_component', 'forecast_datetime', 'direct_solar_radiation', 'surface_solar_radiation_downwards', 'snowfall', 'total_precipitation']
historical_weather_cols = ['datetime', 'temperature', 'dewpoint', 'rain', 'snowfall', 'surface_pressure','cloudcover_total','cloudcover_low','cloudcover_mid','cloudcover_high','windspeed_10m','winddirection_10m','shortwave_radiation','direct_solar_radiation','diffuse_radiation','latitude','longitude']
location_cols = ['longitude', 'latitude', 'county']
target_cols = ['target', 'county', 'is_business', 'product_type', 'is_consumption', 'datetime']

In [3]:
df_data = pl.read_csv(os.path.join(root, "train.csv"), columns=data_cols, try_parse_dates=True)
df_client = pl.read_csv(os.path.join(root, "client.csv"), columns=client_cols, try_parse_dates=True)
df_gas_prices = pl.read_csv(os.path.join(root, "gas_prices.csv"), columns=gas_prices_cols, try_parse_dates=True)
df_electricity_prices = pl.read_csv(os.path.join(root, "electricity_prices.csv"), columns=electricity_prices_cols, try_parse_dates=True)
df_forecast_weather = pl.read_csv(os.path.join(root, "forecast_weather.csv"), columns=forecast_weather_cols, try_parse_dates=True)
df_historical_weather = pl.read_csv(os.path.join(root, "historical_weather.csv"), columns=historical_weather_cols, try_parse_dates=True)
df_weather_station_to_county_mapping = pl.read_csv(os.path.join(root, "weather_station_to_county_mapping.csv"), columns=location_cols, try_parse_dates=True)
df_target = df_data.select(target_cols)

schema_data = df_data.schema
schema_client = df_client.schema
schema_gas  = df_gas_prices.schema
schema_electricity = df_electricity_prices.schema
schema_forecast = df_forecast_weather.schema
schema_historical = df_historical_weather.schema
schema_target = df_target.schema

In [4]:
estonian_holidays = holidays.country_holidays('EE', years=range(2020, 2026))
estonian_holidays = list(estonian_holidays.keys())
df_estonian_holidays = pd.DataFrame(estonian_holidays, columns=['date'])

# Utilities
Functions to be used as aux

In [5]:
def extend_columns_fill(columns_fill, df):
    
    columns_substrings = ['temperature', 'dewpoint', 'cloudcover_high', 'cloudcover_low', 'cloudcover_mid',
                        'cloudcover_total', '10_metre_u_wind_component', '10_metre_v_wind_component', 'direct_solar_radiation',
                        'surface_solar_radiation_downwards', 'snowfall', 'total_precipitation_', 'temperature',
                         'surface_pressure', 'windspeed', 'winddirection', 'shortwave_radiation', 'diffuse_radiation', 
                         'rain', 'surface_pressure', 'windspeed']
    
    all_columns = df.columns.tolist()
    selected_columns = [name for name in all_columns if any(substring in name for substring in columns_substrings)]
    columns_fill.extend(selected_columns)
    
    return columns_fill

In [6]:
def create_sun_hours_df():
    start_date = datetime(2020, 1, 1)
    end_date = datetime(2026, 12, 31)
    date_range = pd.date_range(start_date, end_date, freq='D')
    df_sun_hours = pd.DataFrame({'date': date_range})
    df_sun_hours['date'] = pd.to_datetime(df_sun_hours['date'])
    
    df_sun_hours['year'] = df_sun_hours['date'].dt.year
    df_sun_hours['month'] = df_sun_hours['date'].dt.month
    df_sun_hours['day'] = df_sun_hours['date'].dt.day

    df_sun_hours['sunrise'], df_sun_hours['sunset'] = zip(*df_sun_hours['date'].apply(calculate_sunrise_sunset))

    df_sun_hours.drop(columns = ['date'], inplace = True)

    return df_sun_hours

In [7]:
def calculate_sunrise_sunset(date):
    observer = ephem.Observer()
    observer.lat = '59.4370'  # Latitude for Tallinn, Estonia
    observer.lon = '24.7536'  # Longitude for Tallinn, Estonia
    observer.date = date.strftime('%Y-%m-%d')
    
    # get sunrise and set based on the location and date
    sunrise = observer.next_rising(ephem.Sun()).datetime()
    sunset = observer.next_setting(ephem.Sun()).datetime()
    
    # converts UTC to EET/EEST
    tallinn_timezone = pytz.timezone('Europe/Tallinn')
    sunrise_local = sunrise.replace(tzinfo=pytz.utc).astimezone(tallinn_timezone).replace(tzinfo=None)
    sunset_local = sunset.replace(tzinfo=pytz.utc).astimezone(tallinn_timezone).replace(tzinfo=None)
    
    return sunrise_local, sunset_local

# Features Engineering

In [8]:
def create_moving_avg_target(df_target, windows_ma_days = [15,30]):
    windows_ma_target = [x * 24 for x in windows_ma_days]
    
    df_target_ma = df_target.clone()

    for window in windows_ma_target:
        df_target_ma = df_target_ma.with_columns(
            rolling_mean=pl.col("target").rolling_mean(window_size=window).over(['county', 'is_business', 'product_type', 'is_consumption']),
        )
    
        df_target_ma = df_target_ma.rename({"rolling_mean": f'rolling_mean_{int(window/24)}'})
        
    return df_target_ma

In [9]:
def generate_features(
        df_data, 
        df_client, 
        df_gas_prices, 
        df_electricity_prices, 
        df_forecast_weather, 
        df_historical_weather, 
        df_weather_station_to_county_mapping, 
        df_target,
        df_target_ma
):
    df_data = (
        df_data
        .with_columns(
            pl.col("datetime").cast(pl.Date).alias("date"),
        )
    )
    
    df_gas_prices = (
        df_gas_prices
        .rename({"forecast_date": "date"})
    )
    
    df_electricity_prices = (
        df_electricity_prices
        .rename({"forecast_date": "datetime"})
    )
    
    df_weather_station_to_county_mapping = (
        df_weather_station_to_county_mapping
        .with_columns(
            pl.col("latitude").cast(pl.datatypes.Float32),
            pl.col("longitude").cast(pl.datatypes.Float32)
        )
    )
    
    # sum of all product_type targets related to ["datetime", "county", "is_business", "is_consumption"]
    df_target_all_type_sum = (
        df_target
        .group_by(["datetime", "county", "is_business", "is_consumption"]).sum()
        .drop("product_type")
    )
    
    df_forecast_weather = (
        df_forecast_weather
        .rename({"forecast_datetime": "datetime"})
        .filter(pl.col("hours_ahead") >= 24) # we don't need forecast for today
        .with_columns(
            pl.col("latitude").cast(pl.datatypes.Float32),
            pl.col("longitude").cast(pl.datatypes.Float32),
            # datetime for forecast in a different timezone
            pl.col('datetime').dt.replace_time_zone(None).cast(pl.Datetime("us"))
        )
        .join(df_weather_station_to_county_mapping, how="left", on=["longitude", "latitude"])
        .drop("longitude", "latitude")
    )
    
    df_historical_weather = (
        df_historical_weather
        .with_columns(
            pl.col("latitude").cast(pl.datatypes.Float32),
            pl.col("longitude").cast(pl.datatypes.Float32),
#            pl.col("datetime") + pl.duration(hours=37)
        )
        .join(df_weather_station_to_county_mapping, how="left", on=["longitude", "latitude"])
        .drop("longitude", "latitude")
    )
    
    # creating average forecast characteristics for all weather stations
    df_forecast_weather_date = (
        df_forecast_weather
        .group_by("datetime").mean()
        .drop("county")
    )
    
    # creating average forecast characteristics for weather stations related to county
    df_forecast_weather_local = (
        df_forecast_weather
        .filter(pl.col("county").is_not_null())
        .group_by("county", "datetime").mean()
    )
    
    # creating average historical characteristics for all weather stations
    df_historical_weather_date = (
        df_historical_weather
        .group_by("datetime").mean()
        .drop("county")
    )
    
    # creating average historical characteristics for weather stations related to county
    df_historical_weather_local = (
        df_historical_weather
        .filter(pl.col("county").is_not_null())
        .group_by("county", "datetime").mean()
    )
    
    df_data = (
        df_data
        # pl.duration(days=1) shifts datetime to join lag features (usually we join last available values)
        .join(df_gas_prices.with_columns((pl.col("date") + pl.duration(days=1)).cast(pl.Date)), on="date", how="left")
        .join(df_client.with_columns((pl.col("date") + pl.duration(days=2)).cast(pl.Date)), on=["county", "is_business", "product_type", "date"], how="left")
        .join(df_electricity_prices.with_columns(pl.col("datetime") + pl.duration(days=1)), on="datetime", how="left")
        
        # lag forecast_weather features (24 hours * days)
        .join(df_forecast_weather_date, on="datetime", how="left", suffix="_fd")
        .join(df_forecast_weather_local, on=["county", "datetime"], how="left", suffix="_fl")
        .join(df_forecast_weather_date.with_columns(pl.col("datetime") + pl.duration(days=7)), on="datetime", how="left", suffix="_fd_7d")
        .join(df_forecast_weather_local.with_columns(pl.col("datetime") + pl.duration(days=7)), on=["county", "datetime"], how="left", suffix="_fl_7d")

        # lag historical_weather features (24 hours * days)
        .join(df_historical_weather_date.with_columns(pl.col("datetime") + pl.duration(days=2)), on="datetime", how="left", suffix="_hd_2d")
        .join(df_historical_weather_local.with_columns(pl.col("datetime") + pl.duration(days=2)), on=["county", "datetime"], how="left", suffix="_hl_2d")
        .join(df_historical_weather_date.with_columns(pl.col("datetime") + pl.duration(days=7)), on="datetime", how="left", suffix="_hd_7d")
        .join(df_historical_weather_local.with_columns(pl.col("datetime") + pl.duration(days=7)), on=["county", "datetime"], how="left", suffix="_hl_7d")
        
        # lag target features (24 hours * days)
        .join(df_target_ma.with_columns(pl.col("datetime") + pl.duration(days=2)).rename({"target": "target_2"}), on=["county", "is_business", "product_type", "is_consumption", "datetime"], how="left")
        .join(df_target.with_columns(pl.col("datetime") + pl.duration(days=3)).rename({"target": "target_3"}), on=["county", "is_business", "product_type", "is_consumption", "datetime"], how="left")
        .join(df_target.with_columns(pl.col("datetime") + pl.duration(days=4)).rename({"target": "target_4"}), on=["county", "is_business", "product_type", "is_consumption", "datetime"], how="left")
        .join(df_target.with_columns(pl.col("datetime") + pl.duration(days=5)).rename({"target": "target_5"}), on=["county", "is_business", "product_type", "is_consumption", "datetime"], how="left")
        .join(df_target.with_columns(pl.col("datetime") + pl.duration(days=6)).rename({"target": "target_6"}), on=["county", "is_business", "product_type", "is_consumption", "datetime"], how="left")
        .join(df_target.with_columns(pl.col("datetime") + pl.duration(days=7)).rename({"target": "target_7"}), on=["county", "is_business", "product_type", "is_consumption", "datetime"], how="left")
        .join(df_target.with_columns(pl.col("datetime") + pl.duration(days=8)).rename({"target": "target_8"}), on=["county", "is_business", "product_type", "is_consumption", "datetime"], how="left")
        .join(df_target.with_columns(pl.col("datetime") + pl.duration(days=9)).rename({"target": "target_9"}), on=["county", "is_business", "product_type", "is_consumption", "datetime"], how="left")
        .join(df_target.with_columns(pl.col("datetime") + pl.duration(days=10)).rename({"target": "target_10"}), on=["county", "is_business", "product_type", "is_consumption", "datetime"], how="left")
        .join(df_target.with_columns(pl.col("datetime") + pl.duration(days=11)).rename({"target": "target_11"}), on=["county", "is_business", "product_type", "is_consumption", "datetime"], how="left")
        .join(df_target.with_columns(pl.col("datetime") + pl.duration(days=12)).rename({"target": "target_12"}), on=["county", "is_business", "product_type", "is_consumption", "datetime"], how="left")
        .join(df_target.with_columns(pl.col("datetime") + pl.duration(days=13)).rename({"target": "target_13"}), on=["county", "is_business", "product_type", "is_consumption", "datetime"], how="left")
        .join(df_target.with_columns(pl.col("datetime") + pl.duration(days=14)).rename({"target": "target_14"}), on=["county", "is_business", "product_type", "is_consumption", "datetime"], how="left")
        
        .join(df_target_all_type_sum.with_columns(pl.col("datetime") + pl.duration(days=2)).rename({"target": "target_1"}), on=["county", "is_business", "is_consumption", "datetime"], suffix="_all_type_sum", how="left")
        .join(df_target_all_type_sum.with_columns(pl.col("datetime") + pl.duration(days=3)).rename({"target": "target_2"}), on=["county", "is_business", "is_consumption", "datetime"], suffix="_all_type_sum", how="left")
        .join(df_target_all_type_sum.with_columns(pl.col("datetime") + pl.duration(days=7)).rename({"target": "target_6"}), on=["county", "is_business", "is_consumption", "datetime"], suffix="_all_type_sum", how="left")
        .join(df_target_all_type_sum.with_columns(pl.col("datetime") + pl.duration(days=14)).rename({"target": "target_7"}), on=["county", "is_business", "is_consumption", "datetime"], suffix="_all_type_sum", how="left")
        
        
        .with_columns(
            pl.col("datetime").dt.ordinal_day().alias("dayofyear"),
            pl.col("datetime").dt.hour().alias("hour"),
            pl.col("datetime").dt.day().alias("day"),
            pl.col("datetime").dt.weekday().alias("weekday"),
            pl.col("datetime").dt.month().alias("month"),
            pl.col("datetime").dt.year().alias("year"),
        )
        
        .with_columns(
            pl.concat_str("county", "is_business", "product_type", "is_consumption", separator="_").alias("segment"),
        )
        
        # cyclical features encoding https://towardsdatascience.com/cyclical-features-encoding-its-about-time-ce23581845ca
        .with_columns(
            (np.pi * pl.col("dayofyear") / 183).sin().alias("sin(dayofyear)"),
            (np.pi * pl.col("dayofyear") / 183).cos().alias("cos(dayofyear)"),
            (np.pi * pl.col("hour") / 12).sin().alias("sin(hour)"),
            (np.pi * pl.col("hour") / 12).cos().alias("cos(hour)"),
        )
        
        .with_columns(
            pl.col(pl.Float64).cast(pl.Float32),
        )
        
        .drop("date", "hour", "dayofyear")
    )
    
    return df_data

In [10]:
def to_pandas(X, y = None):
    cat_cols = ["county", "is_business", "product_type", "is_consumption", "segment"]
    
    if y is not None:
        df = pd.concat([X.to_pandas(), y.to_pandas()], axis=1)
    else:
        df = X.to_pandas()    
    
    df = df.set_index("row_id")
    df[cat_cols] = df[cat_cols].astype("category")
    
    df["target_mean_1"] = df[[f"target_{i}" for i in range(2,8)]].mean(1)
    df["target_mean_2"] = df[[f"target_{i}" for i in range(2,15)]].mean(1)
    df["target_std_1"] = df[[f"target_{i}" for i in range(2, 8)]].std(1)
    df["target_std_2"] = df[[f"target_{i}" for i in range(2, 15)]].std(1)
    df["target_ratio_1"] = df["target_2"] / (df["target_7"] + 1e-3)
    df["target_ratio_2"] = df["target_7"] / (df["target_14"] + 1e-3)
    
    return df

In [11]:
def deal_null(df):
    # these will be filled be the mean of the county
    columns_fill = ['eic_count', 'installed_capacity', 'target_2','target_3','target_4','target_5', 'target_6','target_7','target_8',
                    'target_9','target_10','target_11','target_12','target_13','target_14','target_mean_1','target_std_1','target_mean_2','target_std_2',
                    'target_ratio_1','target_ratio_2',  'lowest_price_per_mwh', 'highest_price_per_mwh', 'euros_per_mwh']

    for column in columns_fill:
        df[column] = df.groupby(['county', 'is_business', 'product_type', 'is_consumption'])[column].fillna(method='ffill')
        df[column] = df.groupby(['county', 'is_business', 'product_type', 'is_consumption'])[column].fillna(method='bfill')
    
    
    # these will be filled be the mean of the datetime
    columns_fill = ['euros_per_mwh']
    columns_fill = extend_columns_fill(columns_fill, df)

    mean_values = df.groupby('datetime')[columns_fill].transform('mean')
    df = pd.concat([df, mean_values.add_suffix('_mean')], axis=1)
    
    for column in columns_fill:
        df[column] = df[column].fillna(df[column + '_mean'])
        
    df.drop(columns=[col + '_mean' for col in columns_fill], inplace=True, axis=1)
    
    #ffill with the ones still null
    for column in columns_fill:
        df[column] = df.groupby(['county', 'is_business', 'product_type', 'is_consumption'])[column].fillna(method='ffill')
        
    df.drop(columns = ['hours_ahead_fl', 'hours_ahead_fl_7d', 'hours_ahead_fd_7d'], inplace = True)
    
    return df

In [12]:
def get_holiday_day_night(df):
    df['datetime'] = pd.to_datetime(df['datetime'])
    df.loc[:,'date'] = df['datetime'].dt.date
    
    #holiday
    df = df.merge(df_estonian_holidays, on = ['date'], how = 'left', indicator = 'is_holiday')
    df.replace({'left_only': 0, 'both': 1, 'right_only': None}, inplace = True)
    
    # day/night
    df_sun_hours = create_sun_hours_df()
    df = df.merge(df_sun_hours, on = ['year', 'month', 'day'], how = 'left')
    df.loc[:,'is_night'] = np.where((df['datetime'] >= df['sunrise']) 
                            & (df['datetime'] <= df['sunset']), 0, 1)
    
    df.drop(columns = ['date', 'sunrise', 'sunset'], inplace = True)
    
    return df

In [13]:
def convert_dtypes(df):
    columns_change = ['county', 'is_business', 'product_type', 'is_consumption', 'is_holiday']
    
    for column in columns_change:
        df[column] = df[column].astype('int32')
        
    return df

In [14]:
def create_moving_avg_weather(df, window_sizes_avg = [8], window_sizes_std = [5]):
    columns_groupby = ['county', 'is_business', 'product_type', 'is_consumption']
    columns_ma = ['temperature', 'dewpoint','cloudcover_high','cloudcover_low','cloudcover_mid','cloudcover_total',
              '10_metre_u_wind_component','10_metre_v_wind_component','direct_solar_radiation',
              'surface_solar_radiation_downwards','snowfall','total_precipitation']
    
    for window_size in window_sizes_avg:
        for column in columns_ma:
            df[column + '_MA_' + str(window_size) + 'hours'] = df.groupby(columns_groupby)[column].transform(lambda x: x.rolling(window=window_size, min_periods=1).mean())
            
    for window_size in window_sizes_std:
        for column in columns_ma:
            df[column + '_STD_' + str(window_size) + 'hours'] = df.groupby(columns_groupby)[column].transform(lambda x: x.rolling(window=window_size, min_periods=1).std())
        
    return df

In [15]:
df_data, y = df_data.drop("target"), df_data.select("target")

windows_ma_days = [15,30]
df_target_ma = create_moving_avg_target(df_target, windows_ma_days)

df_train_features = generate_features(
    df_data, 
    df_client, 
    df_gas_prices, 
    df_electricity_prices, 
    df_forecast_weather, 
    df_historical_weather, 
    df_weather_station_to_county_mapping, 
    df_target,
    df_target_ma
)

df_train_features = to_pandas(df_train_features, y)

# a little proportion of target values are null
df_train_features = df_train_features[df_train_features['target'].notnull()]

# filter old data
df_train_features = df_train_features[df_train_features.year >= 2022]

# deal with null values
df_train_features = deal_null(df_train_features)

# get holidays and day/night features
df_train_features = get_holiday_day_night(df_train_features)

window_sizes_avg = [8, 12]
window_sizes_std = [5]
df_train_features = create_moving_avg_weather(df_train_features, window_sizes_avg, window_sizes_std)

df_train_features.drop(columns = ['datetime', 'segment'], inplace = True)

df_train_features = convert_dtypes(df_train_features)

  df[column] = df.groupby(['county', 'is_business', 'product_type', 'is_consumption'])[column].fillna(method='ffill')
  df[column] = df.groupby(['county', 'is_business', 'product_type', 'is_consumption'])[column].fillna(method='ffill')
  df[column] = df.groupby(['county', 'is_business', 'product_type', 'is_consumption'])[column].fillna(method='bfill')
  df[column] = df.groupby(['county', 'is_business', 'product_type', 'is_consumption'])[column].fillna(method='bfill')
  df[column] = df.groupby(['county', 'is_business', 'product_type', 'is_consumption'])[column].fillna(method='ffill')
  df[column] = df.groupby(['county', 'is_business', 'product_type', 'is_consumption'])[column].fillna(method='ffill')
  df.loc[:,'date'] = df['datetime'].dt.date
  df[column + '_MA_' + str(window_size) + 'hours'] = df.groupby(columns_groupby)[column].transform(lambda x: x.rolling(window=window_size, min_periods=1).mean())
  df[column + '_MA_' + str(window_size) + 'hours'] = df.groupby(columns_groupby)[colum

# Model

In [16]:
class MonthlyKFold:
    def __init__(self, n_splits=3):
        self.n_splits = n_splits
        
    def split(self, X, y, groups=None):
        dates = 12 * X["year"] + X["month"]
        timesteps = sorted(dates.unique().tolist())
        X = X.reset_index()
        
        for t in timesteps[-self.n_splits:]:
            idx_train = X[dates.values < t].index
            idx_test = X[dates.values == t].index
            
            yield idx_train, idx_test
            
    def get_n_splits(self, X, y, groups=None):
        return self.n_splits

In [17]:
features = df_train_features.columns.tolist()
features.remove('target')
features.remove('is_consumption')

In [18]:
X_0 = df_train_features[df_train_features['is_consumption'] == 0][features]
y_0 = df_train_features[df_train_features['is_consumption'] == 0]['target']
X_1 = df_train_features[df_train_features['is_consumption'] == 1][features]
y_1 = df_train_features[df_train_features['is_consumption'] == 1]['target']

y_diff_0 = df_train_features[df_train_features['is_consumption'] == 0]['target'] - df_train_features[df_train_features['is_consumption'] == 0]['target_2'].fillna(0)
y_diff_1 = df_train_features[df_train_features['is_consumption'] == 1]['target'] - df_train_features[df_train_features['is_consumption'] == 1]['target_2'].fillna(0)

#### Cross Validation

In [19]:
# params = {
#     'reg_alpha':5, 
#     'reg_lambda':3.5, 
#     'n_estimators':3000, 
#     'num_leaves':500, 
#     'learning_rate' : 0.05, 
#     'max_depth' : 12,
#     'objective' : "regression_l1", 
#     'colsample_bytree' : 0.8, 
#     'colsample_bynode' : 0.7   
# }

# model_0 = lgb.LGBMRegressor(**params)
# model_1 = lgb.LGBMRegressor(**params)
# model_0_diff = lgb.LGBMRegressor(**params)
# model_1_diff = lgb.LGBMRegressor(**params)

In [20]:
# cv = MonthlyKFold(1)

# scores_0 = cross_val_score(model_0, X_0, y_0, cv=cv, scoring='neg_mean_absolute_error')
# scores_1 = cross_val_score(model_1, X_1, y_1, cv=cv, scoring='neg_mean_absolute_error')

In [21]:
# print(scores_0)
# #print(scores_0.mean())

# print(scores_1)
# #print(scores_1.mean())

In [22]:
# params_cat = {
#     'iterations':2000,
#     'depth':12, 
#     'learning_rate':0.05, 
#     'loss_function':'RMSE', 
#     'l2_leaf_reg': 3, 
#     'verbose': False
# }

# cat_model_0 = CatBoostRegressor(**params_cat)
# cat_model_1 = CatBoostRegressor(**params_cat)

# scores_0 = cross_val_score(cat_model_0, X_0, y_0, cv=cv, scoring='neg_mean_absolute_error')
# scores_1 = cross_val_score(cat_model_1, X_1, y_1, cv=cv, scoring='neg_mean_absolute_error')

In [23]:
# print(scores_0)
# print(scores_0.mean())

# print(scores_1)
# print(scores_1.mean())

#### Votting Regressor + Bagging

In [24]:
random_states = [10, 20, 30, 40, 50]

lgbm_models_0 = [lgb.LGBMRegressor(reg_alpha=5, reg_lambda=4, n_estimators=3000, num_leaves=500, learning_rate = 0.04, max_depth = 12,objective = "regression_l1", colsample_bytree = 0.85, colsample_bynode = 0.7, device = 'gpu',random_state=rs) for rs in random_states]
lgbm_models_1 = [lgb.LGBMRegressor(reg_alpha=5, reg_lambda=4, n_estimators=3000, num_leaves=500, learning_rate = 0.04, max_depth = 12,objective = "regression_l1", colsample_bytree = 0.85, colsample_bynode = 0.7, device = 'gpu',random_state=rs) for rs in random_states]
lgbm_models_0_dif = [lgb.LGBMRegressor(reg_alpha=5, reg_lambda=4, n_estimators=3000, num_leaves=500, learning_rate = 0.04, max_depth = 12, objective = "regression_l1", colsample_bytree = 0.85, colsample_bynode = 0.7, device = 'gpu',random_state=rs) for rs in random_states]
lgbm_models_1_dif = [lgb.LGBMRegressor(reg_alpha=5, reg_lambda=4, n_estimators=3000, num_leaves=500, learning_rate = 0.04, max_depth = 12, objective = "regression_l1", colsample_bytree = 0.85, colsample_bynode = 0.7, device = 'gpu',random_state=rs) for rs in random_states]

model_0 = VotingRegressor(estimators=[('lgbm_' + str(rs), model) for rs, model in zip(random_states, lgbm_models_0)])
model_1 = VotingRegressor(estimators=[('lgbm_' + str(rs), model) for rs, model in zip(random_states, lgbm_models_1)])
model_0_diff = VotingRegressor(estimators=[('lgbm_' + str(rs), model) for rs, model in zip(random_states, lgbm_models_0_dif)])
model_1_diff = VotingRegressor(estimators=[('lgbm_' + str(rs), model) for rs, model in zip(random_states, lgbm_models_1_dif)])

In [25]:
params_cat = {
    'iterations':2500,
    'depth':12, 
    'learning_rate':0.04, 
    'loss_function':'RMSE', 
    'l2_leaf_reg': 5, 
    'verbose': False,
    'task_type':"GPU"
}

cat_model_0_base = CatBoostRegressor(**params_cat)
cat_model_1_base = CatBoostRegressor(**params_cat)

cat_model_0 = BaggingRegressor(cat_model_0_base, n_estimators=5, random_state=42)
cat_model_1 = BaggingRegressor(cat_model_1_base, n_estimators=5, random_state=42)

#### Fit

In [26]:
model_0.fit(X_0, y_0)
model_1.fit(X_1, y_1)

model_0_diff.fit(X_0, y_diff_0)
model_1_diff.fit(X_1, y_diff_1)

cat_model_0.fit(X_0, y_0)
cat_model_1.fit(X_1, y_1)

  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):


### Feature Importance

In [27]:
# feature_importance0 = model_0.feature_importances_
# feature_importance_df0 = pd.DataFrame({'Feature': features, 'Importance': feature_importance0})
# feature_importance_df0 = feature_importance_df0.sort_values(by='Importance', ascending=False)

# feature_importance1 = model_1.feature_importances_
# feature_importance_df1 = pd.DataFrame({'Feature': features, 'Importance': feature_importance1})
# feature_importance_df1 = feature_importance_df1.sort_values(by='Importance', ascending=False)

# print("Feature Importance 0:")
# print(feature_importance_df0.head(50))

# print("Feature Importance 1:")
# print(feature_importance_df1.head(50))

# Submit API

In [28]:
import enefit

env = enefit.make_env()
iter_test = env.iter_test()

The following code demonstrated usage of API when in each cycle step we simulate new day and we need to send prediciotns before we get the next day (this guranteeres that we don't see targets from future).

Local running of a notebook uses toy example of test data, after notebook submission toy data will be replaced with new unseen data.

In [29]:
for (
    test, 
    revealed_targets, 
    client, 
    historical_weather,
    forecast_weather, 
    electricity_prices, 
    gas_prices, 
    sample_prediction
) in iter_test:
    
    if test.iloc[0]['currently_scored'] == False:
        sample_prediction.loc[:]['target'] = 0
        env.predict(sample_prediction)
    else:
        test = test.rename(columns={"prediction_datetime": "datetime"})

        gas_prices.drop_duplicates(subset = ['forecast_date'], keep='first', inplace = True)
        electricity_prices.drop_duplicates(subset = ['forecast_date'], keep='first', inplace = True)
        client.drop_duplicates(subset = ['product_type', 'county', 'is_business', 'date'], keep='first', inplace = True)

        df_test = pl.from_pandas(test[data_cols[1:]], schema_overrides=schema_data)
        df_client = pl.from_pandas(client[client_cols], schema_overrides=schema_client)
        df_gas_prices = pl.from_pandas(gas_prices[gas_prices_cols], schema_overrides=schema_gas)
        df_electricity_prices = pl.from_pandas(electricity_prices[electricity_prices_cols], schema_overrides=schema_electricity)
        df_new_forecast_weather = pl.from_pandas(forecast_weather[forecast_weather_cols], schema_overrides=schema_forecast)
        df_new_historical_weather = pl.from_pandas(historical_weather[historical_weather_cols], schema_overrides=schema_historical)
        df_new_target = pl.from_pandas(revealed_targets[target_cols], schema_overrides=schema_target)

        df_forecast_weather = pl.concat([df_forecast_weather, df_new_forecast_weather]).unique(['forecast_datetime', 'latitude', 'longitude', 'hours_ahead'])
        df_historical_weather = pl.concat([df_historical_weather, df_new_historical_weather]).unique(['datetime', 'latitude', 'longitude'])
        df_target = pl.concat([df_target, df_new_target]).unique(['datetime', 'county', 'is_business', 'product_type', 'is_consumption'])

        df_target_ma = create_moving_avg_target(df_target, windows_ma_days)

        df_test_features = generate_features(
            df_test, 
            df_client, 
            df_gas_prices, 
            df_electricity_prices, 
            df_forecast_weather, 
            df_historical_weather, 
            df_weather_station_to_county_mapping, 
            df_target,
            df_target_ma
        )

        df_test_features = to_pandas(df_test_features)
        df_test_features = deal_null(df_test_features)
        df_test_features = get_holiday_day_night(df_test_features)
        df_test_features = create_moving_avg_weather(df_test_features, window_sizes_avg, window_sizes_std)
        df_test_features.drop(columns = ['datetime', 'segment'], inplace = True)
        df_test_features = convert_dtypes(df_test_features)

        predictions = np.zeros(len(sample_prediction))

        mask = df_test_features['is_consumption'] != 1
        predictions[mask.values] = 0.4*(model_0.predict(df_test_features.loc[mask,features].values)) + 0.4*(model_0_diff.predict(df_test_features.loc[mask,features].values) + df_test_features.loc[mask,'target_2'].fillna(0)) + 0.2*(cat_model_0.predict(df_test_features.loc[mask,features].values))
        mask = df_test_features['is_consumption'] == 1
        predictions[mask.values] =  0.4*(model_1.predict(df_test_features.loc[mask,features].values)) + 0.4*(model_1_diff.predict(df_test_features.loc[mask,features].values) + df_test_features.loc[mask,'target_2'].fillna(0)) + 0.2*(cat_model_1.predict(df_test_features.loc[mask,features].values))

        sample_prediction['target'] = predictions

        sample_prediction['target'] = sample_prediction['target'].clip(lower=0, upper = 15481)
        sample_prediction = sample_prediction.replace([np.inf, -np.inf], np.nan)
        sample_prediction['target'] = sample_prediction['target'].fillna(0)

        sample_prediction['target'] = sample_prediction['target'].astype(float)
        sample_prediction['row_id'] = sample_prediction['row_id'].astype(int)

        sample_prediction = sample_prediction.groupby('row_id').agg({'target': np.mean}).reset_index()

        env.predict(sample_prediction)

This version of the API is not optimized and should not be used to estimate the runtime of your code on the hidden test set.
