In [None]:
import pandas as pd
import numpy as np

import polars as pl
import datetime

import lightgbm as lgb

from sklearn.linear_model import LinearRegression


# Utilities and feature extractors for an initial submission

### Utilities: dataframe formatting for online data

In [None]:
def format_dfs(*batch_inputs):
    """ dataframe formatting for online use """
    batch_inputs = list(batch_inputs)
    for i, input_data in enumerate(batch_inputs):
        batch_inputs[i] = pl.from_pandas(input_data)
    (test, client) = batch_inputs

    test = test.with_columns(
                pl.col('prediction_datetime').str.to_datetime(),
                (pl.col('prediction_datetime').str.to_datetime().dt.date() +
                    datetime.timedelta(days=-2)
                    ).alias('date_when_predicting')
              )

    client = client.with_columns( pl.col('date').str.to_date())
    
    #return (test, revealed_targets, client, 
    #        weatherHistorical, weatherForecast,
    #        electricityPrices, gasPrices, sample_prediction)

    return test, client



#### Date Note:
We want to predict targets using the data from two days previous

In [None]:
# load data
dataLocation = '/kaggle/input/predict-energy-behavior-of-prosumers/'

train = pd.read_csv(dataLocation+'train.csv')
clients = pd.read_csv(dataLocation+'client.csv')

# drop na values from training
train.dropna(axis='index', inplace=True)

train, clients = format_dfs(train.rename(columns={'datetime':'prediction_datetime'}),clients)


In [None]:
# make features: targets

def make_targets(train):
    """ Preprocess the training features, using log(x+1) transformed targets """
    train = train.drop(['county', 'is_business', 'product_type', 'prediction_datetime',
                        'data_block_id', 'prediction_unit_id', 'date_when_predicting'])
    return train.with_columns( np.log1p(pl.col('target')) )

def raw_targets( targets):
    """ Undo the preprocessing to make the raw features """
    return targets.with_columns( np.expm1( pl.col('target')))


In [None]:
targets = make_targets(train)
targets.to_pandas().plot(y='target', kind='hist', bins=15)

This target distribution, while not entirely normal is much closer than before

In [None]:
# make deterministic time dependent features
from statsmodels.tsa.deterministic import CalendarFourier, DeterministicProcess

def make_date_process(train):
    dates = pd.date_range( train['prediction_datetime'].dt.date().min(),
                          train['prediction_datetime'].dt.date().max() )
    fourier = CalendarFourier(freq="A", order=6) 
    date_process = DeterministicProcess(dates,
                           constant=True,
                           order=1,
                           seasonal=True,
                           period=7,
                           additional_terms=[fourier]
                          )
    return date_process


In [None]:
def make_regression_features(train, clients, date_process):
    """ input features for the regression"""
    # question: does polars to_dummies result in linear dependencies?
    features = train.join(clients, 
                          left_on=['county', 'is_business', 'product_type', 'date_when_predicting'],
                          right_on=['county', 'is_business', 'product_type', 'date'],
                          how='inner' )

    # covariates
    unit_ids = features['prediction_unit_id'].to_dummies()
    features = features.with_columns( np.log1p( pl.col('installed_capacity')))
    

    # seasonal components
    # to add: local holidays
    times = features['prediction_datetime'].dt.time().to_dummies()
    features = features.with_columns( pl.col('prediction_datetime').dt.date().alias('prediction_date'))
    date_features = date_process.range(
                                       features['prediction_date'].min(),
                                       features['prediction_date'].max()
                                      ).reset_index()
    date_features = pl.from_pandas( date_features ).with_columns(pl.col('index').dt.date())

    features = features.join(date_features, left_on=['prediction_date'], right_on=['index'], how='left')
    
    
    # some of these features should be included in the future
    columns_to_drop = ['target', 'data_block_id', 'data_block_id_right', 'prediction_date',
                       'prediction_datetime', 'prediction_unit_id', 'date_when_predicting',
                       'county', 'is_business', 'product_type', 'eic_count'
                      ]
    features = features.drop(columns_to_drop)
    
    return pl.concat([features, unit_ids, times], how='horizontal')


### Process the training data

In [None]:
### formatting the training data
date_process = make_date_process(train)
targets = make_targets(train)
features = make_regression_features(train, clients, date_process)

In [None]:
trainSet = targets.join(features, on='row_id', how='inner')
productionSet = trainSet.filter(pl.col('is_consumption') == 0).drop(['is_consumption', 'is_consumption_right']).to_pandas().set_index('row_id')
consumptionSet = trainSet.filter(pl.col('is_consumption') == 1).drop(['is_consumption', 'is_consumption_right']).to_pandas().set_index('row_id')

In [None]:
consumptionTargets = consumptionSet['target']
consumptionInputs = consumptionSet.drop(columns='target')

In [None]:
productionTargets = productionSet['target']
productionInputs = productionSet.drop(columns='target')

### Train the linear regression model

In [None]:
consumptionModel = LinearRegression()
productionModel = LinearRegression()

In [None]:
consumptionModel.fit(consumptionInputs, consumptionTargets)

In [None]:
consumptionModel.score(consumptionInputs, consumptionTargets)

In [None]:
productionModel.fit(productionInputs, productionTargets)

In [None]:
productionModel.score(productionInputs, productionTargets)

### Making a submission

In [None]:
import enefit
env = enefit.make_env()
iter_test = env.iter_test()

In [None]:
(test, revealed_targets, client, weatherHistorical, weatherForecast,
    electricityPrices, gasPrices, sample_prediction) = next(iter_test)

In [None]:
pl.from_pandas(test)['prediction_datetime'].dtype