# Baseline model

Now that I have a good handle on what the linear regression features should look like, set up basic models.

The main goal here is to have a model structure that I can use for the remainder of this competition.

In [10]:
import matplotlib as mpl
import matplotlib.pyplot as plt
%matplotlib inline
from IPython.core.pylabtools import figsize
 
import numpy as np
import pandas as pd
import polars as pl
import seaborn as sns

import statsmodels.formula.api as smf

import sklearn

In [11]:
__context__ = 'local'

import sys

if __context__ == 'local':
    trainDataLocation = '../../data/train/'
    libraryLocation = '../..'
    sys.path.append('..')
    from public_timeseries_testing_util import MockApi
    env = MockApi()
    

elif __context__ == 'kaggle':
    trainDataLocation = '/kaggle/input/predict-energy-behavior-of-prosumers/'
    libraryLocation = '/kaggle/input/'
    import enefit
    env = enefit.make_env()

sys.path.append(libraryLocation)

## Pipeline Assembly

Running the pipeline from scratch

In [3]:
# import and transform data
from enefittools.data.format_data import format_dfs, assemble_train_client

train = pd.read_csv(trainDataLocation+'train.csv')
client = pd.read_csv(trainDataLocation+'client.csv')
weather_forecast = pd.read_csv(trainDataLocation + 'forecast_weather.csv')
prices_gas = pd.read_csv(trainDataLocation + 'gas_prices.csv')
prices_electricity = pd.read_csv(trainDataLocation + 'electricity_prices.csv')
solar = pd.read_csv(libraryLocation + '/enefittools/data/datasets/solar_data.csv')

target_production, target_consumption, weather_forecast, prices_electricity, prices_gas, solar = \
        format_dfs(target=train, client=client, weather_forecast=weather_forecast,
                   gas_prices=prices_gas, electricity_prices=prices_electricity, solar=solar,
                   assemble_and_split=True
                  )


# limit resources for local development
if __context__ == 'local':
    target_production = target_production.filter(pl.col('prediction_unit_id')== 0)
    target_consumption = target_consumption.filter(pl.col('prediction_unit_id')== 0)


In [4]:
# feature pipeline
from enefittools.features.datetime_features import Datetime_Features
from enefittools.features.autoregressive_features import Delayed_Features
from enefittools.features.target_transformers import Normalize_Target

from enefittools.data.history_holder import Target_History

from sklearn.pipeline import Pipeline


production_history = Target_History(target_production, normalize=True)
consumption_history = Target_History(target_consumption, normalize=False)


regression_features_production = Pipeline([
                                    ('time_features', Datetime_Features()),
                                    ('ar_features', Delayed_Features('target', history=production_history)),
                                ])

regression_features_consumption = Pipeline([
                                    ('time_features', Datetime_Features()),
                                    ('ar_features', Delayed_Features('target', history=consumption_history))
                                ])


In [5]:
# linear regression pipeline
from enefittools.models.linear_models import SM_Regression
from enefittools.features.target_transformers import Normalize_Target
from enefittools.models.chaining import Predictions_to_Features

regression_cols = ['weekday', 'hour_of_day', 'target_2d_ago', 'target_7d_ago', 'trend'] + \
                  [f'sin_{i}' for i in range(1,7)] + [f'cos_{i}' for i in range(1,7)]
date_cols = " + ".join(regression_cols[4:])

consumption_spec =f'target ~ (C(weekday) + C(hour_of_day)) * ({date_cols}) + C(weekday)*target_2d_ago + target_7d_ago'
production_spec = f'target ~ C(hour_of_day) * ({date_cols}) + target_2d_ago'


production_regression = Pipeline([('norm-fwd', Normalize_Target(mode='fwd')),
                                  ('extraction', regression_features_production),
                                  ('regression', Predictions_to_Features(
                                                      SM_Regression(production_spec, 
                                                                    to_drop=regression_cols)
                                  )),
                                  ('norm-inv', Normalize_Target(mode='inv'))
                                 ])
                                
consumption_regression = Pipeline([('extraction', regression_features_consumption),
                                 ('regression', SM_Regression(consumption_spec, to_drop=regression_cols))
                                  ])
                                

In [6]:
production_regression.fit(target_production)

In [7]:
consumption_regression.fit(target_consumption)

## Run on the test set

In [12]:
from enefittools.data.format_predictions import format_outputs

iter_test = env.iter_test()

for (test, revealed_targets, client, weather_historical, weather_forecast,
    prices_electricity, prices_gas, sample_prediction) in iter_test:
    
    prod_test, consume_test, revealed_targets, weather_forecast, \
    prices_electricity, prices_gas, sample_prediction = \
    format_dfs(target=test, revealed_targets=revealed_targets, client=client,
               weather_forecast=weather_forecast, electricity_prices=prices_electricity,
               gas_prices=prices_gas, sample_prediction=sample_prediction,
              assemble_and_split=True, mode='test')


    production_history.update_data(revealed_targets.filter(pl.col('is_consumption')==False))
        
    consumption_history.update_data(revealed_targets.filter(pl.col('is_consumption')==True))

    prod_predictions = production_regression.predict(prod_test)
    consume_predictions = consumption_regression.predict(consume_test)

    prediction = format_outputs([prod_predictions, consume_predictions], sample_prediction)
    
    env.predict(prediction)