# Random Forest corrections to regression

In [1]:
import matplotlib as mpl
import matplotlib.pyplot as plt
%matplotlib inline
from IPython.core.pylabtools import figsize
 
import numpy as np
import pandas as pd
import polars as pl
import seaborn as sns

import statsmodels.formula.api as smf

import sklearn

In [2]:
__context__ = 'local'

import sys

if __context__ == 'local':
    trainDataLocation = '../data/train/'
    libraryLocation = '..'
    sys.path.append('..')
    from public_timeseries_testing_util import MockApi
    env = MockApi()
    

elif __context__ == 'kaggle':
    trainDataLocation = '/kaggle/input/predict-energy-behavior-of-prosumers/'
    libraryLocation = '/kaggle/input/'
    import enefit
    env = enefit.make_env()

sys.path.append(libraryLocation)

In [3]:
# import and transform data
from enefittools.data.format_data import format_dfs, assemble_train_client, Data_Holder

train = pd.read_csv(trainDataLocation+'train.csv')
client = pd.read_csv(trainDataLocation+'client.csv')
weather_forecast = pd.read_csv(trainDataLocation + 'forecast_weather.csv')
prices_gas = pd.read_csv(trainDataLocation + 'gas_prices.csv')
prices_electricity = pd.read_csv(trainDataLocation + 'electricity_prices.csv')
solar = pd.read_csv(libraryLocation + '/enefittools/data/datasets/solar_data.csv')

target_production, target_consumption, weather_forecast, prices_electricity, prices_gas, solar = \
        format_dfs(target=train, client=client, weather_forecast=weather_forecast,
                   gas_prices=prices_gas, electricity_prices=prices_electricity, solar=solar,
                   assemble_and_split=True
                  )


# limit resources for local development
if __context__ == 'local':
    target_production = target_production.filter(pl.col('prediction_unit_id')== 0)
    target_consumption = target_consumption.filter(pl.col('prediction_unit_id')== 0)



In [4]:
# initialize data holders

data_production = Data_Holder(target_production, weather_forecast, prices_electricity, prices_gas,
                              solar, normalize=True, mode='train')
data_consumption = Data_Holder(target_consumption, weather_forecast, prices_electricity, prices_gas,
                              solar, mode='train')


In [5]:
# feature pipeline
from enefittools.features.datetime_features import Datetime_Features
from enefittools.features.autoregressive_features import Delayed_Features
from enefittools.features.solar_features import Solar_Features

from sklearn.pipeline import Pipeline


regression_features_production = Pipeline([
                                    ('time_features', Datetime_Features()),
                                    ('ar_features', Delayed_Features('target')),
                                    ('solar_features', Solar_Features())
                                ])

regression_features_consumption = Pipeline([
                                    ('time_features', Datetime_Features()),
                                    ('ar_features', Delayed_Features('target'))
                                ])

In [6]:
# linear regression models
from enefittools.models.linear_models import SM_Regression
from enefittools.features.target_transformers import Normalize_Target
from enefittools.models.chaining import Predictions_to_Features

regression_cols = ['weekday', 'hour_of_day', 'target_2d_ago', 'target_7d_ago', 'trend'] + \
                  [f'sin_{i}' for i in range(1,7)] + [f'cos_{i}' for i in range(1,7)]
date_cols = " + ".join(regression_cols[4:])

consumption_spec =f'target ~ (C(weekday) + C(hour_of_day)) * ({date_cols}) + C(weekday)*target_2d_ago + target_7d_ago'
production_spec = f'target ~ solar_elevation + solar_azimuth + C(hour_of_day) * ({date_cols}) + target_2d_ago'


In [7]:
# random forest features
from enefittools.features.price_features import Price_Features
from enefittools.features.weather_features import Simple_Weather_Features


# simple weather mapping
direct_stations = pd.read_csv(
                        trainDataLocation + 'weather_station_to_county_mapping.csv'
                   ).dropna(
                   ).sort_values('county'
                   )
r = direct_stations.iloc[13]
r['county'] = 12
direct_stations.iloc[13] = r
direct_stations = pl.from_pandas(direct_stations.drop_duplicates('county', keep='first'),
                                 schema_overrides={'county': pl.Int8}
                                ).select('longitude', 'latitude', 'county'
                                ).with_columns(pl.col('latitude').round(3),
                                               pl.col('longitude').round(3))



RF_features_production = Pipeline([('price_features', Price_Features()),
                                   ('weather_features',Simple_Weather_Features(direct_stations))
                                  ])

RF_features_consumption = Pipeline([('solar_features', Solar_Features()),
                                    ('price_features', Price_Features()),
                                    ('weather_features',Simple_Weather_Features(direct_stations))
                                   ])


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  r['county'] = 12


Full pipeline assembly

In [8]:
from enefittools.features.target_transformers import Normalize_Target
from enefittools.models.random_forest_models import RF_Residual
from enefittools.models.chaining import Learn_On_Residuals

from sklearn.ensemble import RandomForestRegressor

production_full = Pipeline([('norm-fwd', Normalize_Target(mode='fwd')),
                                  ('features_lin', regression_features_production),
                                  ('regression', Predictions_to_Features(SM_Regression(production_spec))
                                  ),
                                  ('features_rf', RF_features_production),
                                  ('residual-rf', Predictions_to_Features(
                                                      RF_Residual(n_estimators=100))
                                  ),
                                  ('norm-inv', Normalize_Target(mode='inv'))
                                 ])
                                
consumption_full = Pipeline([('features_lin', regression_features_consumption),
                                 ('regression', Predictions_to_Features(SM_Regression(consumption_spec))
                                 ),
                                 ('features_rf', RF_features_production),
                                 ('residual-rf', RF_Residual(n_estimators=100))
                                 ])
                                

In [9]:
consumption_full.fit(data_consumption)

In [None]:
production_full.fit(data_production.reset())

In [None]:
from enefittools.data.format_predictions import format_outputs

iter_test = env.iter_test()

data_production.mode = 'inference'
data_consumption.mode = 'inference'

for (test, revealed_targets, client, weather_historical, weather_forecast,
    prices_electricity, prices_gas, sample_prediction) in iter_test:
    
    prod_test, consume_test, revealed_targets, weather_forecast, \
    prices_electricity, prices_gas, sample_prediction = \
    format_dfs(target=test, revealed_targets=revealed_targets, client=client,
               weather_forecast=weather_forecast, electricity_prices=prices_electricity,
               gas_prices=prices_gas, sample_prediction=sample_prediction,
              assemble_and_split=True, mode='test')


    # set the data in our data holder
    data_production.set_working_data(prod_test, weather_forecast, prices_electricity, prices_gas, solar)
    data_consumption.set_working_data(consume_test, weather_forecast, prices_electricity, prices_gas, solar)

    # update the historical data
    data_production.update_tracked_data(revealed_targets.filter(pl.col('is_consumption') == False),
                                        normalize=True)
    data_consumption.update_tracked_data(revealed_targets.filter(pl.col('is_consumption') == True))

    
    prod_predictions = production_full.predict(data_production)
    consume_predictions = consumption_full.predict(data_consumption)

    prediction = format_outputs([prod_predictions.features, consume_predictions.features],
                                sample_prediction)
    
    env.predict(prediction)