### load and preprocess

In [11]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt

from src.utils import inspect_nulls, pop_datetime
from src.preprocessing import (
    to_datetime,
    drop_full_nulls,
    try_drop_shared_nulls,
    drop_full_zero,
)
from src.preparation import train_test_split, make_batches, make_sequences

with open("data/energy_dataset.csv") as file:
    en = pd.read_csv(file, sep=",")
    en.name = "energy"

# create datetime column
en = to_datetime(en, "time")

# drop features with only zero values
en = drop_full_zero(en)

# nulls imputation
en = drop_full_nulls(en)
if en.datetime.diff().dt.total_seconds().value_counts().shape[0] == 1:
    en = en.interpolate(method="linear")
else:
    display("timestamps not evenly interpolate... chto delat?")
    #en = try_drop_shared_nulls(en, any_null=True)

# datetime as index
#en_norm.set_index(keys="datetime", inplace=True)

display(inspect_nulls(en))

datetime                                       0.0
generation biomass                             0.0
generation fossil brown coal/lignite           0.0
generation fossil coal-derived gas             0.0
generation fossil gas                          0.0
generation fossil hard coal                    0.0
generation fossil oil                          0.0
generation fossil oil shale                    0.0
generation fossil peat                         0.0
generation geothermal                          0.0
generation hydro pumped storage consumption    0.0
generation hydro run-of-river and poundage     0.0
generation hydro water reservoir               0.0
generation marine                              0.0
generation nuclear                             0.0
generation other                               0.0
generation other renewable                     0.0
generation solar                               0.0
generation waste                               0.0
generation wind offshore       

### train/test splitting
at this stage:
* train/test only, w/o validation subset
* next contains previous training subset XOR non-overlapping subsets
* 80-20 ratio 

In [12]:
train_test_split_ratio = 0.8
train_en, test_en = train_test_split(en, 0.8, normalise=True)

train size: (28051, 27) test size: (7013, 27)


### batches

In [13]:
features = ["total load actual", "price actual"]

In [14]:
# | samples_to_drop | observations | offset | targets | observations | offset | targets | ...
#                    <----------sequence 1-----------> <-----------sequence 2----------> ...

observations = 100
offset = 0
targets = 1
sequences_observations, sequences_targets = make_sequences(
    train_en[features], observations=100, offset=0, targets=1
)

To have uniform sequences, indexes dropped: [0:74]
Number of sequences: 277


In [None]:
# | sequence 1 | sequence 2 | sequence 3 | sequence 4 | ...
#  <-------------- batch 1 -------------> <------- batch 2 ----   

batches_observations = make_batches(sequences_observations, n_sequences_per_batch=3)
batches_targets = make_batches(sequences_targets, n_sequences_per_batch=3)