# Modelling Libraries - Preprocessing

In [1]:
# Import to be able to import python package from src
import sys
sys.path.insert(0, '../src')

In [2]:
import pandas as pd
import numpy as np
import ontime as on
from darts.datasets import EnergyDataset

The `LightGBM` module could not be imported. To enable LightGBM support in Darts, follow the detailed instructions in the installation guide: https://github.com/unit8co/darts/blob/master/INSTALL.md
The `Prophet` module could not be imported. To enable Prophet support in Darts, follow the detailed instructions in the installation guide: https://github.com/unit8co/darts/blob/master/INSTALL.md
  from tqdm.autonotebook import tqdm


---
## Load data

In [3]:
ts = EnergyDataset().load()
ts = ts.astype(np.float32)

---

## Preprocessing

- [x] Normalize
- [x] Split train, test, val
- [ ] Feature engineering
        - add weather for location
        - add day of the week, month, year, etc.
        - add whatever
- [x] Windowing
- [x] Windowing - Split (parts to train as X, parts to predict as y)
- [x] Windowing - to tf.data.Dataset
- [ ] Windowing - to Pytorch DataLoaders

In [16]:
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from darts.dataprocessing.transformers import Scaler

def normalize(ts: on.TimeSeries, type='minmax', return_transformer=False):
    match type:
        case 'minmax':
            scaler = MinMaxScaler()
        case 'zscore':
            scaler = StandardScaler()
    transformer = Scaler(scaler)
    ts_transformed = transformer.fit_transform(ts)
    if return_transformer:
        return ts_transformed, transformer
    else:
        return ts_transformed

In [17]:
def train_test_split(ts: on.TimeSeries, test_split=None, train_split=None) -> tuple:
    """
    Description
    
    :param ts: TimeSeries to split
    :param test_split: float, int or pd.TimeStamp
    :param train_split: float, int or pd.TimeStamp
    """
    
    if train_split is not None and test_split is not None:
        raise Exception('Only one of those two parameters can be set : train_split, test_split.')

    if train_split is None and test_split is None:
        test_split = 0.25
    
    # split ts in subts : train, test
    if test_split is not None: 
        train_set, test_set = ts.split_after(1-test_split)
    
    if train_split is not None:
        train_set, test_set = ts.split_after(train_split)

    return train_set, test_set

In [18]:
def split_by_n(ts, n, drop_last=True):

    # Get DataFrame
    df = ts.pd_dataframe()
    
    # Calculate the total number of splits needed
    total_splits = -(-len(df) // n)  # Ceiling division to get the number of parts
    
    # Initialize a list to hold the DataFrame splits
    splits_df = []
    
    # Loop through the DataFrame and split it
    for split in range(total_splits):
        start_index = split * n
        end_index = start_index + n
        # Append the part to the list, using slicing with .iloc
        splits_df.append(df.iloc[start_index:end_index])

    # If the last dataframe has a different length, then drop it.
    if drop_last:
        last_df = splits_df[-1]
        second_last = splits_df[-2]        
        if len(last_df) != len(second_last):
            splits_df = splits_df[:-1]

    # Change the data sctructure from DataFrame to TimeSeries
    return list(map(on.TimeSeries.from_dataframe, splits_df))
    

In [19]:
def split_inputs_from_targets(ts_list, input_len, target_len):

    # Change inner data structure to DataFrame
    dfs = [ts.pd_dataframe() for ts in ts_list]

    # Create initial arrays
    input_series_list = []
    target_series_list = []
    
    # Iterate over each DataFrame in the list
    for df in dfs:
        # Check if the DataFrame is large enough to accommodate input_len and label_len
        if len(df) >= input_len + target_len:
            # Get the first input_len items
            input_series = df.iloc[:input_len]
            input_series_list.append(input_series)
            
            # Get the last label_len items
            target_series = df.iloc[-target_len:]
            target_series_list.append(target_series)
        else:
            raise Exception('input_len + label_len is longer that the total length of the DataFrame')

    input_ts_list = list(map(on.TimeSeries.from_dataframe, input_series_list))
    target_ts_list = list(map(on.TimeSeries.from_dataframe, target_series_list))
    
    return input_ts_list, target_ts_list

In [20]:
def to_numpy(ts_list):
    return np.array([ts.pd_dataframe().to_numpy() for ts in ts_list])  

### Test with common functions

In [10]:
ts_t = normalize(ts)

  data_min = np.nanmin(X, axis=0)
  data_max = np.nanmax(X, axis=0)


In [11]:
train, test = train_test_split(ts_t, train_split=0.8)

In [12]:
train_list = split_by_n(train, 6)
test_list = split_by_n(test, 6)

In [13]:
X_train, y_train = split_inputs_from_targets(train_list, 4, 2)
X_test, y_test = split_inputs_from_targets(test_list, 4, 2)

In [14]:
X_train = to_numpy(X_train)
y_train = to_numpy(y_train)
X_test = to_numpy(X_test)
y_test = to_numpy(y_test)

In [23]:
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

(4675, 4, 28)
(4675, 2, 28)
(1168, 4, 28)
(1168, 2, 28)
