# Pipeline

Here is the encapsulation of the chosen model ARIMA in a class named Pipeline. The class receives as argument the forecasting horizon length (12) and the path of the dataset and also if the generated predictions should be saved in a csv file. The class has a method named run that executes the pipeline and returns the predictions and the trained model. 

In [13]:
import pandas as pd
import numpy as np
from sktime.transformations.series.boxcox import LogTransformer
from sktime.forecasting.compose import TransformedTargetForecaster
from sktime.forecasting.arima import ARIMA

# filter warnings
import warnings
warnings.filterwarnings("ignore")


class Pipeline:
    """
    Pipeline class to read and process training data, fit and predict model, and export test data
    """

    def __init__(self, fh:int=12, training_data_path:str="../data/processed_train.csv", export_test:bool=True, test_data_path:str="../data/test.csv"):
        """
        Parameters
        ----------
        fh : int
            Forecast horizon
        training_data_path : str
            Path to the training data
        export_test : bool
            Export test data
        test_data_path : str
            Path to the test data
        """
        self.fh = fh
        self.training_data_path = training_data_path
        self.export_test = export_test
        self.test_data_path = test_data_path


    def read_and_process_training_data(self, training_data_path="../data/processed_train.csv"):
        """
        Read and process training data

        Parameters
        ----------
        training_data_path : str
            Path to the training data

        Returns
        -------
        y : pd.Series
            Time series data
        """

        ts = pd.read_csv(
            training_data_path,  # data path
            index_col=[0],  # index column
            parse_dates=[0],  # parse date column
        )

        # Get training data
        train_ts = ts.dropna()

        # Assert time series dates consistency by checking all dates are monthly and recorded properly (this includes missing dates if any)
        train_ts = train_ts.asfreq("MS").fillna(
            0
        )  # if there are missing dates, they will be filled with 0

        # if the path does not contain "processed", then process the series: 2020 and 2019 data are averaged and added to the series
        if "processed" not in training_data_path:
            processed_train_ts = train_ts.copy()

            # Get 2020 and 2019 data
            train_ts_2020 = processed_train_ts[processed_train_ts.index.year==2020]
            train_ts_2019 = processed_train_ts[processed_train_ts.index.year==2019]

            # Calculate mean of 2020 and 2019
            mean_2020_2019 = pd.DataFrame((train_ts_2020.values.ravel() + train_ts_2019.values.ravel()) / 2, index=train_ts_2020.index, columns=['y'])

            # Delete from processed_train_ts 2020 data
            processed_train_ts = processed_train_ts.drop(train_ts_2020.index)

            # Union processed_train_ts with mean_2020_2019
            train_ts = pd.concat([processed_train_ts, mean_2020_2019]).sort_index()
        
        y = train_ts.copy()  # copy to avoid any changes to original data
        y.index = pd.PeriodIndex(y.index, freq="M")  # required frequency from sktime

        return y

    def fit_predict_model(self, train_data, fh=12):
        """
        Fit and predict model
        """
        
        forecaster = TransformedTargetForecaster(
            [
                ("log", LogTransformer()),
                (
                    "forecast",
                    ARIMA(order=(0, 1, 1), seasonal_order=(0, 1, 0, 12)),
                )
            ]
        )

        model = forecaster.fit(train_data)
        y_pred = model.predict(fh=np.arange(1, fh+1))
        
        return y_pred, model
    
    def run(self):
        # Read and process training data
        y = self.read_and_process_training_data(self.training_data_path)
        
        # Fit and predict model
        y_pred, model = self.fit_predict_model(y, self.fh)

        # Format index "01.%m.%y"
        y_pred.index = y_pred.index.strftime("01.%m.%y")


        # Export test data
        if self.export_test:
            y_pred.to_csv(self.test_data_path, date_format="01.%m.%y")
        
        return y_pred, model

    
# test class
if __name__ == "__main__":
    y_pred, model = Pipeline(fh=12, training_data_path="../data/processed_train.csv", export_test=True, test_data_path="../data/test.csv").run()
    print(y_pred)

                 y
01.03.21  2.550081
01.04.21  2.393647
01.05.21  2.306598
01.06.21  2.936396
01.07.21  2.428073
01.08.21  2.129764
01.09.21  2.515241
01.10.21  2.118683
01.11.21  2.045107
01.12.21  1.651797
01.01.22  1.975611
01.02.22  2.655872
