In [1]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

In [2]:
from LagsCreator import LagsCreator

In [3]:
data = np.array([["a1", "a2", "a3", "a4", "a5", "a6", "a7", "a8", "a9", "a10", "a11", "a12", "a13", "a14", "a15", "a16", "a17", "a18", "a19", "a20"], 
                 ["b1", "b2", "b3", "b4", "b5", "b6", "b7", "b8", "b9", "b10", "b11", "b12", "b13", "b14", "b15", "b16", "b17", "b18", "b19", "b20"], 
                 ["c1", "c2", "c3", "c4", "c5", "c6", "c7", "c8", "c9", "c10", "c11", "c12", "c13", "c14", "c15", "c16", "c17", "c18", "c19", "c20"]])

In [4]:
df = pd.DataFrame(np.transpose(data), index = pd.date_range("2018-08-22", periods = 20), columns = pd.MultiIndex.from_product([["Group 1"], ["A", "B", "C"]], names = ["Group", "Time-series"]))

In [5]:
df

Group,Group 1,Group 1,Group 1
Time-series,A,B,C
2018-08-22,a1,b1,c1
2018-08-23,a2,b2,c2
2018-08-24,a3,b3,c3
2018-08-25,a4,b4,c4
2018-08-26,a5,b5,c5
2018-08-27,a6,b6,c6
2018-08-28,a7,b7,c7
2018-08-29,a8,b8,c8
2018-08-30,a9,b9,c9
2018-08-31,a10,b10,c10


In [6]:
lags_dict = dict()
# Define lags for each indicator.
lags_dict["A"] = 3
lags_dict["B"] = 4
lags_dict["C"] = 0

In [24]:
from rolling_window import rolling_window
import numpy as np
import pandas as pd

# Python module.
#
#
# Pietro Foini
#
# Year: 2020

class LagsCreator:
    """LagsCreator
    
    This module allows to create training/validation/test lag-features for time-series forecasting. It supports several 
    configurations to get the output into several format. The starting point for using this module is to have
    a dataframe with two levels on axis 1: the level 0 corresponding to the main group and the level 1 corresponding 
    to the time-series. An advantage of this module is that it is possible to visualize the samples created during the precosse through 
    an highlighting of the cells of the dataframe.
    
    """
    def __init__(self, group, lags_dictionary, target):
        """
        ***Initialization function***
 
        Initialization of the LagsCreator class.
        
        Parameters
        ----------
        group: a pandas dataframe with hierarchical multi-index on axis 1 (more precisely two levels) where the time-series are stored. 
           The dataframe must have as index a single pandas datetime column with an appropriate frequency set. 
        lags_dictionary: a python dictionary containing the lag values corresponding to each time-series (the names of the time-series 
           will be the keys of the dictionary).
        target: a python string containing the name of the time-series that you want to predict. The target variable must always present
           also into the 'lags_dictionary' parameter.
           
        """
        # Define the name of the group.
        group_name = group.columns.get_level_values(0).unique()
        # Remove level 0 from the dataframe.
        group = group.droplevel(level = 0, axis = 1)
        
        # Adjust the 'lags_dictionary' parameter.
        if target not in lags_dictionary.keys():
            raise ValueError("The target feature must be always included in the 'lags_dictionary' parameter.")
        # The features whose are not specified into 'lags_dictionary' are removed.
        features_to_remove = list(set(list(group.columns)) - set(list(lags_dictionary.keys())))
        group = group.drop(columns = features_to_remove)
        # Define the features (the names of the time-series).
        features = group.columns
        # Define static features (features, i.e. time-series, with lag value set to 0).
        static_features = [key for (key, value) in lags_dictionary.items() if value == 0]
        
        # Define the boolean mask for the creation of feature-lags foer each time-series.
        # Add the reference size of the window (timestep dimension) as attribute of the class.
        window_size = max(lags_dictionary.values())
        # Create mask based on lags into 'lags_dictionary' over the input samples.
        mask = np.full(shape = (window_size, len(features)), fill_value = False)    
        for i, feature in enumerate(features):
            lags = lags_dictionary[feature]
            mask[:, i][-lags:] = True

        # Define some attributes of the class.
        self.group_name = group_name
        self.group = group
        self.features = features
        self.static_features = static_features
        self.target = target
        self.window_size = window_size
        self.mask = mask
    
    def to_dataframe(self, X, y):
        """
        ***Sub-function***
 
        This function allows to convert the outputs (input and output samples) into dataframes format.
        
        Parameters
        ----------
        X: the input samples with the column of temporal information of shape (n_samples, timesteps, n_features).
        y: the output samples with the row of temporal information of shape (n_samples, 2, n_out]).

        """
        # Create dataframe of output samples.
        if y is not None:
            # Consider the temporal information.
            dates = y[:, 0, :].flatten()
            # Not consider temporal information.
            y = y[:, 1, :]
            # Create columns values.
            if self.single_step:
                columns = ["x(t+%d)" % self.h]
            else:
                columns = ["x(t+%d)" % (i+1) for i in range(self.n_out)]
            # Create multi-index columns.
            iterables = [[self.target], columns]
            columns = pd.MultiIndex.from_product(iterables, names = ["Features", "Prediction horizon"])  
            y = pd.DataFrame(y, columns = columns)
        else:
            y = None
            
        # Create dataframe of input samples.
        if X is not None:
            # Not consider temporal information for the input samples.
            X = X[:, :, 1:]
            # Flatten the lags of each sample over the rows.
            X = np.stack([x.flatten("F") for x in X])
            # Create columns values.
            columns = ["x(t)" if i == 1 else "x(t-%d)" % (i-1) for i in range(self.window_size, 0, -1)]
            # Create multi-index columns.
            iterables = [self.features, columns]
            columns = pd.MultiIndex.from_product(iterables, names = ["Features", "Lags"])
            # Create dataframe of input samples.    
            X = pd.DataFrame(X, columns = columns)
            X.dropna(axis = 1, how = "all", inplace = True)
            # Adjust features for static features.
            for feature in self.static_features:
                X[feature] = X[feature][["x(t)"]]
                # Replace names for static features.
                X.columns = pd.MultiIndex.from_tuples(map(lambda x: (feature, "x") if x == (feature, "x(t)") else x, X.columns), names = X.columns.names) 
                # Change the variable X regarding the lags to show for the static features.
                self.mask[:, self.features.get_loc(feature)][:-1] = False
                mask = np.tile(self.mask, (self.X.shape[0], 1, 1))
                self.X[:, :, 1:] = np.ma.masked_array(self.X[:, :, 1:], mask = ~mask, fill_value = 0).filled(np.nan)
            X.dropna(axis = 1, how = "all", inplace = True)

            # Add the temporal information to the input samples.
            if self.feature_time:
                if y is not None:
                    days = [date.day for date in dates]
                    months = [date.month for date in dates]
                    years = [date.year for date in dates]
                    # Create feature time.
                    dates = np.stack([days, months, years], axis = 1)
                    columns = pd.MultiIndex.from_tuples([("Day", "x"), ("Month", "x"), ("Year", "x")], names = ["Features", "Lags"])
                    dates = pd.DataFrame(dates, columns = columns)
                    # Add to the dataframe.
                    X = pd.concat([X, dates], axis = 1)
                else:
                    # For the test samples.
                    X.loc[0, ("Day", "x")] = (self.group.index[-1] + (self.h-1)*self.group.index.freq).day
                    X.loc[0, ("Month", "x")] = (self.group.index[-1] + (self.h-1)*self.group.index.freq).month
                    X.loc[0, ("Year", "x")] = (self.group.index[-1] + (self.h-1)*self.group.index.freq).year
        else:
            X = None

        return X, y
        
    def to_supervised(self, n_out, single_step = False, h = None, return_dataframe = False, validation = False, 
                      feature_time = False, return_single_level = False, dtype = object):
        """
        ***Main function***
 
        This function allows to create training/validation/test samples to use for time-series forecasting purposes.
        The main output format difference is determined by the 'return_dataframe' parameter. If set, the outputs are 
        rearranged into pandas dataframes otherwise the output are returned as numpy arrays.
        
        Parameters
        ----------
        n_out: the maximum forecasting horizon ahead in the future. If 'single_step' is set, the parameter 'n_out' indicates 
           the size of the validation set.
        single_step: if set, each prediction horizon is predicted independently of the others.
        h: the independent forecasting horizon to predict for the 'single_step' mode. If 'single_step = False', the 'h' parameter
           is not taken into account.
        return_dataframe: the modality to set in order to have the outputs returned as pandas dataframes.
        validation: if you want to create validation points.
        feature_time: if you want to create a feature time to add as feature in the input samples. This parameter can be use 
           only if the 'single_step' and 'return_dataframe' mode are set.
        return_single_level: if 'return_dataframe' is set, this parameter allows to have as output dataframes with a single level on
           axis 1 merging column names levels.
           
        Return
        ----------
        X_train: the training input samples.
        y_train: the training output samples.
        X_val: the validation input samples.
        y_val: the validation output samples.
        X_test: the test input sample.
    
        """
        # Check parameters.
        if single_step and h is None:
            raise ValueError("If 'single_step' is set, you must provide a value for the 'h' parameter.")
        if h > n_out:
            raise ValueError("The 'h' parameter must be not greater than 'n_out' parameter.")      
        if feature_time and (not single_step or not return_dataframe):
            raise ValueError("You can use the 'feature_time' only if you are working in the 'single_step' and 'return_dataframe' modes.")
    
        # Define attributes of the class.
        self.n_out = n_out
        self.single_step = single_step
        self.h = h
        self.return_dataframe = return_dataframe
        self.validation = validation
        self.feature_time = feature_time

        # Rolling a no masked window over the dataframe based on the maximum value of the 'lags_dictionary'.
        # Create input samples.
        X = rolling_window(self.group.reset_index().values, self.window_size, axes = 0).swapaxes(1, 2)
        # Add the mask to the input samples based on lags.
        # Add the temporal information to the mask in order to always mantain the temporal information.
        temporal_mask = np.expand_dims(np.array([True]*self.window_size), 1)
        mask = np.concatenate([temporal_mask, self.mask], axis = 1)
        # Expand the mask to all the samples.
        mask = np.tile(mask, (X.shape[0], 1, 1))
        # Define input samples with defined lags (with also temporal information).
        X = np.ma.masked_array(X, mask = ~mask, fill_value = 0).filled(np.nan)
        
        # Create output samples.
        if single_step:
            y = rolling_window(self.group[self.target].reset_index().values[self.window_size + h-1:], 1, axes = 0)
        else:
            y = rolling_window(self.group[self.target].reset_index().values[self.window_size:], n_out, axes = 0)
        
        # Splitting of the input X samples and the output y samples into training/validation/test.
        # Define the test sample input.
        X_test = X[-1:]
        # Define the training and validation samples input and outputs.
        if validation:
            if single_step:
                y_val = y[-n_out:]
                X_val = X[-(n_out+h):][:n_out]
                y = y[:-n_out]
                X = X[:-(n_out+h)]
            else:
                y_val = y[-1:]
                X_val = X[:y.shape[0]][-1:]
                y = y[:-n_out]
                X = X[:-2*n_out]
        else:
            X = X[:y.shape[0]]
            X_val, y_val = None, None
            
        # Samples arrays created until here with also temporal information: X, y, X_val, y_val, X_test.    
        self.X = X
        self.y = y
        self.X_val = X_val # It could be None if 'validation = False'.
        self.y_val = y_val # It could be None if 'validation = False'.
        self.X_test = X_test
        
        # In this last phase, the output format is changed if desired.
        if return_dataframe:
            # Define input and output samples training dataframes.
            X_train, y_train = self.to_dataframe(X, y)
            # Define input and output samples validation dataframes.
            X_val, y_val = self.to_dataframe(X_val, y_val)
            # Define input samples test dataframes.
            X_test, _ = self.to_dataframe(X_test, None)
            if return_single_level:
                X_train.columns = X_train.columns.map(lambda x: " | ".join([str(i) for i in x]))
                y_train.columns = y_train.columns.map(lambda x: " | ".join([str(i) for i in x]))
                X_val.columns = X_val.columns.map(lambda x: " | ".join([str(i) for i in x]))
                y_val.columns = y_val.columns.map(lambda x: " | ".join([str(i) for i in x]))
                X_test.columns = X_test.columns.map(lambda x: " | ".join([str(i) for i in x]))
        else:
            # Define input samples training arrays removing the temporal information.
            X_train = X[:, :, 1:]
            # Define output samples training arrays removing the temporal information.
            y_train = y[:, 1, :]
            if validation:
                # Define input samples validation arrays removing the temporal information.
                X_val = X_val[:, :, 1:]
                # Define output samples validation arrays removing the temporal information.
                y_val = y_val[:, 1, :] 
            else:
                X_val, y_val = None, None
            # Define input samples test arrays removing the temporal information.
            X_test = X_test[:, :, 1:]
            
        return X_train.astype(dtype), y_train.astype(dtype), X_val.astype(dtype), y_val.astype(dtype), X_test.astype(dtype)
    
    def highlight_cells(self, x, y):
        """
        ***Sub-function***
 
        This function draws the cells of the dataframe that belongs to the lag features for the current input sample x and
        output sample y.

        """
        # Pandas mask.
        m = pd.DataFrame(self.mask, index = x[:, 0], columns = self.features)
        # Getting (index, column) pairs for True elements of the boolean DataFrame.
        cells_to_color_input = m[m == True].stack().index.tolist()
        if not self.return_dataframe:
            cells_to_color_input_extra = m[m == False].stack().index.tolist()
        if y is not None:
            cells_to_color_output = y[0]
            
        def draw(x):
            df_styler  = self.group.copy()
            if self.validation:
                df_styler.loc[-self.n_out:, self.target] = "color: red"
            # Set particular cell colors for the input.
            for location in cells_to_color_input:
                df_styler.loc[location[0], location[1]] = "background-color: RGB(0,131,255)"
            # If the return dataframe is not set, the extra cells of the input filled with nan values are opaque colored.
            if not self.return_dataframe:
                for location in cells_to_color_input_extra:
                    df_styler.loc[location[0], location[1]] = "background-color: RGBA(0,131,255,0.44)"
            # Set particular cell colors for the output.
            if y is not None:
                for location in cells_to_color_output:
                    df_styler.loc[location, self.target] = "background-color: RGB(255,154,0)"
            return df_styler 
        
        # Highlight the following sample into the dataframe.
        sample = self.group.style.apply(lambda x: draw(x), axis = None)
        return sample
        
    def visualization(self, boundaries = True):
        """
        ***Sub-function***
 
        This function allows to visualize the training/validation/test input and output samples created by the process.
        
        Parameters
        ----------
        boundaries: if you want to visualize only the first two an the last two sample points created.
        
        Return
        ----------
        train_dataframes: a list of dataframes that underline each training sample created.
        validation_dataframes: a list of dataframes that underline each validation sample created.
        test_dataframes: a list of dataframes that underline each test sample created.

        """
        # Create dataframes for visualization.
        if boundaries:
            # Keep only boundaries samples.
            self.X = np.concatenate([self.X[:2], self.X[-2:]])
            self.y = np.concatenate([self.y[:2], self.y[-2:]])
            train_dataframes = [self.highlight_cells(x, y) for x, y in zip(self.X, self.y)]
            if self.X_val is not None:
                # Keep only boundaries samples.
                self.X_val = np.concatenate([self.X_val[:2], self.X_val[-2:]])
                self.y_val = np.concatenate([self.y_val[:2], self.y_val[-2:]])
                validation_dataframes = [self.highlight_cells(x, y) for x, y in zip(self.X_val, self.y_val)]
            else:
                validation_dataframes = None
            test_dataframes = [self.highlight_cells(x, None) for x in self.X_test]
        else:
            train_dataframes = [self.highlight_cells(x, y) for x, y in zip(self.X, self.y)]
            if self.X_val is not None:
                validation_dataframes = [self.highlight_cells(x, y) for x, y in zip(self.X_val, self.y_val)]
            else:
                validation_dataframes = None
            test_dataframes = [self.highlight_cells(x, None) for x in self.X_test]
            
        return train_dataframes, validation_dataframes, test_dataframes
    

In [25]:
creator = LagsCreator(df, lags_dict, "A")

In [28]:
X_train, y_train, X_val, y_val, X_test = creator.to_supervised(4, single_step=False,return_dataframe = True,h=1, 
                                                               validation=True, feature_time=False, return_single_level = True, 
                                                               dtype = object)

In [29]:
X_train

Unnamed: 0,A | x(t-2),A | x(t-1),A | x(t),B | x(t-3),B | x(t-2),B | x(t-1),B | x(t),C | x
0,a2,a3,a4,b1,b2,b3,b4,c4
1,a3,a4,a5,b2,b3,b4,b5,c5
2,a4,a5,a6,b3,b4,b5,b6,c6
3,a5,a6,a7,b4,b5,b6,b7,c7
4,a6,a7,a8,b5,b6,b7,b8,c8
5,a7,a8,a9,b6,b7,b8,b9,c9
6,a8,a9,a10,b7,b8,b9,b10,c10
7,a9,a10,a11,b8,b9,b10,b11,c11
8,a10,a11,a12,b9,b10,b11,b12,c12


In [10]:
y_train

Unnamed: 0,A | x(t+1),A | x(t+2),A | x(t+3),A | x(t+4)
0,a5,a6,a7,a8
1,a6,a7,a8,a9
2,a7,a8,a9,a10
3,a8,a9,a10,a11
4,a9,a10,a11,a12
5,a10,a11,a12,a13
6,a11,a12,a13,a14
7,a12,a13,a14,a15
8,a13,a14,a15,a16


In [11]:
X_test

Unnamed: 0,A | x(t-2),A | x(t-1),A | x(t),B | x(t-3),B | x(t-2),B | x(t-1),B | x(t),C | x
0,a18,a19,a20,b17,b18,b19,b20,c20


In [12]:
train, val, test = creator.visualization(boundaries = True)

In [13]:
len(train)

4

In [14]:
train[2]

Time-series,A,B,C
2018-08-22 00:00:00,a1,b1,c1
2018-08-23 00:00:00,a2,b2,c2
2018-08-24 00:00:00,a3,b3,c3
2018-08-25 00:00:00,a4,b4,c4
2018-08-26 00:00:00,a5,b5,c5
2018-08-27 00:00:00,a6,b6,c6
2018-08-28 00:00:00,a7,b7,c7
2018-08-29 00:00:00,a8,b8,c8
2018-08-30 00:00:00,a9,b9,c9
2018-08-31 00:00:00,a10,b10,c10


In [15]:
val[-1]

Time-series,A,B,C
2018-08-22 00:00:00,a1,b1,c1
2018-08-23 00:00:00,a2,b2,c2
2018-08-24 00:00:00,a3,b3,c3
2018-08-25 00:00:00,a4,b4,c4
2018-08-26 00:00:00,a5,b5,c5
2018-08-27 00:00:00,a6,b6,c6
2018-08-28 00:00:00,a7,b7,c7
2018-08-29 00:00:00,a8,b8,c8
2018-08-30 00:00:00,a9,b9,c9
2018-08-31 00:00:00,a10,b10,c10


In [16]:
test[0]

Time-series,A,B,C
2018-08-22 00:00:00,a1,b1,c1
2018-08-23 00:00:00,a2,b2,c2
2018-08-24 00:00:00,a3,b3,c3
2018-08-25 00:00:00,a4,b4,c4
2018-08-26 00:00:00,a5,b5,c5
2018-08-27 00:00:00,a6,b6,c6
2018-08-28 00:00:00,a7,b7,c7
2018-08-29 00:00:00,a8,b8,c8
2018-08-30 00:00:00,a9,b9,c9
2018-08-31 00:00:00,a10,b10,c10
