# Baseline

In [2]:
from random import sample
from typing import Any, Dict, List, Union, Optional
import numpy as np
import pandas as pd
from itertools import product
from statsmodels.tsa.stattools import adfuller
from time_series_kedro.extras.utils import parallel_groupby

import logging
logger = logging.getLogger(__name__)

def compute_seg_metrics(
    data: pd.DataFrame,
    serie_id: Union[List[str], str],
    serie_target: str,
    serie_freq: str,
    n_jobs: int
):
    """
    This node calculates metrics to assess the quality of the series.

    Args:
        data: Dataframe with time series.
        serie_id: Column or list of columns that identify series.
        serie_target: Target column name.
        serie_freq: Serie frequency.
    Returns:
        Dataframe with metrics computed to each serie.
    """
    group_func = lambda data, group_cols, **kwargs: data.groupby(group_cols).apply(lambda serie_data: _seg_metrics(serie_data, **kwargs)) 
    seg_data = parallel_groupby(data,
                                group_func,
                                ["serie_id",],
                                n_jobs=n_jobs,
                                serie_target=serie_target,
                                serie_freq=serie_freq)
    return seg_data.reset_index()

def _seg_metrics(
    data: pd.DataFrame,
    serie_target: str,
    serie_freq: str
) -> pd.Series:
    """
    This function compute metrics (Sample Entropy, Coefficient of variation, 
    Serie size, Amount accumulated in the last cycle).

    Args:
        data: Dataframe with time serie.
        serie_target: Target column name.
        serie_freq: Serie frequency.
    Returns:
        Serie metrics.
    """
    ts = data[serie_target].values
    nonzeros = np.nonzero(ts)
    if nonzeros[0].shape[0]:
        first_point = nonzeros[0][0]
        last_point = nonzeros[0][-1]
        len_ts = (last_point - first_point) + 1
        ts = ts[first_point:]
        sample_entropy = _sample_entropy(ts, m=2, r=0.2*np.std(ts)) 
        adf = adfuller(ts)[0]
    else:
        len_ts = 1
        sample_entropy = np.nan
        adf = np.nan
    
    
    mean = ts.mean()
    if mean:
        cv = ts.std()/mean
    else:
        cv = np.nan

    if serie_freq == "D":
        last = 30
    elif serie_freq == "M" or serie_freq == "MS":
        last = 12
    elif serie_freq == "Y":
        last = 1
    elif serie_freq == "h":
        last = 24
    acc_12m = ts[-last:].sum()

    

    
    return pd.Series({
            "sample_entropy": sample_entropy, 
            "cv": cv, 
            "len_ts": len_ts, 
            "acc_12m": acc_12m,
            "adf":adf})

def time_series_segmentation(
    data: pd.DataFrame,
    seg_metrics: pd.DataFrame, 
    serie_id: Union[List[str], str],
    group_divisions: Dict[str, Any],
    sampling: Optional[int] = None,
    random_state: int = 42):
    """
    This node segments the series based on a set of conditions that
    have been defined for the metrics.

    Args:
        data: Dataframe with time series.
        seg_metrics: Dataframe with metrics computed to each serie.
        serie_id: Column or list of columns that identify series.
        group_division: Conditions that have been defined for the metrics
    Returns:
        Dataframe with segmentation groups in column ``group``.
    """

    metrics = list(group_divisions)
    seg_metrics["group"] = 0

    for i, group in enumerate(product(["gt", "le"], repeat=len(metrics))):
        series_filter = True 
        for comp, metric in zip(group, metrics):
            method = group_divisions[metric]["method"]
            args = group_divisions[metric]["args"]
            value = getattr(seg_metrics[metric], method)(*args)
            comp_filter = getattr(seg_metrics[metric], comp)(value)
            series_filter = series_filter & comp_filter
        seg_metrics.loc[series_filter, "group"] = i + 1
    seg_metrics = seg_metrics[["serie_id", "group"]]
    data = pd.merge(data, seg_metrics, on="serie_id")
    if sampling:
        np.random.seed(random_state)
        sample = pd.DataFrame()
        for group in data.group.unique():
            data_sample = data[data.group == group]
            series = np.random.choice(data_sample["serie_id"].unique(), min(sampling, data_sample["serie_id"].nunique()), replace=False)
            data_sample = data_sample[data_sample["serie_id"].isin(series)]
            sample = pd.concat((sample, data_sample), ignore_index=True)
        data = sample
        logger.info(f"# Series after sampling: {data['serie_id'].nunique()}")
    return data





def _sample_entropy(
    L: np.array,
    m: int,
    r: int
) -> int:
    """ 
    Calculates Sample Entropy for a given time series. Sample entropy (SampEn)
    is a modification of approximate entropy (ApEn), used for assessing the 
    complexity of time-series signals. For more details please refer to 
    https://www.mdpi.com/1099-4300/21/6/541.
    
    Args:
        L: array_like, time-series signal.
        m: int, embedding dimension.
        r: int, tolerance
    Returns:
        Sample entropy.
    """
    # Initialize parameters
    N = len(L)
    B = 0.0
    A = 0.0
    
    # Split time series and save all templates of length m
    xmi = np.array([L[i : i + m] for i in range(N - m)])
    xmj = np.array([L[i : i + m] for i in range(N - m + 1)])

    # Save all matches minus the self-match, compute B
    B = np.sum([np.sum(np.abs(xmii - xmj).max(axis=1) <= r) - 1 for xmii in xmi])

    # Similar for computing A
    m += 1
    xm = np.array([L[i : i + m] for i in range(N - m + 1)])

    A = np.sum([np.sum(np.abs(xmi - xm).max(axis=1) <= r) - 1 for xmi in xm])

    # Return SampEn
    return -np.log(A / B)

In [4]:
%time seg_metrics = compute_seg_metrics(data, serie_id, serie_target, serie_freq, n_jobs)

CPU times: user 2.62 s, sys: 395 ms, total: 3.02 s
Wall time: 3min 15s


  llf = -nobs2*np.log(2*np.pi) - nobs2*np.log(ssr / nobs) - nobs2
  llf = -nobs2*np.log(2*np.pi) - nobs2*np.log(ssr / nobs) - nobs2


# Seg Metrics

In [3]:
data = catalog.load("prepared_data")
serie_target = catalog.load("params:serie_target")
serie_id = catalog.load("params:series_level.columns")
serie_freq = catalog.load("params:serie_freq")
n_jobs = catalog.load("params:n_jobs")

2022-04-03 20:16:43,137 - kedro.io.data_catalog - INFO - Loading data from `prepared_data` (CSVDataSet)...
2022-04-03 20:16:45,910 - kedro.io.data_catalog - INFO - Loading data from `params:serie_target` (MemoryDataSet)...
2022-04-03 20:16:45,911 - kedro.io.data_catalog - INFO - Loading data from `params:series_level.columns` (MemoryDataSet)...
2022-04-03 20:16:45,912 - kedro.io.data_catalog - INFO - Loading data from `params:serie_freq` (MemoryDataSet)...
2022-04-03 20:16:45,913 - kedro.io.data_catalog - INFO - Loading data from `params:n_jobs` (MemoryDataSet)...


In [35]:
def _seg_metrics(
    data: pd.DataFrame,
    serie_target: str,
    serie_freq: str
) -> pd.Series:
    """
    This function compute metrics (Sample Entropy, Coefficient of variation, 
    Serie size, Amount accumulated in the last cycle).

    Args:
        data: Dataframe with time serie.
        serie_target: Target column name.
        serie_freq: Serie frequency.
    Returns:
        Serie metrics.
    """
    
    ts = data[serie_target].values
    """
    nonzeros = np.nonzero(ts)
    if nonzeros[0].shape[0]:
        first_point = nonzeros[0][0]
        last_point = nonzeros[0][-1]
        len_ts = (last_point - first_point) + 1
        ts = ts[first_point:]
        sample_entropy = _sample_entropy(ts, m=2, r=0.2*np.std(ts)) 
        adf = adfuller(ts)[0]
    else:
        len_ts = 1
        sample_entropy = np.nan
        adf = np.nan
    """
    mean = ts.mean()
    std = ts.std()
    if mean:
        cv = std/mean
    else:
        cv = np.nan

    if serie_freq == "D":
        last = 30
    elif serie_freq == "M" or serie_freq == "MS":
        last = 12
    elif serie_freq == "Y":
        last = 1
    elif serie_freq == "h":
        last = 24
    acc_12m = ts[-last:].sum()

    

    
    return pd.Series({
            "std_serie": std,
            "mean_serie": mean,
            "cv": cv, 
            "acc_12m": acc_12m,})

In [40]:
series = np.random.choice(data["serie_id"].unique(), 500, replace=False)
data_sample = data[data.serie_id.isin(series)]
data_sample.sample(5)

Unnamed: 0,serie_id,date,sales,dcoilwtico
1585288,"(4, 'BABY CARE')",2016-05-30,0.0,0.0
1040593,"(7, 'BOOKS')",2015-03-27,0.0,48.83
399547,"(2, 'HOME AND KITCHEN II')",2013-11-11,0.0,95.13
553062,"(27, 'HOME AND KITCHEN I')",2014-03-11,1.0,100.29
1214684,"(40, 'LAWN AND GARDEN')",2015-08-12,0.0,43.22


In [7]:
series_data = data_sample.pivot_table(columns="serie_id", values="sales", index="date")
series = series_data.values

In [23]:
series[-last:,:]

array([[ 12.      ,   0.      ,   9.      , ..., 558.      ,  11.      ,
        106.072   ],
       [ 23.      ,   0.      ,  15.      , ..., 455.      ,  11.      ,
        122.981   ],
       [ 16.      ,   0.      ,  12.      , ..., 391.      ,  16.      ,
        108.145004],
       ...,
       [ 14.      ,   0.      ,  35.      , ..., 343.      ,   5.      ,
        112.1     ],
       [ 11.      ,   0.      ,  11.      , ..., 355.      ,   6.      ,
        114.12    ],
       [ 14.      ,   0.      ,   8.      , ..., 373.      ,  10.      ,
        154.553   ]])

In [45]:
def compute_seg_metrics(
    data: pd.DataFrame,
    serie_target: str,
    serie_freq: str,
    date_col: str
) -> pd.Series:
    
    series_data = data.pivot_table(columns="serie_id", values=serie_target, index=date_col)
    series = series_data.values
    metrics = pd.DataFrame(index=series_data.columns)
    metrics["mean_serie"] = series.mean(axis=0)
    metrics["std_serie"] = series.std(axis=0)
    metrics["cv"] = metrics.std_serie/metrics.mean_serie
    
    if serie_freq == "D":
        last = 30
    elif serie_freq == "M" or serie_freq == "MS":
        last = 12
    elif serie_freq == "Y":
        last = 1
    elif serie_freq == "h":
        last = 24
    metrics["acc"] = series[-last:,:].sum(axis=0)
    return metrics

In [41]:
seg_metrics = lambda serie_data: _seg_metrics(serie_data, serie_target=serie_target, serie_freq=serie_freq)
group_func = lambda data, group_cols: data.groupby(group_cols).apply(seg_metrics)

In [42]:
%time group_func(data, "serie_id")

CPU times: user 1.06 s, sys: 48.6 ms, total: 1.11 s
Wall time: 1.11 s


Unnamed: 0_level_0,std_serie,mean_serie,cv,acc_12m
serie_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
"(1, 'AUTOMOTIVE')",2.835173,3.523632,0.804617,155.000000
"(1, 'BABY CARE')",0.000000,0.000000,,0.000000
"(1, 'BEAUTY')",1.907383,2.567164,0.742992,110.000000
"(1, 'BEVERAGES')",689.685154,1701.390962,0.405365,69308.000000
"(1, 'BOOKS')",0.546681,0.136816,3.995742,6.000000
...,...,...,...,...
"(9, 'POULTRY')",126.325875,420.950918,0.300096,11893.747960
"(9, 'PREPARED FOODS')",27.080572,77.448568,0.349659,2967.201001
"(9, 'PRODUCE')",796.313504,879.090113,0.905838,45685.159000
"(9, 'SCHOOL AND OFFICE SUPPLIES')",18.527958,5.045605,3.672098,1722.000000


In [46]:
%time compute_seg_metrics(data, serie_target=serie_target, serie_freq=serie_freq, date_col="date")

CPU times: user 1.09 s, sys: 379 ms, total: 1.47 s
Wall time: 1.47 s


Unnamed: 0_level_0,mean_serie,std_serie,cv,acc
serie_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
"(1, 'AUTOMOTIVE')",3.523632,2.835173,0.804617,155.000000
"(1, 'BABY CARE')",0.000000,0.000000,,0.000000
"(1, 'BEAUTY')",2.567164,1.907383,0.742992,110.000000
"(1, 'BEVERAGES')",1701.390962,689.685154,0.405365,69308.000000
"(1, 'BOOKS')",0.136816,0.546681,3.995742,6.000000
...,...,...,...,...
"(9, 'POULTRY')",420.950918,126.325875,0.300096,11893.747960
"(9, 'PREPARED FOODS')",77.448568,27.080572,0.349659,2967.201001
"(9, 'PRODUCE')",879.090113,796.313504,0.905838,45685.159000
"(9, 'SCHOOL AND OFFICE SUPPLIES')",5.045605,18.527958,3.672098,1722.000000


In [34]:
%time parallel_groupby(data, group_func, ["serie_id",], n_jobs=2)

CPU times: user 2.36 s, sys: 392 ms, total: 2.75 s
Wall time: 3min 29s


Unnamed: 0_level_0,sample_entropy,cv,len_ts,acc_12m,adf
serie_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
"(1, 'AUTOMOTIVE')",2.085086,0.803767,1205.0,155.000000,-5.243874
"(1, 'BABY CARE')",,,1.0,0.000000,
"(1, 'BEAUTY')",1.762834,0.742126,1205.0,110.000000,-5.976987
"(1, 'BEVERAGES')",0.896215,0.404173,1205.0,69308.000000,-3.000669
"(1, 'BOOKS')",0.743264,1.442526,208.0,6.000000,-3.770140
...,...,...,...,...,...
"(9, 'POULTRY')",1.833846,0.298587,1205.0,11893.747960,-3.771836
"(9, 'PREPARED FOODS')",1.780438,0.348326,1205.0,2967.201001,-1.888849
"(9, 'PRODUCE')",0.431223,0.651947,944.0,45685.159000,-2.813764
"(9, 'SCHOOL AND OFFICE SUPPLIES')",0.197748,3.215219,944.0,1722.000000,-3.402982


  llf = -nobs2*np.log(2*np.pi) - nobs2*np.log(ssr / nobs) - nobs2
  llf = -nobs2*np.log(2*np.pi) - nobs2*np.log(ssr / nobs) - nobs2


In [19]:
%time data_sample.groupby("serie_id").apply(seg_metrics)

CPU times: user 49.7 s, sys: 36.6 s, total: 1min 26s
Wall time: 18.6 s


Unnamed: 0_level_0,sample_entropy,cv,len_ts,acc_12m,adf
serie_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
"(1, 'GROCERY I')",1.540453,0.275982,1205.0,81771.000,-3.855137
"(1, 'HOME AND KITCHEN I')",0.942673,1.342575,944.0,852.000,-11.044915
"(1, 'LADIESWEAR')",0.182606,0.731643,944.0,431.000,-4.752495
"(10, 'FROZEN FOODS')",0.421992,1.985743,1205.0,955.719,-7.965333
"(10, 'HARDWARE')",0.668514,1.559985,1205.0,10.000,-3.739275
...,...,...,...,...,...
"(7, 'BEAUTY')",1.992991,0.649232,1205.0,165.000,-4.186691
"(8, 'BOOKS')",0.292775,1.732313,153.0,0.000,-2.864907
"(8, 'BREAD/BAKERY')",1.512877,0.243872,1205.0,23838.033,-1.513881
"(8, 'HOME APPLIANCES')",0.477071,1.189884,1143.0,14.000,-3.771481


In [25]:
%time data_sample[[]].groupby("serie_id").transform(seg_metrics)

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x7fe85a63c880>