# 01 - M4 Example

In [1]:
import pandas as pd
import numpy as np
from scalecast import GridGenerator
from scalecast.Forecaster import Forecaster
from scalecast.Pipeline import Transformer, Reverter, Pipeline
from scalecast.util import find_optimal_transformation
from tqdm.notebook import tqdm
from fix_date import fix_date

In [2]:
# select which models will be evaluated
GridGenerator.get_example_grids()

models = (
    'lasso',
    'ridge',
    'elasticnet',
    'lightgbm',
    'xgboost',
)

In [3]:
def set_validation_params(f,fcst_horizon):
    f.set_test_length(fcst_horizon if len(f.y) > 100 else 1)
    f.set_validation_length(fcst_horizon if len(f.y) > 100 else 1)
    f.set_validation_metric('rmse')
    
def forecaster(f,models):
    f.auto_Xvar_select(
        monitor='LevelTestSetRMSE',
        max_ar = 100,
        try_trend = False,
        try_seasonalities = True,
        estimator='svr',
        
    )
    f.tune_test_forecast(
        models,
        limit_grid_size = .2,
    )
    
    f.set_estimator('combo')
    f.manual_forecast(call_me='avg_top2',models='top_2',determine_best_by='LevelTestSetRMSE')
    f.manual_forecast(call_me='avg_top4',models='top_4',determine_best_by='LevelTestSetRMSE')
    f.manual_forecast(call_me='avg_all')

In [4]:
info = pd.read_csv(
    'm4/M4-info.csv',
    index_col=0,
    parse_dates=['StartingDate'],
    dayfirst=True,
)
info['StartingDate'] = info['StartingDate'].apply(fix_date)

[Hourly](#Hourly)  
[Daily](#Daily)  
[Weekly](#Weekly)  
[Monthly](#Monthly)  
[Quarterly](#Quarterly)  
[Yearly](#Yearly)  

## Hourly

In [5]:
Hourly = pd.read_csv(
    f'm4/train/Hourly-train.csv',
    index_col=0,
)

In [6]:
Hourly_test = pd.read_csv(
    f'm4/test/Hourly-test.csv',
    index_col=0,
)

Hourly_results_template = pd.DataFrame(
    columns=Hourly_test.columns,
    index=Hourly_test.index,
)

Hourly_results = {}

In [7]:
for i in tqdm(Hourly.index):
    y = Hourly.loc[i].dropna()
    sd = info.loc[i,'StartingDate']
    fcst_horizon = info.loc[i,'Horizon']
    cd = pd.date_range(
        start = sd,
        freq = 'H',
        periods = len(y),
    )
    f = Forecaster(
        y = y,
        current_dates = cd,
        future_dates = fcst_horizon,
    )
    set_validation_params(f,fcst_horizon=fcst_horizon)
    transformer, reverter = find_optimal_transformation(
        f,
        monitor = 'TestSetRMSE',
        estimator = 'svr',
    )
    pipeline = Pipeline(
        steps = [
            ('Transform',transformer),
            ('Forecast',forecaster),
            ('Revert',reverter),
        ]
    )
    try:
        f = pipeline.fit_predict(
            f,
            models=models,
        )
    except:
        f.diff()
        f.tune_test_forecast(
            models,
            limit_grid_size = .2,
        )
    fcsts = f.export('lvl_fcsts',determine_best_by='LevelTestSetRMSE')
    for m in f.history.keys():
        if m in Hourly_results:
            Hourly_results[m].loc[i] = fcsts[m].to_list()
        else:
            df = Hourly_results_template.copy()
            df.loc[i] = fcsts[m].to_list()
            Hourly_results[m] = df
    if 'auto_select' in Hourly_results:
        Hourly_results['auto_select'].loc[i] = fcsts.iloc[:,1].to_list()
    else:
        df = Hourly_results_template.copy()
        df.loc[i] = fcsts.iloc[:,1].to_list()
        Hourly_results['auto_select'] = df

  0%|          | 0/414 [00:00<?, ?it/s]

In [8]:
for m,df in Hourly_results.items():
    df.to_csv(f'M4/model_results/Hourly/{m}.csv')

In [9]:
del Hourly
del Hourly_test
del Hourly_results_template
del Hourly_results

## Daily

In [10]:
Daily = pd.read_csv(
    f'm4/train/Daily-train.csv',
    index_col=0,
)
#Daily = Daily.sample(50)

In [11]:
Daily_test = pd.read_csv(
    f'm4/test/Daily-test.csv',
    index_col=0,
)
#Daily_test = Daily_test.loc[Daily.index]

Daily_results_template = pd.DataFrame(
    columns=Daily_test.columns,
    index=Daily_test.index,
)

Daily_results = {}

In [23]:
for i in tqdm(Daily.index):
    y = Daily.loc[i].dropna()
    sd = info.loc[i,'StartingDate']
    fcst_horizon = info.loc[i,'Horizon']
    cd = pd.date_range(
        start = sd,
        freq = 'D',
        periods = len(y),
    )
    f = Forecaster(
        y = y,
        current_dates = cd,
        future_dates = fcst_horizon,
    )
    set_validation_params(f,fcst_horizon=fcst_horizon)
    transformer, reverter = find_optimal_transformation(
        f,
        monitor = 'TestSetRMSE',
        estimator = 'svr',
    )
    pipeline = Pipeline(
        steps = [
            ('Transform',transformer),
            ('Forecast',forecaster),
            ('Revert',reverter),
        ]
    )
    try:
        f = pipeline.fit_predict(
            f,
            models=models,
        )
    except:
        f.diff()
        forecaster(f,models)
    fcsts = f.export('lvl_fcsts',determine_best_by='LevelTestSetMAE')
    for m in f.history.keys():
        if m in Daily_results:
            Daily_results[m].loc[i] = fcsts[m].to_list()
        else:
            df = Daily_results_template.copy()
            df.loc[i] = fcsts[m].to_list()
            Daily_results[m] = df
    if 'auto_select' in Daily_results:
        Daily_results['auto_select'].loc[i] = fcsts.iloc[:,1].to_list()
    else:
        df = Daily_results_template.copy()
        df.loc[i] = fcsts.iloc[:,1].to_list()
        Daily_results['auto_select'] = df

  0%|          | 0/4227 [00:00<?, ?it/s]

ValueError: Expected n_neighbors <= n_samples,  but n_samples = 70, n_neighbors = 83

In [None]:
for m,df in Daily_results.items():
    df.to_csv(f'M4/model_results/Daily/{m}.csv')

In [None]:
del Daily
del Daily_test
del Daily_results_template
del Daily_results

## Weekly

In [37]:
Weekly = pd.read_csv(
    f'm4/train/Weekly-train.csv',
    index_col=0,
)

In [38]:
Weekly_test = pd.read_csv(
    f'm4/test/Weekly-test.csv',
    index_col=0,
)

Weekly_results_template = pd.DataFrame(
    columns=Weekly_test.columns,
    index=Weekly_test.index,
)

Weekly_results = {}

In [39]:
for i in tqdm(Weekly.index):
    y = Weekly.loc[i].dropna()
    sd = info.loc[i,'StartingDate']
    fcst_horizon = info.loc[i,'Horizon']
    cd = pd.date_range(
        start = sd,
        freq = 'W',
        periods = len(y),
    )
    f = Forecaster(
        y = y,
        current_dates = cd,
        future_dates = fcst_horizon,
    )
    set_validation_params(f,fcst_horizon)
    transformer, reverter = find_optimal_transformation(
        f,
        **opt_transform_kwargs,
        lags = fcst_horizon,
        m = 52,
    )
    pipeline = Pipeline(
        steps = [
            ('Transform',transformer),
            ('Forecast',forecaster),
            ('Revert',reverter),
        ]
    )
    f = pipeline.fit_predict(f,models=models,max_ar=52)
    fcsts = f.export('lvl_fcsts',determine_best_by='LevelTestSetMAE')
    for m in f.history.keys():
        if m in Weekly_results:
            Weekly_results[m].loc[i] = fcsts[m].to_list()
        else:
            df = Weekly_results_template.copy()
            df.loc[i] = fcsts[m].to_list()
            Weekly_results[m] = df
    if 'auto_select' in Weekly_results:
        Weekly_results['auto_select'].loc[i] = fcsts.iloc[:,1].to_list()
    else:
        df = Weekly_results_template.copy()
        df.loc[i] = fcsts.iloc[:,1].to_list()
        Weekly_results['auto_select'] = df

  0%|          | 0/359 [00:00<?, ?it/s]

In [40]:
for m,df in Weekly_results.items():
    df.to_csv(f'M4/model_results/Weekly/{m}.csv')

In [41]:
del Weekly
del Weekly_test
del Weekly_results_template
del Weekly_results

## Monthly

In [None]:
Monthly = pd.read_csv(
    f'm4/train/Monthly-train.csv',
    index_col=0,
)
#Monthly = Monthly.sample(100)

In [51]:
Monthly_test = pd.read_csv(
    f'm4/test/Monthly-test.csv',
    index_col=0,
)
#Monthly_test = Monthly_test.loc[Monthly.index]

Monthly_results_template = pd.DataFrame(
    columns=Monthly_test.columns,
    index=Monthly_test.index,
)

Monthly_results = {}

In [52]:
for i in tqdm(Monthly.index):
    y = Monthly.loc[i].dropna()
    sd = info.loc[i,'StartingDate']
    fcst_horizon = info.loc[i,'Horizon']
    cd = pd.date_range(
        start = sd,
        freq = 'M',
        periods = len(y),
    )
    f = Forecaster(
        y = y,
        current_dates = cd,
        future_dates = fcst_horizon,
    )
    set_validation_params(f,fcst_horizon=fcst_horizon)
    transformer = Transformer(
        transformers = [
            ('LogTransform',),
            ('DiffTransform',1),
        ]
    )
    reverter = Reverter(
        reverters = [
            ('DiffRevert',1),
            ('LogRevert',),
        ],
        base_transformer = transformer,
    )
    pipeline = Pipeline(
        steps = [
            ('Transform',transformer),
            ('Forecast',forecaster),
            ('Revert',reverter),
        ]
    )
    f = pipeline.fit_predict(f,models=models,max_ar=100)
    fcsts = f.export('lvl_fcsts',determine_best_by='LevelTestSetRMSE')
    for m in f.history.keys():
        if m in Monthly_results:
            Monthly_results[m].loc[i] = fcsts[m].to_list()
        else:
            df = Monthly_results_template.copy()
            df.loc[i] = fcsts[m].to_list()
            Monthly_results[m] = df
    if 'auto_select' in Monthly_results:
        Monthly_results['auto_select'].loc[i] = fcsts.iloc[:,1].to_list()
    else:
        df = Monthly_results_template.copy()
        df.loc[i] = fcsts.iloc[:,1].to_list()
        Monthly_results['auto_select'] = df

  0%|          | 0/48000 [00:00<?, ?it/s]

In [53]:
for m,df in Monthly_results.items():
    df.to_csv(f'M4/model_results/Monthly/{m}.csv')

In [54]:
del Monthly
del Monthly_test
del Monthly_results_template
del Monthly_results

## Quarterly

In [5]:
Quarterly = pd.read_csv(
    f'm4/train/Quarterly-train.csv',
    index_col=0,
)

In [6]:
Quarterly_test = pd.read_csv(
    f'm4/test/Quarterly-test.csv',
    index_col=0,
)

Quarterly_results_template = pd.DataFrame(
    columns=Quarterly_test.columns,
    index=Quarterly_test.index,
)

Quarterly_results = {}

In [7]:
for i in tqdm(Quarterly.index):
    y = Quarterly.loc[i].dropna()
    sd = info.loc[i,'StartingDate']
    fcst_horizon = info.loc[i,'Horizon']
    cd = pd.date_range(
        start = sd,
        freq = 'Q',
        periods = len(y),
    )
    f = Forecaster(
        y = y,
        current_dates = cd,
        future_dates = fcst_horizon,
    )
    set_validation_params(f,fcst_horizon=fcst_horizon)
    transformer = Transformer(
        transformers = [
            ('LogTransform',),
            ('DiffTransform',1),
        ]
    )
    reverter = Reverter(
        reverters = [
            ('DiffRevert',1),
            ('LogRevert',),
        ],
        base_transformer = transformer,
    )
    pipeline = Pipeline(
        steps = [
            ('Transform',transformer),
            ('Forecast',forecaster),
            ('Revert',reverter),
        ]
    )
    f = pipeline.fit_predict(f,models=models,max_ar=48)
    fcsts = f.export('lvl_fcsts',determine_best_by='LevelTestSetRMSE')
    for m in f.history.keys():
        if m in Quarterly_results:
            Quarterly_results[m].loc[i] = fcsts[m].to_list()
        else:
            df = Quarterly_results_template.copy()
            df.loc[i] = fcsts[m].to_list()
            Quarterly_results[m] = df
    if 'auto_select' in Quarterly_results:
        Quarterly_results['auto_select'].loc[i] = fcsts.iloc[:,1].to_list()
    else:
        df = Quarterly_results_template.copy()
        df.loc[i] = fcsts.iloc[:,1].to_list()
        Quarterly_results['auto_select'] = df

  0%|          | 0/24000 [00:00<?, ?it/s]

In [8]:
for m,df in Quarterly_results.items():
    df.to_csv(f'M4/model_results/Quarterly/{m}.csv')

In [9]:
del Quarterly
del Quarterly_test
del Quarterly_results_template
del Quarterly_results

## Yearly

In [10]:
Yearly = pd.read_csv(
    f'm4/train/Yearly-train.csv',
    index_col=0,
)

In [11]:
Yearly_test = pd.read_csv(
    f'm4/test/Yearly-test.csv',
    index_col=0,
)

Yearly_results_template = pd.DataFrame(
    columns=Yearly_test.columns,
    index=Yearly_test.index,
)

Yearly_results = {}

In [13]:
for i in tqdm(Yearly.index):
    y = Yearly.loc[i].dropna()
    sd = info.loc[i,'StartingDate']
    fcst_horizon = info.loc[i,'Horizon']
    cd = pd.date_range(
        start = sd,
        freq = 'D', # for yearly it doesn't matter and will break if Y
        periods = len(y),
    )
    f = Forecaster(
        y = y,
        current_dates = cd,
        future_dates = fcst_horizon,
    )
    set_validation_params(f,fcst_horizon=fcst_horizon)
    transformer = Transformer(
        transformers = [
            ('LogTransform',),
            ('DiffTransform',1),
        ]
    )
    reverter = Reverter(
        reverters = [
            ('DiffRevert',1),
            ('LogRevert',),
        ],
        base_transformer = transformer,
    )
    f = pipeline.fit_predict(f,models=models,max_ar=20)
    fcsts = f.export('lvl_fcsts',determine_best_by='LevelTestSetRMSE')
    for m in f.history.keys():
        if m in Yearly_results:
            Yearly_results[m].loc[i] = fcsts[m].to_list()
        else:
            df = Yearly_results_template.copy()
            df.loc[i] = fcsts[m].to_list()
            Yearly_results[m] = df
    if 'auto_select' in Yearly_results:
        Yearly_results['auto_select'].loc[i] = fcsts.iloc[:,1].to_list()
    else:
        df = Yearly_results_template.copy()
        df.loc[i] = fcsts.iloc[:,1].to_list()
        Yearly_results['auto_select'] = df

  0%|          | 0/23000 [00:00<?, ?it/s]

In [14]:
for m,df in Yearly_results.items():
    df.to_csv(f'M4/model_results/Yearly/{m}.csv')

In [15]:
del Yearly
del Yearly_test
del Yearly_results_template
del Yearly_results