# Project: Predict Future Sales
#### Notebook 2 of 4: building hierarchical time series and top level forecasting

In this notebook, we mainly did the following:
- Building hierarchical time series for each shop
- Forecasting all top level hierachical time series using Facebook Prophet


In [1]:
import numpy as np
import pandas as pd
import seaborn as sns


import matplotlib
from matplotlib import pyplot as plt
from textwrap import wrap


pd.options.display.float_format = '{:,.2f}'.format
pd.options.display.max_columns = None
pd.options.display.max_rows = None
pd.set_option('display.max_colwidth', None)
pd.options.mode.chained_assignment = None  # default='warn'

import pickle
import time
from tqdm import tqdm

from itertools import product


from prophet import Prophet
import itertools
from prophet.diagnostics import cross_validation
from prophet.diagnostics import performance_metrics

import holidays


import hts.functions
import collections
from hts import HTSRegressor
from hts.hierarchy import HierarchyTree


import logging
logger = logging.getLogger('cmdstanpy')
logger.addHandler(logging.NullHandler())
logger.propagate = False
logger.setLevel(logging.CRITICAL)

# define functions

In [1]:
# The function is for hyperparameter tuning to get best hyperparamters of 
# - changepoint_prior_scale
# - seasonality_prior_scacle

def get_best_hyperparams(data):   
    
    default_best_params = {'changepoint_prior_scale': 0.2, 'seasonality_prior_scale': 0.5}
    
    first_cutoff_date = pd.to_datetime('2014-02-1')
    first_nonzero_date = data['y'].ne(0).idxmax()
    
    # return default_best_params if 
    # 1. all values are zero
    # 2. too many values are zero
    if (data['y'] == 0).all() or (first_cutoff_date < first_nonzero_date):
        print( first_nonzero_date)
        print("too many zero values, return default best params")
        return default_best_params
    else:
        param_grid = {  
                'changepoint_prior_scale': [0.01,0.2],
                'seasonality_prior_scale': [0.01,0.5 ],
        }

        # Generate all combinations of parameters
        all_params = [dict(zip(param_grid.keys(), v)) for v in itertools.product(*param_grid.values())]
        rmses = []  # Store the RMSEs for each params here

        # Use cross validation to evaluate all parameters
        cutoffs = pd.to_datetime(['2014-02-1', '2015-06-1'])
        #cutoffs = pd.to_datetime(['2015-9-1'])
        
        
        for params in all_params:
            m = Prophet(**params).fit(data)  # Fit model with given params
            df_cv = cross_validation(m, cutoffs=cutoffs, horizon='60 days', parallel="processes")
            df_p = performance_metrics(df_cv, rolling_window=1)
            rmses.append(df_p['rmse'].values[0])

        # Find the best parameters
        tuning_results = pd.DataFrame(all_params)
        tuning_results['rmse'] = rmses

        best_params = all_params[np.argmin(rmses)]
        print(best_params)
        return best_params


In [20]:
#the function is to return the best predictions 
# step1: get best hyperparameters by calling funciotn 'get_best_hyperparams'
# step2: return the best predictions using the best hyterparameters
def get_best_preds(df_,   #df_heir_train
                   cat_id  #cat_id in oneshop_cat_list 
                  ):
    #get dataframe
    columns = ['ds', cat_id]
    df = df_.loc[:, columns]
    df.columns = ['ds', 'y']
    
    #get best_params
    best_params = get_best_hyperparams(df)
    
    #create model based on best_params
    m = Prophet(changepoint_prior_scale=best_params['changepoint_prior_scale'], 
                seasonality_prior_scale=best_params['seasonality_prior_scale'], 
                yearly_seasonality=True)
    m.add_country_holidays(country_name='RU')
    m.fit(df)

    #predict and return
    future = m.make_future_dataframe(periods=1, freq='MS')
    forecast = m.predict(future)

    print(forecast[['ds', 'yhat', 'yhat_lower', 'yhat_upper']].tail(3))
    fig1 = m.plot(forecast)
    plt.show()

    best_preds = forecast.tail(1).yhat.values[0]
    print('best_preds: ', best_preds)
    return best_preds
    

# import data from pickle file

In [5]:
pickle_dict2 = pickle.load(open('../temp/data2.pkl', 'rb'))
df_basegrid = pickle_dict2['df_basegrid']   #concat df_train_m and df_test


In [6]:
list_shop_id=sorted(df_basegrid['shop_id'].unique())
print(list_shop_id)
print(len(list_shop_id))

[2, 3, 4, 5, 6, 7, 10, 12, 14, 15, 16, 18, 19, 21, 22, 24, 25, 26, 28, 31, 34, 35, 36, 37, 38, 39, 41, 42, 44, 45, 46, 47, 48, 49, 50, 52, 53, 55, 56, 57, 58, 59]
42


# create hierachical time series for all shops

In [4]:
%%time

for i in tqdm(range(len(list_shop_id))):
    ################ get one shop dataframe
    #################################################################
    df_oneshop = df_basegrid[df_basegrid['shop_id'] == list_shop_id[i]]

    ##change values in the following columns so that their combinations are more readable
    df_oneshop['date_block_num'] =df_oneshop['date_block_num'].astype(str)
    df_oneshop['item_category_id'] =df_oneshop['item_category_id'].astype(str)
    df_oneshop['shop_id'] =df_oneshop['shop_id'].astype(str)
    df_oneshop['item_id'] =df_oneshop['item_id'].astype(str)
    df_oneshop['item_category_id'] = 'c' + df_oneshop['item_category_id']
    df_oneshop['shop_id'] = 's' + df_oneshop['shop_id']
    df_oneshop['item_id'] = 'i' + df_oneshop['item_id']



    ############### create hierachical time series for the shop
    ##############################################################
    level_names = ['item_category_id','item_id']
    hierarchy = [['item_category_id']]
    df_hier, sum_mat, sum_mat_labels = hts.functions.get_hierarchichal_df(df_oneshop,
                                                                          level_names=level_names,
                                                                          hierarchy=hierarchy,
                                                                          date_colname='year_month',
                                                                          val_colname='item_cnt_month')
    #fill all null value with 0
    df_hier = df_hier.fillna(0)
    df_hier['ds']= df_hier.index

    old_column_list = df_hier.columns
    new_column_list = [old_column_list[-1], *old_column_list[:-1]]
    df_hier = df_hier[new_column_list]

    
    ################ get the hier_dict
    #################################################################

    #get the lists
    oneshop_cat_list = sorted(df_oneshop['item_category_id'].unique())
    oneshop_item_list = sorted(df_oneshop['item_id'].unique())

    #dictionary to hold all level nodes
    hier_dict = {}

    #get level 1 nodes: category nodes
    level_1_nodes = [str(cat_id) for cat_id in oneshop_cat_list]

    #print(level_1_nodes)
    hier_dict['total'] = level_1_nodes

    #get level 2 nodes
    df_hier_columns = sorted(df_hier.columns)
    for node in level_1_nodes: 
        temp_level_2_nodes = list(filter(lambda cat_item: f'{node}_' in cat_item, df_hier_columns))
        hier_dict[node] = temp_level_2_nodes

    tree = HierarchyTree.from_nodes(hier_dict, df_hier, root='total')
    #print(tree)

    sum_mat, sum_mat_labels = hts.functions.to_sum_mat(tree)



    ###########save neccessary object to pickle_dict, 
    ############################################################################
    pickle_dict_oneshop = dict()
    pickle_dict_oneshop['df_hier']=df_hier   #concat df_train_m and df_test
    pickle_dict_oneshop['tree']=tree 
    pickle_dict_oneshop['sum_mat']=sum_mat
    pickle_dict_oneshop['sum_mat_labels']=sum_mat_labels
    pickle_dict_oneshop['oneshop_cat_list']=oneshop_cat_list
    pickle_dict_oneshop['oneshop_item_list']=oneshop_item_list

    pickle.dump(pickle_dict_oneshop, open(f'../temp/s{list_shop_id[i]}_hier.pkl', 'wb'))




100%|██████████████████████████████████████████████████████████████████████████████████| 42/42 [07:28<00:00, 10.67s/it]

Wall time: 7min 28s





# forecasting all hierarchical time series using prophet

### shop_id: 36 only has one month data, won't be fed into forcasting

In [7]:
list_shop_id_filtered = list_shop_id.copy()
list_shop_id_filtered.remove(36)
print(list_shop_id_filtered)
print(len(list_shop_id_filtered))

[2, 3, 4, 5, 6, 7, 10, 12, 14, 15, 16, 18, 19, 21, 22, 24, 25, 26, 28, 31, 34, 35, 37, 38, 39, 41, 42, 44, 45, 46, 47, 48, 49, 50, 52, 53, 55, 56, 57, 58, 59]
41


### forecasting total sales and each category sales in each shop

In [None]:
%%time
    
for shop_id in tqdm(list_shop_id_filtered):
    
    print(f'\n\nforecasting shop: s{shop_id}')
    
    ####read one shop data from pickle_dict
    #############################################
    pickle_dict_oneshop = pickle.load(open(f'../temp/s{shop_id}_hier.pkl', 'rb'))
    df_hier  = pickle_dict_oneshop['df_hier']
    tree = pickle_dict_oneshop['tree']
    sum_mat = pickle_dict_oneshop['sum_mat']
    sum_mat_labels =pickle_dict_oneshop['sum_mat_labels']
    oneshop_cat_list = pickle_dict_oneshop['oneshop_cat_list']
    oneshop_item_list = pickle_dict_oneshop['oneshop_item_list']


    ################### creat train, validation,and test dataset
    ###########################################################################
    df_hier_train = df_hier.loc[df_hier.index <= '2015-10-01']
    df_hier_test = df_hier.loc[df_hier.index == '2015-11-01']


    
    forecasts = pd.DataFrame(columns = df_hier.columns, index=['fake'])
    
    
    ################### forecasting total sales in this shop
    #################################################################
    print('\ngetting best preds for forecast_total_sales')
    forecasts['total'] = [get_best_preds(df_hier_train, 'total')]

    
  
    ################### forecasting all category sales in this shop
    #################################################################
    #dataframe to hold the forecasts
    for cat in oneshop_cat_list:
        print(f'\ngetting best preds for s{shop_id}_', cat)
        forecasts[cat] = [get_best_preds(df_hier_train, cat)]


    ###########save neccessary object to pickle_dict, 
    ############################################################################
    pickle_dict_oneshop_new = dict()
    pickle_dict_oneshop_new['forecasts']=forecasts
    pickle.dump(pickle_dict_oneshop_new, open(f'../temp/s{shop_id}_forecasts.pkl', 'wb'))

