dependencies </n>
Env :: Forecastinit
Python=3.11.5

Libraries

In [197]:
# General 
import pandas as pd
import numpy as np

# sklearn 
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

# skforecast 
from skforecast.datasets import load_demo_dataset
from skforecast.ForecasterAutoreg import ForecasterAutoreg
from skforecast.model_selection import backtesting_forecaster
from skforecast.model_selection import grid_search_forecaster

# ML Models 
from xgboost import XGBRegressor

Input

In [198]:
Historical_data_org = pd.read_csv('Input/Actual.csv')
TimeKey_org = pd.read_csv('Input/Time.csv')
Forecast_level = ['Version.[Version Name]', 'Channel.[Channel]', 'Account.[Account]',
       'PnL.[PnL]', 'Demand Domain.[Demand Domain]', 'Region.[Region]',
       'Location.[Location]', 'Time.[Planning Month]', 'Item.[Item]']
Time_column = ['Time.[Planning Month]']
Historical_data_column = ['Actual']
#Date should be in format of time key column, which is available in TimeKey file 
Time_key_column_name = "Time.[PlanningMonthKey]"
Time_key_date_format = "%m/%d/%Y %I:%M:%S %p"
Historic_start_date = "M09-20" 
Historic_end_date = "M12-22"
Forecast_start_date = "M01-23"
Forecast_end_date = "M03-23"
drivers = ['driver_Month']
models = ["Random Forest", "XG Boost"]
Hypertunning = False


In [199]:
# lags used in grid search hyper tunning 
# lags_grid = [2,3,6,[1,2,3],[1,2,3,6],[3,6],[1,2,3,6]]
lags_grid = [2,6,[1,2,3]]

# parameters used in grid search hyper tunning 
param_grid = {
    'n_estimators': [50, 100, 200, 500, 1000, 1500],
    'max_depth': [2, 5, 8, 10, 12, 15]
}


Custom Functions

In [200]:
# Data filter based on start and end date, then split in X and y based on provided columns 
# split(historical data frame contains drivers and actual, start date , end date, X columns in list, y column in string )
def split(data,Historic_start_date_key,Historic_end_date_key,Forecast_start_date_key,Forecast_end_date_key,X_cols,y_col):
    data = data[(data['TimeKey']>=Historic_start_date_key)&(data['TimeKey']<=Forecast_end_date_key)]
    data.sort_values('TimeKey',inplace=True)
    data.reset_index(drop=True,inplace=True)
    
    # creating date range this will be only for index related purpose , will not used in forecasting 
    index = pd.Series(pd.date_range(start=Historic_start_date_key,periods=len(data)))
    data['index'] = index
    data.set_index('index',inplace=True)
    data = data.asfreq('D')

    #  train/test 
    train =  data[(data['TimeKey']>=Historic_start_date_key)&(data['TimeKey']<=Historic_end_date_key)]
    test =  data[(data['TimeKey']>Historic_end_date_key)&(data['TimeKey']<=Forecast_end_date_key)]
    
    
    X_train = train[['key','TimeKey']+X_cols]
    y_train = train[['TimeKey']+[y_col]]

    X_test = test[['key','TimeKey']+X_cols]
    y_test = test[['TimeKey']+[y_col]]
    return X_train,y_train,X_test,y_test

In [201]:
# fit the model 
# if Hypertunning is turned on fit will be in grid search else normal fit with default values will return 
def skforecastgridsrchfit(forecaster,y_train_loc,X_train_loc,drivers_loc,param_grid,lags_grid):
    results = grid_search_forecaster(
              forecaster         = forecaster,
              y                  = y_train_loc,
              param_grid         = param_grid,
              lags_grid          = lags_grid,
              steps              = 12,
              refit              = True,
              metric             = 'mean_squared_error',
              initial_train_size = len(y_train)-1,
              fixed_train_size   = False,
              return_best        = True,
              n_jobs             = 'auto',
              verbose            = False,
              show_progress      = True,
              exog               = X_train_loc[drivers_loc]
          )   

In [202]:
# SK Forecast ML Models
# skforecastpredict(models in list, X train data frame, y train data frame, X test data frame ) 
def skforecastpredict(models,Historical_data_column, X_train, y_train, X_test, drivers_local, Hypertunning,param_grid,lags_grid):
    # Predicted output data frame 
    Output = pd.DataFrame()
    # converting data frame to series, as sk forecast y accept series 
    y_train_loc = y_train[Historical_data_column[0]]
    
    # from sklearn.preprocessing import FunctionTransformer
    # transformer_y = FunctionTransformer(func=np.log1p, inverse_func=np.expm1)

    # Random Forest ========================================================================================================================= 
    if "Random Forest" in models:
        randomforestforecaster = ForecasterAutoreg(
                 regressor = RandomForestRegressor(random_state=123),
                 lags      = [2]
             )
        if Hypertunning is True:
            skforecastgridsrchfit(randomforestforecaster,y_train_loc,X_train,drivers_local,param_grid,lags_grid)
        else:
            randomforestforecaster.fit(y= y_train_loc, exog=X_train[drivers_local])
        y_hat_randomforest = randomforestforecaster.predict_interval(steps=3,exog=X_test[drivers_local]).reset_index(drop=True)
        y_hat_randomforest.columns = ['Random Forest Y_hat', 'Random Forest Lower Bound', 'Random Forest Upper Bound']
        Output = pd.concat([Output, y_hat_randomforest], axis=1)
    # xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx
    
    #XG Boost ================================================================================================================================
    if "XG Boost" in models:
        xgboostForecaster = ForecasterAutoreg(
                regressor= XGBRegressor(random_state = 123),
                lags= [1,2]
            )      
        if Hypertunning is True:
            skforecastgridsrchfit(xgboostForecaster,y_train_loc,X_train,drivers_local,param_grid,lags_grid)
        else:
            xgboostForecaster.fit(y=y_train_loc, exog=X_train[drivers_local])
        y_hat_xgboost = xgboostForecaster.predict_interval(steps=3,exog=X_test[drivers]).reset_index(drop=True)
        y_hat_xgboost.columns = ['XG Boost Y_hat', 'XG Boost Lower Bound', 'XG Boost Upper Bound']
        Output = pd.concat([Output, y_hat_xgboost], axis=1)
    # xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx
        
    return Output

Processing Input

In [203]:
Forecast_level_minus_time = Forecast_level.copy()
Forecast_level_minus_time.remove(Time_column[0])
Forecast_level_minus_time

['Version.[Version Name]',
 'Channel.[Channel]',
 'Account.[Account]',
 'PnL.[PnL]',
 'Demand Domain.[Demand Domain]',
 'Region.[Region]',
 'Location.[Location]',
 'Item.[Item]']

Copy Input 

In [204]:
Historical_data = Historical_data_org.copy(deep=True)
TimeKey = TimeKey_org.copy(deep=True)
TimeKey['TimeKey'] = pd.to_datetime(TimeKey[Time_key_column_name], format=Time_key_date_format)

In [205]:
# calculating Historic_end_date_key
Historic_start_date_key = pd.to_datetime(TimeKey[TimeKey[Time_column[0]]==Historic_start_date]['TimeKey'].values[0])
Historic_end_date_key = pd.to_datetime(TimeKey[TimeKey[Time_column[0]]==Historic_end_date]['TimeKey'].values[0])
Forecast_start_date_key = pd.to_datetime(TimeKey[TimeKey[Time_column[0]]==Forecast_start_date]['TimeKey'].values[0])
Forecast_end_date_key = pd.to_datetime(TimeKey[TimeKey[Time_column[0]]==Forecast_end_date]['TimeKey'].values[0])

Processing Copy Input

In [206]:
Historical_data[Historical_data_column] = Historical_data[Historical_data_column].astype(float)


Creating Key in Historical Data

In [207]:
# creating keys 
Historical_data['key'] = Historical_data[Forecast_level_minus_time].astype(str).agg("__ MDJoinner__".join, axis=1)
# dropping columns, which is already present in keys 
Historical_data.drop(Forecast_level_minus_time,axis=1,inplace=True) 
Historical_data.head(2)

Unnamed: 0,Time.[Planning Month],Actual,key
0,M07-20,445.0,CurrentWorkingView__ MDJoinner__B2B__ MDJoinne...
1,M08-20,711.0,CurrentWorkingView__ MDJoinner__B2B__ MDJoinne...


Custom Filter

In [208]:
Historical_data = Historical_data[Historical_data['key']=='CurrentWorkingView__ MDJoinner__B2B__ MDJoinner__AMO__ MDJoinner__DP__ MDJoinner__DP__ MDJoinner__ShipTo1__ MDJoinner__DP__ MDJoinner__Loctite 248 19g Stick']
# Historical_data['key'].values

Merging Time Key in Historical Data

In [209]:
Historical_data = pd.merge(Historical_data,TimeKey[Time_column+["TimeKey"]],on=Time_column,how='left')
Historical_data.head(2)

Unnamed: 0,Time.[Planning Month],Actual,key,TimeKey
0,M07-20,445.0,CurrentWorkingView__ MDJoinner__B2B__ MDJoinne...,2020-07-05
1,M08-20,711.0,CurrentWorkingView__ MDJoinner__B2B__ MDJoinne...,2020-08-02


Adding features

In [210]:
# Creating drivers 
Historical_data['driver_Month'] = Historical_data['TimeKey'].dt.month
Historical_data.head()

Unnamed: 0,Time.[Planning Month],Actual,key,TimeKey,driver_Month
0,M07-20,445.0,CurrentWorkingView__ MDJoinner__B2B__ MDJoinne...,2020-07-05,7
1,M08-20,711.0,CurrentWorkingView__ MDJoinner__B2B__ MDJoinne...,2020-08-02,8
2,M09-20,462.0,CurrentWorkingView__ MDJoinner__B2B__ MDJoinne...,2020-08-30,8
3,M10-20,174.0,CurrentWorkingView__ MDJoinner__B2B__ MDJoinne...,2020-10-04,10
4,M11-20,179.0,CurrentWorkingView__ MDJoinner__B2B__ MDJoinne...,2020-11-01,11


Train and Test Data

In [211]:
X_train,y_train,X_test,y_test = split(Historical_data, Historic_start_date_key,Historic_end_date_key,Forecast_start_date_key,Forecast_end_date_key,drivers,Historical_data_column[0])

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data.sort_values('TimeKey',inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['index'] = index


Prediction

In [212]:
Hypertunning=True
models = ["Random Forest", "XG Boost"]
rf = skforecastpredict(models, Historical_data_column, X_train, y_train, X_test, drivers, Hypertunning, param_grid, lags_grid)

Number of models compared: 108.


lags grid:   0%|          | 0/3 [00:00<?, ?it/s]

params grid:   0%|          | 0/36 [00:00<?, ?it/s]

`Forecaster` refitted using the best-found lags and parameters, and the whole data set: 
  Lags: [1 2 3] 
  Parameters: {'max_depth': 2, 'n_estimators': 200}
  Backtesting metric: 1399.7474066141463

Number of models compared: 108.


lags grid:   0%|          | 0/3 [00:00<?, ?it/s]

params grid:   0%|          | 0/36 [00:00<?, ?it/s]

`Forecaster` refitted using the best-found lags and parameters, and the whole data set: 
  Lags: [1 2] 
  Parameters: {'max_depth': 12, 'n_estimators': 50}
  Backtesting metric: 364.36970446351916



In [None]:
rf

Unnamed: 0,Random Forest Y_hat,Random Forest Lower Bound,Random Forest Upper Bound,XG Boost Y_hat,XG Boost Lower Bound,XG Boost Upper Bound
0,220.22,185.468,290.06,286.099701,286.098434,286.100861
1,198.05,163.62,267.89,211.592102,211.590836,211.593262
2,207.58,169.7375,321.936,178.813751,178.812485,178.814911


In [109]:
# Grid search hyperparameters and lags
# ==============================================================================
forecaster = ForecasterAutoreg(
                 regressor = RandomForestRegressor(random_state=123),
                 lags      = 2 # Placeholder, the value will be overwritten
             )

# Lags used as predictors
lags_grid = [2,3,6,[1,2,3],[1,2,3,6],[3,6],[1,2,3,6]]

lags_grid = [2,3]
# Regressor hyperparameters
param_grid = {
    'n_estimators': [50, 100, 200, 500, 1000, 1500],
    'max_depth': [2, 5, 8, 10, 12, 15]
}

forecaster.fit(y=y_train[Historical_data_column[0]],exog=X_train[drivers])
forecaster.predict_interval(steps=3,exog=X_test[drivers])
# lags_grid = [2, 10, [1, 2, 3, 20]]

# Regressor hyperparameters
# param_grid = {
#     'n_estimators': [50, 100],
#     'max_depth': [5, 10, 15]
# }




Unnamed: 0,pred,lower_bound,upper_bound
2020-09-27,245.67,173.06,374.14
2020-09-28,236.78,150.8115,390.203
2020-09-29,235.31,152.7535,463.226


In [131]:
rf
yhat = rf.rename(columns={'Random Forest Y_hat':'pred','XG Boost Y_hat':'pred'})

yhat

Unnamed: 0,pred,Random Forest Lower Bound,Random Forest Upper Bound
0,242.18,171.91925,416.8
1,275.09,206.505,449.71
2,261.68,120.6405,566.3995


In [132]:
y_test.reset_index(inplace=True,drop=True)
y_test


Unnamed: 0,TimeKey,Actual
0,2023-01-01,209.0
1,2023-01-29,223.0
2,2023-02-26,333.0


In [133]:
yhat = pd.merge(yhat[['pred']],y_test,left_index=True,right_index=True,how='outer')
yhat

Unnamed: 0,pred,TimeKey,Actual
0,242.18,2023-01-01,209.0
1,275.09,2023-01-29,223.0
2,261.68,2023-02-26,333.0


In [134]:
yhat['at'] = 'all'
yhat['diff'] = abs(yhat['pred'] - yhat['Actual'])
yhat = yhat.groupby(['at'],as_index=False)[['diff','Actual']].sum()
yhat

Unnamed: 0,at,diff,Actual
0,all,156.59,765.0


In [135]:
print("Accuracy",100-yhat['diff']/yhat['Actual']*100)


Accuracy 0    79.530719
dtype: float64
