In [1]:
import os
import pickle
import numpy as np
import pandas as pd
from mahts import HTSDistributor
from scipy.stats import iqr

import matplotlib.pyplot as plt

***

In [2]:
hierarchy = pd.read_parquet("../input/hierarchy_raw.parquet")

In [3]:
infile = open("../input/encoders.pkl", "rb")
encoders = pickle.load(infile)
infile.close()

***
## Approach: level1 -> level2 -> level3 -> level12

In [4]:
!ls ../output/kaggle_submissions/

fnu050-num-leaves-76.csv	 m5-first-public-notebook-under-0-50.csv
m5-accuracy-poisson-tweedie.csv  m5-forecast-v2-python.csv
m5-accuracy-tweedie-is-back.csv  m5-forecaster-v2.csv


In [5]:
forecast_level12 = pd.read_csv("../output/kaggle_submissions/m5-first-public-notebook-under-0-50.csv")
forecast_level12.head()

Unnamed: 0,id,F1,F2,F3,F4,F5,F6,F7,F8,F9,...,F19,F20,F21,F22,F23,F24,F25,F26,F27,F28
0,FOODS_1_001_CA_1_validation,0.910026,0.847982,0.85089,0.801789,1.071719,1.296671,1.350886,1.009637,0.970509,...,1.112975,1.385381,1.294539,0.99315,0.869451,0.849285,0.85523,1.022105,1.262562,1.233375
1,FOODS_1_001_CA_2_validation,0.940942,0.954666,0.882539,1.273918,1.290628,1.354433,1.554513,0.920688,0.9263,...,1.208342,1.641095,1.501229,1.032052,0.982766,1.012417,1.052117,1.241158,1.689393,1.424921
2,FOODS_1_001_CA_3_validation,1.091795,1.044925,0.94997,0.91752,0.990185,1.107758,1.223882,1.105829,1.137468,...,1.132724,1.674212,1.766982,1.147176,1.055445,0.980131,0.969123,1.054151,1.391111,1.266828
3,FOODS_1_001_CA_4_validation,0.414818,0.361572,0.355099,0.349345,0.405143,0.451183,0.517519,0.398493,0.426824,...,0.457459,0.482851,0.500463,0.385219,0.363917,0.368164,0.37339,0.41976,0.45792,0.481183
4,FOODS_1_001_TX_1_validation,0.180371,0.179159,0.1707,0.171773,0.172805,0.180151,0.228228,0.476832,0.429286,...,0.393753,0.380485,0.353399,0.295464,0.280412,0.275795,0.27256,0.278792,0.331847,0.318894


In [6]:
date_mapping1 = {f"F{i}": pd.to_datetime("2016-04-24")+pd.DateOffset(days=i) for i in range(1,29)}
date_mapping2 = {f"F{i}": pd.to_datetime("2016-05-22")+pd.DateOffset(days=i) for i in range(1,29)}

forecast_level12_valid = (
    forecast_level12
    .loc[:30489, :]
    .assign(id = lambda x: x.id.map(lambda x: x.replace("_validation","")))
    .rename(date_mapping1, axis=1)
    .set_index("id")
    .transpose()
)

forecast_level12_eval = (
    forecast_level12
    .loc[30490:, :]
    .assign(id = lambda x: x.id.map(lambda x: x.replace("_evaluation","")))
    .rename(date_mapping2, axis=1)
    .set_index("id")
    .transpose()
)

In [7]:
forecast_level12_valid.tail()

id,FOODS_1_001_CA_1,FOODS_1_001_CA_2,FOODS_1_001_CA_3,FOODS_1_001_CA_4,FOODS_1_001_TX_1,FOODS_1_001_TX_2,FOODS_1_001_TX_3,FOODS_1_001_WI_1,FOODS_1_001_WI_2,FOODS_1_001_WI_3,...,HOUSEHOLD_2_516_CA_1,HOUSEHOLD_2_516_CA_2,HOUSEHOLD_2_516_CA_3,HOUSEHOLD_2_516_CA_4,HOUSEHOLD_2_516_TX_1,HOUSEHOLD_2_516_TX_2,HOUSEHOLD_2_516_TX_3,HOUSEHOLD_2_516_WI_1,HOUSEHOLD_2_516_WI_2,HOUSEHOLD_2_516_WI_3
2016-05-18 00:00:00,0.849285,1.012417,0.980131,0.368164,0.275795,0.428849,0.435424,0.556651,0.381275,0.324671,...,0.189202,0.184461,0.115581,0.114239,0.10588,0.234043,0.141908,0.085014,0.088967,0.103765
2016-05-19 00:00:00,0.85523,1.052117,0.969123,0.37339,0.27256,0.435233,0.440499,0.567618,0.383275,0.319644,...,0.208429,0.186873,0.114525,0.115479,0.108983,0.226027,0.145095,0.08765,0.076782,0.109788
2016-05-20 00:00:00,1.022105,1.241158,1.054151,0.41976,0.278792,0.481264,0.471629,0.703742,0.403016,0.378388,...,0.291932,0.234969,0.131894,0.133653,0.129428,0.280163,0.177965,0.131066,0.101013,0.154863
2016-05-21 00:00:00,1.262562,1.689393,1.391111,0.45792,0.331847,0.547399,0.53926,1.02689,0.440113,0.424405,...,0.356674,0.273579,0.162382,0.161577,0.159163,0.36825,0.207824,0.145694,0.101227,0.16025
2016-05-22 00:00:00,1.233375,1.424921,1.266828,0.481183,0.318894,0.528644,0.506802,0.805103,0.405239,0.364692,...,0.343888,0.268824,0.150913,0.16271,0.143827,0.339974,0.187936,0.139324,0.095689,0.143969


In [8]:
forecast_level1_valid = (
    pd.read_csv("../output/forecast_level1.csv", parse_dates=["ds"])
    .query("ds <= '2016-05-22'")
    .set_index("ds")
    .rename({"y_pred":"root"}, axis=1)
)

forecast_level1_eval = (
    pd.read_csv("../output/forecast_level1.csv", parse_dates=["ds"])
    .query("ds >= '2016-05-23'")
    .set_index("ds")
    .rename({"y_pred":"root"}, axis=1)
)

In [9]:
forecast_level1_eval.tail()

Unnamed: 0_level_0,root
ds,Unnamed: 1_level_1
2016-06-15,41821.755327
2016-06-16,40034.043678
2016-06-17,43855.204804
2016-06-18,49876.376823
2016-06-19,47095.657592


In [10]:
forecast_level2_valid = (
    pd.read_csv("../output/forecast_level2.csv", parse_dates=["ds"])
    .query("ds <= '2016-05-22'")
    .assign(state_id = lambda x: encoders["state"].inverse_transform(x.state_id))
    .pivot(index="ds", columns="state_id", values="y_pred")
)

forecast_level2_eval = (
    pd.read_csv("../output/forecast_level2.csv", parse_dates=["ds"])
    .query("ds >= '2016-05-23'")
    .assign(state_id = lambda x: encoders["state"].inverse_transform(x.state_id))
    .pivot(index="ds", columns="state_id", values="y_pred")
)

In [11]:
forecast_level2_eval.head()

state_id,CA,TX,WI
ds,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2016-05-23,16436.528628,10305.473531,10386.920153
2016-05-24,14835.246738,9336.073186,9902.577344
2016-05-25,14536.126882,9350.38516,9849.840648
2016-05-26,14897.844553,9607.3091,10267.400059
2016-05-27,16542.566114,10359.831406,11796.342535


In [12]:
forecast_level3_valid = (
    pd.read_csv("../output/forecast_level3.csv", parse_dates=["ds"])
    .query("ds <= '2016-05-22'")
    .assign(store_id = lambda x: encoders["store"].inverse_transform(x.store_id))
    .pivot(index="ds", columns="store_id", values="y_pred")
)

forecast_level3_eval = (
    pd.read_csv("../output/forecast_level3.csv", parse_dates=["ds"])
    .query("ds >= '2016-05-23'")
    .assign(store_id = lambda x: encoders["store"].inverse_transform(x.store_id))
    .pivot(index="ds", columns="store_id", values="y_pred")
)

In [13]:
forecast_level3_valid.head()

store_id,CA_1,CA_2,CA_3,CA_4,TX_1,TX_2,TX_3,WI_1,WI_2,WI_3
ds,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
2016-04-25,4091.490967,3834.845789,5917.302713,2589.053279,2986.369223,3658.009162,3670.41669,3214.306487,4328.170338,3242.235332
2016-04-26,3683.369169,3671.67541,5486.626948,2432.476441,2765.738437,3334.837524,3393.874701,3106.612502,4167.746957,3026.673254
2016-04-27,3563.569356,3647.429944,5250.504504,2358.486993,2687.865582,3289.101873,3296.410209,3121.47878,4147.533751,2940.078248
2016-04-28,3603.053812,3689.19284,5180.914969,2359.378619,2659.252982,3297.943273,3280.910317,3181.940656,4173.130817,2980.661975
2016-04-29,4349.159399,4334.66551,5661.639214,2543.332982,2961.314859,3760.779512,3610.539906,3928.80986,4872.960884,3716.46309


***

In [14]:
hierarchy_dict = {"root":hierarchy.state_id.unique()}

for state_id in hierarchy.state_id.unique():
    hierarchy_dict[state_id] = hierarchy.query("state_id == @state_id").store_id.unique()
    
for store_id in hierarchy.store_id.unique():
    hierarchy_dict[store_id] = hierarchy.query("store_id == @store_id").id.unique()

In [15]:
hts = HTSDistributor(hierarchy_dict)

In [16]:
forecast_valid = pd.concat([forecast_level1_valid, forecast_level2_valid, forecast_level3_valid, forecast_level12_valid],
                           axis=1)

In [17]:
forecast_eval = pd.concat([forecast_level1_eval, forecast_level2_eval, forecast_level3_eval, forecast_level12_eval],
                           axis=1)

***
### weights

In [116]:
residuals_level2 = pd.read_parquet("../output/residuals_level2.parquet")
residuals_level2["state_id"] = encoders["state"].inverse_transform(residuals_level2.state_id)
residuals_level2

Unnamed: 0,state_id,residual
0,CA,401.2735
1,TX,379.039934
2,WI,405.244333


In [117]:
residuals_level3 = pd.read_parquet("../output/residuals_level3.parquet")
residuals_level3["store_id"] = encoders["store"].inverse_transform(residuals_level3.store_id)
residuals_level3

Unnamed: 0,store_id,residual
0,CA_1,191.696049
1,CA_2,186.930838
2,CA_3,243.934122
3,CA_4,113.910485
4,TX_1,156.807282
5,TX_2,220.916225
6,TX_3,165.716004
7,WI_1,205.164055
8,WI_2,204.697503
9,WI_3,200.869899


In [118]:
residuals_level12 = pd.read_parquet("../output/residuals_level12.parquet")
residuals_level12["item_id"] = encoders["item"].inverse_transform(residuals_level12.item_id)
residuals_level12["store_id"] = encoders["store"].inverse_transform(residuals_level12.store_id)
residuals_level12 = pd.merge(hierarchy.loc[:, ["store_id","item_id","id"]],
                             residuals_level12,
                             how="left", 
                             on=["item_id","store_id"])
# there are missing item_id,store_id in residuals dataframe
residuals_level12["residual"] = residuals_level12.residual.fillna(1)
residuals_level12

Unnamed: 0,store_id,item_id,id,residual
0,CA_1,HOBBIES_1_001,HOBBIES_1_001_CA_1,0.873202
1,CA_1,HOBBIES_1_002,HOBBIES_1_002_CA_1,0.606440
2,CA_1,HOBBIES_1_003,HOBBIES_1_003_CA_1,0.724929
3,CA_1,HOBBIES_1_004,HOBBIES_1_004_CA_1,1.996709
4,CA_1,HOBBIES_1_005,HOBBIES_1_005_CA_1,1.186268
...,...,...,...,...
30485,WI_3,FOODS_3_823,FOODS_3_823_WI_3,0.785383
30486,WI_3,FOODS_3_824,FOODS_3_824_WI_3,0.592361
30487,WI_3,FOODS_3_825,FOODS_3_825_WI_3,1.175961
30488,WI_3,FOODS_3_826,FOODS_3_826_WI_3,1.150346


In [119]:
weights_level12 = {row["id"]:1./(row["residual"]**2) for _,row in residuals_level12.iterrows()}
weights_level3 = {row["store_id"]:1./(row["residual"]**2) for _,row in residuals_level3.iterrows()}
weights_level2 = {row["state_id"]:1./(row["residual"]**2) for _,row in residuals_level2.iterrows()}
weights_level1 = {"root":1./(882.9388819910898**2)}

In [120]:
weights = {**weights_level1, **weights_level2, **weights_level3, **weights_level12}

***

In [122]:
fcst_valid = hts.compute_optimal_combination(forecast_valid, weights=weights)
fcst_valid.head()

Unnamed: 0,root,CA,TX,WI,CA_1,CA_2,CA_3,CA_4,TX_1,TX_2,...,FOODS_3_818_WI_3,FOODS_3_819_WI_3,FOODS_3_820_WI_3,FOODS_3_821_WI_3,FOODS_3_822_WI_3,FOODS_3_823_WI_3,FOODS_3_824_WI_3,FOODS_3_825_WI_3,FOODS_3_826_WI_3,FOODS_3_827_WI_3
0,37680.648834,16751.483867,10337.442672,10591.722295,4214.647729,3857.273921,6097.612933,2581.949285,3087.784077,3703.775065,...,1.584383,1.931497,1.533137,0.678676,1.97401,0.280136,0.258698,0.766876,0.898447,0.823534
1,35380.41777,15524.815184,9629.48801,10226.114576,3817.46256,3643.435,5595.225995,2468.691629,2844.93361,3438.605909,...,1.5683,1.638755,1.320771,0.720531,1.656786,0.270925,0.191475,0.645455,0.837931,0.793133
2,34967.808238,15279.389399,9477.471534,10210.947306,3777.412847,3666.886717,5377.606396,2457.483439,2825.866593,3379.482298,...,1.68818,1.582318,1.309771,0.686037,1.577617,0.329461,0.188,0.651402,0.841722,0.716171
3,34994.238769,15261.710056,9442.490915,10290.037797,3794.830072,3678.029032,5337.059192,2451.79176,2790.637209,3382.006123,...,1.659102,1.701679,1.378177,0.649532,1.604032,0.254945,0.187136,0.607735,0.765719,0.700322
4,41330.334292,17510.643614,10767.249057,13052.441621,4548.195116,4509.489584,5810.076744,2642.882171,3149.437568,3914.100041,...,1.694412,1.977987,1.878087,0.799142,2.105389,0.37135,0.220117,0.666762,0.982478,0.725503


In [None]:
fcst_eval = hts.compute_optimal_combination(forecast_eval, weights=weights)
fcst_eval.head()

***

In [None]:
fcst_valid.set_index(forecast_valid.index, inplace=True)
fcst_eval.set_index(forecast_eval.index, inplace=True)

fcst_valid = fcst_valid.loc[:, hts.bottom_nodes].transpose()
fcst_eval = fcst_eval.loc[:, hts.bottom_nodes].transpose()

fcst_valid.columns = [f"F{i}" for i in range(1,29)]
fcst_eval.columns = [f"F{i}" for i in range(1,29)]

fcst_valid = (fcst_valid
              .reset_index()
              .rename({"index":"id"}, axis=1))

fcst_eval= (fcst_eval
              .reset_index()
              .rename({"index":"id"}, axis=1))

fcst_valid["id"] = fcst_valid.id.apply(lambda x: x+"_validation")
fcst_eval["id"] = fcst_eval.id.apply(lambda x: x+"_evaluation")

***
### submission

In [None]:
submission = pd.read_csv("../input/sample_submission.csv")

In [None]:
all_predictions = pd.concat([fcst_valid, fcst_eval], ignore_index=True)

In [None]:
assert set(all_predictions.id) == set(submission.id), \
    "there are missing time series predictions."

In [None]:
submission = pd.merge(submission.loc[:, ["id"]], all_predictions)

In [None]:
!ls ../output

In [112]:
file_path = "../output/lgbm_bottom_v41.csv.gz"

if os.path.isfile(file_path):
    print("File already exists.")
else:
    submission.to_csv(file_path, index=False, compression="gzip")

In [None]:
!kaggle competitions submit -c m5-forecasting-accuracy -f ../output/lgbm_bottom_v41.csv.gz -m ""

***

In [114]:
# difference between base submission and new submission
# using weights = 1./std(residual errors)
submission
base_submission = pd.merge(submission.loc[:, ["id"]], forecast_level12, how="left", on="id")
diff = np.sqrt(np.mean((submission.loc[:, [f"F{i}" for i in range(1,29)]].values - base_submission.loc[:, [f"F{i}" for i in range(1,29)]].values)**2))
print(diff)

0.2376194385521546


In [43]:
# difference between base submission and new submission
submission
base_submission = pd.merge(submission.loc[:, ["id"]], forecast_level12, how="left", on="id")
diff = np.sqrt(np.mean((submission.loc[:, [f"F{i}" for i in range(1,29)]].values - base_submission.loc[:, [f"F{i}" for i in range(1,29)]].values)**2))
print(diff)

0.38272484979712906


In [87]:
# difference between base submission and new submission
submission
base_submission = pd.merge(submission.loc[:, ["id"]], forecast_level12, how="left", on="id")
diff = np.sqrt(np.mean((submission.loc[:, [f"F{i}" for i in range(1,29)]].values - base_submission.loc[:, [f"F{i}" for i in range(1,29)]].values)**2))
print(diff)

0.012083599329737928


***