In [1]:
import os
import pickle
import numpy as np
import pandas as pd
from mahts import HTSDistributor
from scipy.stats import iqr

import matplotlib.pyplot as plt

***

In [2]:
hierarchy = pd.read_parquet("../input/hierarchy_raw.parquet")

In [3]:
infile = open("../input/encoders.pkl", "rb")
encoders = pickle.load(infile)
infile.close()

***
## Approach: level1 -> level2 -> level3 -> level12

In [4]:
!ls ../output/kaggle_submissions/

fnu050-num-leaves-76.csv	 m5-first-public-notebook-under-0-50.csv
m5-accuracy-poisson-tweedie.csv  m5-forecast-v2-python.csv
m5-accuracy-tweedie-is-back.csv  m5-forecaster-v2.csv


In [5]:
forecast_level12 = pd.read_csv("../output/kaggle_submissions/m5-first-public-notebook-under-0-50.csv")
forecast_level12.head()

Unnamed: 0,id,F1,F2,F3,F4,F5,F6,F7,F8,F9,...,F19,F20,F21,F22,F23,F24,F25,F26,F27,F28
0,FOODS_1_001_CA_1_validation,0.910026,0.847982,0.85089,0.801789,1.071719,1.296671,1.350886,1.009637,0.970509,...,1.112975,1.385381,1.294539,0.99315,0.869451,0.849285,0.85523,1.022105,1.262562,1.233375
1,FOODS_1_001_CA_2_validation,0.940942,0.954666,0.882539,1.273918,1.290628,1.354433,1.554513,0.920688,0.9263,...,1.208342,1.641095,1.501229,1.032052,0.982766,1.012417,1.052117,1.241158,1.689393,1.424921
2,FOODS_1_001_CA_3_validation,1.091795,1.044925,0.94997,0.91752,0.990185,1.107758,1.223882,1.105829,1.137468,...,1.132724,1.674212,1.766982,1.147176,1.055445,0.980131,0.969123,1.054151,1.391111,1.266828
3,FOODS_1_001_CA_4_validation,0.414818,0.361572,0.355099,0.349345,0.405143,0.451183,0.517519,0.398493,0.426824,...,0.457459,0.482851,0.500463,0.385219,0.363917,0.368164,0.37339,0.41976,0.45792,0.481183
4,FOODS_1_001_TX_1_validation,0.180371,0.179159,0.1707,0.171773,0.172805,0.180151,0.228228,0.476832,0.429286,...,0.393753,0.380485,0.353399,0.295464,0.280412,0.275795,0.27256,0.278792,0.331847,0.318894


In [6]:
date_mapping1 = {f"F{i}": pd.to_datetime("2016-04-24")+pd.DateOffset(days=i) for i in range(1,29)}
date_mapping2 = {f"F{i}": pd.to_datetime("2016-05-22")+pd.DateOffset(days=i) for i in range(1,29)}

forecast_level12_valid = (
    forecast_level12
    .loc[:30489, :]
    .assign(id = lambda x: x.id.map(lambda x: x.replace("_validation","")))
    .rename(date_mapping1, axis=1)
    .set_index("id")
    .transpose()
)

forecast_level12_eval = (
    forecast_level12
    .loc[30490:, :]
    .assign(id = lambda x: x.id.map(lambda x: x.replace("_evaluation","")))
    .rename(date_mapping2, axis=1)
    .set_index("id")
    .transpose()
)

In [7]:
forecast_level12_valid.tail()

id,FOODS_1_001_CA_1,FOODS_1_001_CA_2,FOODS_1_001_CA_3,FOODS_1_001_CA_4,FOODS_1_001_TX_1,FOODS_1_001_TX_2,FOODS_1_001_TX_3,FOODS_1_001_WI_1,FOODS_1_001_WI_2,FOODS_1_001_WI_3,...,HOUSEHOLD_2_516_CA_1,HOUSEHOLD_2_516_CA_2,HOUSEHOLD_2_516_CA_3,HOUSEHOLD_2_516_CA_4,HOUSEHOLD_2_516_TX_1,HOUSEHOLD_2_516_TX_2,HOUSEHOLD_2_516_TX_3,HOUSEHOLD_2_516_WI_1,HOUSEHOLD_2_516_WI_2,HOUSEHOLD_2_516_WI_3
2016-05-18 00:00:00,0.849285,1.012417,0.980131,0.368164,0.275795,0.428849,0.435424,0.556651,0.381275,0.324671,...,0.189202,0.184461,0.115581,0.114239,0.10588,0.234043,0.141908,0.085014,0.088967,0.103765
2016-05-19 00:00:00,0.85523,1.052117,0.969123,0.37339,0.27256,0.435233,0.440499,0.567618,0.383275,0.319644,...,0.208429,0.186873,0.114525,0.115479,0.108983,0.226027,0.145095,0.08765,0.076782,0.109788
2016-05-20 00:00:00,1.022105,1.241158,1.054151,0.41976,0.278792,0.481264,0.471629,0.703742,0.403016,0.378388,...,0.291932,0.234969,0.131894,0.133653,0.129428,0.280163,0.177965,0.131066,0.101013,0.154863
2016-05-21 00:00:00,1.262562,1.689393,1.391111,0.45792,0.331847,0.547399,0.53926,1.02689,0.440113,0.424405,...,0.356674,0.273579,0.162382,0.161577,0.159163,0.36825,0.207824,0.145694,0.101227,0.16025
2016-05-22 00:00:00,1.233375,1.424921,1.266828,0.481183,0.318894,0.528644,0.506802,0.805103,0.405239,0.364692,...,0.343888,0.268824,0.150913,0.16271,0.143827,0.339974,0.187936,0.139324,0.095689,0.143969


In [8]:
forecast_level1_valid = (
    pd.read_csv("../output/forecast_level1.csv", parse_dates=["ds"])
    .query("ds <= '2016-05-22'")
    .set_index("ds")
    .rename({"y_pred":"root"}, axis=1)
)

forecast_level1_eval = (
    pd.read_csv("../output/forecast_level1.csv", parse_dates=["ds"])
    .query("ds >= '2016-05-23'")
    .set_index("ds")
    .rename({"y_pred":"root"}, axis=1)
)

In [9]:
forecast_level1_eval.tail()

Unnamed: 0_level_0,root
ds,Unnamed: 1_level_1
2016-06-15,41821.755327
2016-06-16,40034.043678
2016-06-17,43855.204804
2016-06-18,49876.376823
2016-06-19,47095.657592


In [10]:
forecast_level2_valid = (
    pd.read_csv("../output/forecast_level2.csv", parse_dates=["ds"])
    .query("ds <= '2016-05-22'")
    .assign(state_id = lambda x: encoders["state"].inverse_transform(x.state_id))
    .pivot(index="ds", columns="state_id", values="y_pred")
)

forecast_level2_eval = (
    pd.read_csv("../output/forecast_level2.csv", parse_dates=["ds"])
    .query("ds >= '2016-05-23'")
    .assign(state_id = lambda x: encoders["state"].inverse_transform(x.state_id))
    .pivot(index="ds", columns="state_id", values="y_pred")
)

In [11]:
forecast_level2_eval.head()

state_id,CA,TX,WI
ds,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2016-05-23,16436.528628,10305.473531,10386.920153
2016-05-24,14835.246738,9336.073186,9902.577344
2016-05-25,14536.126882,9350.38516,9849.840648
2016-05-26,14897.844553,9607.3091,10267.400059
2016-05-27,16542.566114,10359.831406,11796.342535


In [12]:
forecast_level3_valid = (
    pd.read_csv("../output/forecast_level3.csv", parse_dates=["ds"])
    .query("ds <= '2016-05-22'")
    .assign(store_id = lambda x: encoders["store"].inverse_transform(x.store_id))
    .pivot(index="ds", columns="store_id", values="y_pred")
)

forecast_level3_eval = (
    pd.read_csv("../output/forecast_level3.csv", parse_dates=["ds"])
    .query("ds >= '2016-05-23'")
    .assign(store_id = lambda x: encoders["store"].inverse_transform(x.store_id))
    .pivot(index="ds", columns="store_id", values="y_pred")
)

In [13]:
forecast_level3_valid.head()

store_id,CA_1,CA_2,CA_3,CA_4,TX_1,TX_2,TX_3,WI_1,WI_2,WI_3
ds,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
2016-04-25,4091.490967,3834.845789,5917.302713,2589.053279,2986.369223,3658.009162,3670.41669,3214.306487,4328.170338,3242.235332
2016-04-26,3683.369169,3671.67541,5486.626948,2432.476441,2765.738437,3334.837524,3393.874701,3106.612502,4167.746957,3026.673254
2016-04-27,3563.569356,3647.429944,5250.504504,2358.486993,2687.865582,3289.101873,3296.410209,3121.47878,4147.533751,2940.078248
2016-04-28,3603.053812,3689.19284,5180.914969,2359.378619,2659.252982,3297.943273,3280.910317,3181.940656,4173.130817,2980.661975
2016-04-29,4349.159399,4334.66551,5661.639214,2543.332982,2961.314859,3760.779512,3610.539906,3928.80986,4872.960884,3716.46309


***

In [14]:
hierarchy_dict = {"root":hierarchy.state_id.unique()}

for state_id in hierarchy.state_id.unique():
    hierarchy_dict[state_id] = hierarchy.query("state_id == @state_id").store_id.unique()
    
for store_id in hierarchy.store_id.unique():
    hierarchy_dict[store_id] = hierarchy.query("store_id == @store_id").id.unique()

In [15]:
hts = HTSDistributor(hierarchy_dict)

In [16]:
forecast_valid = pd.concat([forecast_level1_valid, forecast_level2_valid, forecast_level3_valid, forecast_level12_valid],
                           axis=1)

In [17]:
forecast_eval = pd.concat([forecast_level1_eval, forecast_level2_eval, forecast_level3_eval, forecast_level12_eval],
                           axis=1)

In [28]:
forecast_both = pd.concat([forecast_valid, forecast_eval], ignore_index=True)
weights = {ts_id:1./np.log1p(iqr(forecast_both[ts_id].values)) for ts_id in forecast_both.columns}

***

In [33]:
fcst_valid = hts.compute_optimal_combination(forecast_valid, weights=weights)
fcst_valid.head()

Unnamed: 0,root,CA,TX,WI,CA_1,CA_2,CA_3,CA_4,TX_1,TX_2,...,FOODS_3_818_WI_3,FOODS_3_819_WI_3,FOODS_3_820_WI_3,FOODS_3_821_WI_3,FOODS_3_822_WI_3,FOODS_3_823_WI_3,FOODS_3_824_WI_3,FOODS_3_825_WI_3,FOODS_3_826_WI_3,FOODS_3_827_WI_3
0,37733.242844,16647.977139,10389.702314,10695.563391,4152.933536,3890.938627,5987.989387,2616.115589,3021.545297,3689.554659,...,1.577582,1.905588,1.520631,0.673522,1.948341,0.278784,0.257518,0.755953,0.895518,0.820672
1,35175.271316,15337.90363,9567.045348,10270.322339,3703.604884,3679.700743,5509.395193,2445.202809,2793.799281,3367.002673,...,1.55871,1.602579,1.303431,0.713323,1.62193,0.269032,0.189862,0.630345,0.833831,0.78924
2,34141.708821,14795.187101,9296.229642,10050.292079,3559.148864,3626.624946,5242.397746,2367.015545,2703.481897,3299.287918,...,1.666625,1.500461,1.271545,0.66992,1.501947,0.325241,0.184394,0.617995,0.83281,0.70776
3,34147.48447,14769.44668,9218.344093,10159.693697,3587.12257,3655.727529,5165.460909,2361.135673,2659.912938,3292.285034,...,1.639531,1.626471,1.343162,0.634795,1.534768,0.251064,0.183817,0.576918,0.75748,0.692613
4,39921.110611,17003.304854,10370.564521,12547.241236,4378.415486,4362.165482,5691.599421,2571.124466,2979.114706,3773.86465,...,1.669803,1.8828,1.833644,0.780488,2.018633,0.366399,0.215913,0.627818,0.972111,0.715362


In [34]:
fcst_eval = hts.compute_optimal_combination(forecast_eval, weights=weights)
fcst_eval.head()

Unnamed: 0,root,CA,TX,WI,CA_1,CA_2,CA_3,CA_4,TX_1,TX_2,...,FOODS_3_818_WI_3,FOODS_3_819_WI_3,FOODS_3_820_WI_3,FOODS_3_821_WI_3,FOODS_3_822_WI_3,FOODS_3_823_WI_3,FOODS_3_824_WI_3,FOODS_3_825_WI_3,FOODS_3_826_WI_3,FOODS_3_827_WI_3
0,38096.116026,16764.571762,10570.597923,10760.946341,4185.56014,3908.508374,6069.539639,2600.963609,3064.835128,3781.837737,...,1.580128,1.915367,1.525209,0.675448,1.957335,0.279291,0.257951,0.759961,0.896592,0.821696
1,34769.795306,15106.02405,9512.480928,10151.290328,3640.120092,3636.544424,5444.263542,2385.095991,2766.836315,3364.681882,...,1.552019,1.576893,1.291403,0.70826,1.598294,0.2677,0.188724,0.619817,0.831008,0.786549
2,34248.714724,14738.599247,9444.682406,10065.433071,3543.041497,3631.401013,5231.780527,2332.37621,2742.136791,3372.439013,...,1.665662,1.496765,1.269815,0.669192,1.498546,0.325049,0.18423,0.616479,0.832404,0.707373
3,35120.60855,15051.487689,9646.114438,10423.006423,3649.490731,3758.826641,5259.917514,2383.252803,2792.358341,3460.954215,...,1.649011,1.662855,1.360203,0.641968,1.568255,0.252952,0.185428,0.591833,0.761479,0.696426
4,39072.649868,16688.302479,10429.41795,11954.929439,4244.116716,4324.554706,5612.272738,2507.35832,2988.655794,3817.850611,...,1.643956,1.783599,1.787182,0.760931,1.927333,0.361253,0.21152,0.587152,0.961208,0.704967


***

In [35]:
fcst_valid.set_index(forecast_valid.index, inplace=True)
fcst_eval.set_index(forecast_eval.index, inplace=True)

fcst_valid = fcst_valid.loc[:, hts.bottom_nodes].transpose()
fcst_eval = fcst_eval.loc[:, hts.bottom_nodes].transpose()

fcst_valid.columns = [f"F{i}" for i in range(1,29)]
fcst_eval.columns = [f"F{i}" for i in range(1,29)]

fcst_valid = (fcst_valid
              .reset_index()
              .rename({"index":"id"}, axis=1))

fcst_eval= (fcst_eval
              .reset_index()
              .rename({"index":"id"}, axis=1))

fcst_valid["id"] = fcst_valid.id.apply(lambda x: x+"_validation")
fcst_eval["id"] = fcst_eval.id.apply(lambda x: x+"_evaluation")

***
### submission

In [36]:
submission = pd.read_csv("../input/sample_submission.csv")

In [37]:
all_predictions = pd.concat([fcst_valid, fcst_eval], ignore_index=True)

In [38]:
assert set(all_predictions.id) == set(submission.id), \
    "there are missing time series predictions."

In [39]:
submission = pd.merge(submission.loc[:, ["id"]], all_predictions)

In [40]:
!ls ../output

forecast_level1.csv	lgbm_bottom_v18.csv.gz	lgbm_bottom_v30.csv.gz
forecast_level2.csv	lgbm_bottom_v19.csv.gz	lgbm_bottom_v31.csv.gz
forecast_level3.csv	lgbm_bottom_v2.csv.gz	lgbm_bottom_v32.csv.gz
forecast_root.csv	lgbm_bottom_v20.csv.gz	lgbm_bottom_v33.csv.gz
kaggle_submissions	lgbm_bottom_v21.csv.gz	lgbm_bottom_v34.csv.gz
lgbm_bottom_v1.csv.gz	lgbm_bottom_v22.csv.gz	lgbm_bottom_v35.csv.gz
lgbm_bottom_v10.csv.gz	lgbm_bottom_v23.csv.gz	lgbm_bottom_v36.csv.gz
lgbm_bottom_v11.csv.gz	lgbm_bottom_v24.csv.gz	lgbm_bottom_v4.csv.gz
lgbm_bottom_v12.csv.gz	lgbm_bottom_v25.csv.gz	lgbm_bottom_v5.csv.gz
lgbm_bottom_v13.csv.gz	lgbm_bottom_v26.csv.gz	lgbm_bottom_v6.csv.gz
lgbm_bottom_v14.csv.gz	lgbm_bottom_v27.csv.gz	lgbm_bottom_v7.csv.gz
lgbm_bottom_v15.csv.gz	lgbm_bottom_v28.csv.gz	lgbm_bottom_v8.csv.gz
lgbm_bottom_v16.csv.gz	lgbm_bottom_v29.csv.gz	lgbm_bottom_v9.csv.gz
lgbm_bottom_v17.csv.gz	lgbm_bottom_v3.csv.gz


In [41]:
file_path = "../output/lgbm_bottom_v37.csv.gz"

if os.path.isfile(file_path):
    print("File already exists.")
else:
    submission.to_csv(file_path, index=False, compression="gzip")

In [42]:
!kaggle competitions submit -c m5-forecasting-accuracy -f ../output/lgbm_bottom_v37.csv.gz -m ""

100%|██████████████████████████████████████| 14.7M/14.7M [00:03<00:00, 4.17MB/s]
Successfully submitted to M5 Forecasting - Accuracy

***

In [43]:
# difference between base submission and new submission
submission
base_submission = pd.merge(submission.loc[:, ["id"]], forecast_level12, how="left", on="id")
diff = np.sqrt(np.mean((submission.loc[:, [f"F{i}" for i in range(1,29)]].values - base_submission.loc[:, [f"F{i}" for i in range(1,29)]].values)**2))
print(diff)

0.38272484979712906


In [87]:
# difference between base submission and new submission
submission
base_submission = pd.merge(submission.loc[:, ["id"]], forecast_level12, how="left", on="id")
diff = np.sqrt(np.mean((submission.loc[:, [f"F{i}" for i in range(1,29)]].values - base_submission.loc[:, [f"F{i}" for i in range(1,29)]].values)**2))
print(diff)

0.012083599329737928


***