In [1]:
%reload_ext autoreload
%autoreload 2

import pandas as pd
import dotenv
import os
import sys
import numpy as np
from tqdm import tqdm

In [2]:
# add root project directory
sys.path.append("../")
# get environment path file
dotenv_path = dotenv.find_dotenv()
# load environment variables
dotenv.load_dotenv(dotenv_path)

CALENDAR_FILE_PATH = os.environ.get("CALENDAR_FILE_PATH")
SALES_TRAIN_EVALUATION_FILE_PATH = os.environ.get("SALES_TRAIN_EVALUATION_FILE_PATH")
SALES_TRAIN_VALIDATION_FILE_PATH = os.environ.get("SALES_TRAIN_VALIDATION_FILE_PATH")
SAMPLE_SUBMISSION_FILE_PATH = os.environ.get("SAMPLE_SUBMISSION_FILE_PATH")
SELL_PRICES_FILE_PATH = os.environ.get("SELL_PRICES_FILE_PATH")

In [3]:
# load dataset
cal_df = pd.read_csv(CALENDAR_FILE_PATH)
sell_price_df = pd.read_csv(SELL_PRICES_FILE_PATH)
df = pd.read_csv(SALES_TRAIN_VALIDATION_FILE_PATH)

# 1. Create The Weight of the for Evaluation

In [4]:
# preprocess data
cal_df['d'] = cal_df['d'].apply(lambda x: x.split('_')[1]).astype(int)
sell_price_df['id'] = sell_price_df['item_id'] + '_' + sell_price_df['store_id'] + '_validation' 

In [5]:
# weight calculation
# scope - last 28 days
# sum of sales from each product according to the price

for d in tqdm(range(1858, 1886)):
    # find the wm_yr_wk based on `d`
    wm_yr_wk = cal_df.loc[cal_df['d']==d, 'wm_yr_wk'].to_list()[0]
    # filter the sell_price_df based on `wm_yr_wk`
    filtered_sell_price_df = sell_price_df[sell_price_df['wm_yr_wk']==wm_yr_wk]
    # merge with the dataframe based on `id`, since all id exist for all items we don't need a temporary table
    df = df.merge(filtered_sell_price_df[['id', 'sell_price']], on=['id'], how='inner')
    # calculate the unit sales = quantity * sell price
    df['unit_sales_'+str(d)] = df['d_'+str(d)] * df['sell_price']
    # drop the `sell_price``
    df.drop(columns=['sell_price'], inplace=True)

100%|██████████| 28/28 [00:21<00:00,  1.33it/s]


In [6]:
# get the total sales of past 28 days
df['sum_unit_sales'] = df[[c for c in df.columns if c.find('unit_sales')==0]].sum(axis=1).to_list()

In [7]:
df['weight'] = df['sum_unit_sales'] / df['sum_unit_sales'].sum() / 12

In [8]:
# drop un wanted table
df.drop(columns=[c for c in df.columns if c.find('unit_sales')==0], inplace=True)
df.drop(columns=['sum_unit_sales'], inplace=True)

# 2. Calculate Weight for The High Level Series

In [9]:
agg_df = pd.DataFrame(df[[c for c in df.columns if c.find('d_')==0]].sum()).transpose()
agg_df['level'] = 1
agg_df['weight'] = 1/12
column_order = agg_df.columns

In [10]:
level_groupings = {
    2: ["state_id"], 3: ["store_id"], 4: ["cat_id"], 5: ["dept_id"], 
    6: ["state_id", "cat_id"], 7: ["state_id", "dept_id"], 8: ["store_id", "cat_id"], 
    9: ["store_id", "dept_id"], 10: ["item_id"], 11: ["item_id", "state_id"]
}

In [11]:
for level in tqdm(level_groupings):
    temp_df = df.groupby(by=level_groupings[level]).sum(numeric_only=True).reset_index()
    temp_df['level'] = level
    agg_df = pd.concat([agg_df, temp_df])
del temp_df

100%|██████████| 10/10 [00:07<00:00,  1.32it/s]


In [12]:
agg_df['weight'].sum() + df['weight'].sum() 

1.0

# 3. Multi Label Regression with ExtraTreeRegressor


In [13]:
h = 28
def rmsse(ground_truth, forecast, train_series, axis=1, n=1885):
    # assuming input are numpy array or matrices
    assert axis == 0 or axis == 1
    assert type(ground_truth) == np.ndarray and type(forecast) == np.ndarray and type(train_series) == np.ndarray
    
    if axis == 1:
        # using axis == 1 we must guarantee these are matrices and not arrays
        assert ground_truth.shape[1] > 1 and forecast.shape[1] > 1 and train_series.shape[1] > 1
    
    numerator = ((ground_truth - forecast)**2).sum(axis=axis)
    if axis == 1:
        denominator = 1/(n-1) * ((train_series[:, 1:] - train_series[:, :-1]) ** 2).sum(axis=axis)
    else:
        denominator = 1/(n-1) * ((train_series[1:] - train_series[:-1]) ** 2).sum(axis=axis)
    return (1/h * numerator/denominator) ** 0.5

In [14]:
pd.get_dummies(df.drop(columns=['id', 'item_id', 'weight']))

Unnamed: 0,d_1,d_2,d_3,d_4,d_5,d_6,d_7,d_8,d_9,d_10,...,store_id_CA_4,store_id_TX_1,store_id_TX_2,store_id_TX_3,store_id_WI_1,store_id_WI_2,store_id_WI_3,state_id_CA,state_id_TX,state_id_WI
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
30485,0,0,2,2,0,3,1,4,1,0,...,0,0,0,0,0,0,1,0,0,1
30486,0,0,0,0,0,5,0,1,1,3,...,0,0,0,0,0,0,1,0,0,1
30487,0,6,0,2,2,4,1,8,5,2,...,0,0,0,0,0,0,1,0,0,1
30488,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,1


In [15]:
df.head()

Unnamed: 0,id,item_id,dept_id,cat_id,store_id,state_id,d_1,d_2,d_3,d_4,...,d_1905,d_1906,d_1907,d_1908,d_1909,d_1910,d_1911,d_1912,d_1913,weight
0,HOBBIES_1_001_CA_1_validation,HOBBIES_1_001,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,...,3,0,1,1,1,3,0,1,1,5.258191e-06
1,HOBBIES_1_002_CA_1_validation,HOBBIES_1_002,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,...,0,0,0,0,1,0,0,0,0,8.123278e-07
2,HOBBIES_1_003_CA_1_validation,HOBBIES_1_003,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,...,1,2,1,1,1,0,1,1,1,1.012852e-06
3,HOBBIES_1_004_CA_1_validation,HOBBIES_1_004,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,...,0,5,4,1,0,1,3,7,2,5.591034e-06
4,HOBBIES_1_005_CA_1_validation,HOBBIES_1_005,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,...,1,1,0,1,1,2,2,2,4,2.029797e-06


In [16]:
df = df[["id", "item_id", "dept_id", "cat_id", "store_id", "state_id", "weight"]].join(pd.get_dummies(df.drop(columns=["id", "item_id", "weight"])))

In [17]:
import random
from sklearn.ensemble import ExtraTreesRegressor

In [18]:
best_s = 100
best_m = None
best_start_date = 1000

In [65]:
# perform a hyperparameter tuning alongisde with cv and wrmsse metric

# perform hyperparameter tuning, inspired by randomized search cv
for _ in tqdm(range(10)):
    # prepare extra tree regressor parameter
    rand_est = random.randint(20, 50)
    rand_depth = random.randint(10, 30)
    rand_start_date = random.randint(1200, 1500)

    print(rand_est, rand_depth, rand_start_date)

    average_wrmsse = []
    # perform cv for each randomized hyperparameter generated
    for cv in range(1, 3):
        train_start = rand_start_date - (28 * cv)
        train_end = 1885 - (28 * cv)

        # create model using randomized hyperparameter
        regressor = ExtraTreesRegressor(n_estimators=rand_est, max_depth=rand_depth, random_state=42)

        # this part of the code will be usefull for next iteration because we don't want duplicate columns
        drop_cols = [item for item in [c for c in df.columns if c.find("f_")==0] + ['wrmsse', 'rmsse'] if item in df.columns]
        df.drop(columns=drop_cols, inplace=True)
        regressor.fit(
            df.drop(columns=['id', 'item_id', 'dept_id', 'state_id', 'store_id', 'cat_id'] \
            + [c for c in df.columns if c.find('d_')==0 and int(c.split('_')[1]) not in range(train_start, train_end+1)]),  # for X_train
            df[[c for c in df.columns if c.find('d_')==0 and int(c.split('_')[1]) in range(train_end+1, train_end+28+1)]]  # for y_train
        )
        print('done')
        

  0%|          | 0/10 [00:00<?, ?it/s]

23 18 1408


  0%|          | 0/10 [00:57<?, ?it/s]


KeyboardInterrupt: 

In [63]:
df.drop(columns=['id', 'item_id', 'dept_id', 'state_id', 'store_id', 'cat_id'] \
            + [c for c in df.columns if c.find('d_')==0 and int(c.split('_')[1]) not in range(train_start, train_end+1)])

Unnamed: 0,weight,d_1361,d_1362,d_1363,d_1364,d_1365,d_1366,d_1367,d_1368,d_1369,...,store_id_CA_4,store_id_TX_1,store_id_TX_2,store_id_TX_3,store_id_WI_1,store_id_WI_2,store_id_WI_3,state_id_CA,state_id_TX,state_id_WI
0,5.258191e-06,1,0,0,1,0,1,1,0,1,...,0,0,0,0,0,0,0,1,0,0
1,8.123278e-07,0,0,0,0,2,0,0,1,0,...,0,0,0,0,0,0,0,1,0,0
2,1.012852e-06,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,5.591034e-06,1,1,0,0,0,4,6,1,0,...,0,0,0,0,0,0,0,1,0,0
4,2.029797e-06,0,0,0,1,2,6,1,0,0,...,0,0,0,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
30485,4.697087e-07,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,1
30486,0.000000e+00,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,1
30487,3.167010e-06,0,1,2,0,2,0,2,1,0,...,0,0,0,0,0,0,1,0,0,1
30488,1.018536e-06,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,1


Resource Constraint to Slow