### Generating Predictions

We load the models and test data from the designated directories and then predict the sales for those days. The individual category-level (such as a single store/department/state/category) model predictions are concatenated to give us the whole set of forecasts. 

### Ensembling
We tried to ensemble all the model predictions present at our disposal, using simple averages and weighted averages (where the weights are model errors). We find the best model ensemble gives us a WRMSSE of 0.55.

In [1]:
import warnings
import pickle
import random
from itertools import product

import pandas as pd
import numpy as np

SEED = 42
warnings.filterwarnings("ignore")
random.seed(SEED)
np.random.seed(SEED)

In [2]:
BASE_DIR = "../../"
MODEL_NAME = "lgb"
TEST_DATA_DIR = BASE_DIR + f"models/{MODEL_NAME}/"
MODELS_DIR = BASE_DIR + f"models/{MODEL_NAME}/"
MODEL_FILE_PREFIX = f"{MODEL_NAME}_model_"

TARGET_COL = "units_sold"
PRED_LENGTH = 28

TRAIN_END = 1941

# Change this list to lists we want to iterate through (unique values in the hierarchical columns)
CATEGORIES = ['HOBBIES', 'HOUSEHOLD', 'FOODS']
STATES = ["CA","TX","WI"]
STORES = ["CA_1", "CA_2", "CA_3", "CA_4", "TX_1", "TX_2", "TX_3", "WI_1", "WI_2", "WI_3"]
DEPTS = ['HOBBIES_1', 'HOBBIES_2', 'HOUSEHOLD_1', 'HOUSEHOLD_2', 'FOODS_1' ,'FOODS_2','FOODS_3']

HIERARCHY_COL = ["state_id","cat_id"]

In [3]:
submission_day_cols = [f'F{day}' for day in range(1,29)]

In [8]:
col_list  = list(product(STORES, DEPTS))

In [9]:
def get_predictions(col_list=col_list):
    # initialising submission df --> storing predictions in submission compatible format
    submission_df = pd.DataFrame()
    col_ids = ["_".join(i) for i in col_list]
    for col_id in col_ids:
        print(f"Generating Predicitons for {col_id}")
        test = pd.read_pickle(TEST_DATA_DIR+"test_data_"+col_id+".pkl")
        
        # Subsetting to only validation or evaluation rows 
        validation_indices = (test['d']>TRAIN_END-PRED_LENGTH) & (test['d']<=TRAIN_END)
        evaluation_indices = (test['d']>TRAIN_END) & (test['d']<=TRAIN_END+PRED_LENGTH)
        test = test[validation_indices+evaluation_indices]
        test.drop(columns=[TARGET_COL], inplace=True)
        # loading the model and predicting to populate target column
        with open(MODELS_DIR+MODEL_FILE_PREFIX+col_id+".pkl", "rb") as f:
            model = pickle.load(f)
        test[TARGET_COL] = model.predict(test.drop(columns=["id"]))
        # columns other than id,d,units_sold are only needed till predictions
        test = test[["d",TARGET_COL,"id"]]

        # Adjusting the id names
        test.loc[validation_indices,"id"] = test.loc[validation_indices,"id"].str.replace("evaluation","validation")

        # reformatting the predictions (each id is a row)
        id_list = test["id"].unique()
        submission_subset = pd.DataFrame(index=id_list, columns=submission_day_cols)
        submission_subset.index.name = "id"
        for id in id_list:
            current_id_indices = test["id"]==id
            submission_subset.loc[id, submission_day_cols] = test.loc[current_id_indices,"units_sold"].values
        submission_df = pd.concat([submission_df,submission_subset])
        del submission_subset, test
    return submission_df.reset_index()

In [10]:
df = get_predictions()

Generating Predicitons for CA_1_HOBBIES_1
Generating Predicitons for CA_1_HOBBIES_2
Generating Predicitons for CA_1_HOUSEHOLD_1
Generating Predicitons for CA_1_HOUSEHOLD_2
Generating Predicitons for CA_1_FOODS_1
Generating Predicitons for CA_1_FOODS_2
Generating Predicitons for CA_1_FOODS_3
Generating Predicitons for CA_2_HOBBIES_1
Generating Predicitons for CA_2_HOBBIES_2
Generating Predicitons for CA_2_HOUSEHOLD_1
Generating Predicitons for CA_2_HOUSEHOLD_2
Generating Predicitons for CA_2_FOODS_1
Generating Predicitons for CA_2_FOODS_2
Generating Predicitons for CA_2_FOODS_3
Generating Predicitons for CA_3_HOBBIES_1
Generating Predicitons for CA_3_HOBBIES_2
Generating Predicitons for CA_3_HOUSEHOLD_1
Generating Predicitons for CA_3_HOUSEHOLD_2
Generating Predicitons for CA_3_FOODS_1
Generating Predicitons for CA_3_FOODS_2
Generating Predicitons for CA_3_FOODS_3
Generating Predicitons for CA_4_HOBBIES_1
Generating Predicitons for CA_4_HOBBIES_2
Generating Predicitons for CA_4_HOUSEHOL

In [21]:
df.to_csv(BASE_DIR+"/submissions/store_dept_lgb_pred_v1.csv",index=False)

In [11]:
import pandas as pd
BASE_DIR = "../../"
submission_day_cols = [f'F{day}' for day in range(1,29)]

## Basic Ensembling
xgb = pd.read_csv(BASE_DIR+"/submissions/store_xgb_pred.csv").sort_values("id")
w1 = 1/1#0.6211
lgb_store = pd.read_csv(BASE_DIR+"/submissions/store_lgb_pred_v2.csv").sort_values("id")
w2 = 1/1#0.62292
lgb_state_cat = pd.read_csv(BASE_DIR+"/submissions/state_cat_lgb_pred_v1.csv").sort_values("id")
w3 = 1/1#0.64231
lgb_store_cat = pd.read_csv(BASE_DIR+"/submissions/store_cat_lgb_pred_v1.csv").sort_values("id")
w4 = 1/1#0.62917
lgb_dept = pd.read_csv(BASE_DIR+"/submissions/dept_lgb_pred_v1.csv").sort_values("id")
w5 = 1/1#2.13755
lgb_dept = pd.read_csv(BASE_DIR+"/submissions/store_dept_lgb_pred_v1.csv").sort_values("id")
w5 = 1/1#0.52978

ensemble = pd.DataFrame(columns=xgb.columns, index=xgb.index)
ensemble[submission_day_cols] = (xgb.iloc[:,1:]*w1
                                + lgb_store.iloc[:,1:]*w2
                                + lgb_state_cat.iloc[:,1:]*w3
                                + lgb_store_cat.iloc[:,1:]*w4
                                + lgb_dept.iloc[:,1:]*w5
                                    )/(w1+w2+w3+w4+w5)
ensemble["id"] = xgb["id"]

ensemble.to_csv(BASE_DIR+"/submissions/ensemble_basic_v9.csv",index=False)