* https://www.kaggle.com/c/favorita-grocery-sales-forecasting/overview
* https://www.kaggle.com/shixw125/1st-place-lgb-model-public-0-506-private-0-511
* https://www.kaggle.com/c/favorita-grocery-sales-forecasting/discussion/47582

In [1]:
from datetime import date, timedelta
import pathlib
import gc
import time

import pandas as pd
import numpy as np
import feather
import lightgbm as lgb
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_squared_error

In [2]:
SAMPLE = False

In [3]:
MAIN_PATH = pathlib.Path('/Users/palermopenano/personal/sm-202011/project_2')

num_days_preds = 16        # number of days into the future to predict (y values)
num_days = 6               #
# num_days = 1

# Date thresholds for train, validation, and test set
t2017 = date(2017, 6, 14)
val2017 = date(2017, 7, 26)
test2017 = date(2017, 8, 16)

# Load data

In [4]:
# Load full dataset
if not SAMPLE:
    df_train = pd.read_csv(
        MAIN_PATH / 'data/train.csv',
        usecols=[1,2,3,4,5],
        dtype={'onpromotion': bool},
        converters={'unit_sales': lambda u: np.log1p(float(u)) if float(u) > 0 else 0},
        parse_dates=["date"],
        skiprows=range(1, 66458909))

    # Generate subsample of full data
    feather.write_dataframe(
        df_train.sample(frac=.01), 
        MAIN_PATH / 'tmp/train_sample')
else:
    # Load small dataset for building pipeline
    df_train = feather.read_dataframe(MAIN_PATH / 'tmp/train_sample')

df_train.shape

(59038132, 5)

In [5]:
df_train.describe()

Unnamed: 0,store_nbr,item_nbr,unit_sales
count,59038130.0,59038130.0,59038130.0
mean,27.75558,1122211.0,1.707525
std,16.20398,561633.8,0.8723205
min,1.0,96995.0,0.0
25%,13.0,668753.0,1.098612
50%,28.0,1152464.0,1.609438
75%,43.0,1464237.0,2.197225
max,54.0,2127114.0,11.40133


In [6]:
df_test = pd.read_csv(
    MAIN_PATH / 'data/test.csv',
    usecols=[0, 1, 2, 3, 4],
    dtype={'onpromotion': bool},
    parse_dates=["date"]).set_index(['store_nbr', 'item_nbr', 'date'])
df_test.shape

(3370464, 2)

In [7]:
items = pd.read_csv(
    MAIN_PATH / "data/items.csv",
).set_index("item_nbr")

stores = pd.read_csv(
    MAIN_PATH / "data/stores.csv",
).set_index("store_nbr")

items.shape, stores.shape

((4100, 3), (54, 4))

In [8]:
df_2017 = df_train.loc[df_train.date >= pd.datetime(2017, 1, 1)]
del df_train
df_2017.shape

(23808261, 5)

# Feature Engineering

## Label encode categorical values

In [9]:
le = LabelEncoder()
items['family'] = le.fit_transform(items['family'].values)
stores['city'] = le.fit_transform(stores['city'].values)
stores['state'] = le.fit_transform(stores['state'].values)
stores['type'] = le.fit_transform(stores['type'].values)

## Clean promo variable

Promo variable (bool) by store and item over time

In [10]:
promo_2017_train = (
    df_2017.
    set_index(['store_nbr', 'item_nbr', 'date'])[['onpromotion']].
    unstack(level=-1).
    fillna(False)
)
promo_2017_train.columns = promo_2017_train.columns.get_level_values(1)

In [11]:
promo_2017_test = df_test[["onpromotion"]].unstack(level=-1).fillna(False)
promo_2017_test.columns = promo_2017_test.columns.get_level_values(1)
promo_2017_test = promo_2017_test.reindex(promo_2017_train.index).fillna(False)

In [12]:
# ??? Why is the train sample columns item ids but dates for the test sample?
promo_2017 = pd.concat([promo_2017_train, promo_2017_test], axis=1)
del promo_2017_test, promo_2017_train
promo_2017

Unnamed: 0_level_0,date,2017-01-01,2017-01-02,2017-01-03,2017-01-04,2017-01-05,2017-01-06,2017-01-07,2017-01-08,2017-01-09,2017-01-10,...,2017-08-22,2017-08-23,2017-08-24,2017-08-25,2017-08-26,2017-08-27,2017-08-28,2017-08-29,2017-08-30,2017-08-31
store_nbr,item_nbr,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
1,96995,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,99197,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,103520,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,103665,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,105574,False,False,True,False,False,True,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
54,2109909,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
54,2110456,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,True,False,False,False,False
54,2113343,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
54,2113914,False,False,False,False,False,False,False,False,False,False,...,True,True,True,True,True,True,True,True,True,True


## Sales by store, item and date

In [13]:
# Convert df to long format with dates along columns and
# store by item along as row axis
# sales by store, item, and date
df_2017 = (
    df_2017.set_index(["store_nbr", "item_nbr", "date"])[["unit_sales"]].
    unstack(level=-1).
    fillna(0)
)
df_2017.columns = df_2017.columns.get_level_values(1)

# Set index for items and stores data to be the same as df_2017
items = items.reindex(df_2017.index.get_level_values(1))
stores = stores.reindex(df_2017.index.get_level_values(0))

## Item sales and num promotion per item over time

In [14]:
# Total sales per item across all stores over time
df_2017_item = df_2017.groupby('item_nbr')[df_2017.columns].sum()  

# Number of promo per item over time
promo_2017_item = promo_2017.groupby('item_nbr')[promo_2017.columns].sum()

## Total sales per class and store over time

In [15]:
# Total sales by item class and store over time
df_2017_store_class = df_2017.reset_index()
df_2017_store_class['class'] = items['class'].values
df_2017_store_class_index = df_2017_store_class[['class', 'store_nbr']]
df_2017_store_class = df_2017_store_class.groupby(['class', 'store_nbr'])[df_2017.columns].sum()

## Total promo per class and store over time

In [16]:
df_2017_promo_store_class = promo_2017.reset_index()
df_2017_promo_store_class['class'] = items['class'].values
df_2017_promo_store_class_index = df_2017_promo_store_class[['class', 'store_nbr']]
df_2017_promo_store_class = df_2017_promo_store_class.groupby(['class', 'store_nbr'])[promo_2017.columns].sum()

# Data prep functions

In [17]:
def get_timespan(df, dt, minus, periods, freq='D'):
    '''Get subset of data based on date interval starting from (dt-minus)
    and going forward `periods` number of periods.
    
    df is a dataframe in wide format with dates along the column
    
    >>> How pd.date_range works <<<
    Example:
    dt = 2017-6-14
    minus = 1
    from_date = dt - timedelta(days=minus)
    periods = 3
    pd.date_range(from_date, periods, freq='D') ==> DatetimeIndex(2017-6-13, 2017-6-14, 2017-6-15])
    '''
    from_date = dt - timedelta(days=minus)
    
    # Generate time periods from_date to P periods into the future (daily)
    date_interval = pd.date_range(from_date, periods=periods, freq=freq)
    return df[date_interval]

# Prepare features

In [18]:
def prepare_dataset(df, t2017, is_train=True, name_prefix=None, num_days_preds=16):
    '''Builds a dataframe containing statistical features at the store / item level
    
    Statistical features include mean, median, min, max, std for various historical date
    periods (last 3, 7, 14, 30 etc. days) starting from reference period t2017
    '''
    X = {}

#     for i in [2, 4]:
    for i in [3, 7, 14, 30, 60, 140]:
        tmp = get_timespan(df, t2017, minus=i, periods=i)
        X['diff_%s_mean' % i] = tmp.diff(axis=1).mean(axis=1).values
        X['mean_%s_decay' % i] = (tmp * np.power(0.9, np.arange(i)[::-1])).sum(axis=1).values
        X['mean_%s' % i] = tmp.mean(axis=1).values
        X['median_%s' % i] = tmp.median(axis=1).values
        X['min_%s' % i] = tmp.min(axis=1).values
        X['max_%s' % i] = tmp.max(axis=1).values
        X['std_%s' % i] = tmp.std(axis=1).values

    X = pd.DataFrame(X)

    if is_train:
        # Predict the next 16 periods from t2017
        y_date_range = pd.date_range(t2017, periods=num_days_preds)
        y = df[y_date_range].values
        return X, y
    if name_prefix is not None:
        X.columns = ['%s_%s' % (name_prefix, c) for c in X.columns]
    return X

## Build training data

In [19]:
print("Preparing dataset...")

X_l, y_l = [], []


# The for loop will take generate data along rows for different points in time
# If we are calculating the 2-day mean for a given store/item, for example, 
# the for loop will calculate the 2-day mean from the starting date to last 2 days,
# then in the next iteration, the 2-day mean from 7 days ago to the last 2 days from that date
# In effect, each store/item combination will have num_days number of rows and the entire
# dataset will be duplicated according to num_days
for i in range(num_days):
    print("------------------------------")
    print(f"days: {i}")
    
    delta = timedelta(days=7 * i)
    
    from_date = t2017 + delta
    y_preds_range = pd.date_range(from_date, periods=num_days_preds)
    print(f"Train end date: {from_date}",
          "\nDates to predict: " + " | ".join(y_preds_range.astype(str).to_list()))
    
    # Store by item level features (sales and promo)
    X_tmp, y_tmp = prepare_dataset(df_2017, from_date)

    X_tmp = pd.concat([X_tmp,  
                       items.reset_index(), 
                       stores.reset_index()], 
                      axis=1)
    
    X_l.append(X_tmp)
    y_l.append(y_tmp)

# Concatenate along rows
X_train = pd.concat(X_l, axis=0)
y_train = np.concatenate(y_l, axis=0)

del X_l, y_l

X_train.head()

Preparing dataset...
------------------------------
days: 0
Train end date: 2017-06-14 
Dates to predict: 2017-06-14 | 2017-06-15 | 2017-06-16 | 2017-06-17 | 2017-06-18 | 2017-06-19 | 2017-06-20 | 2017-06-21 | 2017-06-22 | 2017-06-23 | 2017-06-24 | 2017-06-25 | 2017-06-26 | 2017-06-27 | 2017-06-28 | 2017-06-29
------------------------------
days: 1
Train end date: 2017-06-21 
Dates to predict: 2017-06-21 | 2017-06-22 | 2017-06-23 | 2017-06-24 | 2017-06-25 | 2017-06-26 | 2017-06-27 | 2017-06-28 | 2017-06-29 | 2017-06-30 | 2017-07-01 | 2017-07-02 | 2017-07-03 | 2017-07-04 | 2017-07-05 | 2017-07-06
------------------------------
days: 2
Train end date: 2017-06-28 
Dates to predict: 2017-06-28 | 2017-06-29 | 2017-06-30 | 2017-07-01 | 2017-07-02 | 2017-07-03 | 2017-07-04 | 2017-07-05 | 2017-07-06 | 2017-07-07 | 2017-07-08 | 2017-07-09 | 2017-07-10 | 2017-07-11 | 2017-07-12 | 2017-07-13
------------------------------
days: 3
Train end date: 2017-07-05 
Dates to predict: 2017-07-05 | 2017-07-

Unnamed: 0,diff_3_mean,mean_3_decay,mean_3,median_3,min_3,max_3,std_3,diff_7_mean,mean_7_decay,mean_7,...,std_140,item_nbr,family,class,perishable,store_nbr,city,state,type,cluster
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.454774,0.099021,...,0.250516,96995,12,1093,0,1,18,12,3,13
1,0.346574,1.681898,0.597253,0.693147,0.0,1.098612,0.555548,0.115525,3.719249,0.683927,...,0.415002,99197,12,1067,0,1,18,12,3,13
2,0.549306,2.346277,0.828302,1.098612,0.0,1.386294,0.73161,0.067578,4.855349,0.908015,...,0.685576,103520,12,1028,0,1,18,12,3,13
3,-0.202733,2.830688,1.059351,1.098612,0.693147,1.386294,0.348237,-0.115525,5.461501,1.038914,...,0.668745,103665,5,2712,1,1,18,12,3,13
4,0.202733,3.637564,1.329661,1.098612,0.693147,2.197225,0.778203,-0.216547,8.941178,1.784525,...,0.639444,105574,12,1045,0,1,18,12,3,13


## Build validation and test data

In [20]:
# Create validation set
X_val, y_val = prepare_dataset(df_2017, val2017)
X_val = pd.concat([X_val, 
                   items.reset_index(), 
                   stores.reset_index()], axis=1)

In [21]:
# Create test set
X_test = prepare_dataset(df_2017, test2017, is_train=False)
X_test = pd.concat([X_test, 
                    items.reset_index(), 
                    stores.reset_index()], axis=1)

In [22]:
# del X_test2, X_val2, df_2017_item, promo_2017_item, df_2017_store_class, df_2017_promo_store_class, df_2017_store_class_index
# gc.collect()

In [23]:
feather.write_dataframe(X_train, MAIN_PATH / 'tmp/X_train')
feather.write_dataframe(pd.DataFrame(y_train), MAIN_PATH / 'tmp/y_train')
feather.write_dataframe(X_val, MAIN_PATH / 'tmp/X_val')
feather.write_dataframe(pd.DataFrame(y_val), MAIN_PATH / 'tmp/y_val')
feather.write_dataframe(X_test, MAIN_PATH / 'tmp/X_test')
feather.write_dataframe(items, MAIN_PATH / 'tmp/items')
feather.write_dataframe(df_2017.index.to_frame(), MAIN_PATH / 'tmp/store_item_idx')
feather.write_dataframe(df_test.reset_index(), MAIN_PATH / 'tmp/df_test')