* https://www.kaggle.com/c/favorita-grocery-sales-forecasting/overview
* https://www.kaggle.com/shixw125/1st-place-lgb-model-public-0-506-private-0-511
* https://www.kaggle.com/c/favorita-grocery-sales-forecasting/discussion/47582

In [25]:
from datetime import date, timedelta
import pathlib
import gc
import time

import pandas as pd
import numpy as np
import feather
import lightgbm as lgb
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_squared_error

# Load data

In [2]:
MAIN_PATH = pathlib.Path('/Users/palermopenano/personal/sm-202011/project_2')

In [3]:
# Load full dataset
# df_train = pd.read_csv(
#     MAIN_PATH / 'data/train.csv',
#     usecols=[1,2,3,4,5],
#     dtype={'onpromotion': bool},
#     converters={'unit_sales': lambda u: np.log1p(float(u)) if float(u) > 0 else 0},
#     parse_dates=["date"],
#     skiprows=range(1, 66458909))

# # Generate subsample of full data
# df_train.sample(frac=.01).to_csv(MAIN_PATH / 'data/train_sample.csv')

# Load small dataset for building pipeline
df_train = pd.read_csv(
    MAIN_PATH / 'data/train_sample.csv',
    usecols=[1, 2, 3, 4, 5],
    dtype={'onpromotion': bool},
    converters={'unit_sales': lambda u: np.log1p(float(u)) if float(u) > 0 else 0},
    parse_dates=["date"]
)

df_train.shape

(590381, 5)

In [4]:
df_test = pd.read_csv(
    MAIN_PATH / 'data/test.csv',
    usecols=[0, 1, 2, 3, 4],
    dtype={'onpromotion': bool},
    parse_dates=["date"]).set_index(['store_nbr', 'item_nbr', 'date'])
df_test.shape

(3370464, 2)

In [5]:
items = pd.read_csv(
    MAIN_PATH / "data/items.csv",
).set_index("item_nbr")

stores = pd.read_csv(
    MAIN_PATH / "data/stores.csv",
).set_index("store_nbr")

items.shape, stores.shape

((4100, 3), (54, 4))

In [6]:
df_2017 = df_train.loc[df_train.date >= pd.datetime(2017, 1, 1)]
del df_train
df_2017.shape

(238021, 5)

# Feature Engineering

## Label encode categorical values

In [7]:
le = LabelEncoder()
items['family'] = le.fit_transform(items['family'].values)
stores['city'] = le.fit_transform(stores['city'].values)
stores['state'] = le.fit_transform(stores['state'].values)
stores['type'] = le.fit_transform(stores['type'].values)

## Clean promo variable

Promo variable (bool) by store and item over time

In [8]:
promo_2017_train = (
    df_2017.
    set_index(['store_nbr', 'item_nbr', 'date'])[['onpromotion']].
    unstack(level=-1).
    fillna(False)
)
promo_2017_train.columns = promo_2017_train.columns.get_level_values(1)

In [9]:
promo_2017_test = df_test[["onpromotion"]].unstack(level=-1).fillna(False)
promo_2017_test.columns = promo_2017_test.columns.get_level_values(1)
promo_2017_test = promo_2017_test.reindex(promo_2017_train.index).fillna(False)

In [10]:
# ??? Why is the train sample columns item ids but dates for the test sample?
promo_2017 = pd.concat([promo_2017_train, promo_2017_test], axis=1)
del promo_2017_test, promo_2017_train
promo_2017

Unnamed: 0_level_0,date,2017-01-01,2017-01-02,2017-01-03,2017-01-04,2017-01-05,2017-01-06,2017-01-07,2017-01-08,2017-01-09,2017-01-10,...,2017-08-22,2017-08-23,2017-08-24,2017-08-25,2017-08-26,2017-08-27,2017-08-28,2017-08-29,2017-08-30,2017-08-31
store_nbr,item_nbr,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
1,96995,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,103520,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,103665,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,105575,False,False,False,False,False,False,False,False,False,False,...,False,False,True,True,True,True,True,True,True,True
1,105577,False,False,False,False,False,False,False,False,False,False,...,False,True,True,True,True,True,True,False,True,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
54,2088922,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
54,2089339,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
54,2101795,False,False,False,False,False,False,False,False,False,False,...,True,False,False,False,False,False,False,True,False,False
54,2103250,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


## Sales by store, item and date

In [11]:
# Convert df to long format with dates along columns and
# store by item along as row axis
# sales by store, item, and date
df_2017 = (
    df_2017.set_index(["store_nbr", "item_nbr", "date"])[["unit_sales"]].
    unstack(level=-1).
    fillna(0)
)
df_2017.columns = df_2017.columns.get_level_values(1)

# Set index for items and stores data to be the same as df_2017
items = items.reindex(df_2017.index.get_level_values(1))
stores = stores.reindex(df_2017.index.get_level_values(0))

## Item sales and num promotion per item over time

In [12]:
# Total sales per item across all stores over time
df_2017_item = df_2017.groupby('item_nbr')[df_2017.columns].sum()  

# Number of promo per item over time
promo_2017_item = promo_2017.groupby('item_nbr')[promo_2017.columns].sum()

## Total sales per class and store over time

In [13]:
# Total sales by item class and store over time
df_2017_store_class = df_2017.reset_index()
df_2017_store_class['class'] = items['class'].values
df_2017_store_class_index = df_2017_store_class[['class', 'store_nbr']]
df_2017_store_class = df_2017_store_class.groupby(['class', 'store_nbr'])[df_2017.columns].sum()

## Total promo per class and store over time

In [14]:
df_2017_promo_store_class = promo_2017.reset_index()
df_2017_promo_store_class['class'] = items['class'].values
df_2017_promo_store_class_index = df_2017_promo_store_class[['class', 'store_nbr']]
df_2017_promo_store_class = df_2017_promo_store_class.groupby(['class', 'store_nbr'])[promo_2017.columns].sum()

# Data prep functions

In [15]:
def get_timespan(df, dt, minus, periods, freq='D'):
    '''Get subset of data based on date interval starting from (dt-minus)
    and going forward `periods` number of periods.
    
    df is a dataframe in wide format with dates along the column
    
    >>> How pd.date_range works <<<
    Example:
    dt = 2017-6-14
    minus = 1
    from_date = dt - timedelta(days=minus)
    periods = 3
    pd.date_range(from_date, periods, freq='D') ==> DatetimeIndex(2017-6-13, 2017-6-14, 2017-6-15])
    '''
    from_date = dt - timedelta(days=minus)
    
    # Generate time periods from_date to P periods into the future (daily)
    date_interval = pd.date_range(from_date, periods=periods, freq=freq)
    return df[date_interval]

# Prepare features

In [16]:
def prepare_dataset(df, t2017, is_train=True, name_prefix=None, num_days_preds=16):
    '''Builds a dataframe containing statistical features at the store / item level
    
    Statistical features include mean, median, min, max, std for various historical date
    periods (last 3, 7, 14, 30 etc. days) starting from reference period t2017
    '''
    X = {}

#     for i in [2, 4]:
    for i in [3, 7, 14, 30, 60, 140]:
        tmp = get_timespan(df, t2017, minus=i, periods=i)
        X['diff_%s_mean' % i] = tmp.diff(axis=1).mean(axis=1).values
        X['mean_%s_decay' % i] = (tmp * np.power(0.9, np.arange(i)[::-1])).sum(axis=1).values
        X['mean_%s' % i] = tmp.mean(axis=1).values
        X['median_%s' % i] = tmp.median(axis=1).values
        X['min_%s' % i] = tmp.min(axis=1).values
        X['max_%s' % i] = tmp.max(axis=1).values
        X['std_%s' % i] = tmp.std(axis=1).values

    X = pd.DataFrame(X)

    if is_train:
        # Predict the next 16 periods from t2017
        y_date_range = pd.date_range(t2017, periods=num_days_preds)
        y = df[y_date_range].values
        return X, y
    if name_prefix is not None:
        X.columns = ['%s_%s' % (name_prefix, c) for c in X.columns]
    return X

## Build training data

In [24]:
print("Preparing dataset...")

t2017 = date(2017, 6, 14)  # last day to use for training
num_days_preds = 16        # number of days into the future to predict (y values)

num_days = 6
# num_days = 1

X_l, y_l = [], []


# The for loop will take generate data along rows for different points in time
# If we are calculating the 2-day mean for a given store/item, for example, 
# the for loop will calculate the 2-day mean from the starting date to last 2 days,
# then in the next iteration, the 2-day mean from 7 days ago to the last 2 days from that date
# In effect, each store/item combination will have num_days number of rows and the entire
# dataset will be duplicated according to num_days
for i in range(num_days):
    print("------------------------------")
    print(f"days: {i}")
    
    delta = timedelta(days=7 * i)
    
    from_date = t2017 + delta
    y_preds_range = pd.date_range(from_date, periods=num_days_preds)
    print(f"from: {from_date} | {y_preds_range}")
    
    # Store by item level features (sales and promo)
    X_tmp, y_tmp = prepare_dataset(df_2017, from_date)

    X_tmp = pd.concat([X_tmp,  
                       items.reset_index(), 
                       stores.reset_index()], 
                      axis=1)
    
    X_l.append(X_tmp)
    y_l.append(y_tmp)

# Concatenate along rows
X_train = pd.concat(X_l, axis=0)
y_train = np.concatenate(y_l, axis=0)

del X_l, y_l

X_train.head()

Preparing dataset...
------------------------------
days: 0
from: 2017-06-14 | DatetimeIndex(['2017-06-14', '2017-06-15', '2017-06-16', '2017-06-17',
               '2017-06-18', '2017-06-19', '2017-06-20', '2017-06-21',
               '2017-06-22', '2017-06-23', '2017-06-24', '2017-06-25',
               '2017-06-26', '2017-06-27', '2017-06-28', '2017-06-29'],
              dtype='datetime64[ns]', freq='D')
------------------------------
days: 1
from: 2017-06-21 | DatetimeIndex(['2017-06-21', '2017-06-22', '2017-06-23', '2017-06-24',
               '2017-06-25', '2017-06-26', '2017-06-27', '2017-06-28',
               '2017-06-29', '2017-06-30', '2017-07-01', '2017-07-02',
               '2017-07-03', '2017-07-04', '2017-07-05', '2017-07-06'],
              dtype='datetime64[ns]', freq='D')
------------------------------
days: 2
from: 2017-06-28 | DatetimeIndex(['2017-06-28', '2017-06-29', '2017-06-30', '2017-07-01',
               '2017-07-02', '2017-07-03', '2017-07-04', '2017-07-05

Unnamed: 0,diff_3_mean,mean_3_decay,mean_3,median_3,min_3,max_3,std_3,diff_7_mean,mean_7_decay,mean_7,...,std_140,item_nbr,family,class,perishable,store_nbr,city,state,type,cluster
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.044505,96995,12,1093,0,1,18,12,3,13
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.044505,103520,12,1028,0,1,18,12,3,13
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.12551,103665,5,2712,1,1,18,12,3,13
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,105575,12,1045,0,1,18,12,3,13
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.08828,105577,12,1045,0,1,18,12,3,13


## Build validation and test data

In [18]:
# Create validation set
val2017 = date(2017, 7, 26)
X_val, y_val = prepare_dataset(df_2017, val2017)
X_val = pd.concat([X_val, 
                   items.reset_index(), 
                   stores.reset_index()], axis=1)

In [19]:
# Create test set
test2017 = date(2017, 8, 16)
X_test = prepare_dataset(df_2017, test2017, is_train=False)

X_test = pd.concat([X_test, 
                    items.reset_index(), 
                    stores.reset_index()], axis=1)

In [20]:
# del X_test2, X_val2, df_2017_item, promo_2017_item, df_2017_store_class, df_2017_promo_store_class, df_2017_store_class_index
# gc.collect()

# Train model

In [21]:
params = {
    'num_leaves': 80,
    'objective': 'regression',
    'min_data_in_leaf': 200,
    'learning_rate': 0.02,
    'feature_fraction': 0.8,
    'bagging_fraction': 0.7,
    'bagging_freq': 1,
    'metric': 'l2',
    'num_threads': 16
}

MAX_ROUNDS = 5000

In [22]:
start = time.time()

val_pred = []
test_pred = []
cate_vars = []

for i in range(num_days_preds):
    print("=" * 50)
    print("Step %d" % (i+1))
    print("=" * 50)
    
    dtrain = lgb.Dataset(
        X_train, label=y_train[:, i],
        categorical_feature=cate_vars,
        weight=pd.concat([items["perishable"]] * num_days) * 0.25 + 1  # items marked as perishable is given a weight of .25; others are 1
    )
    dval = lgb.Dataset(
        X_val, label=y_val[:, i],
        categorical_feature=cate_vars,
        weight=items["perishable"] * 0.25 + 1
    )
    bst = lgb.train(
        params, dtrain, num_boost_round=MAX_ROUNDS,
        valid_sets=[dtrain, dval], early_stopping_rounds=125, verbose_eval=50
    )
    
    # Interesting trick!
    print("\n".join(("%s: %.2f" % x) for x in sorted(
        zip(X_train.columns, bst.feature_importance("gain")),
        key=lambda x: x[1], reverse=True
    )))
    
    val_pred.append(
        bst.predict(X_val, 
                    num_iteration=bst.best_iteration or MAX_ROUNDS)
    )
    test_pred.append(
        bst.predict(X_test, 
                    num_iteration=bst.best_iteration or MAX_ROUNDS)
    )


print("\nValidation mse:", 
      mean_squared_error(y_val, np.array(val_pred).transpose()))
weight = items['perishable'] * 0.25 + 1
err = (y_val - np.array(val_pred).transpose())**2
err = err.sum(axis=1) * weight
err = np.sqrt(err.sum() / weight.sum() / 16)
print("Validation nwrmsle = {}".format(err))
print(f"Time taken: {(time.time() - start) / 60} mins")

    

Step 1




Training until validation scores don't improve for 125 rounds
[50]	training's l2: 0.00833986	valid_1's l2: 0.00853434
[100]	training's l2: 0.00830314	valid_1's l2: 0.00853559
[150]	training's l2: 0.00827232	valid_1's l2: 0.00853798
Early stopping, best iteration is:
[67]	training's l2: 0.00832636	valid_1's l2: 0.00853416
item_nbr: 89.95
max_140: 74.80
class: 67.91
mean_140_decay: 52.22
store_nbr: 42.57
std_140: 33.95
cluster: 27.92
mean_140: 27.82
family: 24.63
city: 19.35
max_60: 16.82
mean_60_decay: 15.92
state: 14.72
std_60: 14.00
max_30: 12.42
mean_60: 11.77
type: 11.12
mean_30_decay: 7.23
mean_30: 7.18
mean_14_decay: 6.99
perishable: 5.05
std_30: 4.85
max_14: 4.16
mean_14: 3.28
diff_7_mean: 2.60
diff_3_mean: 2.39
mean_7_decay: 2.25
max_7: 2.11
std_14: 1.52
diff_60_mean: 1.24
mean_3_decay: 1.22
mean_7: 1.01
diff_30_mean: 0.83
std_3: 0.48
diff_14_mean: 0.35
mean_3: 0.23
diff_140_mean: 0.21
std_7: 0.19
max_3: 0.19
median_3: 0.00
min_3: 0.00
median_7: 0.00
min_7: 0.00
median_14: 0.00


KeyboardInterrupt: 

In [None]:
items

In [None]:
y_val.shape

In [None]:
len(val_pred)

# Create submission