In [1]:
from  datetime import datetime, timedelta
import gc
import numpy as np 
import pandas as pd
import lightgbm as lgb

In [2]:
cal_dtype = {"event_name_1": "category", "event_name_2": "category", "event_type_1": "category", 
              "event_type_2": "category", "weekday": "category", 'wm_yr_wk': 'int16', "wday": "int16",
              "month": "int16", "year": "int16", "snap_CA": "float32", 'snap_TX': 'float32', 'snap_WI': 'float32'}
price_dtype = {"store_id": "category", "item_id": "category", "wm_yr_wk": "int16","sell_price":"float32"}
pd.options.display.max_columns = 50
h = 28                         #time horizon for prediction
max_lags = 57                  #number of previous values needed for lag and rolling mean calculation
tr_last = 1941                 #last value training set
fday = datetime(2016,5, 23)    #first value inference set

In [3]:
def create_dt(is_train = True, nrows = None, first_day = 1200):
    #Read price dataset and set category columns as int16 to reduce memory
    prices = pd.read_csv("../input/m5-forecasting-accuracy/sell_prices.csv", dtype = price_dtype)
    for col, col_dtype in price_dtype.items():
        if col_dtype == "category":
            prices[col] = prices[col].cat.codes.astype("int16")
            prices[col] -= prices[col].min()
    
    #Read calendar dataset and set category columns as int16 to reduce memory
    cal = pd.read_csv("../input/m5-forecasting-accuracy/calendar.csv", dtype = cal_dtype)
    cal["date"] = pd.to_datetime(cal["date"])
    for col, col_dtype in cal_dtype.items():
        if col_dtype == "category":
            cal[col] = cal[col].cat.codes.astype("int16")
            cal[col] -= cal[col].min()
    
    start_day = max(1 if is_train  else tr_last-max_lags, first_day) #for train we need entire dataset, for inference only last 57 values
    
    numcols = [f"d_{day}" for day in range(start_day,tr_last+1)]
    catcols = ['id', 'item_id', 'dept_id','store_id', 'cat_id', 'state_id']
    dtype = {numcol:"float32" for numcol in numcols} 
    dtype.update({col: "category" for col in catcols if col != "id"})
    
    #Reading sales dataset and change category columns to int16
    dt = pd.read_csv("../input/m5-forecasting-accuracy/sales_train_evaluation.csv", 
                     nrows = nrows, usecols = catcols + numcols, dtype = dtype)

   
    for col in catcols:
        if col != "id":
            dt[col] = dt[col].cat.codes.astype("int16")
            dt[col] -= dt[col].min()
    
    #Generate 28 columns for prediction in case of inference set
    if not is_train:
        for day in range(tr_last+1, tr_last+ 28 +1):
            dt[f"d_{day}"] = np.nan
    
    #Melt sales data to get each day sales in rows instead of columns
    dt = pd.melt(dt,
                  id_vars = catcols,
                  value_vars = [col for col in dt.columns if col.startswith("d_")],
                  var_name = "d",
                  value_name = "sales")
   
    #Merge sales data with calendar data and price data
    dt = dt.merge(cal, on= "d", copy = False)
    dt = dt.merge(prices, on = ["store_id", "item_id", "wm_yr_wk"], copy = False)

    
    return dt

In [4]:
def create_features(dt):
    
    #Create lags
    lags = [7, 28]
    lag_cols = [f"lag_{lag}" for lag in lags ]
    for lag, lag_col in zip(lags, lag_cols):
        dt[lag_col] = dt[["id","sales"]].groupby("id")["sales"].shift(lag)
        

    #Create rolling means 
    wins = [7, 28]
    for win in wins :
        for lag,lag_col in zip(lags, lag_cols):
            dt[f"rmean_{lag}_{win}"] = dt[["id", lag_col]].groupby("id")[lag_col].transform(lambda x : x.rolling(win).mean()) 
    
    #Create date features from date
    date_features = {"wday": "weekday",
                     "week": "weekofyear",
                     "month": "month",
                     "quarter": "quarter",
                     "year": "year",
                     "mday": "day"}
       
    for date_feat_name, date_feat_func in date_features.items():
        if date_feat_name in dt.columns:
            dt[date_feat_name] = dt[date_feat_name].astype("int16")
        else:
            dt[date_feat_name] = getattr(dt["date"].dt, date_feat_func).astype("int16")


In [5]:
%%time
first_day = 1
df = create_dt(is_train=True, first_day= first_day)
df.shape

CPU times: user 35 s, sys: 14.3 s, total: 49.2 s
Wall time: 49.3 s


(46881677, 22)

In [6]:
%%time
create_features(df)
df.shape

CPU times: user 3min 10s, sys: 19.3 s, total: 3min 29s
Wall time: 3min 29s


(46881677, 31)

In [7]:
df.dropna(inplace = True)
df.shape

(45204727, 31)

In [8]:
cat_feats = ['item_id', 'dept_id','store_id', 'cat_id', 'state_id'] + ["event_name_1", "event_name_2", "event_type_1", "event_type_2"]
useless_cols = ["id", "date", "sales","d", "wm_yr_wk", "weekday"]
train_cols = df.columns[~df.columns.isin(useless_cols)]
X_train = df[train_cols]
y_train = df["sales"]

In [9]:
#Creating dataset for LightGBM model
%%time
np.random.seed(777)
fake_valid_inds = np.random.choice(X_train.index.values, 2_000_000, replace = False)
train_inds = np.setdiff1d(X_train.index.values, fake_valid_inds)
train_data = lgb.Dataset(X_train.loc[train_inds] , label = y_train.loc[train_inds],
                         categorical_feature=cat_feats, free_raw_data=False)
fake_valid_data = lgb.Dataset(X_train.loc[fake_valid_inds], label = y_train.loc[fake_valid_inds],
                              categorical_feature=cat_feats, free_raw_data=False)

CPU times: user 17.5 s, sys: 2.54 s, total: 20 s
Wall time: 20 s


In [10]:
#Deleting unneeded vars to free RAM
del df, X_train, y_train, fake_valid_inds,train_inds ; gc.collect()

116

In [11]:
params = {"objective" : "poisson",
          "metric" :"rmse",
          "force_row_wise" : True,
          "learning_rate" : 0.075,
          "sub_row" : 0.75,
          "bagging_freq" : 1,
          "lambda_l2" : 0.1,
          "verbosity": 1,
          "num_iterations" : 1200,
          "num_leaves": 128,
          "min_data_in_leaf": 100}

In [12]:
#Training the model
%%time
m_lgb = lgb.train(params, train_data, valid_sets = [fake_valid_data], verbose_eval=20) 



[20]	valid_0's rmse: 3.02293
[40]	valid_0's rmse: 2.57764
[60]	valid_0's rmse: 2.45981
[80]	valid_0's rmse: 2.42675
[100]	valid_0's rmse: 2.41573
[120]	valid_0's rmse: 2.40753
[140]	valid_0's rmse: 2.39997
[160]	valid_0's rmse: 2.39243
[180]	valid_0's rmse: 2.38456
[200]	valid_0's rmse: 2.37745
[220]	valid_0's rmse: 2.37176
[240]	valid_0's rmse: 2.36775
[260]	valid_0's rmse: 2.36338
[280]	valid_0's rmse: 2.35837
[300]	valid_0's rmse: 2.35389
[320]	valid_0's rmse: 2.35021
[340]	valid_0's rmse: 2.34703
[360]	valid_0's rmse: 2.34345
[380]	valid_0's rmse: 2.33953
[400]	valid_0's rmse: 2.33566
[420]	valid_0's rmse: 2.33221
[440]	valid_0's rmse: 2.33019
[460]	valid_0's rmse: 2.32743
[480]	valid_0's rmse: 2.32515
[500]	valid_0's rmse: 2.32259
[520]	valid_0's rmse: 2.32078
[540]	valid_0's rmse: 2.31746
[560]	valid_0's rmse: 2.31534
[580]	valid_0's rmse: 2.31229
[600]	valid_0's rmse: 2.31083
[620]	valid_0's rmse: 2.30893
[640]	valid_0's rmse: 2.30642
[660]	valid_0's rmse: 2.30397
[680]	valid_0'

In [13]:
m_lgb.save_model("model_eval.lgb")

<lightgbm.basic.Booster at 0x7fce61180750>

In [14]:
#Predicting future 28 day sales 1 day at a time
%%time

sub = 0.

te = create_dt(False)
cols = [f"F{i}" for i in range(1,29)]

for tdelta in range(0, 28):
    day = fday + timedelta(days=tdelta)
    print(tdelta, day)
    tst = te[(te.date >= day - timedelta(days=max_lags)) & (te.date <= day)].copy()
    create_features(tst)
    tst = tst.loc[tst.date == day , train_cols]
    te.loc[te.date == day, "sales"] = m_lgb.predict(tst) 


#Selecting the rows for prediction days
te_sub = te.loc[te.date >= fday, ["id", "sales"]].copy()

#Converting rows to columns for submission
te_sub["F"] = [f"F{rank}" for rank in te_sub.groupby("id")["id"].cumcount()+1]
te_sub = te_sub.set_index(["id", "F" ]).unstack()["sales"][cols].reset_index()
te_sub.fillna(0., inplace = True)
te_sub.sort_values("id", inplace = True)
te_sub.reset_index(drop=True, inplace = True)

sub = te_sub

0 2016-05-23 00:00:00
1 2016-05-24 00:00:00
2 2016-05-25 00:00:00
3 2016-05-26 00:00:00
4 2016-05-27 00:00:00
5 2016-05-28 00:00:00
6 2016-05-29 00:00:00
7 2016-05-30 00:00:00
8 2016-05-31 00:00:00
9 2016-06-01 00:00:00
10 2016-06-02 00:00:00
11 2016-06-03 00:00:00
12 2016-06-04 00:00:00
13 2016-06-05 00:00:00
14 2016-06-06 00:00:00
15 2016-06-07 00:00:00
16 2016-06-08 00:00:00
17 2016-06-09 00:00:00
18 2016-06-10 00:00:00
19 2016-06-11 00:00:00
20 2016-06-12 00:00:00
21 2016-06-13 00:00:00
22 2016-06-14 00:00:00
23 2016-06-15 00:00:00
24 2016-06-16 00:00:00
25 2016-06-17 00:00:00
26 2016-06-18 00:00:00
27 2016-06-19 00:00:00
CPU times: user 45min 20s, sys: 17.6 s, total: 45min 37s
Wall time: 40min 46s


In [15]:
sub.id.nunique(), sub["id"].str.contains("validation$").sum()

(30490, 0)

In [16]:
sub.shape

(30490, 29)

In [17]:
sub.tail(10)

F,id,F1,F2,F3,F4,F5,F6,F7,F8,F9,F10,F11,F12,F13,F14,F15,F16,F17,F18,F19,F20,F21,F22,F23,F24,F25,F26,F27,F28
30480,HOUSEHOLD_2_516_CA_1_evaluation,0.196804,0.190303,0.173647,0.164056,0.271941,0.2817,0.269095,0.217232,0.185091,0.178228,0.190902,0.234152,0.225553,0.223551,0.154217,0.151678,0.154407,0.143056,0.193281,0.225908,0.228067,0.151253,0.142731,0.138127,0.144285,0.186299,0.218811,0.198264
30481,HOUSEHOLD_2_516_CA_2_evaluation,0.170984,0.183647,0.177004,0.191395,0.263777,0.355653,0.296765,0.229875,0.192716,0.183933,0.214338,0.262835,0.369481,0.371549,0.21604,0.211279,0.203438,0.200422,0.284238,0.393404,0.364766,0.230025,0.207105,0.214275,0.219429,0.314289,0.509879,0.434026
30482,HOUSEHOLD_2_516_CA_3_evaluation,0.101907,0.134861,0.138655,0.14172,0.171662,0.199013,0.188777,0.170717,0.126897,0.132792,0.143149,0.16526,0.178,0.172194,0.135351,0.133268,0.149735,0.151663,0.171479,0.196573,0.1853,0.131158,0.137383,0.154405,0.154408,0.184281,0.215506,0.191459
30483,HOUSEHOLD_2_516_CA_4_evaluation,0.132683,0.140113,0.140313,0.146131,0.155917,0.178952,0.142979,0.13407,0.123725,0.1171,0.12684,0.142871,0.148714,0.146358,0.123442,0.107739,0.102802,0.108387,0.123648,0.134441,0.131459,0.111373,0.10988,0.112364,0.114937,0.13722,0.150474,0.12382
30484,HOUSEHOLD_2_516_TX_1_evaluation,0.056705,0.055949,0.055339,0.057925,0.064507,0.076747,0.073188,0.129266,0.1155,0.098968,0.105974,0.115999,0.133248,0.125854,0.102979,0.101214,0.098356,0.127853,0.148077,0.162995,0.157634,0.128846,0.124655,0.127266,0.135483,0.158046,0.179625,0.161194
30485,HOUSEHOLD_2_516_TX_2_evaluation,0.203598,0.195399,0.190857,0.185437,0.234381,0.276635,0.26368,0.227095,0.179311,0.166687,0.182328,0.223855,0.262402,0.24831,0.156005,0.152537,0.150102,0.153469,0.204535,0.234769,0.217157,0.154936,0.150152,0.14149,0.159582,0.205668,0.244096,0.225897
30486,HOUSEHOLD_2_516_TX_3_evaluation,0.153028,0.141534,0.134474,0.138585,0.180521,0.238111,0.266273,0.207849,0.181953,0.167342,0.181495,0.232451,0.264511,0.243136,0.175941,0.185829,0.18119,0.175113,0.19531,0.229091,0.230584,0.170311,0.162635,0.166611,0.171918,0.234438,0.276085,0.269427
30487,HOUSEHOLD_2_516_WI_1_evaluation,0.106169,0.105764,0.107014,0.118542,0.16291,0.189634,0.177679,0.126264,0.110936,0.102149,0.109537,0.147821,0.182315,0.174952,0.120263,0.118249,0.115545,0.122167,0.155525,0.169574,0.157383,0.111372,0.108389,0.111991,0.115209,0.163936,0.194547,0.164577
30488,HOUSEHOLD_2_516_WI_2_evaluation,0.093552,0.084074,0.085365,0.087175,0.107737,0.10659,0.096473,0.100018,0.093646,0.09803,0.10378,0.126461,0.116272,0.111554,0.095708,0.096388,0.092938,0.096977,0.118315,0.117257,0.107049,0.100633,0.095641,0.098039,0.099182,0.121749,0.120471,0.097466
30489,HOUSEHOLD_2_516_WI_3_evaluation,0.103589,0.100018,0.101354,0.110252,0.119836,0.11734,0.109056,0.116963,0.106657,0.097961,0.106139,0.136059,0.13073,0.123103,0.099579,0.098089,0.096784,0.100318,0.126217,0.126557,0.124917,0.100674,0.097253,0.099528,0.10435,0.132766,0.139251,0.118756


In [18]:
sub.to_csv("submission_eval.csv",index=False)