In [None]:
from  datetime import datetime, timedelta
import gc
import numpy as np, pandas as pd
import lightgbm as lgb

> This notebook aims to push the public LB under 0.50. Certainly, the competition is not yet at its peak and there clearly remains room for improvement.

This notebook is based on [m5-first-public-notebook-under-0-50](https://www.kaggle.com/kneroma/m5-first-public-notebook-under-0-50) v.6 by @kkiller 


In [None]:
CAL_DTYPES={"event_name_1": "category", "event_name_2": "category", "event_type_1": "category", 
         "event_type_2": "category", "weekday": "category", 'wm_yr_wk': 'int16', "wday": "int16",
        "month": "int16", "year": "int16", "snap_CA": "float32", 'snap_TX': 'float32', 'snap_WI': 'float32' }
PRICE_DTYPES = {"store_id": "category", "item_id": "category", "wm_yr_wk": "int16","sell_price":"float32" }

In [None]:
pd.options.display.max_columns = 50

In [None]:
h = 28 
max_lags = 57
tr_last = 1913
fday = datetime(2016,4, 25) 
fday

In [None]:
def create_dt(is_train = True, nrows = None, first_day = 1200):
    prices = pd.read_csv("../input/m5-forecasting-accuracy/sell_prices.csv", dtype = PRICE_DTYPES)
    for col, col_dtype in PRICE_DTYPES.items():
        if col_dtype == "category":
            prices[col] = prices[col].cat.codes.astype("int16")
            prices[col] -= prices[col].min()
            
    cal = pd.read_csv("../input/m5-forecasting-accuracy/calendar.csv", dtype = CAL_DTYPES)
    cal["date"] = pd.to_datetime(cal["date"])
    for col, col_dtype in CAL_DTYPES.items():
        if col_dtype == "category":
            cal[col] = cal[col].cat.codes.astype("int16")
            cal[col] -= cal[col].min()
    
    start_day = max(1 if is_train  else tr_last-max_lags, first_day)
    numcols = [f"d_{day}" for day in range(start_day,tr_last+1)]
    catcols = ['id', 'item_id', 'dept_id','store_id', 'cat_id', 'state_id']
    dtype = {numcol:"float32" for numcol in numcols} 
    dtype.update({col: "category" for col in catcols if col != "id"})
    dt = pd.read_csv("../input/m5-forecasting-accuracy/sales_train_validation.csv", 
                     nrows = nrows, usecols = catcols + numcols, dtype = dtype)
    
    for col in catcols:
        if col != "id":
            dt[col] = dt[col].cat.codes.astype("int16")
            dt[col] -= dt[col].min()
    
    if not is_train:
        for day in range(tr_last+1, tr_last+ 28 +1):
            dt[f"d_{day}"] = np.nan
    
    dt = pd.melt(dt,
                  id_vars = catcols,
                  value_vars = [col for col in dt.columns if col.startswith("d_")],
                  var_name = "d",
                  value_name = "sales")
    
    dt = dt.merge(cal, on= "d", copy = False)
    dt = dt.merge(prices, on = ["store_id", "item_id", "wm_yr_wk"], copy = False)
    
    return dt

In [None]:
def create_fea(dt):
    lags = [7, 28]
    lag_cols = [f"lag_{lag}" for lag in lags ]
    for lag, lag_col in zip(lags, lag_cols):
        dt[lag_col] = dt[["id","sales"]].groupby("id")["sales"].shift(lag)

    wins = [7, 28]
    for win in wins :
        for lag,lag_col in zip(lags, lag_cols):
            dt[f"rmean_{lag}_{win}"] = dt[["id", lag_col]].groupby("id")[lag_col].transform(lambda x : x.rolling(win).mean())

    
    
    date_features = {
        
        "wday": "weekday",
        "week": "weekofyear",
        "month": "month",
        "quarter": "quarter",
        "year": "year",
        "mday": "day",
#         "ime": "is_month_end",
#         "ims": "is_month_start",
    }
    
#     dt.drop(["d", "wm_yr_wk", "weekday"], axis=1, inplace = True)
    
    for date_feat_name, date_feat_func in date_features.items():
        if date_feat_name in dt.columns:
            dt[date_feat_name] = dt[date_feat_name].astype("int16")
        else:
            dt[date_feat_name] = getattr(dt["date"].dt, date_feat_func).astype("int16")

In [None]:
FIRST_DAY = 350 # If you want to load all the data set it to '1' -->  Great  memory overflow  risk !

In [None]:
%%time

df = create_dt(is_train=True, first_day= FIRST_DAY)
df.shape

In [None]:
df.head()

In [None]:
df.info()

In [None]:
%%time

create_fea(df)
df.shape

In [None]:
df.info()

In [None]:
df.head()

In [None]:
df.dropna(inplace = True)
df.shape

In [None]:
cat_feats = ['item_id', 'dept_id','store_id', 'cat_id', 'state_id'] + ["event_name_1", "event_name_2", "event_type_1", "event_type_2"]
useless_cols = ["id", "date", "sales","d", "wm_yr_wk", "weekday"]
train_cols = df.columns[~df.columns.isin(useless_cols)]
X_train = df[train_cols]
y_train = df["sales"]

In [None]:
%%time

np.random.seed(777)

fake_valid_inds = np.random.choice(X_train.index.values, 2_000_000, replace = False)
train_inds = np.setdiff1d(X_train.index.values, fake_valid_inds)
X_train=X_train.loc[fake_valid_inds]
y_train = y_train.loc[fake_valid_inds]
X_traine=X_train.loc[fake_valid_inds]
y_traine = y_train.loc[fake_valid_inds]
# train_data = lgb.Dataset(X_train.loc[train_inds] , label = y_train.loc[train_inds], 
#                          categorical_feature=cat_feats, free_raw_data=False)
# fake_valid_data = lgb.Dataset(X_train.loc[fake_valid_inds], label = y_train.loc[fake_valid_inds],
#                               categorical_feature=cat_feats,
#                  free_raw_data=False)# This is a random sample, we're not gonna apply any time series train-test-split tricks here!

In [None]:
#del df, X_train, y_train, fake_valid_inds,train_inds ; gc.collect()

In [None]:
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from mlxtend.regressor import StackingCVRegressor
lightgbm1 = LGBMRegressor(objective='poisson', 
                       metric ='rmse',
                       learning_rate = 0.075,
                       sub_row = 0.75,
                       bagging_freq = 1,
                       lambda_l2 = 0.1,
                       verbosity= 1,
                       num_iterations = 2000,
                       num_leaves= 128,
                       min_data_in_leaf= 100)
lightgbm2 = LGBMRegressor(objective='tweedie', 
                       metric ='rmse',
                       learning_rate = 0.075,
                       sub_row = 0.75,
                       bagging_freq = 1,
                       lambda_l2 = 0.1,
                       verbosity= 1,
                       num_iterations = 2000,
                       num_leaves= 128,
                       min_data_in_leaf= 100)

lightgbm3 = LGBMRegressor(objective='poisson', 
                       metric ='rmse',
                       learning_rate = 0.075,
                       sub_row = 0.75,
                       bagging_freq = 1,
                       lambda_l2 = 0.1,
                       verbosity= 1,
                       num_iterations = 1200,
                       num_leaves= 128,
                       min_data_in_leaf= 100)
lightgbm4 = LGBMRegressor(objective='tweedie', 
                       metric ='rmse',
                       learning_rate = 0.075,
                       sub_row = 0.75,
                       bagging_freq = 1,
                       lambda_l2 = 0.1,
                       verbosity= 1,
                       num_iterations = 1500,
                       num_leaves= 128,
                       min_data_in_leaf= 100)

xgboost = XGBRegressor(objective='count:poisson',
                       learning_rate=0.075,
                       n_estimators=1000,
                       min_child_weight=50)

stackReg = StackingCVRegressor(regressors=(lightgbm1,lightgbm2,lightgbm3,lightgbm4),
                                meta_regressor=(lightgbm1),
                                use_features_in_secondary=True, 
                                random_state=42)

In [None]:
# def stacked_ensemble(X_train,y_train):
#     estimators = [
#     ('rfr', RandomForestRegressor(n_estimators=150)),
#     ('adb', AdaBoostRegressor(n_estimators=150)),
#     ('etr',ExtraTreesRegressor(n_estimators=150)),
#     ('gbr',GradientBoostingRegressor(n_estimators=150)),
#     ('bar',BaggingRegressor(n_estimators=150))
#     ]
#     reg = StackingRegressor(estimators=estimators,
#                         final_estimator=XGBRegressor(n_estimators=300,learning_rate=0.05))
#     reg.fit(X_train,y_train)
#     return(reg)

In [None]:
#my_model = stacked_ensemble(X_train,y_train)
import warnings
warnings.filterwarnings("default")

In [None]:
%%time
m_lgb= stackReg.fit(X_train, y_train)
                   

# Prediction stage
(updated vs original)

In [None]:
def create_lag_features_for_test(dt, day):
    # create lag feaures just for single day (faster)
    lags = [7, 28]
    lag_cols = [f"lag_{lag}" for lag in lags]
    for lag, lag_col in zip(lags, lag_cols):
        dt.loc[dt.date == day, lag_col] = \
            dt.loc[dt.date ==day-timedelta(days=lag), 'sales'].values  # !!! main

    windows = [7, 28]
    for window in windows:
        for lag in lags:
            df_window = dt[(dt.date <= day-timedelta(days=lag)) & (dt.date > day-timedelta(days=lag+window))]
            df_window_grouped = df_window.groupby("id").agg({'sales':'mean'}).reindex(dt.loc[dt.date==day,'id'])
            dt.loc[dt.date == day,f"rmean_{lag}_{window}"] = \
                df_window_grouped.sales.values     

In [None]:
def create_date_features_for_test(dt):
    # copy of the code from `create_dt()` above
    date_features = {
        "wday": "weekday",
        "week": "weekofyear",
        "month": "month",
        "quarter": "quarter",
        "year": "year",
        "mday": "day",
    }

    for date_feat_name, date_feat_func in date_features.items():
        if date_feat_name in dt.columns:
            dt[date_feat_name] = dt[date_feat_name].astype("int16")
        else:
            dt[date_feat_name] = getattr(
                dt["date"].dt, date_feat_func).astype("int16")

In [None]:
%%time

alphas = [1.028, 1.023, 1.018]
weights = [1/len(alphas)]*len(alphas)  # equal weights

te0 = create_dt(False)  # create master copy of `te`
create_date_features_for_test (te0)

for icount, (alpha, weight) in enumerate(zip(alphas, weights)):
    te = te0.copy()  # just copy
#     te1 = te0.copy()
    cols = [f"F{i}" for i in range(1, 29)]

    for tdelta in range(0, 28):
        day = fday + timedelta(days=tdelta)
        print(tdelta, day.date())
        tst = te[(te.date >= day - timedelta(days=max_lags))
                 & (te.date <= day)].copy()
#         tst1 = te1[(te1.date >= day - timedelta(days=max_lags))
#                  & (te1.date <= day)].copy()
#         create_fea(tst)  # correct, but takes much time
        create_lag_features_for_test(tst, day)  # faster  
        tst = tst.loc[tst.date == day, train_cols]
        te.loc[te.date == day, "sales"] = \
            alpha * m_lgb.predict(tst)  # magic multiplier by kyakovlev
        
#         create_lag_features_for_test(tst1, day)  # faster  
#         tst1 = tst1.loc[tst1.date == day, train_cols]
#         te1.loc[te1.date == day, "sales"] = \
#             alpha * m_lgb1.predict(tst1)  # magic multiplier by kyakovlev

    te_sub = te.loc[te.date >= fday, ["id", "sales"]].copy()
#     te_sub1 = te1.loc[te1.date >= fday, ["id", "sales"]].copy()

    te_sub["F"] = [f"F{rank}" for rank in te_sub.groupby("id")[
        "id"].cumcount()+1]
#     te_sub1["F"] = [f"F{rank}" for rank in te_sub1.groupby("id")[
#         "id"].cumcount()+1]
    te_sub = te_sub.set_index(["id", "F"]).unstack()[
        "sales"][cols].reset_index()
#     te_sub1 = te_sub1.set_index(["id", "F"]).unstack()[
#         "sales"][cols].reset_index()
    
    te_sub.fillna(0., inplace=True)
#     te_sub1.fillna(0., inplace=True)
    te_sub.sort_values("id", inplace=True)
#     te_sub1.sort_values("id", inplace=True)
    te_sub.reset_index(drop=True, inplace=True)
#     te_sub1.reset_index(drop=True, inplace=True)
    te_sub.to_csv(f"submission_{icount}.csv", index=False)
#     te_sub1.to_csv(f"submission1_{icount}.csv", index=False)
    if icount == 0:
        sub = te_sub
        sub[cols] *= weight
#         sub1 = te_sub1
#         sub1[cols] *= weight
    else:
        sub[cols] += te_sub[cols]*weight
#         sub1[cols] += te_sub1[cols]*weight
    print(icount, alpha, weight)
    
    
    
    

In [None]:
sub.head(10)

In [None]:
sub.id.nunique(), sub["id"].str.contains("validation$").sum()
# sub1.id.nunique(), sub1["id"].str.contains("validation$").sum()

In [None]:
sub.shape
# sub1.shape

In [None]:
sub2 = sub.copy()
sub2["id"] = sub2["id"].str.replace("validation$", "evaluation")
sub = pd.concat([sub, sub2], axis=0, sort=False)
sub.to_csv("submissionp.csv",index=False)

In [None]:
# sub3 = sub1.copy()
# sub3["id"] = sub3["id"].str.replace("validation$", "evaluation")
# sub1 = pd.concat([sub1, sub3], axis=0, sort=False)
# sub.to_csv("submissiont.csv",index=False)

In [None]:
# poisson = sub.sort_values(by = 'id').reset_index(drop = True)
# tweedie = sub1.sort_values(by = 'id').reset_index(drop = True)
# sub5 = poisson.copy()

# for i in sub5.columns :
#     if i != 'id' :
#         sub5[i] = 0.5*poisson[i] + 0.5*tweedie[i]
        
# sub5.to_csv('submissionavg.csv', index = False)