In [0]:
from  datetime import datetime, timedelta
import gc
import numpy as np, pandas as pd
import lightgbm as lgb
from sklearn.metrics import mean_squared_error

In [0]:
DIRECTORY = "/content/drive/My Drive/Colab Notebooks/lsda/HW1/" #Mahdi

from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [0]:
CAL_DTYPES={"event_name_1": "category", "event_name_2": "category", "event_type_1": "category", 
         "event_type_2": "category", "weekday": "category", 'wm_yr_wk': 'int16', "wday": "int16",
        "month": "int16", "year": "int16", "snap_CA": "float32", 'snap_TX': 'float32', 'snap_WI': 'float32' }
PRICE_DTYPES = {"store_id": "category", "item_id": "category", "wm_yr_wk": "int16","sell_price":"float32" }

In [0]:
pd.options.display.max_columns = 50

In [0]:
h = 28 
max_lags = 364 + 364 + 1 # max_lag + max_window + 1
tr_last = 1913-28 #CHANGE
fday = datetime(2016,4, 25) - timedelta(days=28) #CHANGE
FIRST_DAY = 1
VERSION = "base_model_364" # remove last 28 days
LAG = [7, 28, 182, 364]
WINDOW = LAG

params = {
        "objective" : "poisson",
        "metric" :"rmse",
        "force_row_wise" : True,
        "learning_rate" : 0.075,
#         "sub_feature" : 0.8,
        "sub_row" : 0.75,
        "bagging_freq" : 1,
        "lambda_l2" : 0.1,
#         "nthread" : 4
        "metric": ["rmse"],
    'verbosity': 1,
    'num_iterations' : 1200,
    'num_leaves': 128,
    "min_data_in_leaf": 100,
}

In [0]:
def create_dt(is_train = True, nrows = None, first_day = 1200, tr_last=tr_last):
    prices = pd.read_csv(DIRECTORY + "data/sell_prices.csv", dtype = PRICE_DTYPES)
    for col, col_dtype in PRICE_DTYPES.items():
        if col_dtype == "category":
            prices[col] = prices[col].cat.codes.astype("int16")
            prices[col] -= prices[col].min()
            
    cal = pd.read_csv(DIRECTORY + "data/calendar.csv", dtype = CAL_DTYPES)
    cal["date"] = pd.to_datetime(cal["date"])
    for col, col_dtype in CAL_DTYPES.items():
        if col_dtype == "category":
            cal[col] = cal[col].cat.codes.astype("int16")
            cal[col] -= cal[col].min()
    
    start_day = max(1 if is_train  else tr_last-max_lags, first_day)
    numcols = [f"d_{day}" for day in range(start_day,tr_last+1)]
    catcols = ['id', 'item_id', 'dept_id','store_id', 'cat_id', 'state_id']
    dtype = {numcol:"float32" for numcol in numcols} 
    dtype.update({col: "category" for col in catcols if col != "id"})
    dt = pd.read_csv(DIRECTORY + "data/sales_train_validation.csv", 
                     nrows = nrows, usecols = catcols + numcols, dtype = dtype)
    
    for col in catcols:
        if col != "id":
            dt[col] = dt[col].cat.codes.astype("int16")
            dt[col] -= dt[col].min()
    
    if not is_train:
        for day in range(tr_last+1, tr_last+ 28 +1):
            dt[f"d_{day}"] = np.nan
    
    dt = pd.melt(dt,
                  id_vars = catcols,
                  value_vars = [col for col in dt.columns if col.startswith("d_")],
                  var_name = "d",
                  value_name = "sales")
    
    dt = dt.merge(cal, on= "d", copy = False)
    dt = dt.merge(prices, on = ["store_id", "item_id", "wm_yr_wk"], copy = False)
    
    return dt

In [0]:
def create_fea(dt):
    lags = LAG
    lag_cols = [f"lag_{lag}" for lag in lags ]
    for lag, lag_col in zip(lags, lag_cols):
        dt[lag_col] = dt[["id","sales"]].groupby("id")["sales"].shift(lag)

    wins = WINDOW
    for win in wins :
        for lag,lag_col in zip(lags, lag_cols):
            #dt[f"rmean_{lag}_{win}"] = dt[["id", lag_col]].groupby('id')[lag_col].shift(1).rolling(win, min_periods=1).mean() #optimized
            dt[f"rmean_{lag}_{win}"] = dt[["id", lag_col]].groupby("id")[lag_col].transform(lambda x : x.rolling(win).mean())

    
    
    date_features = {
        
        "wday": "weekday",
        "week": "weekofyear",
        "month": "month",
        "quarter": "quarter",
        "year": "year",
        "mday": "day",
#         "ime": "is_month_end",
#         "ims": "is_month_start",
    }
    
#     dt.drop(["d", "wm_yr_wk", "weekday"], axis=1, inplace = True)
    
    for date_feat_name, date_feat_func in date_features.items():
        if date_feat_name in dt.columns:
            dt[date_feat_name] = dt[date_feat_name].astype("int16")
        else:
            dt[date_feat_name] = getattr(dt["date"].dt, date_feat_func).astype("int16")

In [0]:
# def create_lag_features_for_test(dt, day):
#     # create lag feaures just for single day (faster)
#     print(dt)
#     lags = [7, 28, 364]
#     lag_cols = [f"lag_{lag}" for lag in lags]
#     for lag, lag_col in zip(lags, lag_cols):
#         print(dt.loc[dt.date == day, lag_col])
#         dt.loc[dt.date == day, lag_col] = dt.loc[dt.date ==day-timedelta(days=lag), 'sales'].values  # !!! main
#         print(dt.loc[dt.date ==day-timedelta(days=lag), 'sales'])

#     windows = [7, 28, 364]
#     for window in windows:
#         for lag in lags:
#             df_window = dt[(dt.date <= day-timedelta(days=lag)) & (dt.date > day-timedelta(days=lag+window))]
#             df_window_grouped = df_window.groupby("id").agg({'sales':'mean'}).reindex(dt.loc[dt.date==day,'id'])
#             dt.loc[dt.date == day,f"rmean_{lag}_{window}"] = df_window_grouped.sales.values  

# def create_date_features_for_test(dt):
#     # copy of the code from `create_dt()` above
#     date_features = {
#         "wday": "weekday",
#         "week": "weekofyear",
#         "month": "month",
#         "quarter": "quarter",
#         "year": "year",
#         "mday": "day",
#     }

#     for date_feat_name, date_feat_func in date_features.items():
#         if date_feat_name in dt.columns:
#             dt[date_feat_name] = dt[date_feat_name].astype("int16")
#         else:
#             dt[date_feat_name] = getattr(
#                 dt["date"].dt, date_feat_func).astype("int16")

In [0]:
df = create_dt(is_train=True, first_day= FIRST_DAY)

In [0]:
create_fea(df)
df

Unnamed: 0,id,item_id,dept_id,store_id,cat_id,state_id,d,sales,date,wm_yr_wk,weekday,wday,month,year,event_name_1,event_type_1,event_name_2,event_type_2,snap_CA,snap_TX,snap_WI,sell_price,lag_7,lag_28,lag_91,...,rmean_182_7,rmean_364_7,rmean_7_28,rmean_28_28,rmean_91_28,rmean_182_28,rmean_364_28,rmean_7_91,rmean_28_91,rmean_91_91,rmean_182_91,rmean_364_91,rmean_7_182,rmean_28_182,rmean_91_182,rmean_182_182,rmean_364_182,rmean_7_364,rmean_28_364,rmean_91_364,rmean_182_364,rmean_364_364,week,quarter,mday
0,HOBBIES_1_008_CA_1_validation,7,0,0,0,0,d_1,12.0,2011-01-29,11101,2,1,1,2011,0,0,0,0,0.0,0.0,0.0,0.46,,,,...,,,,,,,,,,,,,,,,,,,,,,,4,1,29
1,HOBBIES_1_008_CA_1_validation,7,0,0,0,0,d_2,15.0,2011-01-30,11101,3,2,1,2011,0,0,0,0,0.0,0.0,0.0,0.46,,,,...,,,,,,,,,,,,,,,,,,,,,,,4,1,30
2,HOBBIES_1_008_CA_1_validation,7,0,0,0,0,d_3,0.0,2011-01-31,11101,1,3,1,2011,0,0,0,0,0.0,0.0,0.0,0.46,,,,...,,,,,,,,,,,,,,,,,,,,,,,5,1,31
3,HOBBIES_1_008_CA_1_validation,7,0,0,0,0,d_4,0.0,2011-02-01,11101,5,4,2,2011,0,0,0,0,1.0,1.0,0.0,0.46,,,,...,,,,,,,,,,,,,,,,,,,,,,,5,1,1
4,HOBBIES_1_008_CA_1_validation,7,0,0,0,0,d_5,0.0,2011-02-02,11101,6,5,2,2011,0,0,0,0,1.0,0.0,1.0,0.46,,,,...,,,,,,,,,,,,,,,,,,,,,,,5,1,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
45174232,FOODS_3_825_WI_3_validation,3046,6,9,2,2,d_1885,1.0,2016-03-27,11609,3,2,3,2016,5,1,0,0,0.0,0.0,0.0,3.98,0.0,0.0,0.0,...,0.000000,0.857143,1.178571,0.642857,0.785714,0.178571,0.892857,0.846154,0.747253,0.373626,0.384615,1.054945,0.604396,0.467033,0.379121,0.686813,1.043956,0.662088,0.631868,0.700549,0.865385,0.958791,12,1,27
45174233,FOODS_3_826_WI_3_validation,3047,6,9,2,2,d_1884,2.0,2016-03-26,11609,2,1,3,2016,0,0,0,0,0.0,0.0,0.0,1.28,1.0,0.0,1.0,...,2.000000,0.000000,0.821429,1.000000,1.392857,1.892857,0.142857,1.186813,1.241758,1.670330,1.186813,0.120879,1.445055,1.532967,1.428571,0.593407,0.197802,0.980769,0.928571,0.744505,0.395604,0.423077,12,1,26
45174234,FOODS_3_826_WI_3_validation,3047,6,9,2,2,d_1885,4.0,2016-03-27,11609,3,2,3,2016,5,1,0,0,0.0,0.0,0.0,1.28,1.0,0.0,2.0,...,2.000000,0.000000,0.857143,0.892857,1.392857,1.821429,0.142857,1.186813,1.219780,1.659341,1.219780,0.120879,1.434066,1.505494,1.439560,0.609890,0.197802,0.983516,0.928571,0.750000,0.403846,0.423077,12,1,27
45174235,FOODS_3_827_WI_3_validation,3048,6,9,2,2,d_1884,0.0,2016-03-26,11609,2,1,3,2016,0,0,0,0,0.0,0.0,0.0,1.00,1.0,4.0,3.0,...,3.571429,1.571429,1.607143,1.607143,1.678571,2.035714,0.821429,1.538462,1.549451,1.714286,1.692308,1.681319,1.697802,1.686813,1.703297,1.901099,1.445055,1.760989,1.700549,1.799451,1.673077,1.640110,12,1,26


In [0]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 45174237 entries, 0 to 45174236
Data columns (total 55 columns):
 #   Column         Dtype         
---  ------         -----         
 0   id             object        
 1   item_id        int16         
 2   dept_id        int16         
 3   store_id       int16         
 4   cat_id         int16         
 5   state_id       int16         
 6   d              object        
 7   sales          float32       
 8   date           datetime64[ns]
 9   wm_yr_wk       int16         
 10  weekday        int16         
 11  wday           int16         
 12  month          int16         
 13  year           int16         
 14  event_name_1   int16         
 15  event_type_1   int16         
 16  event_name_2   int16         
 17  event_type_2   int16         
 18  snap_CA        float32       
 19  snap_TX        float32       
 20  snap_WI        float32       
 21  sell_price     float32       
 22  lag_7          float32       
 23  lag_2

In [0]:
df.dropna(inplace = True)
df.shape

(23602466, 55)

In [0]:
df

Unnamed: 0,id,item_id,dept_id,store_id,cat_id,state_id,d,sales,date,wm_yr_wk,weekday,wday,month,year,event_name_1,event_type_1,event_name_2,event_type_2,snap_CA,snap_TX,snap_WI,sell_price,lag_7,lag_28,lag_91,...,rmean_182_7,rmean_364_7,rmean_7_28,rmean_28_28,rmean_91_28,rmean_182_28,rmean_364_28,rmean_7_91,rmean_28_91,rmean_91_91,rmean_182_91,rmean_364_91,rmean_7_182,rmean_28_182,rmean_91_182,rmean_182_182,rmean_364_182,rmean_7_364,rmean_28_364,rmean_91_364,rmean_182_364,rmean_364_364,week,quarter,mday
12729751,HOBBIES_1_008_CA_1_validation,7,0,0,0,0,d_728,0.0,2013-01-25,11252,0,7,1,2013,0,0,0,0,0.0,0.0,0.0,0.46,0.0,0.0,0.0,...,4.285714,11.000000,0.000000,0.000000,0.000000,5.000000,9.607142,0.000000,0.000000,0.000000,6.681319,10.296703,0.164835,0.769231,3.340659,6.049450,8.906593,3.236264,3.763736,5.598901,7.478022,6.824176,4,1,25
12729758,HOBBIES_1_009_CA_1_validation,8,0,0,0,0,d_728,0.0,2013-01-25,11252,0,7,1,2013,0,0,0,0,0.0,0.0,0.0,1.77,1.0,1.0,0.0,...,1.571429,0.714286,1.571429,1.750000,2.214286,1.178571,1.821429,1.747253,1.791209,1.736264,1.758242,1.868132,1.675824,1.604396,1.747253,1.692308,1.582418,1.667582,1.697802,1.747253,1.637363,1.821429,4,1,25
12729765,HOBBIES_1_010_CA_1_validation,9,0,0,0,0,d_728,1.0,2013-01-25,11252,0,7,1,2013,0,0,0,0,0.0,0.0,0.0,2.97,0.0,0.0,0.0,...,0.857143,1.000000,0.678571,0.750000,1.071429,1.107143,0.714286,0.747253,0.835165,1.087912,0.901099,0.747253,0.917582,0.967033,0.994505,0.868132,0.824176,0.895604,0.887363,0.892857,0.846154,0.543956,4,1,25
12729779,HOBBIES_1_012_CA_1_validation,11,0,0,0,0,d_728,0.0,2013-01-25,11252,0,7,1,2013,0,0,0,0,0.0,0.0,0.0,6.52,0.0,2.0,0.0,...,0.000000,0.285714,0.607143,0.535714,0.678571,0.000000,0.464286,0.494505,0.527473,0.395604,0.296703,0.626374,0.417582,0.357143,0.346154,0.412088,0.549451,0.420330,0.420330,0.461538,0.480769,0.552198,4,1,25
12729800,HOBBIES_1_015_CA_1_validation,14,0,0,0,0,d_728,0.0,2013-01-25,11252,0,7,1,2013,0,0,0,0,0.0,0.0,0.0,0.72,1.0,1.0,5.0,...,4.571429,7.714286,3.071429,3.928571,3.750000,4.464286,5.821429,4.164835,4.263736,4.274725,4.923077,7.912088,4.252747,4.379121,4.598901,6.032967,7.445055,5.203297,5.310440,6.063187,6.739011,6.722528,4,1,25
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
45174232,FOODS_3_825_WI_3_validation,3046,6,9,2,2,d_1885,1.0,2016-03-27,11609,3,2,3,2016,5,1,0,0,0.0,0.0,0.0,3.98,0.0,0.0,0.0,...,0.000000,0.857143,1.178571,0.642857,0.785714,0.178571,0.892857,0.846154,0.747253,0.373626,0.384615,1.054945,0.604396,0.467033,0.379121,0.686813,1.043956,0.662088,0.631868,0.700549,0.865385,0.958791,12,1,27
45174233,FOODS_3_826_WI_3_validation,3047,6,9,2,2,d_1884,2.0,2016-03-26,11609,2,1,3,2016,0,0,0,0,0.0,0.0,0.0,1.28,1.0,0.0,1.0,...,2.000000,0.000000,0.821429,1.000000,1.392857,1.892857,0.142857,1.186813,1.241758,1.670330,1.186813,0.120879,1.445055,1.532967,1.428571,0.593407,0.197802,0.980769,0.928571,0.744505,0.395604,0.423077,12,1,26
45174234,FOODS_3_826_WI_3_validation,3047,6,9,2,2,d_1885,4.0,2016-03-27,11609,3,2,3,2016,5,1,0,0,0.0,0.0,0.0,1.28,1.0,0.0,2.0,...,2.000000,0.000000,0.857143,0.892857,1.392857,1.821429,0.142857,1.186813,1.219780,1.659341,1.219780,0.120879,1.434066,1.505494,1.439560,0.609890,0.197802,0.983516,0.928571,0.750000,0.403846,0.423077,12,1,27
45174235,FOODS_3_827_WI_3_validation,3048,6,9,2,2,d_1884,0.0,2016-03-26,11609,2,1,3,2016,0,0,0,0,0.0,0.0,0.0,1.00,1.0,4.0,3.0,...,3.571429,1.571429,1.607143,1.607143,1.678571,2.035714,0.821429,1.538462,1.549451,1.714286,1.692308,1.681319,1.697802,1.686813,1.703297,1.901099,1.445055,1.760989,1.700549,1.799451,1.673077,1.640110,12,1,26


In [0]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 23602466 entries, 12729751 to 45174236
Data columns (total 55 columns):
 #   Column         Dtype         
---  ------         -----         
 0   id             object        
 1   item_id        int16         
 2   dept_id        int16         
 3   store_id       int16         
 4   cat_id         int16         
 5   state_id       int16         
 6   d              object        
 7   sales          float32       
 8   date           datetime64[ns]
 9   wm_yr_wk       int16         
 10  weekday        int16         
 11  wday           int16         
 12  month          int16         
 13  year           int16         
 14  event_name_1   int16         
 15  event_type_1   int16         
 16  event_name_2   int16         
 17  event_type_2   int16         
 18  snap_CA        float32       
 19  snap_TX        float32       
 20  snap_WI        float32       
 21  sell_price     float32       
 22  lag_7          float32       
 23

In [0]:
cat_feats = ['item_id', 'dept_id','store_id', 'cat_id', 'state_id'] + ["event_name_1", "event_name_2", "event_type_1", "event_type_2"]
useless_cols = ["id", "date", "sales","d", "wm_yr_wk", "weekday"]
train_cols = df.columns[~df.columns.isin(useless_cols)]
X_train = df[train_cols]
y_train = df["sales"]

In [0]:
np.random.seed(42)

fake_valid_inds = np.random.choice(X_train.index.values, 2_000_000, replace = False)
train_inds = np.setdiff1d(X_train.index.values, fake_valid_inds)
train_data = lgb.Dataset(X_train.loc[train_inds] , label = y_train.loc[train_inds], categorical_feature=cat_feats, free_raw_data=False)
fake_valid_data = lgb.Dataset(X_train.loc[fake_valid_inds], label = y_train.loc[fake_valid_inds],
                              categorical_feature=cat_feats,
                 free_raw_data=False)# This is a random sample, we're not gonna apply any time series train-test-split tricks here!

In [0]:
del df, X_train, y_train, fake_valid_inds, train_inds ; gc.collect()

0

In [0]:
%%time
m_lgb = lgb.train(params, train_data, valid_sets=[fake_valid_data], verbose_eval=20) 



In [0]:
#2.30653 DEFAULT NO MAGIC NUMBER

In [0]:
m_lgb.save_model(DIRECTORY + "models/model_v" + str(VERSION) + ".lgb")

In [0]:
# # # #load model
# m_lgb = lgb.Booster(model_file=DIRECTORY + "models/model_v" + str(VERSION) + ".lgb")

In [0]:
import matplotlib.pyplot as plt
plt.rcParams['figure.figsize'] = (18.0, 4)
%matplotlib inline 

fig, ax = plt.subplots(figsize=(12,8))
lgb.plot_importance(m_lgb, max_num_features=50, height=0.8, ax=ax)
ax.grid(False)
plt.title("LightGBM - Feature Importance: " + str(VERSION), fontsize=15)
plt.savefig(DIRECTORY + "models/model_v" + str(VERSION) + ".png")
#plt.show()

In [0]:
%%time
te = create_dt(False)
#create_date_features_for_test(te)
cols = [f"F{i}" for i in range(1,29)]


for tdelta in range(0, 28):
    day = fday + timedelta(days=tdelta)
    print(tdelta, day)
    tst = te[(te.date >= day - timedelta(days=max_lags)) & (te.date <= day)].copy()
    create_fea(tst) #old, slow
    #create_lag_features_for_test(tst, day)  # faster  optimized
    tst = tst.loc[tst.date == day , train_cols]
    te.loc[te.date == day, "sales"] = m_lgb.predict(tst)



te_sub = te.loc[te.date >= fday, ["id", "sales"]].copy()
te_sub["F"] = [f"F{rank}" for rank in te_sub.groupby("id")["id"].cumcount()+1]
te_sub = te_sub.set_index(["id", "F" ]).unstack()["sales"][cols].reset_index()
te_sub.fillna(0., inplace = True)
te_sub.sort_values("id", inplace = True)
te_sub.reset_index(drop=True, inplace = True)

In [0]:
te_sub.to_csv(DIRECTORY + "submissions/submission_v" + str(VERSION) + ".csv",index=False)

In [0]:
te_sub

In [0]:
#last_28 = pd.read_csv(DIRECTORY + "data/sales_last28.csv") #BACKUP TRUE DATASET

def create_test():
  """
  Create test set starting from last day + 1 used for
  trainining. We split our trining data into test and train,
  we use the last 28 days as test set, and all the previous
  as train set. Last day of train data is 1913 - 28,
  first day of test is 1913 - 28 + 1 )
  """
  df_test2 = create_dt(is_train=True, first_day= 1913-27, tr_last=1913)
 
  cols = [f"F{i}" for i in range(1,29)]
 
  df_test2["F"] = [f"F{rank}" for rank in df_test2.groupby("id")["id"].cumcount()+1]
  df_test2 = df_test2.set_index(["id", "F" ]).unstack()["sales"][cols].reset_index()
  df_test2.fillna(0., inplace = True)
  df_test2.sort_values("id", inplace = True)
  df_test2.reset_index(drop=True, inplace = True)
  return df_test2

def is_testset_correct(test_set):
  """
  Run only if check is needed, memory consumption.
  Given a test_set (in submission format) we created,
  we check if it actually match the last 28 days of
  the true sales train validation data.
  Return 0 if perfect match.
  """
  # Load dataset (takes memory)
  true_sales = pd.read_csv(DIRECTORY + "data/sales_train_validation.csv")
  dcols = [f"d_{i}" for i in range(1913-27, 1914)]
  fcols = [f"F{i}" for i in range(1, 29)]
  true_sales = true_sales[["id"] + dcols]
  true_sales.sort_values("id", inplace = True)
 
  # Check if is correct
  test_set = test_set[["id"] + fcols]
  value = np.sum(test_set[fcols].values - true_sales[dcols].values)
  return value
 
#print(is_testset_correct(create_test()))


def get_rmse(predictions):
  """
  Given a prediction (submission format),
  return RMSE using the create_test function
  (last 28 days of the sales_train_validation.csv
  in submission format)
  """
  # Take all columns beside ID for both test and pred data and extract values
  test_data = create_test().iloc[:,1:]
  #test_data = last_28.iloc[:,1:] 
  print(test_data.shape)
  predictions = predictions.iloc[:,1:]
  print(predictions.shape)
  error = mean_squared_error(predictions, test_data, squared=False)
  #error = test_data.values - predictions.values
  return error
 
get_rmse(te_sub)
