In [0]:
from  datetime import datetime, timedelta
import gc
import numpy as np, pandas as pd
import lightgbm as lgb

In [2]:
DIRECTORY = "/content/drive/My Drive/Colab Notebooks/lsda/HW1/"
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


> This notebook aims to push the public LB under 0.50. Certainly, the competition is not yet at its peak and there clearly remains room for improvement.

# Credits

* [First R notebook](https://www.kaggle.com/kailex/m5-forecaster-v2)
* [Python translation](https://www.kaggle.com/kneroma/m5-forecast-v2-python)

# Changes (AUTHOR)
* v5 : try to optimise the LGBM params (go below in lgbm params section to see changes)
* v4 : add df, X_train deletion before training step --> increasing train sample without memeroy issues

# Changes (us)
1. Default no magic number (version 7) -> 0.55648

In [0]:
CAL_DTYPES={"event_name_1": "category", "event_name_2": "category", "event_type_1": "category", 
         "event_type_2": "category", "weekday": "category", 'wm_yr_wk': 'int16', "wday": "int16",
        "month": "int16", "year": "int16", "snap_CA": "float32", 'snap_TX': 'float32', 'snap_WI': 'float32' }
PRICE_DTYPES = {"store_id": "category", "item_id": "category", "wm_yr_wk": "int16","sell_price":"float32" }

In [0]:
pd.options.display.max_columns = 50

In [5]:
h = 28 
max_lags = 57
tr_last = 1913 + 28 # CHANGED
fday = datetime(2016,4, 25) + timedelta(days=28)#CHANGED
fday

datetime.datetime(2016, 5, 23, 0, 0)

In [0]:
def create_dt(is_train = True, nrows = None, first_day = 1200):
    prices = pd.read_csv(DIRECTORY + "data/sell_prices.csv", dtype = PRICE_DTYPES)
    for col, col_dtype in PRICE_DTYPES.items():
        if col_dtype == "category":
            prices[col] = prices[col].cat.codes.astype("int16")
            prices[col] -= prices[col].min()
            
    cal = pd.read_csv(DIRECTORY + "data/calendar.csv", dtype = CAL_DTYPES)
    cal["date"] = pd.to_datetime(cal["date"])
    for col, col_dtype in CAL_DTYPES.items():
        if col_dtype == "category":
            cal[col] = cal[col].cat.codes.astype("int16")
            cal[col] -= cal[col].min()
    
    start_day = max(1 if is_train  else tr_last-max_lags, first_day)
    numcols = [f"d_{day}" for day in range(start_day,tr_last+1)]
    catcols = ['id', 'item_id', 'dept_id','store_id', 'cat_id', 'state_id']
    dtype = {numcol:"float32" for numcol in numcols} 
    dtype.update({col: "category" for col in catcols if col != "id"})
    #dt = pd.read_csv(DIRECTORY + "data/sales_train_validation.csv", 
                     #nrows = nrows, usecols = catcols + numcols, dtype = dtype)
    dt = pd.read_csv(DIRECTORY + "data/sales_train_evaluation.csv", 
                     nrows = nrows, usecols = catcols + numcols, dtype = dtype)
    
    for col in catcols:
        if col != "id":
            dt[col] = dt[col].cat.codes.astype("int16")
            dt[col] -= dt[col].min()
    
    if not is_train:
        for day in range(tr_last+1, tr_last+ 28 +1):
            dt[f"d_{day}"] = np.nan
    
    dt = pd.melt(dt,
                  id_vars = catcols,
                  value_vars = [col for col in dt.columns if col.startswith("d_")],
                  var_name = "d",
                  value_name = "sales")
    
    dt = dt.merge(cal, on= "d", copy = False)
    dt = dt.merge(prices, on = ["store_id", "item_id", "wm_yr_wk"], copy = False)
    
    return dt

In [0]:
def create_fea(dt):
    lags = [7, 28]
    lag_cols = [f"lag_{lag}" for lag in lags ]
    for lag, lag_col in zip(lags, lag_cols):
        dt[lag_col] = dt[["id","sales"]].groupby("id")["sales"].shift(lag)

    wins = [7, 28]
    for win in wins :
        for lag,lag_col in zip(lags, lag_cols):
            dt[f"rmean_{lag}_{win}"] = dt[["id", lag_col]].groupby("id")[lag_col].transform(lambda x : x.rolling(win).mean())

    
    
    date_features = {
        
        "wday": "weekday",
        "week": "weekofyear",
        "month": "month",
        "quarter": "quarter",
        "year": "year",
        "mday": "day",
#         "ime": "is_month_end",
#         "ims": "is_month_start",
    }
    
#     dt.drop(["d", "wm_yr_wk", "weekday"], axis=1, inplace = True)
    
    for date_feat_name, date_feat_func in date_features.items():
        if date_feat_name in dt.columns:
            dt[date_feat_name] = dt[date_feat_name].astype("int16")
        else:
            dt[date_feat_name] = getattr(dt["date"].dt, date_feat_func).astype("int16")

In [0]:
FIRST_DAY = 350 # If you want to load all the data set it to '1' -->  Great  memory overflow  risk !

In [9]:
df = create_dt(is_train=True, first_day= FIRST_DAY)
df.shape

(41571939, 22)

In [10]:
df

Unnamed: 0,id,item_id,dept_id,store_id,cat_id,state_id,d,sales,date,wm_yr_wk,weekday,wday,month,year,event_name_1,event_type_1,event_name_2,event_type_2,snap_CA,snap_TX,snap_WI,sell_price
0,HOBBIES_1_002_CA_1_evaluation,1,0,0,0,0,d_350,0.0,2012-01-13,11150,0,7,1,2012,0,0,0,0,0.0,1.0,0.0,3.97
1,HOBBIES_1_004_CA_1_evaluation,3,0,0,0,0,d_350,2.0,2012-01-13,11150,0,7,1,2012,0,0,0,0,0.0,1.0,0.0,4.34
2,HOBBIES_1_005_CA_1_evaluation,4,0,0,0,0,d_350,0.0,2012-01-13,11150,0,7,1,2012,0,0,0,0,0.0,1.0,0.0,2.48
3,HOBBIES_1_008_CA_1_evaluation,7,0,0,0,0,d_350,0.0,2012-01-13,11150,0,7,1,2012,0,0,0,0,0.0,1.0,0.0,0.50
4,HOBBIES_1_009_CA_1_evaluation,8,0,0,0,0,d_350,2.0,2012-01-13,11150,0,7,1,2012,0,0,0,0,0.0,1.0,0.0,1.77
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
41571934,FOODS_3_825_WI_3_evaluation,3046,6,9,2,2,d_1941,2.0,2016-05-22,11617,3,2,5,2016,0,0,0,0,0.0,0.0,0.0,3.98
41571935,FOODS_3_826_WI_3_evaluation,3047,6,9,2,2,d_1940,1.0,2016-05-21,11617,2,1,5,2016,0,0,0,0,0.0,0.0,0.0,1.28
41571936,FOODS_3_826_WI_3_evaluation,3047,6,9,2,2,d_1941,0.0,2016-05-22,11617,3,2,5,2016,0,0,0,0,0.0,0.0,0.0,1.28
41571937,FOODS_3_827_WI_3_evaluation,3048,6,9,2,2,d_1940,5.0,2016-05-21,11617,2,1,5,2016,0,0,0,0,0.0,0.0,0.0,1.00


In [11]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 41571939 entries, 0 to 41571938
Data columns (total 22 columns):
 #   Column        Dtype         
---  ------        -----         
 0   id            object        
 1   item_id       int16         
 2   dept_id       int16         
 3   store_id      int16         
 4   cat_id        int16         
 5   state_id      int16         
 6   d             object        
 7   sales         float32       
 8   date          datetime64[ns]
 9   wm_yr_wk      int16         
 10  weekday       int16         
 11  wday          int16         
 12  month         int16         
 13  year          int16         
 14  event_name_1  int16         
 15  event_type_1  int16         
 16  event_name_2  int16         
 17  event_type_2  int16         
 18  snap_CA       float32       
 19  snap_TX       float32       
 20  snap_WI       float32       
 21  sell_price    float32       
dtypes: datetime64[ns](1), float32(5), int16(14), object(2)
memory us

In [12]:
df.dropna(inplace = True)
df.shape

(41571939, 22)

In [13]:
df

Unnamed: 0,id,item_id,dept_id,store_id,cat_id,state_id,d,sales,date,wm_yr_wk,weekday,wday,month,year,event_name_1,event_type_1,event_name_2,event_type_2,snap_CA,snap_TX,snap_WI,sell_price
0,HOBBIES_1_002_CA_1_evaluation,1,0,0,0,0,d_350,0.0,2012-01-13,11150,0,7,1,2012,0,0,0,0,0.0,1.0,0.0,3.97
1,HOBBIES_1_004_CA_1_evaluation,3,0,0,0,0,d_350,2.0,2012-01-13,11150,0,7,1,2012,0,0,0,0,0.0,1.0,0.0,4.34
2,HOBBIES_1_005_CA_1_evaluation,4,0,0,0,0,d_350,0.0,2012-01-13,11150,0,7,1,2012,0,0,0,0,0.0,1.0,0.0,2.48
3,HOBBIES_1_008_CA_1_evaluation,7,0,0,0,0,d_350,0.0,2012-01-13,11150,0,7,1,2012,0,0,0,0,0.0,1.0,0.0,0.50
4,HOBBIES_1_009_CA_1_evaluation,8,0,0,0,0,d_350,2.0,2012-01-13,11150,0,7,1,2012,0,0,0,0,0.0,1.0,0.0,1.77
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
41571934,FOODS_3_825_WI_3_evaluation,3046,6,9,2,2,d_1941,2.0,2016-05-22,11617,3,2,5,2016,0,0,0,0,0.0,0.0,0.0,3.98
41571935,FOODS_3_826_WI_3_evaluation,3047,6,9,2,2,d_1940,1.0,2016-05-21,11617,2,1,5,2016,0,0,0,0,0.0,0.0,0.0,1.28
41571936,FOODS_3_826_WI_3_evaluation,3047,6,9,2,2,d_1941,0.0,2016-05-22,11617,3,2,5,2016,0,0,0,0,0.0,0.0,0.0,1.28
41571937,FOODS_3_827_WI_3_evaluation,3048,6,9,2,2,d_1940,5.0,2016-05-21,11617,2,1,5,2016,0,0,0,0,0.0,0.0,0.0,1.00


In [14]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 41571939 entries, 0 to 41571938
Data columns (total 22 columns):
 #   Column        Dtype         
---  ------        -----         
 0   id            object        
 1   item_id       int16         
 2   dept_id       int16         
 3   store_id      int16         
 4   cat_id        int16         
 5   state_id      int16         
 6   d             object        
 7   sales         float32       
 8   date          datetime64[ns]
 9   wm_yr_wk      int16         
 10  weekday       int16         
 11  wday          int16         
 12  month         int16         
 13  year          int16         
 14  event_name_1  int16         
 15  event_type_1  int16         
 16  event_name_2  int16         
 17  event_type_2  int16         
 18  snap_CA       float32       
 19  snap_TX       float32       
 20  snap_WI       float32       
 21  sell_price    float32       
dtypes: datetime64[ns](1), float32(5), int16(14), object(2)
memory us

In [0]:
cat_feats = ['item_id', 'dept_id','store_id', 'cat_id', 'state_id'] + ["event_name_1", "event_name_2", "event_type_1", "event_type_2"]
useless_cols = ["id", "date", "sales","d", "wm_yr_wk", "weekday"]
train_cols = df.columns[~df.columns.isin(useless_cols)]
X_train = df[train_cols]
y_train = df["sales"]

In [0]:
# train_data = lgb.Dataset(X_train, label = y_train, categorical_feature=cat_feats, free_raw_data=False)
# fake_valid_inds = np.random.choice(len(X_train), 1000000, replace = False)
# fake_valid_data = lgb.Dataset(X_train.iloc[fake_valid_inds], label = y_train.iloc[fake_valid_inds],categorical_feature=cat_feats,
#                              free_raw_data=False)   # This is just a subsample of the training set, not a real validation set !

In [17]:
%%time

np.random.seed(42)

fake_valid_inds = np.random.choice(X_train.index.values, 2_000_000, replace = False)
train_inds = np.setdiff1d(X_train.index.values, fake_valid_inds)
train_data = lgb.Dataset(X_train.loc[train_inds] , label = y_train.loc[train_inds], 
                         categorical_feature=cat_feats, free_raw_data=False)
fake_valid_data = lgb.Dataset(X_train.loc[fake_valid_inds], label = y_train.loc[fake_valid_inds],
                              categorical_feature=cat_feats,
                 free_raw_data=False)# This is a random sample, we're not gonna apply any time series train-test-split tricks here!

CPU times: user 15.9 s, sys: 33.5 ms, total: 15.9 s
Wall time: 15.9 s


In [18]:
del df, X_train, y_train, fake_valid_inds,train_inds ; gc.collect()

0

In [0]:
params = {
        "objective" : "poisson",
        "metric" :"rmse",
        "force_row_wise" : True,
        "learning_rate" : 0.075,
#         "sub_feature" : 0.8,
        "sub_row" : 0.75,
        "bagging_freq" : 1,
        "lambda_l2" : 0.1,
#         "nthread" : 4
        "metric": ["rmse"],
    'verbosity': 1,
    'num_iterations' : 1200,
    'num_leaves': 128,
    "min_data_in_leaf": 100,
}


# params = {
#                     'boosting_type': 'gbdt',
#                     'objective': 'tweedie',
#                     'tweedie_variance_power': 1.1,
#                     'metric': 'rmse',
#                     'subsample': 0.5,
#                     'subsample_freq': 1,
#                     'learning_rate': 0.03,
#                     'num_leaves': 2**11-1,
#                     'min_data_in_leaf': 2**12-1,
#                     'feature_fraction': 0.5,
#                     'max_bin': 100,
#                     'n_estimators': 2,
#                     'boost_from_average': False,
#                     'verbose': -1,
#                 } 
VERSION = "magic_number_eval"

In [20]:
m_lgb = lgb.train(params, train_data, valid_sets = [fake_valid_data], verbose_eval=20) 



[20]	valid_0's rmse: 3.27916
[40]	valid_0's rmse: 3.01399
[60]	valid_0's rmse: 2.92826
[80]	valid_0's rmse: 2.8832
[100]	valid_0's rmse: 2.83933
[120]	valid_0's rmse: 2.8133
[140]	valid_0's rmse: 2.77712
[160]	valid_0's rmse: 2.74773
[180]	valid_0's rmse: 2.73301
[200]	valid_0's rmse: 2.7088
[220]	valid_0's rmse: 2.68784
[240]	valid_0's rmse: 2.67171
[260]	valid_0's rmse: 2.65477
[280]	valid_0's rmse: 2.6448
[300]	valid_0's rmse: 2.63229
[320]	valid_0's rmse: 2.62001
[340]	valid_0's rmse: 2.60503
[360]	valid_0's rmse: 2.59386
[380]	valid_0's rmse: 2.57924
[400]	valid_0's rmse: 2.57082
[420]	valid_0's rmse: 2.56097
[440]	valid_0's rmse: 2.55786
[460]	valid_0's rmse: 2.55156
[480]	valid_0's rmse: 2.54358
[500]	valid_0's rmse: 2.5381
[520]	valid_0's rmse: 2.53023
[540]	valid_0's rmse: 2.52291
[560]	valid_0's rmse: 2.51687
[580]	valid_0's rmse: 2.51322
[600]	valid_0's rmse: 2.50876
[620]	valid_0's rmse: 2.50467
[640]	valid_0's rmse: 2.50311
[660]	valid_0's rmse: 2.50097
[680]	valid_0's rms

In [21]:
m_lgb.save_model(DIRECTORY + "models/model_v" + str(VERSION) + ".lgb")

<lightgbm.basic.Booster at 0x7f3fab1fa160>

In [22]:
alphas = [1.028, 1.023, 1.018]#changed
weights = [1/len(alphas)]*len(alphas)
sub = 0.

for icount, (alpha, weight) in enumerate(zip(alphas, weights)):

    te = create_dt(False)
    cols = [f"F{i}" for i in range(1,29)]

    for tdelta in range(0, 28):
        day = fday + timedelta(days=tdelta)
        print(tdelta, day)
        tst = te[(te.date >= day - timedelta(days=max_lags)) & (te.date <= day)].copy()
        create_fea(tst)
        tst = tst.loc[tst.date == day , train_cols]
        #prediction step
        te.loc[te.date == day, "sales"] = alpha*m_lgb.predict(tst) # magic multiplier by kyakovlev



    te_sub = te.loc[te.date >= fday, ["id", "sales"]].copy()
#     te_sub.loc[te.date >= fday+ timedelta(days=h), "id"] = te_sub.loc[te.date >= fday+timedelta(days=h), 
#                                                                           "id"].str.replace("validation$", "evaluation")
    te_sub["F"] = [f"F{rank}" for rank in te_sub.groupby("id")["id"].cumcount()+1]
    te_sub = te_sub.set_index(["id", "F" ]).unstack()["sales"][cols].reset_index()
    te_sub.fillna(0., inplace = True)
    te_sub.sort_values("id", inplace = True)
    te_sub.reset_index(drop=True, inplace = True)
    #te_sub.to_csv(f"submission_{icount}.csv",index=False)
    if icount == 0 :
        sub = te_sub
        sub[cols] *= weight
    else:
        sub[cols] += te_sub[cols]*weight
    print(icount, alpha, weight)


sub2 = sub.copy()
sub2["id"] = sub2["id"].str.replace("validation$", "evaluation")
sub = pd.concat([sub, sub2], axis=0, sort=False)
sub.to_csv(DIRECTORY + "submissions/submission_v" + str(VERSION) + ".csv",index=False)

0 2016-05-23 00:00:00
1 2016-05-24 00:00:00
2 2016-05-25 00:00:00
3 2016-05-26 00:00:00
4 2016-05-27 00:00:00
5 2016-05-28 00:00:00
6 2016-05-29 00:00:00
7 2016-05-30 00:00:00
8 2016-05-31 00:00:00
9 2016-06-01 00:00:00
10 2016-06-02 00:00:00
11 2016-06-03 00:00:00
12 2016-06-04 00:00:00
13 2016-06-05 00:00:00
14 2016-06-06 00:00:00
15 2016-06-07 00:00:00
16 2016-06-08 00:00:00
17 2016-06-09 00:00:00
18 2016-06-10 00:00:00
19 2016-06-11 00:00:00
20 2016-06-12 00:00:00
21 2016-06-13 00:00:00
22 2016-06-14 00:00:00
23 2016-06-15 00:00:00
24 2016-06-16 00:00:00
25 2016-06-17 00:00:00
26 2016-06-18 00:00:00
27 2016-06-19 00:00:00
0 1.028 0.3333333333333333
0 2016-05-23 00:00:00
1 2016-05-24 00:00:00
2 2016-05-25 00:00:00
3 2016-05-26 00:00:00
4 2016-05-27 00:00:00
5 2016-05-28 00:00:00
6 2016-05-29 00:00:00
7 2016-05-30 00:00:00
8 2016-05-31 00:00:00
9 2016-06-01 00:00:00
10 2016-06-02 00:00:00
11 2016-06-03 00:00:00
12 2016-06-04 00:00:00
13 2016-06-05 00:00:00
14 2016-06-06 00:00:00
15 2

In [0]:
eval = sub.iloc[:30490]
valid = pd.read_csv(DIRECTORY + "submissions/submission_v4.csv")
valid = valid.iloc[:30490]
magic_submssion_both = pd.concat([valid, eval], axis=0, sort=False)
magic_submssion_both.to_csv(DIRECTORY + "submissions/submission_v" + "magic_submssion_both3" + ".csv",index=False)
#eval["id"] =  eval["id"].str.replace("validation$", "evaluation")
# sub2["id"] = sub2["id"].str.replace("validation$", "evaluation")
# sub = pd.concat([sub, sub2], axis=0, sort=False)
# sub.to_csv(DIRECTORY + "submissions/submission_v" + str(VERSION) + ".csv",index=False)

In [53]:
magic_submssion_both.iloc[:30491]

F,id,F1,F2,F3,F4,F5,F6,F7,F8,F9,F10,F11,F12,F13,F14,F15,F16,F17,F18,F19,F20,F21,F22,F23,F24,F25,F26,F27,F28
0,FOODS_1_001_CA_1_validation,0.854313,0.814471,0.805627,0.780377,1.070533,1.198954,1.184004,0.951208,0.925478,0.916661,0.925250,1.059795,1.303851,1.099792,0.953310,0.897313,0.916248,0.907666,1.050847,1.317036,1.226898,0.924066,0.849146,0.830094,0.843233,1.024028,1.269667,1.244264
1,FOODS_1_001_CA_2_validation,0.948229,1.042148,0.951847,1.080279,1.204486,1.257308,1.409113,0.949901,0.944817,0.916222,0.914493,1.117217,1.434148,1.157543,1.024767,1.000518,1.010625,1.025981,1.220921,1.573305,1.317167,1.031072,0.968689,0.967473,1.024615,1.231622,1.566847,1.345562
2,FOODS_1_001_CA_3_validation,1.076222,1.030540,0.926158,0.911666,0.991928,1.196637,1.187551,1.052254,1.073073,0.932195,0.959263,1.032812,1.396085,1.251486,1.078489,1.058220,0.970175,1.012115,1.042555,1.515576,1.443205,1.079198,1.025432,0.932216,0.923486,1.016119,1.319582,1.217043
3,FOODS_1_001_CA_4_validation,0.391966,0.362796,0.364131,0.368898,0.443854,0.441799,0.517706,0.406772,0.423831,0.405297,0.389249,0.395015,0.431004,0.378880,0.364333,0.373084,0.384750,0.402299,0.436298,0.467810,0.498522,0.379515,0.368319,0.376223,0.381262,0.437772,0.465171,0.487544
4,FOODS_1_001_TX_1_validation,0.181781,0.180437,0.191644,0.193718,0.167594,0.178529,0.224174,0.490582,0.449993,0.432530,0.441261,0.475322,0.509656,0.409111,0.407222,0.401526,0.353430,0.344968,0.386384,0.386349,0.365303,0.298422,0.288390,0.277458,0.276637,0.289957,0.344379,0.322675
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
30486,HOUSEHOLD_2_516_TX_3_validation,0.169858,0.160619,0.168124,0.158487,0.187403,0.210085,0.166691,0.128192,0.123370,0.114832,0.123424,0.142975,0.174671,0.138567,0.133132,0.123239,0.130042,0.132795,0.155133,0.178387,0.172457,0.145150,0.136855,0.136275,0.142032,0.163697,0.187260,0.180530
30487,HOUSEHOLD_2_516_WI_1_validation,0.091766,0.088918,0.089304,0.092001,0.102377,0.109160,0.096984,0.092616,0.089521,0.087444,0.096959,0.121730,0.142786,0.104451,0.096125,0.089439,0.091441,0.095213,0.124595,0.139350,0.134034,0.098086,0.093877,0.094359,0.098273,0.127460,0.142491,0.134527
30488,HOUSEHOLD_2_516_WI_2_validation,0.046579,0.045611,0.043815,0.094880,0.108058,0.110906,0.103965,0.095354,0.091514,0.084924,0.110171,0.122999,0.121233,0.095542,0.105289,0.097431,0.102118,0.105566,0.126751,0.132693,0.124450,0.108200,0.102050,0.100567,0.091422,0.108892,0.112999,0.106608
30489,HOUSEHOLD_2_516_WI_3_validation,0.059618,0.058095,0.055458,0.057433,0.070422,0.064527,0.065155,0.104833,0.100583,0.095054,0.095151,0.118375,0.127516,0.099467,0.115386,0.108305,0.108320,0.112650,0.141197,0.151200,0.143210,0.118592,0.111566,0.114135,0.122210,0.152812,0.160938,0.152241


In [55]:
magic = pd.read_csv(DIRECTORY + "submissions/submission_vmagic_submssion_both3 (1).csv")
magic


Unnamed: 0,id,F1,F2,F3,F4,F5,F6,F7,F8,F9,F10,F11,F12,F13,F14,F15,F16,F17,F18,F19,F20,F21,F22,F23,F24,F25,F26,F27,F28
0,FOODS_1_001_CA_1_validation,0.854313,0.814471,0.805627,0.780377,1.070533,1.198954,1.184004,0.951208,0.925478,0.916661,0.925250,1.059795,1.303851,1.099792,0.953310,0.897313,0.916248,0.907666,1.050847,1.317036,1.226898,0.924066,0.849146,0.830094,0.843233,1.024028,1.269667,1.244264
1,FOODS_1_001_CA_2_validation,0.948229,1.042148,0.951847,1.080279,1.204486,1.257308,1.409113,0.949901,0.944817,0.916222,0.914493,1.117217,1.434148,1.157543,1.024767,1.000518,1.010625,1.025981,1.220921,1.573305,1.317167,1.031072,0.968689,0.967473,1.024615,1.231622,1.566847,1.345562
2,FOODS_1_001_CA_3_validation,1.076222,1.030540,0.926158,0.911666,0.991928,1.196637,1.187551,1.052254,1.073073,0.932195,0.959263,1.032812,1.396085,1.251486,1.078489,1.058220,0.970175,1.012115,1.042555,1.515576,1.443205,1.079198,1.025432,0.932216,0.923486,1.016119,1.319582,1.217043
3,FOODS_1_001_CA_4_validation,0.391966,0.362796,0.364131,0.368898,0.443854,0.441799,0.517706,0.406772,0.423831,0.405297,0.389249,0.395015,0.431004,0.378880,0.364333,0.373084,0.384750,0.402299,0.436298,0.467810,0.498522,0.379515,0.368319,0.376223,0.381262,0.437772,0.465171,0.487544
4,FOODS_1_001_TX_1_validation,0.181781,0.180437,0.191644,0.193718,0.167594,0.178529,0.224174,0.490582,0.449993,0.432530,0.441261,0.475322,0.509656,0.409111,0.407222,0.401526,0.353430,0.344968,0.386384,0.386349,0.365303,0.298422,0.288390,0.277458,0.276637,0.289957,0.344379,0.322675
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
60975,HOUSEHOLD_2_516_TX_2_evaluation,0.208441,0.196869,0.198319,0.202518,0.253482,0.328260,0.312636,0.227683,0.196869,0.203677,0.202631,0.258101,0.329738,0.318351,0.213488,0.199988,0.203786,0.207304,0.256685,0.330662,0.316288,0.209118,0.207615,0.201525,0.210066,0.262879,0.338830,0.294116
60976,HOUSEHOLD_2_516_TX_3_evaluation,0.135977,0.129519,0.131561,0.133062,0.160017,0.203319,0.195958,0.146048,0.129519,0.136065,0.129703,0.165196,0.205222,0.199589,0.140223,0.131171,0.136018,0.137139,0.163818,0.206728,0.199748,0.138188,0.136818,0.135505,0.138221,0.166954,0.210298,0.178030
60977,HOUSEHOLD_2_516_WI_1_evaluation,0.092529,0.093530,0.094395,0.098342,0.130710,0.165889,0.132538,0.100096,0.093530,0.097804,0.091511,0.135429,0.166405,0.137568,0.097325,0.093524,0.098902,0.101694,0.132304,0.172732,0.139542,0.095676,0.102456,0.100215,0.101333,0.135223,0.170036,0.120759
60978,HOUSEHOLD_2_516_WI_2_evaluation,0.073243,0.077421,0.078137,0.079719,0.096864,0.112834,0.092397,0.080467,0.077421,0.083274,0.073973,0.096332,0.117813,0.094900,0.075840,0.081160,0.080357,0.080588,0.101033,0.114585,0.095779,0.076999,0.079378,0.080788,0.081641,0.099236,0.114948,0.083920


In [56]:
no_magic = pd.read_csv(DIRECTORY + "submissions/submissionv7 (1).csv")
no_magic

Unnamed: 0,id,F1,F2,F3,F4,F5,F6,F7,F8,F9,F10,F11,F12,F13,F14,F15,F16,F17,F18,F19,F20,F21,F22,F23,F24,F25,F26,F27,F28
0,FOODS_1_001_CA_1_validation,0.939726,0.850084,0.820145,0.768457,1.035075,1.216329,1.138530,0.960439,0.951691,0.937198,0.913792,1.038766,1.283936,1.167014,0.958759,0.889473,0.953201,0.956776,1.080745,1.394617,1.281298,0.945312,0.819376,0.801762,0.831189,0.980068,1.235459,1.196918
1,FOODS_1_001_CA_2_validation,1.065755,1.075433,0.974453,1.009992,1.114876,1.278345,1.367633,0.892753,0.942971,0.909506,0.894733,1.100992,1.462416,1.253041,0.966029,0.917625,0.953466,0.974164,1.150974,1.505388,1.488653,1.025413,0.967871,0.963037,1.038513,1.203212,1.671207,1.378352
2,FOODS_1_001_CA_3_validation,1.161203,1.109648,0.989184,0.937442,1.058255,1.295776,1.506602,1.084531,1.104753,0.984926,1.103826,1.076847,1.405898,1.332807,1.137419,1.068058,1.024964,1.053812,1.126057,1.602989,1.701825,1.124139,1.048320,0.962478,0.946072,1.031898,1.334200,1.322655
3,FOODS_1_001_CA_4_validation,0.403851,0.367348,0.348875,0.342480,0.433861,0.436922,0.532028,0.396154,0.411024,0.396980,0.373810,0.400469,0.437936,0.395224,0.363869,0.368507,0.385564,0.387010,0.439794,0.471519,0.471509,0.372813,0.351480,0.362181,0.367240,0.419002,0.448769,0.470088
4,FOODS_1_001_TX_1_validation,0.191279,0.186788,0.188098,0.190280,0.179746,0.169661,0.221444,0.472035,0.455647,0.445950,0.448696,0.505242,0.492031,0.402419,0.390688,0.428201,0.357010,0.349191,0.402437,0.380240,0.373323,0.290769,0.278141,0.264831,0.266419,0.295989,0.327746,0.318174
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
60975,HOUSEHOLD_2_516_TX_2_evaluation,0.230797,0.213216,0.215462,0.215581,0.300500,0.366120,0.340956,0.236703,0.226624,0.225882,0.250956,0.325068,0.336318,0.302493,0.238211,0.229805,0.222968,0.231825,0.269434,0.338886,0.338241,0.240876,0.225619,0.235151,0.225221,0.273217,0.343932,0.329755
60976,HOUSEHOLD_2_516_TX_3_evaluation,0.155437,0.143145,0.148176,0.141931,0.175548,0.196728,0.154953,0.122863,0.115530,0.106284,0.118616,0.141143,0.168262,0.137309,0.128159,0.119509,0.118671,0.126850,0.154556,0.168371,0.163848,0.138865,0.130368,0.131446,0.140827,0.169050,0.187608,0.181662
60977,HOUSEHOLD_2_516_WI_1_evaluation,0.085963,0.079347,0.079054,0.084168,0.098020,0.112173,0.100139,0.085737,0.082647,0.080829,0.087441,0.114473,0.138584,0.106104,0.087402,0.081365,0.082533,0.086164,0.114295,0.129911,0.124442,0.087081,0.089525,0.090111,0.093919,0.121796,0.138237,0.134329
60978,HOUSEHOLD_2_516_WI_2_evaluation,0.039023,0.037167,0.036018,0.078877,0.101052,0.100860,0.098599,0.088681,0.082962,0.078292,0.094108,0.120053,0.116401,0.090873,0.095308,0.088400,0.084678,0.088911,0.119047,0.116973,0.110745,0.094210,0.086741,0.087550,0.085274,0.107177,0.108939,0.108637


In [58]:
model_364 = pd.read_csv(DIRECTORY + "submissions/submission_vvalidation_364kaggle.csv")
model_364

Unnamed: 0,id,F1,F2,F3,F4,F5,F6,F7,F8,F9,F10,F11,F12,F13,F14,F15,F16,F17,F18,F19,F20,F21,F22,F23,F24,F25,F26,F27,F28
0,FOODS_1_001_CA_1_validation,0.843833,0.802040,0.793556,0.801595,1.004529,1.179631,1.102649,0.936743,0.925613,0.896021,0.877344,0.985851,1.240734,1.058779,0.889607,0.808945,0.805237,0.820636,0.930280,1.225136,1.089149,0.864598,0.777519,0.772946,0.760629,0.900748,1.139330,1.151380
1,FOODS_1_001_CA_2_validation,0.995239,1.046045,0.963179,1.124182,1.262426,1.372729,1.478084,0.872548,0.945528,0.907317,0.898176,1.019746,1.390103,1.251277,0.975033,0.950194,1.050849,0.984804,1.165706,1.546107,1.602401,0.990499,0.922950,0.931412,0.802665,0.887474,1.379222,1.482774
2,FOODS_1_001_CA_3_validation,0.971854,0.930272,0.811160,0.827058,0.923581,1.158174,1.162923,0.967649,1.036144,0.876539,0.928624,1.021109,1.502222,1.368150,1.021456,1.002789,0.956370,1.006532,1.088499,1.412352,1.519428,1.027419,0.982478,0.823215,0.865855,0.977729,1.233483,1.282721
3,FOODS_1_001_CA_4_validation,0.403613,0.344519,0.343573,0.345222,0.369192,0.422671,0.432421,0.330165,0.396849,0.359125,0.327736,0.366866,0.436794,0.346740,0.327817,0.323158,0.335873,0.337749,0.357502,0.442832,0.392858,0.343169,0.367807,0.366825,0.380499,0.420793,0.521612,0.395255
4,FOODS_1_001_TX_1_validation,0.204047,0.199068,0.192254,0.199363,0.181528,0.209848,0.213428,0.337218,0.384235,0.364905,0.355843,0.401530,0.469052,0.420271,0.404720,0.396197,0.408079,0.401723,0.435913,0.474641,0.422201,0.334558,0.334235,0.337848,0.358716,0.422273,0.559917,0.532455
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
60975,HOUSEHOLD_2_516_TX_2_evaluation,0.274729,0.266292,0.249143,0.250334,0.308790,0.390951,0.373615,0.278487,0.265797,0.270317,0.320636,0.319500,0.406353,0.317862,0.255200,0.251535,0.247623,0.253157,0.286518,0.377477,0.341254,0.260272,0.251089,0.254259,0.260804,0.301537,0.383866,0.350116
60976,HOUSEHOLD_2_516_TX_3_evaluation,0.159418,0.156176,0.166303,0.160680,0.182495,0.218347,0.200884,0.150350,0.148349,0.145022,0.160433,0.165351,0.242239,0.185326,0.154613,0.148994,0.146705,0.147932,0.171515,0.232410,0.207438,0.155282,0.159101,0.161178,0.166970,0.195895,0.248846,0.224749
60977,HOUSEHOLD_2_516_WI_1_evaluation,0.092531,0.088851,0.089820,0.091012,0.111189,0.114127,0.098622,0.083525,0.080554,0.081761,0.091200,0.111972,0.121956,0.091231,0.075699,0.073005,0.075166,0.077203,0.107497,0.105593,0.100946,0.076101,0.074197,0.083997,0.086393,0.107712,0.110507,0.107603
60978,HOUSEHOLD_2_516_WI_2_evaluation,0.045906,0.047634,0.048806,0.076833,0.100673,0.103568,0.094833,0.075813,0.078945,0.077474,0.079443,0.091231,0.100899,0.079793,0.065469,0.068071,0.068850,0.068706,0.087020,0.092517,0.088216,0.066168,0.067475,0.069923,0.069422,0.088443,0.100364,0.098118


In [0]:

# te = create_dt(False)
# cols = [f"F{i}" for i in range(1,29)]

# for tdelta in range(0, 28):
#     day = fday + timedelta(days=tdelta)
#     print(tdelta, day)
#     tst = te[(te.date >= day - timedelta(days=max_lags)) & (te.date <= day)].copy()
#     create_fea(tst)
#     tst = tst.loc[tst.date == day , train_cols]
#     te.loc[te.date == day, "sales"] = m_lgb.predict(tst)



# te_sub = te.loc[te.date >= fday, ["id", "sales"]].copy()
# te_sub["F"] = [f"F{rank}" for rank in te_sub.groupby("id")["id"].cumcount()+1]
# te_sub = te_sub.set_index(["id", "F" ]).unstack()["sales"][cols].reset_index()
# te_sub.fillna(0., inplace = True)
# te_sub.sort_values("id", inplace = True)
# te_sub.reset_index(drop=True, inplace = True)



#te_sub.to_csv("submission_v" + str(VERSION) + ".csv",index=False)

In [0]:
# te_sub = te.loc[te.date >= fday, ["id", "sales"]].copy()
# te_sub["F"] = [f"F{rank}" for rank in te_sub.groupby("id")["id"].cumcount()+1]
# te_sub = te_sub.set_index(["id", "F" ]).unstack()["sales"][cols].reset_index()
# te_sub.fillna(0., inplace = True)
# te_sub.sort_values("id", inplace = True)
# te_sub.reset_index(drop=True, inplace = True)


In [0]:
# VERSION = 7
# sub = te_sub
# sub2 = sub.copy()
# sub2["id"] = sub2["id"].str.replace("validation$", "evaluation")
# sub = pd.concat([sub, sub2], axis=0, sort=False)
# sub.to_csv(DIRECTORY + "submissions/submission_v" + str(VERSION) + ".csv",index=False)

In [0]:

#te_sub.to_csv(DIRECTORY + "submissions/submission_v" + str(VERSION) + ".csv",index=False)

In [27]:
sub.id.nunique(), sub["id"].str.contains("validation$").sum()

(30490, 0)

In [28]:
sub.shape

(60980, 29)

In [0]:
# !wc -l submission_0.csv
# !head submission_0.csv

In [30]:
# !wc -l /content/drive/My\ Drive/Colab\ Notebooks/lsda/HW1/submissions/submission_v3.csv
# !head /content/drive/My\ Drive/Colab\ Notebooks/lsda/HW1/submissions/submission_v3.csv
!ls /content/drive/My\ Drive/Colab\ Notebooks/lsda/HW1/submissions/

submission_v1500days_split.csv		    submissionv8.csv
submission_v1500days_split_streamlined.csv  submission_vBOOSTER_model_v7.csv
submission_v3.csv			    submission_vd1_lag364_win364.csv
submission_v4.csv			    submission_vmagic_number_eval.csv
submission_v6.csv			    submission_vwinlag_182.csv
submissionv7.csv


In [0]:
# x = pd.read_csv("/content/drive/My Drive/Colab Notebooks/lsda/HW1/submissions/submission_v6.csv")
# y = pd.read_csv("/content/drive/My Drive/Colab Notebooks/lsda/HW1/submissions/submissionv7.csv")
# x.iloc[:,1:]-y.iloc[:,1:]