In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os, sys, gc, time, warnings, pickle, psutil, random
from google.colab import drive
from IPython.core.interactiveshell import InteractiveShell
import plotly.offline
import plotly.express as px
import gc
import warnings
warnings.filterwarnings('ignore')
pd.options.display.max_columns = 100
InteractiveShell.ast_node_interactivity = "all"

from  datetime import datetime, timedelta
import lightgbm as lgb
from lightgbm.callback import early_stopping

warnings.filterwarnings('ignore')

In [None]:
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [None]:
os.chdir('/content/gdrive/MyDrive/msba6421_predictive/Amy_group_project/m5-forecasting-accuracy')

In [None]:
gc.collect()

0

# Training By store_id - cat_id

In [None]:
def reduce_mem_usage(df, verbose=False):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                       df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

In [None]:
DATA = '1202_data.pkl'
dir_ = '/content/gdrive/MyDrive/msba6421_predictive/Amy_group_project/m5-forecasting-accuracy'
model_dir = 'model/'
log_dir = 'log/'
submission_dir = 'submission/'

STORES = [
    'CA_1', 'CA_2', 'CA_3',
    'CA_4',
    'TX_1', 'TX_2','TX_3','WI_1', 'WI_2', 'WI_3'
         ]
CATS = ['HOBBIES', 'HOUSEHOLD', 'FOODS']
TARGET = 'sold'

STEPS = list(np.arange(4,30,4))   #[7, 14, 21, 28]

HORIZON = 28
TRAIN_START = 1
TRAIN_END = 1941 - HORIZON
FIRST_VAL_DAY = TRAIN_END + 1
LAST_VAL_DAY = 1941
FIRST_PRED_DAY = 1941 + 1
VAL_START = 1942-HORIZON
VAL_END = 1941
VAL_DAYS, TEST_DAYS = STEPS[0], STEPS[0]

BASE = pd.read_pickle(DATA)
BASE = reduce_mem_usage(BASE)
remove_feature = ['id',
                  'state_id',
                  'store_id',
                  'item_id',
                  'dept_id',
                  'cat_id',
                  'date','wm_yr_wk','d','sold']

lags_col = list(BASE.columns[(BASE.columns.str.contains('lag')) | (BASE.columns.str.contains('rm_')) | (BASE.columns.str.contains('std_')) | (BASE.columns.str.contains('max_')) | (BASE.columns.str.contains('ema_'))])
cat_var = ['item_id', 'dept_id','store_id', 'cat_id', 'state_id'] + ["event_name_1", "event_name_2", "event_type_1", "event_type_2"]
cat_var = list(set(cat_var) - set(remove_feature))

In [None]:
# Make training data

def prepare_data(store, state):

    grid_df = BASE[(BASE['store_id'] == store) & (BASE['cat_id'] == state)]

    grid_df = reduce_mem_usage(grid_df)

    return grid_df

In [None]:
# Model params
lgb_params = {
                    'boosting_type': 'gbdt',
                    'objective': 'tweedie',
                    'tweedie_variance_power': 1.1,
                    'metric': 'rmse',
                    'subsample': 0.5,
                    'subsample_freq': 1,
                    'learning_rate': 0.015,
                    'num_leaves': 2**8-1,
                    'min_data_in_leaf': 2**8-1,
                    'feature_fraction': 0.5,
                    'max_bin': 100,
                    'n_estimators': 1000,
                    'boost_from_average': False,
                    'verbose': -1,
                    'seed' : 1995
                }

## Start training + prediction

In [None]:
predictions = pd.read_pickle(f'{submission_dir}before_ensemble/me_with_steps_store_cat_preds_temp_3_1.pkl')

for store in STORES:
    for state in CATS:
        for step in STEPS:
          print(store,state, 'start')
          grid_df = prepare_data(store, state)
          grid_df[lags_col] = grid_df.groupby(['id'], observed=False)[lags_col].shift(step)
          model_var = grid_df.columns[~grid_df.columns.isin(remove_feature)]
          # ix_to_drop = grid_df[(grid_df['d'] <= 1941) & grid_df.isna().any(axis=1)].index
          # grid_df.drop(index=ix_to_drop, inplace=True)

          pred_start = FIRST_PRED_DAY + step - VAL_DAYS
          pred_end = FIRST_PRED_DAY + step - 1


          tr_mask = (grid_df['d'] >= TRAIN_START) & (grid_df['d'] <= TRAIN_END)
          vl_mask = (grid_df['d'] >= VAL_START) & (grid_df['d'] <= VAL_END)
          pr_mask = (grid_df['d'] >= pred_start) & (grid_df['d'] <= pred_end)

          trainX = grid_df[tr_mask][model_var]
          trainY = grid_df[tr_mask][TARGET]
          valX = grid_df[vl_mask][model_var]
          valY = grid_df[vl_mask][TARGET]
          testX = grid_df[pr_mask][model_var]
          print(f'Train shape: {trainX.shape}. Val shape: {valX.shape}. Test shape: {testX.shape}')

          # Train
          lgbm = lgb.LGBMRegressor(**lgb_params)
          callbacks = [early_stopping(stopping_rounds=50, first_metric_only=False)]

          lgbm.fit(trainX, trainY,
                  eval_set=[(valX, valY)],
                  eval_metric='rmse',
                  callbacks=callbacks)

          # Predict
          yhat = lgbm.predict(testX, num_iteration=lgbm.best_iteration_)
          preds = grid_df[(grid_df['d'] >= pred_start) & (grid_df['d'] <= pred_end)][['id', 'd']]
          preds['sales'] = yhat
          predictions = pd.concat([predictions, preds], axis=0)
          predictions.to_pickle(f'{submission_dir}before_ensemble/me_with_steps_store_cat_preds_temp_3_1.pkl')

          del grid_df, trainX, trainY, valX, valY, testX, lgbm, tr_mask, vl_mask, pr_mask ; gc.collect

CA_1 HOBBIES start
Train shape: (872938, 74). Val shape: (15820, 74). Test shape: (2260, 74)
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[319]	valid_0's rmse: 2.15749


<function gc.collect(generation=2)>

CA_1 HOBBIES start
Train shape: (872938, 74). Val shape: (15820, 74). Test shape: (2260, 74)
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[269]	valid_0's rmse: 2.16038


<function gc.collect(generation=2)>

CA_1 HOBBIES start
Train shape: (872938, 74). Val shape: (15820, 74). Test shape: (2260, 74)
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[269]	valid_0's rmse: 2.17088


<function gc.collect(generation=2)>

CA_1 HOBBIES start
Train shape: (872938, 74). Val shape: (15820, 74). Test shape: (2260, 74)
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[294]	valid_0's rmse: 2.17158


<function gc.collect(generation=2)>

CA_1 HOBBIES start
Train shape: (872938, 74). Val shape: (15820, 74). Test shape: (2260, 74)
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[268]	valid_0's rmse: 2.17596


<function gc.collect(generation=2)>

CA_1 HOBBIES start
Train shape: (872938, 74). Val shape: (15820, 74). Test shape: (2260, 74)
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[320]	valid_0's rmse: 2.17482


<function gc.collect(generation=2)>

CA_1 HOBBIES start
Train shape: (872938, 74). Val shape: (15820, 74). Test shape: (2260, 74)
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[246]	valid_0's rmse: 2.17676


<function gc.collect(generation=2)>

CA_1 HOUSEHOLD start
Train shape: (1609952, 74). Val shape: (29316, 74). Test shape: (4188, 74)
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[230]	valid_0's rmse: 1.23915


<function gc.collect(generation=2)>

CA_1 HOUSEHOLD start
Train shape: (1609952, 74). Val shape: (29316, 74). Test shape: (4188, 74)
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[204]	valid_0's rmse: 1.26244


<function gc.collect(generation=2)>

CA_1 HOUSEHOLD start
Train shape: (1609952, 74). Val shape: (29316, 74). Test shape: (4188, 74)
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[185]	valid_0's rmse: 1.27598


<function gc.collect(generation=2)>

CA_1 HOUSEHOLD start
Train shape: (1609952, 74). Val shape: (29316, 74). Test shape: (4188, 74)
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[167]	valid_0's rmse: 1.28657


<function gc.collect(generation=2)>

CA_1 HOUSEHOLD start
Train shape: (1609952, 74). Val shape: (29316, 74). Test shape: (4188, 74)
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[164]	valid_0's rmse: 1.29753


<function gc.collect(generation=2)>

CA_1 HOUSEHOLD start
Train shape: (1609952, 74). Val shape: (29316, 74). Test shape: (4188, 74)
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[154]	valid_0's rmse: 1.30578


<function gc.collect(generation=2)>

CA_1 HOUSEHOLD start
Train shape: (1609952, 74). Val shape: (29316, 74). Test shape: (4188, 74)
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[154]	valid_0's rmse: 1.31063


<function gc.collect(generation=2)>

CA_1 FOODS start
Train shape: (2220005, 74). Val shape: (40236, 74). Test shape: (5748, 74)
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[471]	valid_0's rmse: 2.42175


<function gc.collect(generation=2)>

CA_1 FOODS start
Train shape: (2220005, 74). Val shape: (40236, 74). Test shape: (5748, 74)
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[759]	valid_0's rmse: 2.4872


<function gc.collect(generation=2)>

CA_1 FOODS start
Train shape: (2220005, 74). Val shape: (40236, 74). Test shape: (5748, 74)
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[651]	valid_0's rmse: 2.56847


<function gc.collect(generation=2)>

CA_1 FOODS start
Train shape: (2220005, 74). Val shape: (40236, 74). Test shape: (5748, 74)
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[690]	valid_0's rmse: 2.62898


<function gc.collect(generation=2)>

CA_1 FOODS start
Train shape: (2220005, 74). Val shape: (40236, 74). Test shape: (5748, 74)
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[669]	valid_0's rmse: 2.66602


<function gc.collect(generation=2)>

CA_1 FOODS start
Train shape: (2220005, 74). Val shape: (40236, 74). Test shape: (5748, 74)
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[463]	valid_0's rmse: 2.69141


<function gc.collect(generation=2)>

CA_1 FOODS start
Train shape: (2220005, 74). Val shape: (40236, 74). Test shape: (5748, 74)
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[464]	valid_0's rmse: 2.70392


<function gc.collect(generation=2)>

CA_2 HOBBIES start
Train shape: (870012, 74). Val shape: (15820, 74). Test shape: (2260, 74)
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[360]	valid_0's rmse: 1.69006


<function gc.collect(generation=2)>

CA_2 HOBBIES start
Train shape: (870012, 74). Val shape: (15820, 74). Test shape: (2260, 74)
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[451]	valid_0's rmse: 1.69729


<function gc.collect(generation=2)>

CA_2 HOBBIES start
Train shape: (870012, 74). Val shape: (15820, 74). Test shape: (2260, 74)
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[435]	valid_0's rmse: 1.69701


<function gc.collect(generation=2)>

CA_2 HOBBIES start
Train shape: (870012, 74). Val shape: (15820, 74). Test shape: (2260, 74)
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[376]	valid_0's rmse: 1.70953


<function gc.collect(generation=2)>

CA_2 HOBBIES start
Train shape: (870012, 74). Val shape: (15820, 74). Test shape: (2260, 74)
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[302]	valid_0's rmse: 1.7125


<function gc.collect(generation=2)>

CA_2 HOBBIES start
Train shape: (870012, 74). Val shape: (15820, 74). Test shape: (2260, 74)
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[294]	valid_0's rmse: 1.709


<function gc.collect(generation=2)>

CA_2 HOBBIES start
Train shape: (870012, 74). Val shape: (15820, 74). Test shape: (2260, 74)
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[411]	valid_0's rmse: 1.70729


<function gc.collect(generation=2)>

CA_2 HOUSEHOLD start
Train shape: (1598521, 74). Val shape: (29316, 74). Test shape: (4188, 74)
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[445]	valid_0's rmse: 1.38657


<function gc.collect(generation=2)>

CA_2 HOUSEHOLD start
Train shape: (1598521, 74). Val shape: (29316, 74). Test shape: (4188, 74)
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[445]	valid_0's rmse: 1.41042


<function gc.collect(generation=2)>

CA_2 HOUSEHOLD start
Train shape: (1598521, 74). Val shape: (29316, 74). Test shape: (4188, 74)
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[492]	valid_0's rmse: 1.41478


<function gc.collect(generation=2)>

CA_2 HOUSEHOLD start
Train shape: (1598521, 74). Val shape: (29316, 74). Test shape: (4188, 74)
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[630]	valid_0's rmse: 1.42494


<function gc.collect(generation=2)>

CA_2 HOUSEHOLD start
Train shape: (1598521, 74). Val shape: (29316, 74). Test shape: (4188, 74)
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[802]	valid_0's rmse: 1.43103


<function gc.collect(generation=2)>

CA_2 HOUSEHOLD start
Train shape: (1598521, 74). Val shape: (29316, 74). Test shape: (4188, 74)
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[573]	valid_0's rmse: 1.43945


<function gc.collect(generation=2)>

CA_2 HOUSEHOLD start
Train shape: (1598521, 74). Val shape: (29316, 74). Test shape: (4188, 74)
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[654]	valid_0's rmse: 1.44398


<function gc.collect(generation=2)>

CA_2 FOODS start
Train shape: (1807243, 74). Val shape: (40236, 74). Test shape: (5748, 74)
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[707]	valid_0's rmse: 2.28723


<function gc.collect(generation=2)>

CA_2 FOODS start
Train shape: (1807243, 74). Val shape: (40236, 74). Test shape: (5748, 74)
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[622]	valid_0's rmse: 2.33302


<function gc.collect(generation=2)>

CA_2 FOODS start
Train shape: (1807243, 74). Val shape: (40236, 74). Test shape: (5748, 74)
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[573]	valid_0's rmse: 2.36955


<function gc.collect(generation=2)>

CA_2 FOODS start
Train shape: (1807243, 74). Val shape: (40236, 74). Test shape: (5748, 74)
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[683]	valid_0's rmse: 2.38979


<function gc.collect(generation=2)>

CA_2 FOODS start
Train shape: (1807243, 74). Val shape: (40236, 74). Test shape: (5748, 74)
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[518]	valid_0's rmse: 2.41541


<function gc.collect(generation=2)>

CA_2 FOODS start
Train shape: (1807243, 74). Val shape: (40236, 74). Test shape: (5748, 74)
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[625]	valid_0's rmse: 2.42469


<function gc.collect(generation=2)>

CA_2 FOODS start
Train shape: (1807243, 74). Val shape: (40236, 74). Test shape: (5748, 74)
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[590]	valid_0's rmse: 2.45043


<function gc.collect(generation=2)>

CA_3 HOBBIES start
Train shape: (869676, 74). Val shape: (15820, 74). Test shape: (2260, 74)
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[192]	valid_0's rmse: 2.17201


<function gc.collect(generation=2)>

CA_3 HOBBIES start
Train shape: (869676, 74). Val shape: (15820, 74). Test shape: (2260, 74)
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[190]	valid_0's rmse: 2.1669


<function gc.collect(generation=2)>

CA_3 HOBBIES start
Train shape: (869676, 74). Val shape: (15820, 74). Test shape: (2260, 74)
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[192]	valid_0's rmse: 2.16138


<function gc.collect(generation=2)>

CA_3 HOBBIES start
Train shape: (869676, 74). Val shape: (15820, 74). Test shape: (2260, 74)
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[185]	valid_0's rmse: 2.16526


<function gc.collect(generation=2)>

CA_3 HOBBIES start
Train shape: (869676, 74). Val shape: (15820, 74). Test shape: (2260, 74)
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[172]	valid_0's rmse: 2.18041


<function gc.collect(generation=2)>

CA_3 HOBBIES start
Train shape: (869676, 74). Val shape: (15820, 74). Test shape: (2260, 74)
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[198]	valid_0's rmse: 2.16681


<function gc.collect(generation=2)>

CA_3 HOBBIES start
Train shape: (869676, 74). Val shape: (15820, 74). Test shape: (2260, 74)
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[192]	valid_0's rmse: 2.16342


<function gc.collect(generation=2)>

CA_3 HOUSEHOLD start
Train shape: (1584465, 74). Val shape: (29316, 74). Test shape: (4188, 74)
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[212]	valid_0's rmse: 1.98505


<function gc.collect(generation=2)>

CA_3 HOUSEHOLD start
Train shape: (1584465, 74). Val shape: (29316, 74). Test shape: (4188, 74)
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[198]	valid_0's rmse: 2.0361


<function gc.collect(generation=2)>

CA_3 HOUSEHOLD start
Train shape: (1584465, 74). Val shape: (29316, 74). Test shape: (4188, 74)
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[178]	valid_0's rmse: 2.06299


<function gc.collect(generation=2)>

CA_3 HOUSEHOLD start
Train shape: (1584465, 74). Val shape: (29316, 74). Test shape: (4188, 74)
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[169]	valid_0's rmse: 2.0895


<function gc.collect(generation=2)>

CA_3 HOUSEHOLD start
Train shape: (1584465, 74). Val shape: (29316, 74). Test shape: (4188, 74)
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[167]	valid_0's rmse: 2.10159


<function gc.collect(generation=2)>

CA_3 HOUSEHOLD start
Train shape: (1584465, 74). Val shape: (29316, 74). Test shape: (4188, 74)
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[166]	valid_0's rmse: 2.11138


<function gc.collect(generation=2)>

CA_3 HOUSEHOLD start
Train shape: (1584465, 74). Val shape: (29316, 74). Test shape: (4188, 74)
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[168]	valid_0's rmse: 2.11675


<function gc.collect(generation=2)>

CA_3 FOODS start
Train shape: (2217800, 74). Val shape: (40236, 74). Test shape: (5748, 74)
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[553]	valid_0's rmse: 2.74297


<function gc.collect(generation=2)>

CA_3 FOODS start
Train shape: (2217800, 74). Val shape: (40236, 74). Test shape: (5748, 74)
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[375]	valid_0's rmse: 2.86879


<function gc.collect(generation=2)>

CA_3 FOODS start
Train shape: (2217800, 74). Val shape: (40236, 74). Test shape: (5748, 74)
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[403]	valid_0's rmse: 2.94677


<function gc.collect(generation=2)>

CA_3 FOODS start
Train shape: (2217800, 74). Val shape: (40236, 74). Test shape: (5748, 74)
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[280]	valid_0's rmse: 2.97374


<function gc.collect(generation=2)>

CA_3 FOODS start
Train shape: (2217800, 74). Val shape: (40236, 74). Test shape: (5748, 74)
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[335]	valid_0's rmse: 2.99751


<function gc.collect(generation=2)>

CA_3 FOODS start
Train shape: (2217800, 74). Val shape: (40236, 74). Test shape: (5748, 74)
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[261]	valid_0's rmse: 3.00048


<function gc.collect(generation=2)>

CA_3 FOODS start
Train shape: (2217800, 74). Val shape: (40236, 74). Test shape: (5748, 74)
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[591]	valid_0's rmse: 2.99529


<function gc.collect(generation=2)>

CA_4 HOBBIES start
Train shape: (816672, 74). Val shape: (15820, 74). Test shape: (2260, 74)
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[326]	valid_0's rmse: 1.6206


<function gc.collect(generation=2)>

CA_4 HOBBIES start
Train shape: (816672, 74). Val shape: (15820, 74). Test shape: (2260, 74)
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[270]	valid_0's rmse: 1.62233


<function gc.collect(generation=2)>

CA_4 HOBBIES start
Train shape: (816672, 74). Val shape: (15820, 74). Test shape: (2260, 74)
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[329]	valid_0's rmse: 1.62385


<function gc.collect(generation=2)>

CA_4 HOBBIES start
Train shape: (816672, 74). Val shape: (15820, 74). Test shape: (2260, 74)
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[345]	valid_0's rmse: 1.62837


<function gc.collect(generation=2)>

CA_4 HOBBIES start
Train shape: (816672, 74). Val shape: (15820, 74). Test shape: (2260, 74)
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[297]	valid_0's rmse: 1.62862


<function gc.collect(generation=2)>

CA_4 HOBBIES start
Train shape: (816672, 74). Val shape: (15820, 74). Test shape: (2260, 74)
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[352]	valid_0's rmse: 1.63258


<function gc.collect(generation=2)>

CA_4 HOBBIES start
Train shape: (816672, 74). Val shape: (15820, 74). Test shape: (2260, 74)
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[245]	valid_0's rmse: 1.63709


<function gc.collect(generation=2)>

CA_4 HOUSEHOLD start
Train shape: (1572159, 74). Val shape: (29316, 74). Test shape: (4188, 74)
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[384]	valid_0's rmse: 0.885976


<function gc.collect(generation=2)>

CA_4 HOUSEHOLD start
Train shape: (1572159, 74). Val shape: (29316, 74). Test shape: (4188, 74)
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[359]	valid_0's rmse: 0.887277


<function gc.collect(generation=2)>

CA_4 HOUSEHOLD start
Train shape: (1572159, 74). Val shape: (29316, 74). Test shape: (4188, 74)
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[464]	valid_0's rmse: 0.887876


<function gc.collect(generation=2)>

CA_4 HOUSEHOLD start
Train shape: (1572159, 74). Val shape: (29316, 74). Test shape: (4188, 74)
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[387]	valid_0's rmse: 0.892134


<function gc.collect(generation=2)>

CA_4 HOUSEHOLD start
Train shape: (1572159, 74). Val shape: (29316, 74). Test shape: (4188, 74)
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[415]	valid_0's rmse: 0.89451


<function gc.collect(generation=2)>

CA_4 HOUSEHOLD start
Train shape: (1572159, 74). Val shape: (29316, 74). Test shape: (4188, 74)
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[467]	valid_0's rmse: 0.896189


<function gc.collect(generation=2)>

CA_4 HOUSEHOLD start
Train shape: (1572159, 74). Val shape: (29316, 74). Test shape: (4188, 74)
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[437]	valid_0's rmse: 0.896798


<function gc.collect(generation=2)>

CA_4 FOODS start
Train shape: (2178355, 74). Val shape: (40236, 74). Test shape: (5748, 74)
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[532]	valid_0's rmse: 1.59176


<function gc.collect(generation=2)>

CA_4 FOODS start
Train shape: (2178355, 74). Val shape: (40236, 74). Test shape: (5748, 74)
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[457]	valid_0's rmse: 1.61495


<function gc.collect(generation=2)>

CA_4 FOODS start
Train shape: (2178355, 74). Val shape: (40236, 74). Test shape: (5748, 74)
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[391]	valid_0's rmse: 1.6244


<function gc.collect(generation=2)>

CA_4 FOODS start
Train shape: (2178355, 74). Val shape: (40236, 74). Test shape: (5748, 74)
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[468]	valid_0's rmse: 1.63319


<function gc.collect(generation=2)>

CA_4 FOODS start
Train shape: (2178355, 74). Val shape: (40236, 74). Test shape: (5748, 74)
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[447]	valid_0's rmse: 1.64422


<function gc.collect(generation=2)>

CA_4 FOODS start
Train shape: (2178355, 74). Val shape: (40236, 74). Test shape: (5748, 74)
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[600]	valid_0's rmse: 1.64799


<function gc.collect(generation=2)>

CA_4 FOODS start
Train shape: (2178355, 74). Val shape: (40236, 74). Test shape: (5748, 74)
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[616]	valid_0's rmse: 1.65547


<function gc.collect(generation=2)>

In [None]:
predictions.to_pickle(f'{submission_dir}before_ensemble/me_with_steps_store_cat_preds_temp_3_1.pkl')

In [None]:
predictions

Unnamed: 0,id,d,sales
46881677,HOBBIES_1_001_CA_1_evaluation,1942,0.922659
46881678,HOBBIES_1_002_CA_1_evaluation,1942,0.273641
46881679,HOBBIES_1_003_CA_1_evaluation,1942,0.440581
46881680,HOBBIES_1_004_CA_1_evaluation,1942,1.569604
46881681,HOBBIES_1_005_CA_1_evaluation,1942,1.216340
...,...,...,...
47717098,FOODS_3_823_CA_4_evaluation,1969,0.862182
47717099,FOODS_3_824_CA_4_evaluation,1969,0.538956
47717100,FOODS_3_825_CA_4_evaluation,1969,0.782921
47717101,FOODS_3_826_CA_4_evaluation,1969,2.007966
