In [1]:
import numpy as np
import pandas as pd

import lightgbm as lgb

import gc
from datetime import datetime

from sklearn.model_selection import train_test_split

from data import optimize_df, features

## Data Loading

In [2]:
DATADIR = '../kaggle/input/m5-forecasting-accuracy/'

In [3]:
start = datetime.now()

calendar = pd.read_csv(f'{DATADIR}calendar.csv')
prices = pd.read_csv(f'{DATADIR}sell_prices.csv')
sales = pd.read_csv(f'{DATADIR}sales_train_validation.csv')

print('Time:', datetime.now() - start)

Time: 0:00:06.617656


In [4]:
start = datetime.now()

calendar, prices, sales = optimize_df(calendar, prices, sales, days=365, verbose=True)

print('Time:', datetime.now() - start)

Mem. usage decreased to  0.06 Mb (63.4% reduction)
Mem. usage decreased to 45.67 Mb (65.0% reduction)
Mem. usage decreased to 16.49 Mb (80.7% reduction)
Time: 0:00:11.499733


In [5]:
start = datetime.now()

id_cols = ['id', 'item_id', 'dept_id','store_id', 'cat_id', 'state_id']
df = pd.melt(sales,
             id_vars = id_cols,
             var_name = 'd',
             value_name = 'sales')

df = df.merge(calendar, on = 'd', copy = False)
df = df.merge(prices, on = ['store_id', 'item_id', 'wm_yr_wk'], copy = False)

print('Time:', datetime.now() - start)

Time: 0:00:08.251601


## Feature Engineering

In [6]:
start = datetime.now()

features(df)

print('Time:', datetime.now() - start)

Time: 0:01:31.645398


In [9]:
print('Before:', df.shape)
df.dropna(inplace = True)
print('After:', df.shape)

(11096813, 31)


(9419863, 31)

In [55]:
cat_features = ['item_id', 'dept_id', 'store_id', 'cat_id', 'state_id'] + \
               ['event_name_1', 'event_name_2', 'event_type_1', 'event_type_2']
    
drop_cols = ["id", "date", "sales", "d", "wm_yr_wk", "weekday"]
keep_cols = df.columns[~df.columns.isin(drop_cols)]

X = df[keep_cols]
y = df["sales"]

In [56]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=0)

In [57]:
train = lgb.Dataset(X_train, 
                    label=y_train, 
                    categorical_feature=cat_features)

val = lgb.Dataset(X_test, 
                  label=y_test,
                  categorical_feature=cat_features)

## Model

In [58]:
# All parameters can be found here:
# https://lightgbm.readthedocs.io/en/latest/Parameters.html

# From: https://www.kaggle.com/kneroma/m5-first-public-notebook-under-0-50
params = {
          "objective" : "poisson",
          "metric" :"rmse",
          "force_row_wise" : True,
          "learning_rate" : 0.075,
          "sub_row" : 0.75,
          "bagging_freq" : 1,
          "lambda_l2" : 0.1,
          'num_leaves': 128,
          "min_data_in_leaf": 100,
         }

num_rounds = 100

In [59]:
model = lgb.train(params, 
                  train, 
                  num_boost_round=num_rounds,
                  valid_sets=[train, val], 
                  valid_names=['train', 'validation'],
                  categorical_feature=train.categorical_feature,
                  early_stopping_rounds=10,
                  verbose_eval=int(num_rounds/10))

Training until validation scores don't improve for 10 rounds
[10]	train's rmse: 2.84826	validation's rmse: 2.85563
[20]	train's rmse: 2.47718	validation's rmse: 2.48983
[30]	train's rmse: 2.28129	validation's rmse: 2.29859
[40]	train's rmse: 2.1799	validation's rmse: 2.20052
[50]	train's rmse: 2.12613	validation's rmse: 2.15018
[60]	train's rmse: 2.09775	validation's rmse: 2.12453
[70]	train's rmse: 2.08076	validation's rmse: 2.10989
[80]	train's rmse: 2.06946	validation's rmse: 2.10095
[90]	train's rmse: 2.06042	validation's rmse: 2.09412
[100]	train's rmse: 2.0543	validation's rmse: 2.08979
Did not meet early stopping. Best iteration is:
[100]	train's rmse: 2.0543	validation's rmse: 2.08979


## Submission