In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Any results you write to the current directory are saved as output.

/kaggle/input/m5-preloaded-data/sales_train_val.csv
/kaggle/input/m5-forecasting-accuracy/calendar.csv
/kaggle/input/m5-forecasting-accuracy/sell_prices.csv
/kaggle/input/m5-forecasting-accuracy/sales_train_validation.csv
/kaggle/input/m5-forecasting-accuracy/sample_submission.csv


In [2]:
# import libraries
import gc

import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import plotly.express as px
import seaborn as sns

import time
import math

from sklearn.preprocessing import LabelEncoder

from sklearn.ensemble import RandomForestRegressor
import lightgbm as lgb

from sklearn import preprocessing, metrics

import warnings
warnings.filterwarnings('ignore')

In [3]:
import subprocess
import sys
# for uninstalled packages, use:
def install(package):
    subprocess.check_call([sys.executable, "-m", "pip", "install", package])

In [4]:
## Memory optimization

# Original code from https://www.kaggle.com/gemartin/load-data-reduce-memory-usage by @gemartin
# Modified by @Vopani

# to support timestamp type, categorical type and to add option to use float16
from pandas.api.types import is_datetime64_any_dtype as is_datetime
from pandas.api.types import is_categorical_dtype

def reduce_mem_usage(df, use_float16=False):
    """
    Iterate through all the columns of a dataframe and modify the data type to reduce memory usage.        
    """
    
    start_mem = df.memory_usage().sum() / 1024**2
    print("Memory usage of dataframe is {:.2f} MB".format(start_mem))
    
    for col in df.columns:
        if is_datetime(df[col]) or is_categorical_dtype(df[col]):
            continue
        col_type = df[col].dtype
        
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == "int":
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if use_float16 and c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype("category")

    end_mem = df.memory_usage().sum() / 1024**2
    print("Memory usage after optimization is: {:.2f} MB".format(end_mem))
    print("Decreased by {:.1f}%".format(100 * (start_mem - end_mem) / start_mem))
    
    return df

In [5]:
# specify path to raw data
path_data = '/kaggle/input/m5-forecasting-accuracy/'

In [6]:
pd.set_option('display.max_columns', 500)
pd.set_option('display.max_rows', 500)

## First data inspection

In [None]:
sell_prices = pd.read_csv(path_data + 'sell_prices.csv')
calendar = pd.read_csv(path_data + 'calendar.csv')
sales_train_val = pd.read_csv(path_data + 'sales_train_validation.csv')
submission = pd.read_csv(path_data + 'sample_submission.csv')

In [None]:
submission.head()

In [None]:
submission.tail()

In [None]:
submission.shape

In [None]:
sell_prices.head()

In [None]:
sell_prices.info()

In [None]:
calendar.head()

In [None]:
calendar.info()

In [None]:
calendar.tail()

In [None]:
calendar.shape

In [None]:
calendar[['event_name_1','event_type_1','event_name_2','event_type_2']].nunique()

In [None]:
sales_train_val.head()

In [None]:
sales_train_val.cat_id.unique().tolist()

In [None]:
sales_train_val.shape

In [None]:
# count number of zero and nonzero elements for each feature
nonzero_total = sales_train_val.drop(columns=['id','item_id','dept_id','cat_id','store_id','state_id'], axis=1).astype(bool).sum(axis=0).sort_values(ascending = False)
nonzero_perc = nonzero_total/sales_train_val.shape[0]
nonzero = pd.concat([nonzero_total, nonzero_perc], axis=1, keys=['Total', 'Percent'])

# display featues with most and fewest nonzero elements
print('Nonzero elements by feature: ')
nonzero.head(10).append(nonzero.tail(10))

In [None]:
del sell_prices, calendar, sales_train_val, submission, nonzero
gc.collect()

## Transform data

In [7]:
def read_and_transform(start_day):

    dtypes_calendar={"event_name_1": "category", "event_name_2": "category", "event_type_1": "category", 
             "event_type_2": "category", "weekday": "category", 'wm_yr_wk': 'int16', "wday": "int16",
            "month": "int16", "year": "int16", "snap_CA": "float32", 'snap_TX': 'float32', 'snap_WI': 'float32' }
    dtypes_sell_prices = {"store_id": "category", "item_id": "category", "wm_yr_wk": "int16","sell_price":"float32" }

    # transform categorical variables into int16 format since it saves memory

    sell_prices = pd.read_csv(path_data + 'sell_prices.csv', dtype = dtypes_sell_prices)

    for col, col_dtype in dtypes_sell_prices.items():
        if col_dtype == "category":
            sell_prices[col] = sell_prices[col].cat.codes.astype("int16")
            sell_prices[col] -= sell_prices[col].min()

    calendar = pd.read_csv(path_data + 'calendar.csv', dtype = dtypes_calendar)

    calendar["date"] = pd.to_datetime(calendar["date"])
    for col, col_dtype in dtypes_calendar.items():
        if col_dtype == "category":
            calendar[col] = calendar[col].cat.codes.astype("int16")
            calendar[col] -= calendar[col].min()

    # start from given day

    numcols = [f"d_{day}" for day in range(start_day,1914)]
    catcols = ['id', 'item_id', 'dept_id','store_id', 'cat_id', 'state_id']
    dtype = {numcol:"float32" for numcol in numcols} 
    dtype.update({col: "category" for col in catcols if col != "id"})
    sales_train_val = pd.read_csv(path_data + 'sales_train_validation.csv', usecols = catcols + numcols, dtype = dtype)

    # transform categorical columsn to integer to save memory
    for col in catcols:
        if col != "id":
            sales_train_val[col] = sales_train_val[col].cat.codes.astype("int16")
            sales_train_val[col] -= sales_train_val[col].min()

    print('###### Transforming into melted and merged data format...')

    ### melt sales dataframe

    id_columns = ['id', 'item_id', 'dept_id', 'cat_id', 'store_id', 'state_id']
    sales_melt = sales_train_val.melt(
        id_vars=id_columns, value_vars = sales_train_val.drop(id_columns, axis=1).columns, 
        var_name='d', 
        value_name='sales'
    )
    sales_melt = reduce_mem_usage(sales_melt)

    # get product table
    product_infos = sales_train_val[id_columns].drop_duplicates()

    del sales_train_val

    ### melt submission dataframe

    submission = pd.read_csv(path_data + 'sample_submission.csv')

    sub_cols = submission.drop(['id'], axis=1).columns

    submission_melt = submission.melt(
        id_vars = ['id'],
        value_vars = sub_cols, 
        var_name = 'd',
        value_name = 'sales')

    del submission

    ### convert submission df to appropiate day format
    submission_melt['d'] = submission_melt['d'].str.replace('F','')
    submission_melt['d'] = pd.to_numeric(submission_melt['d'], errors='coerce')

    submission_melt.loc[submission_melt["id"].str.contains("validation"), 'd'] += 1913
    submission_melt.loc[submission_melt["id"].str.contains("evaluation"), 'd'] += 1941

    submission_melt = submission_melt.applymap(str)
    submission_melt['d'] = 'd_'+ submission_melt['d'].astype(str)

    submission_melt.sales = submission_melt.sales.astype('float32')

    submission_melt=reduce_mem_usage(submission_melt)

    ### split up into training, validation and test data set
    # - submission consisting of:
    #   *sales from day 1914-1941 (used for the leaderbord)
    #   *sales fro day 1942-1970 (used for final score)

    # merge product infos on submission file

    # temporarily separate test dataframes
    df_submission1 = submission_melt[submission_melt["id"].str.contains("validation")]
    df_submission2 = submission_melt[submission_melt["id"].str.contains("evaluation")]

    del submission_melt

    # merge with product table
    # to do that we have to temporarily rename values in the id column
    df_submission2["id"] = df_submission2["id"].str.replace("_evaluation", "_validation")
    df_submission1 = df_submission1.merge(product_infos, how="left", on="id")
    df_submission2 = df_submission2.merge(product_infos, how="left", on="id")
    df_submission2["id"] = df_submission2["id"].str.replace("_validation", "_evaluation")
    df_submission1['part'] = 'public_leaderboard'
    df_submission2['part'] = 'private_leaderboard'

    # for the moment only use public leaderboard data
    #df_submission = pd.concat([df_submission1, df_submission2], axis=0)
    df_submission = df_submission1.copy()

    df_submission=reduce_mem_usage(df_submission)

    del product_infos, df_submission1, df_submission2
    gc.collect()

    ### Merge calendar data
    # drop time features (own ones will be added)
    calendar = calendar.drop(["weekday", "wday", "month", "year"], axis=1)
    df_train_val = sales_melt.merge(calendar, how="left", on="d")
    del sales_melt
    df_submission = df_submission.merge(calendar, how="left", on="d")

    del calendar

    # Merge sell price data
    # CHANGE BACK TO LEFT JOIN
    #df_train_val = df_train_val.merge(sell_prices, on=["store_id", "item_id", "wm_yr_wk"], how="left")
    #df_submission = df_submission.merge(sell_prices, on=["store_id", "item_id", "wm_yr_wk"], how="left")
    df_train_val = df_train_val.merge(sell_prices, on=["store_id", "item_id", "wm_yr_wk"])
    df_submission = df_submission.merge(sell_prices, on=["store_id", "item_id", "wm_yr_wk"])

    del sell_prices

    df_train_val['part'] = 'train_val'
    df_submission = df_submission[df_train_val.columns]

    ### Merge trains and submission dfs
    df_train_val_test = pd.concat([df_train_val, df_submission], axis=0)

    del df_train_val, df_submission

    ### add time-features
    df_train_val_test['date'] = pd.to_datetime(df_train_val_test.date, format="%Y-%m-%d %H:%M:%S")
    df_train_val_test['year'] = df_train_val_test.date.dt.year
    df_train_val_test['month'] = df_train_val_test.date.dt.month
    df_train_val_test['day'] = df_train_val_test.date.dt.day
    df_train_val_test['weekday'] = df_train_val_test.date.dt.weekday
    df_train_val_test['hour'] = df_train_val_test.date.dt.hour

    df_train_val_test = reduce_mem_usage(df_train_val_test)

    gc.collect()
    
    return df_train_val_test

start_day = 800
df_train_val_test = read_and_transform(start_day)

###### Transforming into melted and merged data format...
Memory usage of dataframe is 971.77 MB
Memory usage after optimization is: 455.02 MB
Decreased by 53.2%
Memory usage of dataframe is 32.57 MB
Memory usage after optimization is: 17.62 MB
Decreased by 45.9%
Memory usage of dataframe is 31.76 MB
Memory usage after optimization is: 19.40 MB
Decreased by 38.9%
Memory usage of dataframe is 3397.87 MB
Memory usage after optimization is: 1823.23 MB
Decreased by 46.3%


In [8]:
df_train_val_test.head()

Unnamed: 0,id,item_id,dept_id,cat_id,store_id,state_id,d,sales,date,wm_yr_wk,event_name_1,event_type_1,event_name_2,event_type_2,snap_CA,snap_TX,snap_WI,sell_price,part,year,month,day,weekday,hour
0,HOBBIES_1_002_CA_1_validation,1,0,0,0,0,d_800,0.0,2013-04-07,11311,0,0,0,0,1.0,1.0,0.0,3.97,train_val,2013,4,7,6,0
1,HOBBIES_1_002_CA_1_validation,1,0,0,0,0,d_801,0.0,2013-04-08,11311,0,0,0,0,1.0,0.0,1.0,3.97,train_val,2013,4,8,0,0
2,HOBBIES_1_002_CA_1_validation,1,0,0,0,0,d_802,0.0,2013-04-09,11311,0,0,0,0,1.0,1.0,1.0,3.97,train_val,2013,4,9,1,0
3,HOBBIES_1_002_CA_1_validation,1,0,0,0,0,d_803,0.0,2013-04-10,11311,0,0,0,0,1.0,0.0,0.0,3.97,train_val,2013,4,10,2,0
4,HOBBIES_1_002_CA_1_validation,1,0,0,0,0,d_804,0.0,2013-04-11,11311,0,0,0,0,0.0,1.0,1.0,3.97,train_val,2013,4,11,3,0


In [9]:
df_train_val_test.shape

(32376116, 24)

In [10]:
df_train_val_test.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 32376116 entries, 0 to 853719
Data columns (total 24 columns):
id              category
item_id         int16
dept_id         int8
cat_id          int8
store_id        int8
state_id        int8
d               category
sales           float32
date            datetime64[ns]
wm_yr_wk        int16
event_name_1    int8
event_type_1    int8
event_name_2    int8
event_type_2    int8
snap_CA         float32
snap_TX         float32
snap_WI         float32
sell_price      float32
part            category
year            int16
month           int8
day             int8
weekday         int8
hour            int8
dtypes: category(3), datetime64[ns](1), float32(5), int16(3), int8(12)
memory usage: 1.8 GB


In [11]:
# check if dates are correct:
# public leaderboard data should begin with d_1914 and 2016-04-25
df_train_val_test[df_train_val_test.part=='public_leaderboard'].head()

Unnamed: 0,id,item_id,dept_id,cat_id,store_id,state_id,d,sales,date,wm_yr_wk,event_name_1,event_type_1,event_name_2,event_type_2,snap_CA,snap_TX,snap_WI,sell_price,part,year,month,day,weekday,hour
0,HOBBIES_1_001_CA_1_validation,0,0,0,0,0,d_1914,0.0,2016-04-25,11613,0,0,0,0,0.0,0.0,0.0,8.38,public_leaderboard,2016,4,25,0,0
1,HOBBIES_1_001_CA_1_validation,0,0,0,0,0,d_1915,0.0,2016-04-26,11613,0,0,0,0,0.0,0.0,0.0,8.38,public_leaderboard,2016,4,26,1,0
2,HOBBIES_1_001_CA_1_validation,0,0,0,0,0,d_1916,0.0,2016-04-27,11613,0,0,0,0,0.0,0.0,0.0,8.38,public_leaderboard,2016,4,27,2,0
3,HOBBIES_1_001_CA_1_validation,0,0,0,0,0,d_1917,0.0,2016-04-28,11613,0,0,0,0,0.0,0.0,0.0,8.38,public_leaderboard,2016,4,28,3,0
4,HOBBIES_1_001_CA_1_validation,0,0,0,0,0,d_1918,0.0,2016-04-29,11613,0,0,0,0,0.0,0.0,0.0,8.38,public_leaderboard,2016,4,29,4,0


## EDA

In [None]:
d_cols = [col for col in sales_train_val.columns if 'd_' in col]

In [None]:
mean_per_cat = sales_train_val.groupby('cat_id').agg('mean').T.reset_index().drop('index', axis=1)
ax = mean_per_cat.plot(figsize=(15, 7))
ax.set_xlabel("day", fontsize=15)
ax.set_ylabel("avg sells", fontsize=15)
ax.tick_params(labelsize=15)

In [None]:
prop_nonzero = sales_train_val[d_cols].astype(bool).sum(axis=0).reset_index().drop('index', axis=1)/sales_train_val.shape[0]
ax = prop_nonzero.plot(figsize=(15, 7))
ax.set_xlabel("day", fontsize=15)
ax.set_ylabel("Proportion at least 1 sale", fontsize=15)
ax.tick_params(labelsize=15)
ax.get_legend().remove()

In [None]:
fig, axes = plt.subplots(1,3,figsize=(30,7))

df_train_val_test.groupby(['cat_id']).agg({'sales': 'mean'}).plot(kind='bar', rot=0, ax=axes[0])
axes[0].xaxis.set_label_text('')
axes[0].set_xticklabels(['Food','Hobbies','Household'], fontsize=15)
axes[0].set_ylabel('Mean sell price', fontsize=20)
axes[0].tick_params(labelsize=20)
axes[0].get_legend().remove()

df_train_val_test.groupby(['dept_id']).agg({'sales': 'mean'}).plot(kind='bar', rot=45, ax=axes[1])
axes[1].xaxis.set_label_text('')
axes[1].set_xlabel('Department', fontsize=20)
axes[1].set_ylabel('Mean sell price', fontsize=20)
axes[1].tick_params(labelsize=17,rotation=0)
axes[1].get_legend().remove()

df_train_val_test.groupby(['store_id']).agg({'sales': 'mean'}).plot(kind='bar', rot=45, ax=axes[2])
axes[2].xaxis.set_label_text('')
axes[2].set_xlabel('Store', fontsize=20)
axes[2].set_ylabel('Mean sell price', fontsize=20)
axes[2].tick_params(labelsize=20,rotation=0)
axes[2].get_legend().remove()

In [None]:
# there are few changes in sell price
price_range_per_id = df_train_val_test.groupby('id')['sell_price'].agg(np.ptp)
plt.figure(figsize=(7,3))
_ = plt.hist(price_range_per_id, bins=100)
_ =plt.xlabel('max-min sell price per id', fontsize=20)

In [None]:
agg = df_train_val_test.groupby(['cat_id']).agg({'sales': 'mean'})
agg

## Label-encoding

In [19]:
df_train_val_test.head()

Unnamed: 0,id,item_id,dept_id,cat_id,store_id,state_id,d,sales,date,wm_yr_wk,event_name_1,event_type_1,event_name_2,event_type_2,snap_CA,snap_TX,snap_WI,sell_price,part,year,month,day,weekday,hour,item_sales_per_day,item_sales_per_store,item_sales_per_store_day,revenue,total_revenue_per_store,total_revenue_per_day
0,HOBBIES_1_002_CA_1_validation,1,0,0,0,0,d_800,0.0,2013-04-07,11311,0,0,0,0,1.0,1.0,0.0,3.97,train_val,2013,4,7,6,0,5.0,319.0,0.0,0.0,15528686.0,4469083.5
1,HOBBIES_1_002_CA_1_validation,1,0,0,0,0,d_801,0.0,2013-04-08,11311,0,0,0,0,1.0,0.0,1.0,3.97,train_val,2013,4,8,0,0,1.0,319.0,0.0,0.0,15528686.0,4445660.0
2,HOBBIES_1_002_CA_1_validation,1,0,0,0,0,d_802,0.0,2013-04-09,11311,0,0,0,0,1.0,1.0,1.0,3.97,train_val,2013,4,9,1,0,2.0,319.0,0.0,0.0,15528686.0,4557555.0
3,HOBBIES_1_002_CA_1_validation,1,0,0,0,0,d_803,0.0,2013-04-10,11311,0,0,0,0,1.0,0.0,0.0,3.97,train_val,2013,4,10,2,0,2.0,319.0,0.0,0.0,15528686.0,4301710.5
4,HOBBIES_1_002_CA_1_validation,1,0,0,0,0,d_804,0.0,2013-04-11,11311,0,0,0,0,0.0,1.0,1.0,3.97,train_val,2013,4,11,3,0,3.0,319.0,0.0,0.0,15528686.0,4401590.0


In [20]:
df_train_val_test.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 32376116 entries, 0 to 32376115
Data columns (total 30 columns):
id                          category
item_id                     int16
dept_id                     int8
cat_id                      int8
store_id                    int8
state_id                    int8
d                           category
sales                       float32
date                        datetime64[ns]
wm_yr_wk                    int16
event_name_1                int8
event_type_1                int8
event_name_2                int8
event_type_2                int8
snap_CA                     float32
snap_TX                     float32
snap_WI                     float32
sell_price                  float32
part                        category
year                        int16
month                       int8
day                         int8
weekday                     int8
hour                        int8
item_sales_per_day          float32
item_sales_per_st

In [21]:
cat_col = ['item_id','dept_id','cat_id','store_id','state_id','event_name_1','event_type_1','event_name_2','event_type_2']
nan_features = ["event_name_1", "event_type_1", "event_name_2", "event_type_2"]

df_train_val_test[nan_features] = df_train_val_test[nan_features].astype(object)

for col in cat_col:

    if df_train_val_test[col].isnull().values.any():
        df_train_val_test[col].fillna('unknown', inplace=True)

    le = LabelEncoder()
    df_train_val_test[col] = le.fit_transform(df_train_val_test[col])

df_train_val_test[cat_col] = df_train_val_test[cat_col].astype('category')

In [22]:
df_train_val_test.head()

Unnamed: 0,id,item_id,dept_id,cat_id,store_id,state_id,d,sales,date,wm_yr_wk,event_name_1,event_type_1,event_name_2,event_type_2,snap_CA,snap_TX,snap_WI,sell_price,part,year,month,day,weekday,hour,item_sales_per_day,item_sales_per_store,item_sales_per_store_day,revenue,total_revenue_per_store,total_revenue_per_day
0,HOBBIES_1_002_CA_1_validation,1,0,0,0,0,d_800,0.0,2013-04-07,11311,0,0,0,0,1.0,1.0,0.0,3.97,train_val,2013,4,7,6,0,5.0,319.0,0.0,0.0,15528686.0,4469083.5
1,HOBBIES_1_002_CA_1_validation,1,0,0,0,0,d_801,0.0,2013-04-08,11311,0,0,0,0,1.0,0.0,1.0,3.97,train_val,2013,4,8,0,0,1.0,319.0,0.0,0.0,15528686.0,4445660.0
2,HOBBIES_1_002_CA_1_validation,1,0,0,0,0,d_802,0.0,2013-04-09,11311,0,0,0,0,1.0,1.0,1.0,3.97,train_val,2013,4,9,1,0,2.0,319.0,0.0,0.0,15528686.0,4557555.0
3,HOBBIES_1_002_CA_1_validation,1,0,0,0,0,d_803,0.0,2013-04-10,11311,0,0,0,0,1.0,0.0,0.0,3.97,train_val,2013,4,10,2,0,2.0,319.0,0.0,0.0,15528686.0,4301710.5
4,HOBBIES_1_002_CA_1_validation,1,0,0,0,0,d_804,0.0,2013-04-11,11311,0,0,0,0,0.0,1.0,1.0,3.97,train_val,2013,4,11,3,0,3.0,319.0,0.0,0.0,15528686.0,4401590.0


In [23]:
df_train_val_test.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 32376116 entries, 0 to 32376115
Data columns (total 30 columns):
id                          category
item_id                     category
dept_id                     category
cat_id                      category
store_id                    category
state_id                    category
d                           category
sales                       float32
date                        datetime64[ns]
wm_yr_wk                    int16
event_name_1                category
event_type_1                category
event_name_2                category
event_type_2                category
snap_CA                     float32
snap_TX                     float32
snap_WI                     float32
sell_price                  float32
part                        category
year                        int16
month                       int8
day                         int8
weekday                     int8
hour                        int8
item_sales_per_day

## Feature-engineering

In [12]:
# total items sold per day
item_sells_day = df_train_val_test.groupby(['item_id','date']).agg({'sales':'sum'})
item_sells_day.columns = ['item_sales_per_day']
item_sells_day.reset_index(inplace=True)

df_train_val_test = pd.merge(df_train_val_test, item_sells_day, on=['item_id','date'], how='left')
df_train_val_test['item_sales_per_day'] = df_train_val_test['item_sales_per_day'].astype(np.float32)

del item_sells_day
gc.collect()

0

In [13]:
# total items sold per store
item_sells_store = df_train_val_test.groupby(['item_id','store_id']).agg({'sales':'sum'})
item_sells_store.columns = ['item_sales_per_store']
item_sells_store.reset_index(inplace=True)

df_train_val_test = pd.merge(df_train_val_test, item_sells_store, on=['item_id','store_id'], how='left')
del item_sells_store
df_train_val_test['item_sales_per_store'] = df_train_val_test['item_sales_per_store'].astype(np.float32)

gc.collect()

0

In [14]:
# total items sold per store and day
item_sells_store_day = df_train_val_test.groupby(['item_id','store_id','date']).agg({'sales':'sum'})
item_sells_store_day.columns = ['item_sales_per_store_day']
item_sells_store_day.reset_index(inplace=True)

df_train_val_test = pd.merge(df_train_val_test, item_sells_store_day, on=['item_id','store_id','date'], how='left')
del item_sells_store_day
df_train_val_test['item_sales_per_store_day'] = df_train_val_test['item_sales_per_store_day'].astype(np.float32)

gc.collect()

0

In [15]:
df_train_val_test['revenue'] = df_train_val_test['sell_price'] *  df_train_val_test['sales']

In [16]:
# revenue per store
revenue_per_store = df_train_val_test.groupby('store_id').agg({'revenue':'sum'})
revenue_per_store.columns = ['total_revenue_per_store']
revenue_per_store.reset_index(inplace=True)
df_train_val_test = pd.merge(df_train_val_test, revenue_per_store, on=['store_id'], how='left')
del revenue_per_store
df_train_val_test['total_revenue_per_store'] = df_train_val_test['total_revenue_per_store'].astype(np.float32)

gc.collect()

13

In [18]:
# revenue per day
revenue_per_day = df_train_val_test.groupby('day').agg({'revenue':'sum'})
revenue_per_day.columns = ['total_revenue_per_day']
revenue_per_day.reset_index(inplace=True)
df_train_val_test = pd.merge(df_train_val_test, revenue_per_day, on=['day'], how='left')
del revenue_per_day
df_train_val_test['total_revenue_per_day'] = df_train_val_test['total_revenue_per_day'].astype(np.float32)

gc.collect()

0

In [None]:
'''
# revenue per store + day - crashes
revenue_per_store_day = df_train_val_test.groupby(['store_id','date']).agg({'revenue':'sum'})
revenue_per_store_day.columns = ['total_revenue_per_store_day']
revenue_per_store_day.reset_index(inplace=True)

df_train_val_test = pd.merge(df_train_val_test, revenue_per_store_day, on=['store_id'], how='left')
del revenue_per_store_day
df_train_val_test['total_revenue_per_store_day'] = df_train_val_test['total_revenue_per_store_day'].astype(np.float32)
'''

In [None]:
df_train_val_test.head()

In [None]:
%whos DataFrame

## Train model

In [None]:
'''
### split into training and validation set with 80% training and 20% validation
# i.e. 80% of days into training and 20% of days into validation set
ndays_train_val = df_train_val_test[df_train_val_test.part=='train_val'].date.nunique()
days_train = round(ndays_train_val*0.8)
days_val = ndays_train_val-days_train
print('Using {} days for training and {} days for validation, {} days in total'.format(days_train,days_val,ndays_train_val))
last_day_train = str(df_train_val_test.date.unique()[days_train-1])
'''

In [None]:
'''
x_train = df_train_val_test[df_train_val_test['date'] <= last_day_train]
x_val = df_train_val_test[(df_train_val_test['date'] > last_day_train) & (df_train_val_test['date'] < str(df_train_val_test[df_train_val_test.part=='public_leaderboard'].date[0]))]
y_train = x_train['sales']
y_val = x_val['sales']
x_train.drop(['d', 'date', 'id', 'part', 'sales'], axis=1, inplace=True)
x_val.drop(['d', 'date', 'id', 'part', 'sales'], axis=1, inplace=True)
test = df_train_val_test[df_train_val_test.part.str.contains("leaderboard")]
x_test = test.drop(['d', 'date', 'id', 'part', 'sales'], axis=1)
'''

In [None]:
#x_train.head()

In [None]:
#x_test.head()

In [None]:
### lgb model
'''

#del df_train_val_test
gc.collect()

# define random hyperparammeters
params = {
    'boosting_type': 'gbdt',
    'metric': 'rmse',
    'objective': 'regression',
    'n_jobs': -1,
    'seed': 236,
    'learning_rate': 0.1,
    'bagging_fraction': 0.75,
    'bagging_freq': 10, 
    'colsample_bytree': 0.75}

train_set = lgb.Dataset(x_train, y_train)
val_set = lgb.Dataset(x_val, y_val)

#del x_train, y_train

model = lgb.train(params, train_set, num_boost_round = 2500, early_stopping_rounds = 50, valid_sets = [train_set, val_set], verbose_eval = 100)
val_pred = model.predict(x_val)
val_score = np.sqrt(metrics.mean_squared_error(val_pred, y_val))
print(f'Our val rmse score is {val_score}')
y_test_pred = model.predict(x_test)

# Plot feature importance
lgb.plot_importance(model)
'''

In [None]:
'''
### create submission file
test = df_train_val_test[df_train_val_test.part=='public_leaderboard']
test['sales'] = y_test_pred
predictions = test[['id', 'date', 'demand']]
predictions = pd.pivot(predictions, index = 'id', columns = 'date', values = 'demand').reset_index()
predictions.columns = ['id'] + ['F' + str(i + 1) for i in range(28)]

evaluation_rows = [row for row in submission['id'] if 'evaluation' in row] 
evaluation = submission[submission['id'].isin(evaluation_rows)]

validation = submission[['id']].merge(predictions, on = 'id')
final = pd.concat([validation, evaluation])
final.to_csv('submission.csv', index = False)
'''