#### Load modules

In [None]:
import gc
import multiprocessing as mp

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import lightgbm as lgb
from sklearn.preprocessing import LabelEncoder

pd.set_option('max_columns', 100)

#### Define helper function

In [None]:
def encode(df, column):
    # Get the instance of global dictionary of encoders.
    global encoders
    # Encode given column data.
    if column in encoders:
        return encoders[column].transform(df[column])
    else:
        encoders[column] = LabelEncoder()
        return encoders[column].fit_transform(df[column])

def decode(df, column):
    # Get the instance of global dictionary of encoders.
    global encoders
    # Decode given column data.
    if column in encoders:
        return encoders[column].inverse_transform(df[column])
    else:
        raise "Can't find an appropriate decoder instance."
        
encoders = {}

In [None]:
def remove_d(x):
    """
    Remove 'd_' from a given string and return as interger.
    This function will be used for data preprocessing.
    """
    x = x.replace('d_', '')
    return int(x)

In [None]:
def reduce_mem_usage(df):
    """
    Iterate through all the columns of a dataframe and modify the data type to reduce memory usage.
    """
    start_mem = df.memory_usage().sum() / 1024**2
    
    for col in df.columns:
        col_type = df[col].dtype
        
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype('category')

    end_mem = df.memory_usage().sum() / 1024**2
    
    return df

#### Preprocess

(1) sales_train_validation.csv

In [None]:
%%time

# Load file.
df_sales = pd.read_csv('./sales_train_validation.csv')

# Create arbitrary values for validation periods.
for d in range(1914, 1942):
    df_sales[f'd_{d}'] = 0

# Encode categorical features.
for column in ['id', 'item_id', 'dept_id', 'cat_id', 'store_id', 'state_id']:
    df_sales[column] = encode(df_sales, column)

# Unpivot 'd' data.
id_vars = [column for column in df_sales if 'd_' not in column]
value_vars = [column for column in df_sales if 'd_' in column]
df_sales = df_sales.melt(id_vars=id_vars, value_vars=value_vars, var_name='d', value_name='sales')

# Remove 'd_' in multiprocessing manner.
with mp.Pool(6) as p:
    df_sales['d'] = p.map(remove_d, df_sales['d'])

# Reduce memory usage.
df_sales = reduce_mem_usage(df_sales)

# Sort dataframe w.r.t 'id' and 'd'.
df_sales = df_sales.sort_values(['id', 'd'], ignore_index=True)

df_sales

(2) calendar.csv

In [None]:
%%time

# Load file.
df_calendar = pd.read_csv('./calendar.csv')

# Create 'day' column.
df_calendar['day'] = pd.DatetimeIndex(df_calendar['date']).day

# Drop redundant columns.
df_calendar.drop(columns=['date', 'weekday'], inplace=True)

# Encode categorical features.
df_calendar = df_calendar.fillna('')
for column in ['wm_yr_wk', 'event_name_1', 'event_type_1', 'event_name_2', 'event_type_2']:
    df_calendar[column] = encode(df_calendar, column)

# Remove 'd_' from 'd' column.
df_calendar['d'] = df_calendar['d'].apply(remove_d)

# Reduce memory usage.
df_calendar = reduce_mem_usage(df_calendar)
df_calendar

(3) sell_prices.csv

In [None]:
%%time

# Load file.
df_prices = pd.read_csv('./sell_prices.csv')

# Encode categorical features.
for column in ['store_id', 'item_id', 'wm_yr_wk']:
    df_prices[column] = encode(df_prices, column)
    
# Reduce memory usage.
df_prices = reduce_mem_usage(df_prices)
df_prices

#### Feature engineering

In [None]:
def feature_eng():
    # Get the global instances.
    global df_sales, df_calendar, df_prices
    
    # Merge dataframes.
    df = pd.merge(df_sales, df_calendar, how='left', on='d')
    df = pd.merge(df, df_prices, how='left', on=['store_id', 'item_id', 'wm_yr_wk'])
    
    # Lagged features.
    columns = ['sales']
    windows = [7, 28, 365]
    for window in windows:
        for column in columns:
            df.loc[:, f'{column}_lag{window}'] = df[column].shift(window)
    
    # Moving average features.
    columns = ['sales_lag7', 'sales_lag28', 'sales_lag365']
    windows = [7, 28, 365]
    for window in windows:
        for column in columns:
            df.loc[:, f'{column}_avg{window}'] = df[column].rolling(window).mean()    

    # Reduce memory usage.
    df = reduce_mem_usage(df)
    
    return df

In [None]:
%%time

# Feature engineering.
df = feature_eng()
df = df.dropna()
where = df['d'] <= 1913
df = df.loc[where]
df

#### Train model

(1) Split train/valid dataset.

In [None]:
# Define categorical/unused features.
categorical_feature = ['state_id', 'store_id', 'cat_id', 'dept_id', 'item_id', 'wday', 'event_name_1', 'event_type_1', 'event_name_2', 'event_type_2']
unused_feature = ['id', 'd', 'wm_yr_wk', 'sales']

# Create a dataset.
train_data = lgb.Dataset(df.drop(columns=unused_feature), df['sales'], categorical_feature=categorical_feature)

# Set parameters.
params = {'metric': 'rmse'}

# Train a model.
model = lgb.train(params, train_data, num_boost_round=100, valid_sets=[train_data])

#### Submission

(1) Iterative inference.

In [None]:
magic = 1.03

# Iterate over prediction periods.
for d in range(1914, 1942):
    
    # Keep only last 365 days values.
    where = df_sales['d'] >= (d - 730)
    df_sales = df_sales[where].copy()
    
    # Create the most recent data.
#     where = df_sales['d'] == df_sales['d'].max()
#     df_now = df_sales[where].copy()
#     df_now['d'] = df_now['d'] + 1
#     df_now['sales'] = 0
#     df_sales = pd.concat([df_sales, df_now], ignore_index=True)
    
    # Feature engineering.
    df = feature_eng()
    
    # Predict the recent values.
    where = df['d'] == d
    predict = model.predict(df[where].drop(columns=unused_feature))
    where = df_sales['d'] == d
    df_sales.loc[where, 'sales'] = predict * magic
    print(f'The inference for d_{d} is done.')

(2) Brief look on the result to get the intuition.

In [None]:
for i in range(600, 700):
    df_sales[df_sales.id == i].sales.plot(figsize=[30, 2], use_index=False)
    plt.show()

(3) Save the submission file.

In [None]:
# Load indices and columns from sample submission file.
df_submission = pd.read_csv('./sample_submission.csv')
indices = df_submission['id']
columns = df_submission.columns

where = df_sales['d'] > 1913
df_submission = df_sales.loc[where, ['id', 'sales', 'd']].copy()
df_submission['id'] = decode(df_submission, 'id')
for idx, d in zip(range(1, 29), range(1914, 1942)):
    where = df_submission['d'] == d
    df_submission.loc[where, 'd'] = f'F{idx}'
df_submission = df_submission.pivot(index='id', columns='d', values='sales')

df_submission = pd.merge(indices, df_submission, how='left', on='id')
df_submission = df_submission[columns]
df_submission

In [None]:
df_submission.fillna(0).to_csv('submission.csv', index=False)