#### Load modules

In [None]:
import numpy as np
import pandas as pd
import lightgbm as lgb
from sklearn.preprocessing import LabelEncoder

pd.set_option('max_columns', 100)

#### Define helper function

In [None]:
def encode(df, column):
    # Get the instance of global dictionary of encoders.
    global encoders
    # Encode given column data.
    if column in encoders:
        return encoders[column].transform(df[column])
    else:
        encoders[column] = LabelEncoder()
        return encoders[column].fit_transform(df[column])

def decode(df, column):
    # Get the instance of global dictionary of encoders.
    global encoders
    # Decode given column data.
    if column in encoder:
        return encoders[column].inverse_transform(df[column])
    else:
        raise "Can't find an appropriate decoder instance."
        
encoders = {}

In [None]:
def reduce_mem_usage(df):
    """ iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.        
    """
    start_mem = df.memory_usage().sum() / 1024**2
    
    for col in df.columns:
        col_type = df[col].dtype
        
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype('category')

    end_mem = df.memory_usage().sum() / 1024**2
    
    return df

#### Preprocess

(1) sales_train_validation.csv

In [None]:
# Load files.
df_sales = pd.read_csv('./sales_train_validation.csv')

# Encode categorical features.
for column in ['id', 'item_id', 'dept_id', 'cat_id', 'store_id', 'state_id']:
    df_sales[column] = encode(df_sales, column)
    
# Unpivot 'd' data.
id_vars = [column for column in df_sales if 'd_' not in column]
value_vars = [column for column in df_sales if 'd_' in column]
df_sales = df_sales.melt(id_vars=id_vars, value_vars=value_vars, var_name='d', value_name='sales')

# Remove 'd_' from 'd' column.
df_sales['d'] = df_sales['d'].apply(lambda x: x.replace('d_', '')).astype('int')

# Reduce memory usage.
df_sales = reduce_mem_usage(df_sales)
df_sales

(2) calendar.csv

In [None]:
# Load files.
df_calendar = pd.read_csv('./calendar.csv')

# Create 'day' column.
df_calendar['day'] = pd.DatetimeIndex(df_calendar['date']).day

# Drop redundant columns.
df_calendar.drop(columns=['date', 'weekday'], inplace=True)

# Encode categorical features.
df_calendar = df_calendar.fillna('')
for column in ['wm_yr_wk', 'event_name_1', 'event_type_1', 'event_name_2', 'event_type_2']:
    df_calendar[column] = encode(df_calendar, column)

# Remove 'd_' from 'd' column.
df_calendar['d'] = df_calendar['d'].apply(lambda x: x.replace('d_', '')).astype('int')

# Reduce memory usage.
df_calendar = reduce_mem_usage(df_calendar)
df_calendar

(3) sell_prices.csv

In [None]:
# Load files.
df_prices = pd.read_csv('./sell_prices.csv')

# Encode categorical features.
for column in ['store_id', 'item_id', 'wm_yr_wk']:
    df_prices[column] = encode(df_prices, column)
    
# Reduce memory usage.
df_prices = reduce_mem_usage(df_prices)
df_prices

(4) merge all

In [None]:
# Merge dataframes.
df = pd.merge(df_sales, df_calendar, how='left', on='d')
df = pd.merge(df, df_prices, how='left', on=['store_id', 'item_id', 'wm_yr_wk'])
del df_sales, df_calendar, df_prices

# Drop useless columns.
df = df.drop(columns='wm_yr_wk')

# Arrange column/row orders.
columns = [
    # Item related features.
    'state_id', 'store_id', 'cat_id', 'dept_id', 'item_id', 'id',
    # Date related features.
    'year', 'month', 'day', 'wday', 'd',
    # Event and snap related features.
    'event_name_1', 'event_type_1', 'event_name_2', 'event_type_2', 'snap_CA', 'snap_TX', 'snap_WI',
    # Sales related features.
    'sell_price', 'sales'
]
df = df[columns]
df = df.sort_values(by=['id', 'd'])

# Save on disk.
df.to_pickle('data.pickle')
df

#### Train model

In [None]:
train = df['d'] <= 1183
valid = df['d'] > 1183
label = 'sales'
categorical_feature = ['state_id', 'store_id', 'cat_id', 'dept_id', 'item_id', 'id', 'wday', 'event_name_1', 'event_type_1', 'event_name_2', 'event_type_2']

train_data = lgb.Dataset(df[train].drop(columns=label), df.loc[train, label], categorical_feature=categorical_feature)
valid_data = lgb.Dataset(df[valid].drop(columns=label), df.loc[valid, label], categorical_feature=categorical_feature)

In [None]:
params = {
    'metric': 'mae'
}
model = lgb.train(params, train_data, valid_sets=[train_data, valid_data])