In [None]:
## Forecast Sales

In [None]:
import time
notebookstart= time.time()

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import gc

# Modeling
import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import KFold

# Visualization
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

print("Data Load Stage")
training = pd.read_csv('../input/train.csv', parse_dates = ["date"])#.sample(1000)
testing = pd.read_csv('../input/test.csv', parse_dates = ["date"])
testdex = testing.id

# Merge
df = pd.concat([training,testing.drop("id",axis=1)],axis=0, sort=True)

In [None]:
def prepare_time_features(df):
    df['date'] = pd.to_datetime(df['date'], format='%Y-%m-%d')
    df['week'] = df.date.dt.week
    df['month'] = df.date.dt.month
    df['day_of_year'] = df.date.dt.dayofyear
    df['week_of_year'] = df.date.dt.weekofyear
    df["weekday"] = df.date.dt.weekday
    df["quarter"] = df.date.dt.quarter
    df["day_of_month"] = df.date.dt.day
    
    return df

# Time Feats
df = prepare_time_features(df)

# Reshape
# df = df.groupby(["date", "store","item",]).sum().reset_index()
train = df.loc[df.date < pd.to_datetime('2018-01-01')]
print("Train Shape: ", train.shape)
test_df = df.loc[df.date >= pd.to_datetime('2018-01-01')]
print("Test Shape: ", test_df.shape)

In [None]:
# Time Aggregate Features
def percentile(n):
    def percentile_(x):
        return np.percentile(x, n)
    percentile_.__name__ = 'percentile_%s' % n
    return percentile_

# Build
def time_agg(train, test_df, vars_to_agg, vars_be_agg):
    for var in vars_to_agg:
        if isinstance(var, list):
            agg = train.groupby(var)[vars_be_agg].agg(["sum","mean","std","skew",percentile(80),percentile(20)])
            agg.columns = pd.Index(["fare_by_" + "_".join(var) + "_" + str(e) for e in agg.columns.tolist()])
            train = pd.merge(train,agg, on=var, how= "left")
            test_df = pd.merge(test_df,agg, on=var, how= "left")
        else:
            agg = train.groupby(var)[vars_be_agg].agg(["sum","mean","std","skew",percentile(80),percentile(20)])
            agg.columns = pd.Index(["fare_by_" + var + "_" + str(e) for e in agg.columns.tolist()])
            train = pd.merge(train,agg, on=[var], how= "left")
            test_df = pd.merge(test_df,agg, on=[var], how= "left")
    
    return train, test_df

In [None]:
%%time
# Time Aggregate Features
train, test_df = time_agg(train, test_df, vars_to_agg= 
                          ["item","store","date", ["week","item","store"], ["month","item","store"], ["day_of_year","item","store"], ["weekday","item","store"],
                           ["quarter","item","store"], ["day_of_month","item","store"], ["item","store","weekday","month"]], vars_be_agg = "sales")

In [None]:
train.head()

In [None]:
# Keep Relevant Variables..
y = train.sales.copy()
test_df.drop(["date", "sales"], axis = 1, inplace=True)
train = train[test_df.columns]
print("Does Train feature equal test feature?: ", all(train.columns == test_df.columns))
trainshape = train.shape
testshape = test_df.shape

In [None]:
# LGBM Dataset Formating
dtrain = lgb.Dataset(train, label=y, free_raw_data=False)

In [None]:
print("Light Gradient Boosting Regressor: ")
lgbm_params =  {
    'task': 'train',
    'boosting_type': 'gbdt',
    'objective': 'regression',
    'metric': "mape"
                }

folds = KFold(n_splits=5, shuffle=True, random_state=1)
fold_preds = np.zeros(testshape[0])
oof_preds = np.zeros(trainshape[0])
dtrain.construct()

# Fit 5 Folds
modelstart = time.time()
for trn_idx, val_idx in folds.split(train):
    clf = lgb.train(
        params=lgbm_params,
        train_set=dtrain.subset(trn_idx),
        valid_sets=dtrain.subset(val_idx),
        # categorical_feature = ["item","store"],
        num_boost_round=5000, 
        early_stopping_rounds=125,
        verbose_eval=500
    )
    oof_preds[val_idx] = clf.predict(dtrain.data.iloc[val_idx])
    fold_preds += clf.predict(test_df) / folds.n_splits
    print("RMSE: ", mean_squared_error(y.iloc[val_idx], oof_preds[val_idx]) ** .5)
print("Model Runtime: %0.2f Minutes"%((time.time() - modelstart)/60))

In [None]:
lgsub = pd.DataFrame(fold_preds,columns=["sales"],index=testdex)
lgsub.to_csv("date.csv",index=True,header=True)

print("Notebook Runtime: %0.2f Minutes"%((time.time() - notebookstart)/60))
lgsub.head()