In [1]:
from datetime import date, timedelta
import pathlib
import time

import pandas as pd
import numpy as np
import feather
import lightgbm as lgb
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_squared_error

In [2]:
MAIN_PATH = pathlib.Path('/Users/palermopenano/personal/sm-202011/project_2')
num_days_preds = 16        # number of days into the future to predict (y values)
num_days = 6

val2017 = date(2017, 7, 26)
test2017 = date(2017, 8, 16)

# Load data

In [3]:
X_train = feather.read_dataframe(MAIN_PATH / 'tmp/X_train')
y_train = feather.read_dataframe(MAIN_PATH / 'tmp/y_train')
X_val = feather.read_dataframe(MAIN_PATH / 'tmp/X_val')
y_val = feather.read_dataframe(MAIN_PATH / 'tmp/y_val')
X_test = feather.read_dataframe(MAIN_PATH / 'tmp/X_test')
items = feather.read_dataframe(MAIN_PATH / 'tmp/items')
store_item_idx = (feather.
                  read_dataframe(MAIN_PATH / 'tmp/store_item_idx').
                  set_index(['store_nbr', 'item_nbr']).
                  index)
df_test = (feather.
           read_dataframe(MAIN_PATH / 'tmp/df_test').
           set_index(['store_nbr', 'item_nbr', 'date']))

In [4]:
params = {
    'num_leaves': 80,
    'objective': 'regression',
    'min_data_in_leaf': 200,
    'learning_rate': 0.02,
    'feature_fraction': 0.8,
    'bagging_fraction': 0.7,
    'bagging_freq': 1,
    'metric': 'l2',
    'num_threads': 16
}
MAX_ROUNDS = 5000

# params = {
#     'num_leaves':80,
#     'objective': 'regression',
#     'min_data_in_leaf': 200,
#     'learning_rate': 0.02,
#     'feature_fraction': 0.8,
#     'bagging_fraction': 0.7,
#     'bagging_freq': 1,
#     'metric': 'l2',
#     'num_threads': 16
# }
# MAX_ROUNDS = 100

In [5]:
start = time.time()

val_pred = []
test_pred = []
cate_vars = []

for i in range(num_days_preds):
    print("=" * 50)
    print("Step %d" % (i+1))
    print("=" * 50)
    
    dtrain = lgb.Dataset(
        X_train, label=y_train.iloc[:, i],
        categorical_feature=cate_vars,
        weight=pd.concat([items["perishable"]] * num_days) * 0.25 + 1  # items marked as perishable is given a weight of .25; others are 1
    )
    dval = lgb.Dataset(
        X_val, label=y_val.iloc[:, i],
        categorical_feature=cate_vars,
        weight=items["perishable"] * 0.25 + 1
    )
    bst = lgb.train(
        params, dtrain, num_boost_round=MAX_ROUNDS,
        valid_sets=[dtrain, dval], early_stopping_rounds=125, verbose_eval=50
    )
    
    # Interesting trick!
    print("\n".join(("%s: %.2f" % x) for x in sorted(
        zip(X_train.columns, bst.feature_importance("gain")),
        key=lambda x: x[1], reverse=True
    )))
    
    val_pred.append(
        bst.predict(X_val, 
                    num_iteration=bst.best_iteration or MAX_ROUNDS)
    )
    test_pred.append(
        bst.predict(X_test, 
                    num_iteration=bst.best_iteration or MAX_ROUNDS)
    )


print("\nValidation mse:", 
      mean_squared_error(y_val, np.array(val_pred).transpose()))
weight = items['perishable'] * 0.25 + 1
err = (y_val - np.array(val_pred).transpose())**2
err = err.sum(axis=1) * weight
err = np.sqrt(err.sum() / weight.sum() / 16)
print("Validation nwrmsle = {}".format(err))
print(f"Time taken: {(time.time() - start) / 60} mins")

Step 1




Training until validation scores don't improve for 125 rounds
[50]	training's l2: 0.442124	valid_1's l2: 0.430471
[100]	training's l2: 0.347089	valid_1's l2: 0.341405
[150]	training's l2: 0.330914	valid_1's l2: 0.32782
[200]	training's l2: 0.326337	valid_1's l2: 0.324897
[250]	training's l2: 0.32388	valid_1's l2: 0.32378
[300]	training's l2: 0.322069	valid_1's l2: 0.323041
[350]	training's l2: 0.320645	valid_1's l2: 0.32266
[400]	training's l2: 0.319401	valid_1's l2: 0.322362
[450]	training's l2: 0.3183	valid_1's l2: 0.322166
[500]	training's l2: 0.317293	valid_1's l2: 0.321972
[550]	training's l2: 0.316336	valid_1's l2: 0.321839
[600]	training's l2: 0.315449	valid_1's l2: 0.321734
[650]	training's l2: 0.314624	valid_1's l2: 0.32163
[700]	training's l2: 0.313789	valid_1's l2: 0.321565
[750]	training's l2: 0.312993	valid_1's l2: 0.321505
[800]	training's l2: 0.312212	valid_1's l2: 0.321471
[850]	training's l2: 0.311464	valid_1's l2: 0.321436
[900]	training's l2: 0.310733	valid_1's l2: 0

Step 4
Training until validation scores don't improve for 125 rounds
[50]	training's l2: 0.501254	valid_1's l2: 0.503674
[100]	training's l2: 0.396351	valid_1's l2: 0.404328
[150]	training's l2: 0.378911	valid_1's l2: 0.389404
[200]	training's l2: 0.373759	valid_1's l2: 0.385998
[250]	training's l2: 0.370714	valid_1's l2: 0.384298
[300]	training's l2: 0.368482	valid_1's l2: 0.383277
[350]	training's l2: 0.366624	valid_1's l2: 0.382605
[400]	training's l2: 0.365017	valid_1's l2: 0.382215
[450]	training's l2: 0.3636	valid_1's l2: 0.382011
[500]	training's l2: 0.362355	valid_1's l2: 0.381782
[550]	training's l2: 0.361159	valid_1's l2: 0.381667
[600]	training's l2: 0.360021	valid_1's l2: 0.381556
[650]	training's l2: 0.358996	valid_1's l2: 0.381434
[700]	training's l2: 0.358024	valid_1's l2: 0.381368
[750]	training's l2: 0.357079	valid_1's l2: 0.381253
[800]	training's l2: 0.356116	valid_1's l2: 0.381218
[850]	training's l2: 0.355234	valid_1's l2: 0.381193
[900]	training's l2: 0.354362	val

Step 7
Training until validation scores don't improve for 125 rounds
[50]	training's l2: 0.480694	valid_1's l2: 0.591817
[100]	training's l2: 0.395906	valid_1's l2: 0.482705
[150]	training's l2: 0.381176	valid_1's l2: 0.460151
[200]	training's l2: 0.376795	valid_1's l2: 0.453862
[250]	training's l2: 0.374244	valid_1's l2: 0.451758
[300]	training's l2: 0.372318	valid_1's l2: 0.45077
[350]	training's l2: 0.370662	valid_1's l2: 0.450241
[400]	training's l2: 0.369234	valid_1's l2: 0.44988
[450]	training's l2: 0.367983	valid_1's l2: 0.449758
[500]	training's l2: 0.36685	valid_1's l2: 0.449633
[550]	training's l2: 0.365754	valid_1's l2: 0.449473
[600]	training's l2: 0.364756	valid_1's l2: 0.449446
[650]	training's l2: 0.363781	valid_1's l2: 0.449415
[700]	training's l2: 0.362829	valid_1's l2: 0.449372
[750]	training's l2: 0.361865	valid_1's l2: 0.449265
[800]	training's l2: 0.360982	valid_1's l2: 0.449245
[850]	training's l2: 0.360115	valid_1's l2: 0.449191
[900]	training's l2: 0.359246	vali

[600]	training's l2: 0.374316	valid_1's l2: 0.407362
[650]	training's l2: 0.37311	valid_1's l2: 0.407224
[700]	training's l2: 0.371935	valid_1's l2: 0.407006
[750]	training's l2: 0.370851	valid_1's l2: 0.406914
[800]	training's l2: 0.369767	valid_1's l2: 0.406752
[850]	training's l2: 0.368778	valid_1's l2: 0.406653
[900]	training's l2: 0.367787	valid_1's l2: 0.406536
[950]	training's l2: 0.366817	valid_1's l2: 0.40641
[1000]	training's l2: 0.365909	valid_1's l2: 0.406319
[1050]	training's l2: 0.36501	valid_1's l2: 0.406238
[1100]	training's l2: 0.364138	valid_1's l2: 0.406171
[1150]	training's l2: 0.363284	valid_1's l2: 0.406142
[1200]	training's l2: 0.36244	valid_1's l2: 0.406098
[1250]	training's l2: 0.3616	valid_1's l2: 0.406045
[1300]	training's l2: 0.360828	valid_1's l2: 0.405994
[1350]	training's l2: 0.360039	valid_1's l2: 0.405999
[1400]	training's l2: 0.359227	valid_1's l2: 0.405981
[1450]	training's l2: 0.358453	valid_1's l2: 0.405979
[1500]	training's l2: 0.357703	valid_1's l

[1600]	training's l2: 0.377983	valid_1's l2: 0.41371
[1650]	training's l2: 0.377175	valid_1's l2: 0.413636
[1700]	training's l2: 0.37644	valid_1's l2: 0.413581
[1750]	training's l2: 0.375665	valid_1's l2: 0.413505
[1800]	training's l2: 0.374904	valid_1's l2: 0.413488
[1850]	training's l2: 0.374176	valid_1's l2: 0.413462
[1900]	training's l2: 0.373452	valid_1's l2: 0.413401
[1950]	training's l2: 0.372735	valid_1's l2: 0.413401
[2000]	training's l2: 0.371998	valid_1's l2: 0.413358
[2050]	training's l2: 0.371311	valid_1's l2: 0.413285
[2100]	training's l2: 0.370574	valid_1's l2: 0.413213
[2150]	training's l2: 0.3699	valid_1's l2: 0.413183
[2200]	training's l2: 0.369197	valid_1's l2: 0.413135
[2250]	training's l2: 0.368492	valid_1's l2: 0.413091
[2300]	training's l2: 0.36783	valid_1's l2: 0.413074
[2350]	training's l2: 0.367188	valid_1's l2: 0.413068
[2400]	training's l2: 0.366508	valid_1's l2: 0.41304
[2450]	training's l2: 0.365859	valid_1's l2: 0.413024
[2500]	training's l2: 0.365185	val

Step 15
Training until validation scores don't improve for 125 rounds
[50]	training's l2: 0.513123	valid_1's l2: 0.490369
[100]	training's l2: 0.425386	valid_1's l2: 0.411806
[150]	training's l2: 0.40922	valid_1's l2: 0.400443
[200]	training's l2: 0.403791	valid_1's l2: 0.39799
[250]	training's l2: 0.400584	valid_1's l2: 0.396881
[300]	training's l2: 0.397976	valid_1's l2: 0.396148
[350]	training's l2: 0.395804	valid_1's l2: 0.395657
[400]	training's l2: 0.394033	valid_1's l2: 0.395391
[450]	training's l2: 0.392486	valid_1's l2: 0.395196
[500]	training's l2: 0.391014	valid_1's l2: 0.395036
[550]	training's l2: 0.38964	valid_1's l2: 0.394884
[600]	training's l2: 0.388427	valid_1's l2: 0.394762
[650]	training's l2: 0.38717	valid_1's l2: 0.394647
[700]	training's l2: 0.385982	valid_1's l2: 0.394505
[750]	training's l2: 0.384908	valid_1's l2: 0.39445
[800]	training's l2: 0.383859	valid_1's l2: 0.394398
[850]	training's l2: 0.382848	valid_1's l2: 0.394352
[900]	training's l2: 0.381834	valid

# Save predictions of unit_sales from validation set

In [6]:
y_val = np.array(val_pred).transpose()
df_preds = (
    pd.DataFrame(y_val, index=store_item_idx, 
                 columns=pd.date_range(val2017, periods=16)).
    stack().
    to_frame("unit_sales")
)
df_preds.index.set_names(['store_nbr', 'item_nbr', 'date'], inplace=True)

# Convert back to original units (quantity of items)
df_preds['unit_sales'] = np.clip(np.expm1(df_preds['unit_sales']), 0, 1000)
df_preds.reset_index().to_csv(MAIN_PATH / 'tmp/lgb_cv.csv', index=False)

# Create submission

In [7]:
y_test = np.array(test_pred).transpose()
df_preds = (
    pd.DataFrame(y_test, index=store_item_idx, 
                 columns=pd.date_range(test2017, periods=16)).
    stack().
    to_frame('unit_sales')
)
df_preds.index.set_names(['store_nbr', 'item_nbr', 'date'], inplace=True)

In [8]:
submission = df_test[['id']].join(df_preds, how='left').fillna(0)
submission["unit_sales"] = np.clip(np.expm1(submission["unit_sales"]), 0, 1000)
submission.to_csv(MAIN_PATH / 'tmp/lgb_sub.csv', float_format='%.4f', index=None)