In [1]:
import pathlib
import time

import pandas as pd
import numpy as np
import feather
import lightgbm as lgb
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_squared_error

In [2]:
MAIN_PATH = pathlib.Path('/Users/palermopenano/personal/sm-202011/project_2')
num_days_preds = 16        # number of days into the future to predict (y values)
num_days = 6

# Load data

In [3]:
X_train = feather.read_dataframe(MAIN_PATH / 'tmp/X_train')
y_train = feather.read_dataframe(MAIN_PATH / 'tmp/y_train')
X_val = feather.read_dataframe(MAIN_PATH / 'tmp/X_val')
y_val = feather.read_dataframe(MAIN_PATH / 'tmp/y_val')
X_test = feather.read_dataframe(MAIN_PATH / 'tmp/X_test')
items = feather.read_dataframe(MAIN_PATH / 'tmp/items')

In [4]:
params = {
    'num_leaves': 80,
    'objective': 'regression',
    'min_data_in_leaf': 200,
    'learning_rate': 0.02,
    'feature_fraction': 0.8,
    'bagging_fraction': 0.7,
    'bagging_freq': 1,
    'metric': 'l2',
    'num_threads': 16
}

MAX_ROUNDS = 5000

In [5]:
start = time.time()

val_pred = []
test_pred = []
cate_vars = []

for i in range(num_days_preds):
    print("=" * 50)
    print("Step %d" % (i+1))
    print("=" * 50)
    
    dtrain = lgb.Dataset(
        X_train, label=y_train.iloc[:, i],
        categorical_feature=cate_vars,
        weight=pd.concat([items["perishable"]] * num_days) * 0.25 + 1  # items marked as perishable is given a weight of .25; others are 1
    )
    dval = lgb.Dataset(
        X_val, label=y_val.iloc[:, i],
        categorical_feature=cate_vars,
        weight=items["perishable"] * 0.25 + 1
    )
    bst = lgb.train(
        params, dtrain, num_boost_round=MAX_ROUNDS,
        valid_sets=[dtrain, dval], early_stopping_rounds=125, verbose_eval=50
    )
    
    # Interesting trick!
    print("\n".join(("%s: %.2f" % x) for x in sorted(
        zip(X_train.columns, bst.feature_importance("gain")),
        key=lambda x: x[1], reverse=True
    )))
    
    val_pred.append(
        bst.predict(X_val, 
                    num_iteration=bst.best_iteration or MAX_ROUNDS)
    )
    test_pred.append(
        bst.predict(X_test, 
                    num_iteration=bst.best_iteration or MAX_ROUNDS)
    )


print("\nValidation mse:", 
      mean_squared_error(y_val, np.array(val_pred).transpose()))
weight = items['perishable'] * 0.25 + 1
err = (y_val - np.array(val_pred).transpose())**2
err = err.sum(axis=1) * weight
err = np.sqrt(err.sum() / weight.sum() / 16)
print("Validation nwrmsle = {}".format(err))
print(f"Time taken: {(time.time() - start) / 60} mins")

    

Step 1




Training until validation scores don't improve for 125 rounds
[50]	training's l2: 0.008467	valid_1's l2: 0.00853278
[100]	training's l2: 0.00844278	valid_1's l2: 0.00853224
[150]	training's l2: 0.00842253	valid_1's l2: 0.00853325
[200]	training's l2: 0.00840475	valid_1's l2: 0.00853439
Early stopping, best iteration is:
[75]	training's l2: 0.00845423	valid_1's l2: 0.00853175
item_nbr: 126.59
class: 117.31
max_140: 104.36
mean_140_decay: 78.99
store_nbr: 62.70
mean_140: 57.11
std_140: 52.79
cluster: 33.44
family: 22.81
mean_60_decay: 22.55
mean_60: 20.93
city: 20.26
std_60: 18.97
max_60: 18.00
max_30: 17.61
type: 17.16
mean_30_decay: 13.70
state: 11.38
std_30: 10.51
perishable: 10.29
max_14: 9.78
mean_30: 7.08
mean_14_decay: 6.85
std_14: 5.57
mean_14: 5.12
mean_7_decay: 4.38
mean_7: 3.48
diff_30_mean: 3.36
std_7: 2.74
mean_3_decay: 2.66
diff_7_mean: 2.26
diff_3_mean: 2.17
max_7: 1.73
diff_60_mean: 1.47
diff_140_mean: 1.34
std_3: 1.14
diff_14_mean: 0.80
mean_3: 0.63
max_3: 0.30
median_3:

Step 8
Training until validation scores don't improve for 125 rounds
[50]	training's l2: 0.00857858	valid_1's l2: 0.00979821
[100]	training's l2: 0.00855536	valid_1's l2: 0.0097962
[150]	training's l2: 0.0085349	valid_1's l2: 0.00979659
[200]	training's l2: 0.0085174	valid_1's l2: 0.00979848
Early stopping, best iteration is:
[100]	training's l2: 0.00855536	valid_1's l2: 0.0097962
max_140: 160.94
item_nbr: 134.66
class: 128.02
store_nbr: 77.75
mean_140_decay: 73.32
std_140: 71.03
mean_140: 59.96
cluster: 49.56
mean_60_decay: 34.08
family: 31.73
city: 26.82
std_60: 25.46
type: 24.08
max_60: 24.00
mean_60: 23.54
max_30: 22.55
state: 19.92
mean_30_decay: 18.30
mean_30: 17.13
std_30: 14.65
perishable: 12.57
mean_14_decay: 9.83
max_14: 7.92
mean_14: 6.27
std_14: 5.75
mean_7_decay: 4.97
std_7: 4.69
mean_3_decay: 4.35
max_7: 3.62
mean_7: 2.86
diff_30_mean: 2.63
diff_14_mean: 1.79
diff_7_mean: 1.75
mean_3: 1.68
diff_60_mean: 1.68
diff_3_mean: 1.41
diff_140_mean: 1.17
max_3: 1.13
std_3: 1.13
me

Step 15
Training until validation scores don't improve for 125 rounds
[50]	training's l2: 0.00882676	valid_1's l2: 0.00860131
[100]	training's l2: 0.00880205	valid_1's l2: 0.00860178
[150]	training's l2: 0.0087815	valid_1's l2: 0.00860338
Early stopping, best iteration is:
[65]	training's l2: 0.00881905	valid_1's l2: 0.00860089
max_140: 150.22
item_nbr: 113.88
class: 95.58
store_nbr: 57.32
mean_140_decay: 51.81
mean_140: 46.18
std_140: 42.57
cluster: 39.97
family: 25.81
city: 18.88
mean_60_decay: 18.57
type: 18.45
max_60: 17.14
mean_60: 16.84
std_60: 15.91
perishable: 12.47
state: 12.28
mean_30_decay: 11.29
max_30: 10.14
max_14: 7.83
mean_30: 7.80
std_30: 6.82
mean_14_decay: 6.34
std_14: 5.46
diff_140_mean: 4.65
mean_14: 4.17
diff_60_mean: 2.49
mean_7_decay: 1.95
mean_7: 1.85
mean_3_decay: 1.49
max_7: 1.37
diff_30_mean: 1.27
diff_3_mean: 1.21
diff_14_mean: 1.11
mean_3: 0.94
diff_7_mean: 0.81
std_7: 0.77
std_3: 0.57
max_3: 0.18
median_3: 0.00
min_3: 0.00
median_7: 0.00
min_7: 0.00
media