In [1]:
"""Train LGB models."""
import sys
import itertools

import lightgbm as lgb
import numpy as np
import pandas as pd
from sklearn.metrics import mean_absolute_error as mae
from sklearn.pipeline import make_pipeline, make_union

from mllib.transformers import (
    DateLagN,
    ExpandingCount,
    ExpandingMean,
    ExpandingSum,
    FunctionTransfomer,
    LagN,
)
from src.constants import (
    TARGETS,
    awards_artifact,
    event_artifact,
    player_twitter_artifact,
    rosters_artifact,
    scores1_mean_artifact,
    scores2_mean_artifact,
    scores3_mean_artifact,
    scores4_mean_artifact,
    scores5_mean_artifact,
    targets_artifact,
    transactions_artifact,
)
from src.feature_gen1 import get_feature_pipeline1
from src.pipelines.artifacts import DataLoader, MapToCol, ParseJsonField



In [2]:
TRAIN_FILE = "data/train_updated.csv"
PLAYERS_FILE = "data/players.csv"
VAL_START_DATE = 20210601
DEVICE = "gpu"
device = DEVICE
artifacts_path = "data/artifacts/v01"
SAVE_FEATURES = True
LOAD_FEATURES = False
TRAIN_SEASON_ONLY = True
SEED1 = 786
SEED2 = 20201102

In [3]:
# raw_data = pd.read_csv(TRAIN_FILE)
# tr = raw_data.loc[raw_data.date < VAL_START_DATE]
# val = raw_data.loc[raw_data.date >= VAL_START_DATE]
# print(raw_data.shape, val.shape)

# roster_2021 = pd.read_csv(PLAYERS_FILE)
# roster_2021 = roster_2021.loc[roster_2021.playerForTestSetAndFuturePreds == True]
# target_enc = ParseJsonField(
#     date_field="date", data_field="nextDayPlayerEngagement", use_cols=TARGETS+['playerId']
# )
# tr_index = target_enc.transform(tr).reset_index(drop=False)
# tr_index = tr_index.loc[tr_index.playerId.astype(str).isin(roster_2021.playerId.astype(str))]
# del tr

# vl_index = target_enc.transform(val).reset_index(drop=False)
# vl_index = vl_index.loc[vl_index.playerId.isin(roster_2021.playerId.astype(str))]
# del raw_data, val
# tr_index.to_csv("data/tr_index_smallv01.csv", index=False)
# vl_index.to_csv("data/vl_index_smallv01.csv", index=False)

tr_index = pd.read_csv("data/tr_index_smallv01.csv")
vl_index = pd.read_csv("data/vl_index_smallv01.csv")
print(tr_index.shape, vl_index.shape)

(1480189, 7) (0, 7)


In [4]:
feature_pipeline_tr1, feature_pipeline_te1 = get_feature_pipeline1(artifacts_path, 'gpu', [7, 30, 150, 1500], [10, 30, 150], [30, 150])

In [8]:
if not LOAD_FEATURES:
    X_tr = feature_pipeline_tr1.transform(tr_index)
    X_vl = feature_pipeline_te1.transform(vl_index)
else:
    X_tr = np.load("data/X_tr_v201_f1.npy")
    X_vl = np.load("data/X_vl_v201_f1.npy")

y_tr = tr_index[TARGETS].values
y_vl = vl_index[TARGETS].values
print(X_tr.shape, X_vl.shape)

if SAVE_FEATURES:
    np.save("data/X_tr_v201_f1.npy", X_tr)
    np.save("data/X_vl_v201_f1.npy", X_vl)

if TRAIN_SEASON_ONLY:
    cond = X_tr[:, -1] > 0
    X_tr = X_tr[cond]
    y_tr = y_tr[cond]

    cond = X_vl[:, -1] > 0
    X_vl = X_vl[cond]
    y_vl = y_vl[cond]
    print(X_tr.shape, X_vl.shape, y_tr.shape, y_vl.shape)

AttributeError: 'Float32' object has no attribute 'astype'

In [None]:
X_tr.dtype, X_vl.dtype

In [18]:
tr1 = lgb.Dataset(X_tr, y_tr[:, 0])
tr2 = lgb.Dataset(X_tr, y_tr[:, 1])
tr3 = lgb.Dataset(X_tr, y_tr[:, 2])
tr4 = lgb.Dataset(X_tr, y_tr[:, 3])

vl1 = lgb.Dataset(X_vl, y_vl[:, 0], reference=tr1)
vl2 = lgb.Dataset(X_vl, y_vl[:, 1], reference=tr2)
vl3 = lgb.Dataset(X_vl, y_vl[:, 2], reference=tr3)
vl4 = lgb.Dataset(X_vl, y_vl[:, 3], reference=tr4)

params1 = {
    "n_estimators": 5000,
    "learning_rate": 0.02,
    "num_leaves": 255,
    "max_depth": -1,
    "min_data_in_leaf": 20,
    "colsample_bytree": 0.5,
    "subsample": 0.95,
    "bagging_freq": 1,
    "reg_alpha": 0.1,
    "reg_lambda": 0.1,
    "extra_trees": False,
    "max_bin": 127,
    # 'device': 'gpu',
    # 'gpu_use_dp': False,
    # 'gpu_device_id': 0,
    "boost_from_average": True,
    "reg_sqrt": True,
    "objective": "mae",
    "metric": "mae",
    "verbose": -1,
    "seed": SEED1,
    "min_data_per_group": 10,
    "cat_l2": 10,
    "cat_smooth": 10,
    "num_threads": 16,
}

bst1 = lgb.train(params1, tr1, valid_sets=[vl1], early_stopping_rounds=200, verbose_eval=50)
pred21 = bst1.predict(X_vl)
print(mae(y_vl[:, 0], pred21))

bst2 = lgb.train(params1, tr2, valid_sets=[vl2], early_stopping_rounds=200, verbose_eval=50)
pred22 = bst2.predict(X_vl)
print(mae(y_vl[:, 1], pred22))

bst3 = lgb.train(params1, tr3, valid_sets=[vl3], early_stopping_rounds=200, verbose_eval=50)
pred23 = bst3.predict(X_vl)
print(mae(y_vl[:, 2], pred23))

bst4 = lgb.train(params1, tr4, valid_sets=[vl4], early_stopping_rounds=200, verbose_eval=50)
pred24 = bst4.predict(X_vl)
print(mae(y_vl[:, 3], pred24))

preds_2 = np.vstack((pred21, pred22, pred23, pred24)).T
print(f"Overall score for params 1 -> f{mae(y_vl, preds_2):6.4f}")
bst1.save_model(f"artifacts/bst1_train_v401_1.pkl")
bst2.save_model(f"artifacts/bst2_train_v401_1.pkl")
bst3.save_model(f"artifacts/bst3_train_v401_1.pkl")
bst4.save_model(f"artifacts/bst4_train_v401_1.pkl")

np.save(f"data/lgb_t1_logv401_skip10_1.npy", pred21)
np.save(f"data/lgb_t2_logv401_skip10_1.npy", pred22)
np.save(f"data/lgb_t3_logv401_skip10_1.npy", pred23)
np.save(f"data/lgb_t4_logv401_skip10_1.npy", pred24)




Training until validation scores don't improve for 200 rounds
[50]	valid_0's l1: 1.07487
[100]	valid_0's l1: 1.01235
[150]	valid_0's l1: 0.985468
[200]	valid_0's l1: 0.974196
[250]	valid_0's l1: 0.968816
[300]	valid_0's l1: 0.965693
[350]	valid_0's l1: 0.963697
[400]	valid_0's l1: 0.962266
[450]	valid_0's l1: 0.961656
[500]	valid_0's l1: 0.960889
[550]	valid_0's l1: 0.960395
[600]	valid_0's l1: 0.959606
[650]	valid_0's l1: 0.959097
[700]	valid_0's l1: 0.958492
[750]	valid_0's l1: 0.958239
[800]	valid_0's l1: 0.957871
[850]	valid_0's l1: 0.957111
[900]	valid_0's l1: 0.956579
[950]	valid_0's l1: 0.956108
[1000]	valid_0's l1: 0.955916
[1050]	valid_0's l1: 0.955652
[1100]	valid_0's l1: 0.955083
[1150]	valid_0's l1: 0.954481
[1200]	valid_0's l1: 0.954007
[1250]	valid_0's l1: 0.953305
[1300]	valid_0's l1: 0.952728
[1350]	valid_0's l1: 0.952604
[1400]	valid_0's l1: 0.952452
[1450]	valid_0's l1: 0.952227
[1500]	valid_0's l1: 0.951612
[1550]	valid_0's l1: 0.951386
[1600]	valid_0's l1: 0.95111
[



Training until validation scores don't improve for 200 rounds
[50]	valid_0's l1: 1.69986
[100]	valid_0's l1: 1.5985
[150]	valid_0's l1: 1.57745
[200]	valid_0's l1: 1.57024
[250]	valid_0's l1: 1.5677
[300]	valid_0's l1: 1.56532
[350]	valid_0's l1: 1.5655
[400]	valid_0's l1: 1.56555
[450]	valid_0's l1: 1.56384
[500]	valid_0's l1: 1.56091
[550]	valid_0's l1: 1.55958
[600]	valid_0's l1: 1.55788
[650]	valid_0's l1: 1.55739
[700]	valid_0's l1: 1.55681
[750]	valid_0's l1: 1.55675
[800]	valid_0's l1: 1.55594
[850]	valid_0's l1: 1.5556
[900]	valid_0's l1: 1.55532
[950]	valid_0's l1: 1.55544
[1000]	valid_0's l1: 1.55503
[1050]	valid_0's l1: 1.55497
[1100]	valid_0's l1: 1.55449
[1150]	valid_0's l1: 1.55412
[1200]	valid_0's l1: 1.55381
[1250]	valid_0's l1: 1.55397
[1300]	valid_0's l1: 1.5542
[1350]	valid_0's l1: 1.55422
[1400]	valid_0's l1: 1.55436
Early stopping, best iteration is:
[1231]	valid_0's l1: 1.55363
1.5536346769866458




Training until validation scores don't improve for 200 rounds
[50]	valid_0's l1: 0.78456
[100]	valid_0's l1: 0.774273
[150]	valid_0's l1: 0.767349
[200]	valid_0's l1: 0.76346
[250]	valid_0's l1: 0.76135
[300]	valid_0's l1: 0.759807
[350]	valid_0's l1: 0.758392
[400]	valid_0's l1: 0.757779
[450]	valid_0's l1: 0.757289
[500]	valid_0's l1: 0.756872
[550]	valid_0's l1: 0.756003
[600]	valid_0's l1: 0.75541
[650]	valid_0's l1: 0.754564
[700]	valid_0's l1: 0.753721
[750]	valid_0's l1: 0.753178
[800]	valid_0's l1: 0.752988
[850]	valid_0's l1: 0.752826
[900]	valid_0's l1: 0.752464
[950]	valid_0's l1: 0.752216
[1000]	valid_0's l1: 0.751949
[1050]	valid_0's l1: 0.75169
[1100]	valid_0's l1: 0.751457
[1150]	valid_0's l1: 0.751029
[1200]	valid_0's l1: 0.750631
[1250]	valid_0's l1: 0.750258
[1300]	valid_0's l1: 0.749995
[1350]	valid_0's l1: 0.749896
[1400]	valid_0's l1: 0.749693
[1450]	valid_0's l1: 0.749308
[1500]	valid_0's l1: 0.749071
[1550]	valid_0's l1: 0.748836
[1600]	valid_0's l1: 0.748582
[16



Training until validation scores don't improve for 200 rounds
[50]	valid_0's l1: 1.31946
[100]	valid_0's l1: 1.2404
[150]	valid_0's l1: 1.21784
[200]	valid_0's l1: 1.20979
[250]	valid_0's l1: 1.20659
[300]	valid_0's l1: 1.20324
[350]	valid_0's l1: 1.20222
[400]	valid_0's l1: 1.19981
[450]	valid_0's l1: 1.19853
[500]	valid_0's l1: 1.1972
[550]	valid_0's l1: 1.19705
[600]	valid_0's l1: 1.19548
[650]	valid_0's l1: 1.19499
[700]	valid_0's l1: 1.19341
[750]	valid_0's l1: 1.19204
[800]	valid_0's l1: 1.19147
[850]	valid_0's l1: 1.1909
[900]	valid_0's l1: 1.19015
[950]	valid_0's l1: 1.18934
[1000]	valid_0's l1: 1.18862
[1050]	valid_0's l1: 1.18765
[1100]	valid_0's l1: 1.18722
[1150]	valid_0's l1: 1.18676
[1200]	valid_0's l1: 1.1863
[1250]	valid_0's l1: 1.18596
[1300]	valid_0's l1: 1.18543
[1350]	valid_0's l1: 1.18534
[1400]	valid_0's l1: 1.18505
[1450]	valid_0's l1: 1.18485
[1500]	valid_0's l1: 1.18451
[1550]	valid_0's l1: 1.18424
[1600]	valid_0's l1: 1.18407
[1650]	valid_0's l1: 1.18351
[1700

NameError: name 'i' is not defined

In [19]:
print(f"Overall score for params 1 -> f{mae(y_vl, preds_2):6.4f}")
bst1.save_model(f"artifacts/bst1_train_v401_1.pkl")
bst2.save_model(f"artifacts/bst2_train_v401_1.pkl")
bst3.save_model(f"artifacts/bst3_train_v401_1.pkl")
bst4.save_model(f"artifacts/bst4_train_v401_1.pkl")

np.save(f"data/lgb_t1_logv401_skip10_1.npy", pred21)
np.save(f"data/lgb_t2_logv401_skip10_1.npy", pred22)
np.save(f"data/lgb_t3_logv401_skip10_1.npy", pred23)
np.save(f"data/lgb_t4_logv401_skip10_1.npy", pred24)


Overall score for params 1 -> f1.1056


In [20]:
feature_pipeline_tr2, feature_pipeline_te2 = get_feature_pipeline1(artifacts_path, 'gpu', [7, 30, 90, 500], [7, 21, 90], [21, 90])

In [21]:
if not LOAD_FEATURES:
    X_tr = feature_pipeline_tr2.transform(tr_index)
    X_vl = feature_pipeline_te2.transform(vl_index)
else:
    X_tr = np.load("data/X_tr_v201_f2.npy")
    X_vl = np.load("data/X_vl_v201_f2.npy")

y_tr = tr_index[TARGETS].values
y_vl = vl_index[TARGETS].values
print(X_tr.shape, X_vl.shape)

if SAVE_FEATURES:
    np.save("data/X_tr_v201_f2.npy", X_tr)
    np.save("data/X_vl_v201_f2.npy", X_vl)

if TRAIN_SEASON_ONLY:
    cond = X_tr[:, -1] > 0
    X_tr = X_tr[cond]
    y_tr = y_tr[cond]

    cond = X_vl[:, -1] > 0
    X_vl = X_vl[cond]
    y_vl = y_vl[cond]
    print(X_tr.shape, X_vl.shape, y_tr.shape, y_vl.shape)

(1480189, 639) (55789, 639)
(1003015, 639) (55789, 639) (1003015, 4) (55789, 4)


In [23]:
tr1 = lgb.Dataset(X_tr, y_tr[:, 0])
tr2 = lgb.Dataset(X_tr, y_tr[:, 1])
tr3 = lgb.Dataset(X_tr, y_tr[:, 2])
tr4 = lgb.Dataset(X_tr, y_tr[:, 3])

vl1 = lgb.Dataset(X_vl, y_vl[:, 0], reference=tr1)
vl2 = lgb.Dataset(X_vl, y_vl[:, 1], reference=tr2)
vl3 = lgb.Dataset(X_vl, y_vl[:, 2], reference=tr3)
vl4 = lgb.Dataset(X_vl, y_vl[:, 3], reference=tr4)

params2 = {
    "n_estimators": 5000,
    "learning_rate": 0.02,
    "num_leaves": 350,
    "max_depth": -1,
    "min_data_in_leaf": 20,
    "colsample_bytree": 0.4,
    "subsample": 0.95,
    "bagging_freq": 1,
    "reg_alpha": 0.1,
    "reg_lambda": 0.1,
    "extra_trees": False,
    "max_bin": 127,
    # 'device': 'gpu',
    # 'gpu_use_dp': False,
    # 'gpu_device_id': 0,
    "boost_from_average": True,
    "reg_sqrt": True,
    "objective": "mae",
    "metric": "mae",
    "verbose": -1,
    "seed": SEED2,
    "min_data_per_group": 10,
    "cat_l2": 10,
    "cat_smooth": 10,
    "num_threads": 16,
}

bst1 = lgb.train(params1, tr1, valid_sets=[vl1], early_stopping_rounds=200, verbose_eval=50)
pred21 = bst1.predict(X_vl)
print(mae(y_vl[:, 0], pred21))

bst2 = lgb.train(params1, tr2, valid_sets=[vl2], early_stopping_rounds=200, verbose_eval=50)
pred22 = bst2.predict(X_vl)
print(mae(y_vl[:, 1], pred22))

bst3 = lgb.train(params1, tr3, valid_sets=[vl3], early_stopping_rounds=200, verbose_eval=50)
pred23 = bst3.predict(X_vl)
print(mae(y_vl[:, 2], pred23))

bst4 = lgb.train(params1, tr4, valid_sets=[vl4], early_stopping_rounds=200, verbose_eval=50)
pred24 = bst4.predict(X_vl)
print(mae(y_vl[:, 3], pred24))

preds_2 = np.vstack((pred21, pred22, pred23, pred24)).T
print(f"Overall score for params 2 -> f{mae(y_vl, preds_2):6.4f}")
bst1.save_model(f"artifacts/bst1_train_v401_2.pkl")
bst2.save_model(f"artifacts/bst2_train_v401_2.pkl")
bst3.save_model(f"artifacts/bst3_train_v401_2.pkl")
bst4.save_model(f"artifacts/bst4_train_v401_2.pkl")

np.save(f"data/lgb_t1_logv401_skip10_2.npy", pred21)
np.save(f"data/lgb_t2_logv401_skip10_2.npy", pred22)
np.save(f"data/lgb_t3_logv401_skip10_2.npy", pred23)
np.save(f"data/lgb_t4_logv401_skip10_2.npy", pred24)

Training until validation scores don't improve for 200 rounds
[50]	valid_0's l1: 1.073
[100]	valid_0's l1: 1.01197
[150]	valid_0's l1: 0.986288
[200]	valid_0's l1: 0.975706
[250]	valid_0's l1: 0.969712
[300]	valid_0's l1: 0.966439
[350]	valid_0's l1: 0.96416
[400]	valid_0's l1: 0.962984
[450]	valid_0's l1: 0.962318
[500]	valid_0's l1: 0.961924
[550]	valid_0's l1: 0.9613
[600]	valid_0's l1: 0.960799
[650]	valid_0's l1: 0.960227
[700]	valid_0's l1: 0.959621
[750]	valid_0's l1: 0.958656
[800]	valid_0's l1: 0.957962
[850]	valid_0's l1: 0.957535
[900]	valid_0's l1: 0.956893
[950]	valid_0's l1: 0.956452
[1000]	valid_0's l1: 0.956138
[1050]	valid_0's l1: 0.955416
[1100]	valid_0's l1: 0.954939
[1150]	valid_0's l1: 0.954534
[1200]	valid_0's l1: 0.954195
[1250]	valid_0's l1: 0.953908
[1300]	valid_0's l1: 0.953623
[1350]	valid_0's l1: 0.953172
[1400]	valid_0's l1: 0.952936
[1450]	valid_0's l1: 0.952591
[1500]	valid_0's l1: 0.952407
[1550]	valid_0's l1: 0.952185
[1600]	valid_0's l1: 0.951983
[1650



Training until validation scores don't improve for 200 rounds
[50]	valid_0's l1: 1.7038
[100]	valid_0's l1: 1.61004
[150]	valid_0's l1: 1.59706
[200]	valid_0's l1: 1.59061
[250]	valid_0's l1: 1.58475
[300]	valid_0's l1: 1.5817
[350]	valid_0's l1: 1.58097
[400]	valid_0's l1: 1.58106
[450]	valid_0's l1: 1.58029
[500]	valid_0's l1: 1.57992
[550]	valid_0's l1: 1.57788
[600]	valid_0's l1: 1.57732
[650]	valid_0's l1: 1.57681
[700]	valid_0's l1: 1.57585
[750]	valid_0's l1: 1.57534
[800]	valid_0's l1: 1.5747
[850]	valid_0's l1: 1.57399
[900]	valid_0's l1: 1.57359
[950]	valid_0's l1: 1.574
[1000]	valid_0's l1: 1.57439
[1050]	valid_0's l1: 1.5743
[1100]	valid_0's l1: 1.57421
Early stopping, best iteration is:
[923]	valid_0's l1: 1.57342
1.5734240430768256




Training until validation scores don't improve for 200 rounds
[50]	valid_0's l1: 0.784252
[100]	valid_0's l1: 0.773714
[150]	valid_0's l1: 0.766666
[200]	valid_0's l1: 0.763188
[250]	valid_0's l1: 0.760998
[300]	valid_0's l1: 0.759731
[350]	valid_0's l1: 0.759193
[400]	valid_0's l1: 0.758372
[450]	valid_0's l1: 0.757682
[500]	valid_0's l1: 0.756843
[550]	valid_0's l1: 0.756363
[600]	valid_0's l1: 0.755876
[650]	valid_0's l1: 0.755224
[700]	valid_0's l1: 0.754775
[750]	valid_0's l1: 0.753972
[800]	valid_0's l1: 0.753654
[850]	valid_0's l1: 0.75325
[900]	valid_0's l1: 0.752742
[950]	valid_0's l1: 0.752276
[1000]	valid_0's l1: 0.751969
[1050]	valid_0's l1: 0.751698
[1100]	valid_0's l1: 0.751377
[1150]	valid_0's l1: 0.750963
[1200]	valid_0's l1: 0.750722
[1250]	valid_0's l1: 0.750532
[1300]	valid_0's l1: 0.750214
[1350]	valid_0's l1: 0.749744
[1400]	valid_0's l1: 0.749639
[1450]	valid_0's l1: 0.749556
[1500]	valid_0's l1: 0.749231
[1550]	valid_0's l1: 0.749075
[1600]	valid_0's l1: 0.748595



Training until validation scores don't improve for 200 rounds
[50]	valid_0's l1: 1.31778
[100]	valid_0's l1: 1.2399
[150]	valid_0's l1: 1.21991
[200]	valid_0's l1: 1.21239
[250]	valid_0's l1: 1.20659
[300]	valid_0's l1: 1.20191
[350]	valid_0's l1: 1.19858
[400]	valid_0's l1: 1.1947
[450]	valid_0's l1: 1.19236
[500]	valid_0's l1: 1.18998
[550]	valid_0's l1: 1.18874
[600]	valid_0's l1: 1.18739
[650]	valid_0's l1: 1.18617
[700]	valid_0's l1: 1.18525
[750]	valid_0's l1: 1.18413
[800]	valid_0's l1: 1.18226
[850]	valid_0's l1: 1.18139
[900]	valid_0's l1: 1.17967
[950]	valid_0's l1: 1.1794
[1000]	valid_0's l1: 1.1783
[1050]	valid_0's l1: 1.17694
[1100]	valid_0's l1: 1.17675
[1150]	valid_0's l1: 1.17611
[1200]	valid_0's l1: 1.17556
[1250]	valid_0's l1: 1.17539
[1300]	valid_0's l1: 1.17466
[1350]	valid_0's l1: 1.17426
[1400]	valid_0's l1: 1.17389
[1450]	valid_0's l1: 1.17401
[1500]	valid_0's l1: 1.17389
[1550]	valid_0's l1: 1.17395
[1600]	valid_0's l1: 1.17375
[1650]	valid_0's l1: 1.17356
[1700

In [24]:
pred211 = np.load("data/lgb_t1_logv401_skip10_1.npy")
pred221 = np.load("data/lgb_t2_logv401_skip10_1.npy")
pred231 = np.load("data/lgb_t3_logv401_skip10_1.npy")
pred241 = np.load("data/lgb_t4_logv401_skip10_1.npy")


In [25]:
preds_21 = np.vstack((pred211, pred221, pred231, pred241)).T
print(mae(y_vl, preds_21))   # 1.1122 (skip 15), 1.1082 (skip 10), 1.1181 (skip 3); 1.1105 (skip 7); 1.1085 (skip 12)


1.1055802542299105


In [26]:
print(mae(y_vl, preds_21*0.5 + preds_2*0.5))

1.1050459474643652
