In [1]:
"""Train LGB models."""
import sys
import itertools

import lightgbm as lgb
import numpy as np
import pandas as pd
from sklearn.metrics import mean_absolute_error as mae
from sklearn.pipeline import make_pipeline, make_union

from mllib.transformers import (
    DateLagN,
    ExpandingCount,
    ExpandingMean,
    ExpandingSum,
    FunctionTransfomer,
    LagN,
)
from src.constants import (
    TARGETS,
    awards_artifact,
    event_artifact,
    player_twitter_artifact,
    rosters_artifact,
    scores1_mean_artifact,
    scores2_mean_artifact,
    scores3_mean_artifact,
    scores4_mean_artifact,
    scores5_mean_artifact,
    targets_artifact,
    transactions_artifact,
)
from src.feature_gen1 import get_feature_pipeline1
from src.pipelines.artifacts import DataLoader, MapToCol, ParseJsonField



In [2]:
TRAIN_FILE = "data/train_updated.csv"
PLAYERS_FILE = "data/players.csv"
VAL_START_DATE = 20210601
DEVICE = "gpu"
device = DEVICE
artifacts_path = "data/artifacts/v02"
SAVE_FEATURES = False
LOAD_FEATURES = True
TRAIN_SEASON_ONLY = True
SEED1 = 786
SEED2 = 20201102

In [3]:
# raw_data = pd.read_csv(TRAIN_FILE)
# tr = raw_data.loc[raw_data.date < VAL_START_DATE]
# val = raw_data.loc[raw_data.date >= VAL_START_DATE]
# print(raw_data.shape, val.shape)

# roster_2021 = pd.read_csv(PLAYERS_FILE)
# roster_2021 = roster_2021.loc[roster_2021.playerForTestSetAndFuturePreds == True]
# target_enc = ParseJsonField(
#     date_field="date", data_field="nextDayPlayerEngagement", use_cols=TARGETS+['playerId']
# )
# tr_index = target_enc.transform(tr).reset_index(drop=False)
# tr_index = tr_index.loc[tr_index.playerId.isin(roster_2021.playerId.astype(str))]
# del tr

# vl_index = target_enc.transform(val).reset_index(drop=False)
# vl_index = vl_index.loc[vl_index.playerId.isin(roster_2021.playerId.astype(str))]
# del raw_data, val
# # tr_index.to_csv("data/tr_index_smallv01.csv", index=False)
# # vl_index.to_csv("data/vl_index_smallv01.csv", index=False)

tr_index = pd.read_csv("data/tr_index_smallv02.csv")
vl_index = pd.read_csv("data/vl_index_smallv02.csv")
print(tr_index.shape, vl_index.shape)

(1515799, 6) (20179, 6)


In [4]:
feature_pipeline_tr1, feature_pipeline_te1 = get_feature_pipeline1(artifacts_path, 'gpu', [7, 30, 150, 1500], [10, 30, 150], [30, 150])

In [5]:
if not LOAD_FEATURES:
    X_tr = feature_pipeline_tr1.transform(tr_index)
    X_vl = feature_pipeline_te1.transform(vl_index)
else:
    X_tr = np.load("data/X_tr_v202_f1.npy")
    X_vl = np.load("data/X_vl_v202_f1.npy")

y_tr = tr_index[TARGETS].values
y_vl = vl_index[TARGETS].values
print(X_tr.shape, X_vl.shape)

if SAVE_FEATURES:
    np.save("data/X_tr_v202_f1.npy", X_tr)
    np.save("data/X_vl_v202_f1.npy", X_vl)

if TRAIN_SEASON_ONLY:
    cond = X_tr[:, -1] > 0
    X_tr = X_tr[cond]
    y_tr = y_tr[cond]

    cond = X_vl[:, -1] > 0
    X_vl = X_vl[cond]
    y_vl = y_vl[cond]
    print(X_tr.shape, X_vl.shape, y_tr.shape, y_vl.shape)

(1515799, 639) (20179, 639)
(1038625, 639) (20179, 639) (1038625, 4) (20179, 4)


In [6]:
tr1 = lgb.Dataset(X_tr, y_tr[:, 0])
tr2 = lgb.Dataset(X_tr, y_tr[:, 1])
tr3 = lgb.Dataset(X_tr, y_tr[:, 2])
tr4 = lgb.Dataset(X_tr, y_tr[:, 3])

vl1 = lgb.Dataset(X_vl, y_vl[:, 0], reference=tr1)
vl2 = lgb.Dataset(X_vl, y_vl[:, 1], reference=tr2)
vl3 = lgb.Dataset(X_vl, y_vl[:, 2], reference=tr3)
vl4 = lgb.Dataset(X_vl, y_vl[:, 3], reference=tr4)

params1 = {
    "n_estimators": 5000,
    "learning_rate": 0.02,
    "num_leaves": 255,
    "max_depth": -1,
    "min_data_in_leaf": 20,
    "colsample_bytree": 0.5,
    "subsample": 0.95,
    "bagging_freq": 1,
    "reg_alpha": 0.1,
    "reg_lambda": 0.1,
    "extra_trees": False,
    "max_bin": 127,
    # 'device': 'gpu',
    # 'gpu_use_dp': False,
    # 'gpu_device_id': 0,
    "boost_from_average": True,
    "reg_sqrt": True,
    "objective": "mae",
    "metric": "mae",
    "verbose": -1,
    "seed": SEED1,
    "min_data_per_group": 10,
    "cat_l2": 10,
    "cat_smooth": 10,
    "num_threads": 16,
}

bst1 = lgb.train(params1, tr1, valid_sets=[vl1], early_stopping_rounds=200, verbose_eval=50)
pred21 = bst1.predict(X_vl)
print(mae(y_vl[:, 0], pred21))

bst2 = lgb.train(params1, tr2, valid_sets=[vl2], early_stopping_rounds=200, verbose_eval=50)
pred22 = bst2.predict(X_vl)
print(mae(y_vl[:, 1], pred22))

bst3 = lgb.train(params1, tr3, valid_sets=[vl3], early_stopping_rounds=200, verbose_eval=50)
pred23 = bst3.predict(X_vl)
print(mae(y_vl[:, 2], pred23))

bst4 = lgb.train(params1, tr4, valid_sets=[vl4], early_stopping_rounds=200, verbose_eval=50)
pred24 = bst4.predict(X_vl)
print(mae(y_vl[:, 3], pred24))

preds_2 = np.vstack((pred21, pred22, pred23, pred24)).T
print(f"Overall score for params 1 -> f{mae(y_vl, preds_2):6.4f}")
bst1.save_model(f"artifacts/bst1_train_v402_1.pkl")
bst2.save_model(f"artifacts/bst2_train_v402_1.pkl")
bst3.save_model(f"artifacts/bst3_train_v402_1.pkl")
bst4.save_model(f"artifacts/bst4_train_v402_1.pkl")

np.save(f"data/lgb_t1_logv402_skip10_1.npy", pred21)
np.save(f"data/lgb_t2_logv402_skip10_1.npy", pred22)
np.save(f"data/lgb_t3_logv402_skip10_1.npy", pred23)
np.save(f"data/lgb_t4_logv402_skip10_1.npy", pred24)




Training until validation scores don't improve for 200 rounds
[50]	valid_0's l1: 0.98144
[100]	valid_0's l1: 0.934452
[150]	valid_0's l1: 0.914542
[200]	valid_0's l1: 0.906327
[250]	valid_0's l1: 0.901175
[300]	valid_0's l1: 0.898819
[350]	valid_0's l1: 0.897438
[400]	valid_0's l1: 0.896347
[450]	valid_0's l1: 0.895816
[500]	valid_0's l1: 0.895316
[550]	valid_0's l1: 0.894586
[600]	valid_0's l1: 0.894079
[650]	valid_0's l1: 0.893728
[700]	valid_0's l1: 0.893256
[750]	valid_0's l1: 0.892866
[800]	valid_0's l1: 0.892351
[850]	valid_0's l1: 0.892137
[900]	valid_0's l1: 0.891846
[950]	valid_0's l1: 0.891024
[1000]	valid_0's l1: 0.890554
[1050]	valid_0's l1: 0.889689
[1100]	valid_0's l1: 0.889225
[1150]	valid_0's l1: 0.888799
[1200]	valid_0's l1: 0.888485
[1250]	valid_0's l1: 0.888232
[1300]	valid_0's l1: 0.88801
[1350]	valid_0's l1: 0.887803
[1400]	valid_0's l1: 0.887651
[1450]	valid_0's l1: 0.887503
[1500]	valid_0's l1: 0.887223
[1550]	valid_0's l1: 0.886833
[1600]	valid_0's l1: 0.886713




Training until validation scores don't improve for 200 rounds
[50]	valid_0's l1: 1.24908
[100]	valid_0's l1: 1.20185
[150]	valid_0's l1: 1.21088
[200]	valid_0's l1: 1.2154
[250]	valid_0's l1: 1.22261
Early stopping, best iteration is:
[96]	valid_0's l1: 1.20091
1.2009122612932897




Training until validation scores don't improve for 200 rounds
[50]	valid_0's l1: 0.709156
[100]	valid_0's l1: 0.698221
[150]	valid_0's l1: 0.691918
[200]	valid_0's l1: 0.688072
[250]	valid_0's l1: 0.685668
[300]	valid_0's l1: 0.684178
[350]	valid_0's l1: 0.682848
[400]	valid_0's l1: 0.681783
[450]	valid_0's l1: 0.681374
[500]	valid_0's l1: 0.68084
[550]	valid_0's l1: 0.679515
[600]	valid_0's l1: 0.678711
[650]	valid_0's l1: 0.678069
[700]	valid_0's l1: 0.677599
[750]	valid_0's l1: 0.677379
[800]	valid_0's l1: 0.676913
[850]	valid_0's l1: 0.676881
[900]	valid_0's l1: 0.676816
[950]	valid_0's l1: 0.676736
[1000]	valid_0's l1: 0.676579
[1050]	valid_0's l1: 0.676275
[1100]	valid_0's l1: 0.675957
[1150]	valid_0's l1: 0.675404
[1200]	valid_0's l1: 0.675158
[1250]	valid_0's l1: 0.674715
[1300]	valid_0's l1: 0.674462
[1350]	valid_0's l1: 0.674453
[1400]	valid_0's l1: 0.674249
[1450]	valid_0's l1: 0.674115
[1500]	valid_0's l1: 0.673998
[1550]	valid_0's l1: 0.674004
[1600]	valid_0's l1: 0.673842



Training until validation scores don't improve for 200 rounds
[50]	valid_0's l1: 0.608192
[100]	valid_0's l1: 0.638607
[150]	valid_0's l1: 0.675789
[200]	valid_0's l1: 0.696082
[250]	valid_0's l1: 0.710729
Early stopping, best iteration is:
[56]	valid_0's l1: 0.605743
0.6057425671425748
Overall score for params 1 -> f0.8407


In [8]:
del X_tr, X_vl

In [4]:
feature_pipeline_tr2, feature_pipeline_te2 = get_feature_pipeline1(artifacts_path, 'gpu', [7, 30, 90, 500], [7, 21, 90], [21, 90])

In [5]:
if not LOAD_FEATURES:
    X_tr = feature_pipeline_tr2.transform(tr_index)
    X_vl = feature_pipeline_te2.transform(vl_index)
else:
    X_tr = np.load("data/X_tr_v202_f2.npy")
    X_vl = np.load("data/X_vl_v202_f2.npy")

y_tr = tr_index[TARGETS].values
y_vl = vl_index[TARGETS].values
print(X_tr.shape, X_vl.shape)

if SAVE_FEATURES:
    np.save("data/X_tr_v202_f2.npy", X_tr)
    np.save("data/X_vl_v202_f2.npy", X_vl)

if TRAIN_SEASON_ONLY:
    cond = X_tr[:, -1] > 0
    X_tr = X_tr[cond]
    y_tr = y_tr[cond]

    cond = X_vl[:, -1] > 0
    X_vl = X_vl[cond]
    y_vl = y_vl[cond]
    print(X_tr.shape, X_vl.shape, y_tr.shape, y_vl.shape)

(1515799, 639) (20179, 639)
(1038625, 639) (20179, 639) (1038625, 4) (20179, 4)


In [7]:
X_tr = X_tr.astype(np.float32)
X_vl = X_vl.astype(np.float32)

In [10]:
tr1 = lgb.Dataset(X_tr, y_tr[:, 0])
tr2 = lgb.Dataset(X_tr, y_tr[:, 1])
tr3 = lgb.Dataset(X_tr, y_tr[:, 2])
tr4 = lgb.Dataset(X_tr, y_tr[:, 3])

vl1 = lgb.Dataset(X_vl, y_vl[:, 0], reference=tr1)
vl2 = lgb.Dataset(X_vl, y_vl[:, 1], reference=tr2)
vl3 = lgb.Dataset(X_vl, y_vl[:, 2], reference=tr3)
vl4 = lgb.Dataset(X_vl, y_vl[:, 3], reference=tr4)

params1 = {
    "n_estimators": 5000,
    "learning_rate": 0.02,
    "num_leaves": 255,
    "max_depth": -1,
    "min_data_in_leaf": 20,
    "colsample_bytree": 0.4,
    "subsample": 0.95,
    "bagging_freq": 1,
    "reg_alpha": 0.1,
    "reg_lambda": 0.1,
    "extra_trees": False,
    "max_bin": 127,
#     'device': 'gpu',
#     'gpu_use_dp': False,
#     'gpu_device_id': 0,
    "boost_from_average": True,
    "reg_sqrt": True,
    "objective": "mae",
    "metric": "mae",
    "verbose": -1,
    "seed": SEED2,
    "min_data_per_group": 10,
    "cat_l2": 10,
    "cat_smooth": 10,
    "num_threads": 16,
}

bst1 = lgb.train(params1, tr1, valid_sets=[vl1], early_stopping_rounds=200, verbose_eval=50)
pred21 = bst1.predict(X_vl)
print(mae(y_vl[:, 0], pred21))

bst2 = lgb.train(params1, tr2, valid_sets=[vl2], early_stopping_rounds=200, verbose_eval=50)
pred22 = bst2.predict(X_vl)
print(mae(y_vl[:, 1], pred22))

bst3 = lgb.train(params1, tr3, valid_sets=[vl3], early_stopping_rounds=200, verbose_eval=50)
pred23 = bst3.predict(X_vl)
print(mae(y_vl[:, 2], pred23))

bst4 = lgb.train(params1, tr4, valid_sets=[vl4], early_stopping_rounds=200, verbose_eval=50)
pred24 = bst4.predict(X_vl)
print(mae(y_vl[:, 3], pred24))

preds_2 = np.vstack((pred21, pred22, pred23, pred24)).T
print(f"Overall score for params 2 -> f{mae(y_vl, preds_2):6.4f}")
bst1.save_model(f"artifacts/bst1_train_v402_2.pkl")
bst2.save_model(f"artifacts/bst2_train_v402_2.pkl")
bst3.save_model(f"artifacts/bst3_train_v402_2.pkl")
bst4.save_model(f"artifacts/bst4_train_v402_2.pkl")

np.save(f"data/lgb_t1_logv402_skip10_2.npy", pred21)
np.save(f"data/lgb_t2_logv402_skip10_2.npy", pred22)
np.save(f"data/lgb_t3_logv402_skip10_2.npy", pred23)
np.save(f"data/lgb_t4_logv402_skip10_2.npy", pred24)

Training until validation scores don't improve for 200 rounds
[50]	valid_0's l1: 0.983373
[100]	valid_0's l1: 0.934525
[150]	valid_0's l1: 0.915653
[200]	valid_0's l1: 0.908637
[250]	valid_0's l1: 0.903885
[300]	valid_0's l1: 0.901091
[350]	valid_0's l1: 0.899427
[400]	valid_0's l1: 0.898176
[450]	valid_0's l1: 0.897821
[500]	valid_0's l1: 0.89714
[550]	valid_0's l1: 0.896778
[600]	valid_0's l1: 0.89623
[650]	valid_0's l1: 0.895597
[700]	valid_0's l1: 0.895021
[750]	valid_0's l1: 0.893831
[800]	valid_0's l1: 0.893006
[850]	valid_0's l1: 0.892819
[900]	valid_0's l1: 0.89213
[950]	valid_0's l1: 0.891513
[1000]	valid_0's l1: 0.891239
[1050]	valid_0's l1: 0.89072
[1100]	valid_0's l1: 0.890075
[1150]	valid_0's l1: 0.889427
[1200]	valid_0's l1: 0.889048
[1250]	valid_0's l1: 0.888425
[1300]	valid_0's l1: 0.888265
[1350]	valid_0's l1: 0.887853
[1400]	valid_0's l1: 0.887697
[1450]	valid_0's l1: 0.887461
[1500]	valid_0's l1: 0.887388
[1550]	valid_0's l1: 0.887214
[1600]	valid_0's l1: 0.887251
[1



Training until validation scores don't improve for 200 rounds
[50]	valid_0's l1: 1.24508
[100]	valid_0's l1: 1.19117
[150]	valid_0's l1: 1.19423
[200]	valid_0's l1: 1.2044
[250]	valid_0's l1: 1.20672
[300]	valid_0's l1: 1.20579
Early stopping, best iteration is:
[109]	valid_0's l1: 1.18944
1.1894360589400466




Training until validation scores don't improve for 200 rounds
[50]	valid_0's l1: 0.709659
[100]	valid_0's l1: 0.698058
[150]	valid_0's l1: 0.691565
[200]	valid_0's l1: 0.686994
[250]	valid_0's l1: 0.685063
[300]	valid_0's l1: 0.683261
[350]	valid_0's l1: 0.682382
[400]	valid_0's l1: 0.68101
[450]	valid_0's l1: 0.680063
[500]	valid_0's l1: 0.679181
[550]	valid_0's l1: 0.678809
[600]	valid_0's l1: 0.678298
[650]	valid_0's l1: 0.677742
[700]	valid_0's l1: 0.677177
[750]	valid_0's l1: 0.676375
[800]	valid_0's l1: 0.675099
[850]	valid_0's l1: 0.674524
[900]	valid_0's l1: 0.673971
[950]	valid_0's l1: 0.673561
[1000]	valid_0's l1: 0.673071
[1050]	valid_0's l1: 0.672514
[1100]	valid_0's l1: 0.672381
[1150]	valid_0's l1: 0.671952
[1200]	valid_0's l1: 0.671662
[1250]	valid_0's l1: 0.670964
[1300]	valid_0's l1: 0.670783
[1350]	valid_0's l1: 0.6706
[1400]	valid_0's l1: 0.670222
[1450]	valid_0's l1: 0.669862
[1500]	valid_0's l1: 0.669796
[1550]	valid_0's l1: 0.66973
[1600]	valid_0's l1: 0.669548
[1



Training until validation scores don't improve for 200 rounds
[50]	valid_0's l1: 0.611744
[100]	valid_0's l1: 0.635798
[150]	valid_0's l1: 0.671099
[200]	valid_0's l1: 0.691093
[250]	valid_0's l1: 0.703045
Early stopping, best iteration is:
[62]	valid_0's l1: 0.608073
0.608073393505657
Overall score for params 2 -> f0.8360
