In [1]:
"""Train LGB models."""
import sys
import itertools

import lightgbm as lgb
import numpy as np
import pandas as pd
from sklearn.metrics import mean_absolute_error as mae
from sklearn.pipeline import make_pipeline, make_union

from mllib.transformers import (
    DateLagN,
    ExpandingCount,
    ExpandingMean,
    ExpandingSum,
    FunctionTransfomer,
    LagN,
)
from src.constants import (
    TARGETS,
    awards_artifact,
    event_artifact,
    player_twitter_artifact,
    rosters_artifact,
    scores1_mean_artifact,
    scores2_mean_artifact,
    scores3_mean_artifact,
    scores4_mean_artifact,
    scores5_mean_artifact,
    targets_artifact,
    transactions_artifact,
)
from src.feature_gen1 import get_feature_pipeline1
from src.pipelines.artifacts import DataLoader, MapToCol, ParseJsonField



In [2]:
TRAIN_FILE = "data/train_updated.csv"
PLAYERS_FILE = "data/players.csv"
VAL_START_DATE = 20210701
DEVICE = "gpu"
device = DEVICE
artifacts_path = "data/artifacts/v02"
SAVE_FEATURES = True
LOAD_FEATURES = False
TRAIN_SEASON_ONLY = True
SEED1 = 786
SEED2 = 20201102

In [4]:
raw_data = pd.read_csv(TRAIN_FILE)
tr = raw_data.loc[raw_data.date < VAL_START_DATE]
val = raw_data.loc[raw_data.date >= VAL_START_DATE]
print(raw_data.shape, val.shape)

roster_2021 = pd.read_csv(PLAYERS_FILE)
roster_2021 = roster_2021.loc[roster_2021.playerForTestSetAndFuturePreds == True]
target_enc = ParseJsonField(
    date_field="date", data_field="nextDayPlayerEngagement", use_cols=TARGETS+['playerId']
)
tr_index = target_enc.transform(tr).reset_index(drop=False)
tr_index = tr_index.loc[tr_index.playerId.isin(roster_2021.playerId)]
del tr

vl_index = target_enc.transform(val).reset_index(drop=False)
vl_index = vl_index.loc[vl_index.playerId.isin(roster_2021.playerId)]
del raw_data, val
tr_index.to_csv("data/tr_index_smallv02.csv", index=False)
vl_index.to_csv("data/vl_index_smallv02.csv", index=False)

tr_index = pd.read_csv("data/tr_index_smallv02.csv")
vl_index = pd.read_csv("data/vl_index_smallv02.csv")
print(tr_index.shape, vl_index.shape)

(1294, 12) (17, 12)
Parsing nextDayPlayerEngagement
Parsing nextDayPlayerEngagement
(1515799, 7) (20179, 7)


In [106]:
feature_pipeline_tr1, feature_pipeline_te1 = get_feature_pipeline1(artifacts_path, 'gpu', [7, 30, 150, 1500], [10, 30, 150], [30, 150])

In [8]:
if not LOAD_FEATURES:
    X_tr = feature_pipeline_tr1.transform(tr_index)
    X_vl = feature_pipeline_te1.transform(vl_index)
else:
    X_tr = np.load("data/X_tr_v202_f1.npy")
    X_vl = np.load("data/X_vl_v202_f1.npy")

y_tr = tr_index[TARGETS].values
y_vl = vl_index[TARGETS].values
print(X_tr.shape, X_vl.shape)

if SAVE_FEATURES:
    np.save("data/X_tr_v202_f1.npy", X_tr)
    np.save("data/X_vl_v202_f1.npy", X_vl)

if TRAIN_SEASON_ONLY:
    cond = X_tr[:, -1] > 0
    X_tr = X_tr[cond]
    y_tr = y_tr[cond]

    cond = X_vl[:, -1] > 0
    X_vl = X_vl[cond]
    y_vl = y_vl[cond]
    print(X_tr.shape, X_vl.shape, y_tr.shape, y_vl.shape)

(1515799, 639) (20179, 639)
(1038625, 639) (20179, 639) (1038625, 4) (20179, 4)


In [9]:
tr1 = lgb.Dataset(X_tr, y_tr[:, 0])
tr2 = lgb.Dataset(X_tr, y_tr[:, 1])
tr3 = lgb.Dataset(X_tr, y_tr[:, 2])
tr4 = lgb.Dataset(X_tr, y_tr[:, 3])

vl1 = lgb.Dataset(X_vl, y_vl[:, 0], reference=tr1)
vl2 = lgb.Dataset(X_vl, y_vl[:, 1], reference=tr2)
vl3 = lgb.Dataset(X_vl, y_vl[:, 2], reference=tr3)
vl4 = lgb.Dataset(X_vl, y_vl[:, 3], reference=tr4)

params1 = {
    "n_estimators": 5000,
    "learning_rate": 0.02,
    "num_leaves": 255,
    "max_depth": -1,
    "min_data_in_leaf": 20,
    "colsample_bytree": 0.5,
    "subsample": 0.95,
    "bagging_freq": 1,
    "reg_alpha": 0.1,
    "reg_lambda": 0.1,
    "extra_trees": False,
    "max_bin": 127,
    # 'device': 'gpu',
    # 'gpu_use_dp': False,
    # 'gpu_device_id': 0,
    "boost_from_average": True,
    "reg_sqrt": True,
    "objective": "mae",
    "metric": "mae",
    "verbose": -1,
    "seed": SEED1,
    "min_data_per_group": 10,
    "cat_l2": 10,
    "cat_smooth": 10,
    "num_threads": 16,
}

bst1 = lgb.train(params1, tr1, valid_sets=[vl1], early_stopping_rounds=200, verbose_eval=50)
pred21 = bst1.predict(X_vl)
print(mae(y_vl[:, 0], pred21))

bst2 = lgb.train(params1, tr2, valid_sets=[vl2], early_stopping_rounds=200, verbose_eval=50)
pred22 = bst2.predict(X_vl)
print(mae(y_vl[:, 1], pred22))

bst3 = lgb.train(params1, tr3, valid_sets=[vl3], early_stopping_rounds=200, verbose_eval=50)
pred23 = bst3.predict(X_vl)
print(mae(y_vl[:, 2], pred23))

bst4 = lgb.train(params1, tr4, valid_sets=[vl4], early_stopping_rounds=200, verbose_eval=50)
pred24 = bst4.predict(X_vl)
print(mae(y_vl[:, 3], pred24))

preds_2 = np.vstack((pred21, pred22, pred23, pred24)).T
print(f"Overall score for params 1 -> f{mae(y_vl, preds_2):6.4f}")
bst1.save_model(f"artifacts/bst1_train_v402_1.pkl")
bst2.save_model(f"artifacts/bst2_train_v402_1.pkl")
bst3.save_model(f"artifacts/bst3_train_v402_1.pkl")
bst4.save_model(f"artifacts/bst4_train_v402_1.pkl")

np.save(f"data/lgb_t1_logv402_skip10_1.npy", pred21)
np.save(f"data/lgb_t2_logv402_skip10_1.npy", pred22)
np.save(f"data/lgb_t3_logv402_skip10_1.npy", pred23)
np.save(f"data/lgb_t4_logv402_skip10_1.npy", pred24)


Training until validation scores don't improve for 200 rounds
[50]	valid_0's l1: 0.98144
[100]	valid_0's l1: 0.934452
[150]	valid_0's l1: 0.914542
[200]	valid_0's l1: 0.906327
[250]	valid_0's l1: 0.901175
[300]	valid_0's l1: 0.898819
[350]	valid_0's l1: 0.897438
[400]	valid_0's l1: 0.896347
[450]	valid_0's l1: 0.895816
[500]	valid_0's l1: 0.895316
[550]	valid_0's l1: 0.894586
[600]	valid_0's l1: 0.894079
[650]	valid_0's l1: 0.893728
[700]	valid_0's l1: 0.893256
[750]	valid_0's l1: 0.892866
[800]	valid_0's l1: 0.892351
[850]	valid_0's l1: 0.892137
[900]	valid_0's l1: 0.891846
[950]	valid_0's l1: 0.891024
[1000]	valid_0's l1: 0.890554
[1050]	valid_0's l1: 0.889689
[1100]	valid_0's l1: 0.889225
[1150]	valid_0's l1: 0.888799
[1200]	valid_0's l1: 0.888485
[1250]	valid_0's l1: 0.888232
[1300]	valid_0's l1: 0.88801
[1350]	valid_0's l1: 0.887803
[1400]	valid_0's l1: 0.887651
[1450]	valid_0's l1: 0.887503
[1500]	valid_0's l1: 0.887223
[1550]	valid_0's l1: 0.886833
[1600]	valid_0's l1: 0.886713


In [10]:
del X_tr, X_vl

In [118]:
feature_pipeline_tr2, feature_pipeline_te2 = get_feature_pipeline1(artifacts_path, 'gpu', [7, 30, 90, 500], [7, 21, 90], [21, 90])

In [12]:
if not LOAD_FEATURES:
    X_tr = feature_pipeline_tr2.transform(tr_index)
    X_vl = feature_pipeline_te2.transform(vl_index)
else:
    X_tr = np.load("data/X_tr_v202_f2.npy")
    X_vl = np.load("data/X_vl_v202_f2.npy")

y_tr = tr_index[TARGETS].values
y_vl = vl_index[TARGETS].values
print(X_tr.shape, X_vl.shape)

if SAVE_FEATURES:
    np.save("data/X_tr_v202_f2.npy", X_tr)
    np.save("data/X_vl_v202_f2.npy", X_vl)

if TRAIN_SEASON_ONLY:
    cond = X_tr[:, -1] > 0
    X_tr = X_tr[cond]
    y_tr = y_tr[cond]

    cond = X_vl[:, -1] > 0
    X_vl = X_vl[cond]
    y_vl = y_vl[cond]
    print(X_tr.shape, X_vl.shape, y_tr.shape, y_vl.shape)

(1515799, 639) (20179, 639)
(1038625, 639) (20179, 639) (1038625, 4) (20179, 4)


In [13]:
tr1 = lgb.Dataset(X_tr, y_tr[:, 0])
tr2 = lgb.Dataset(X_tr, y_tr[:, 1])
tr3 = lgb.Dataset(X_tr, y_tr[:, 2])
tr4 = lgb.Dataset(X_tr, y_tr[:, 3])

vl1 = lgb.Dataset(X_vl, y_vl[:, 0], reference=tr1)
vl2 = lgb.Dataset(X_vl, y_vl[:, 1], reference=tr2)
vl3 = lgb.Dataset(X_vl, y_vl[:, 2], reference=tr3)
vl4 = lgb.Dataset(X_vl, y_vl[:, 3], reference=tr4)

params1 = {
    "n_estimators": 5000,
    "learning_rate": 0.02,
    "num_leaves": 255,
    "max_depth": -1,
    "min_data_in_leaf": 20,
    "colsample_bytree": 0.4,
    "subsample": 0.95,
    "bagging_freq": 1,
    "reg_alpha": 0.1,
    "reg_lambda": 0.1,
    "extra_trees": False,
    "max_bin": 127,
#     'device': 'gpu',
#     'gpu_use_dp': False,
#     'gpu_device_id': 0,
    "boost_from_average": True,
    "reg_sqrt": True,
    "objective": "mae",
    "metric": "mae",
    "verbose": -1,
    "seed": SEED2,
    "min_data_per_group": 10,
    "cat_l2": 10,
    "cat_smooth": 10,
    "num_threads": 16,
}

bst1 = lgb.train(params1, tr1, valid_sets=[vl1], early_stopping_rounds=200, verbose_eval=50)
pred21 = bst1.predict(X_vl)
print(mae(y_vl[:, 0], pred21))

bst2 = lgb.train(params1, tr2, valid_sets=[vl2], early_stopping_rounds=200, verbose_eval=50)
pred22 = bst2.predict(X_vl)
print(mae(y_vl[:, 1], pred22))

bst3 = lgb.train(params1, tr3, valid_sets=[vl3], early_stopping_rounds=200, verbose_eval=50)
pred23 = bst3.predict(X_vl)
print(mae(y_vl[:, 2], pred23))

bst4 = lgb.train(params1, tr4, valid_sets=[vl4], early_stopping_rounds=200, verbose_eval=50)
pred24 = bst4.predict(X_vl)
print(mae(y_vl[:, 3], pred24))

preds_2 = np.vstack((pred21, pred22, pred23, pred24)).T
print(f"Overall score for params 2 -> f{mae(y_vl, preds_2):6.4f}")
bst1.save_model(f"artifacts/bst1_train_v402_2.pkl")
bst2.save_model(f"artifacts/bst2_train_v402_2.pkl")
bst3.save_model(f"artifacts/bst3_train_v402_2.pkl")
bst4.save_model(f"artifacts/bst4_train_v402_2.pkl")

np.save(f"data/lgb_t1_logv402_skip10_2.npy", pred21)
np.save(f"data/lgb_t2_logv402_skip10_2.npy", pred22)
np.save(f"data/lgb_t3_logv402_skip10_2.npy", pred23)
np.save(f"data/lgb_t4_logv402_skip10_2.npy", pred24)

Training until validation scores don't improve for 200 rounds
[50]	valid_0's l1: 0.983373
[100]	valid_0's l1: 0.934525
[150]	valid_0's l1: 0.915653
[200]	valid_0's l1: 0.908637
[250]	valid_0's l1: 0.903885
[300]	valid_0's l1: 0.901091
[350]	valid_0's l1: 0.899427
[400]	valid_0's l1: 0.898176
[450]	valid_0's l1: 0.897821
[500]	valid_0's l1: 0.89714
[550]	valid_0's l1: 0.896778
[600]	valid_0's l1: 0.89623
[650]	valid_0's l1: 0.895597
[700]	valid_0's l1: 0.895021
[750]	valid_0's l1: 0.893831
[800]	valid_0's l1: 0.893006
[850]	valid_0's l1: 0.892819
[900]	valid_0's l1: 0.89213
[950]	valid_0's l1: 0.891513
[1000]	valid_0's l1: 0.891239
[1050]	valid_0's l1: 0.89072
[1100]	valid_0's l1: 0.890075
[1150]	valid_0's l1: 0.889427
[1200]	valid_0's l1: 0.889048
[1250]	valid_0's l1: 0.888425
[1300]	valid_0's l1: 0.888265
[1350]	valid_0's l1: 0.887853
[1400]	valid_0's l1: 0.887697
[1450]	valid_0's l1: 0.887461
[1500]	valid_0's l1: 0.887388
[1550]	valid_0's l1: 0.887214
[1600]	valid_0's l1: 0.887251
[1

In [3]:
import joblib
from src.nn_utils import RandomData, LitModel
import torch
from torch.utils.data import DataLoader as TDataLoader

def get_preds(X, version='401_1'):
    preds_all = []
    for i in range(4):
        bst = lgb.Booster(model_file=f'artifacts/bst{i+1}_train_v{version}.pkl')
        preds = bst.predict(X)
        preds_all.append(preds)
    return np.vstack(preds_all).T

def _get_preds(model, loader, device='cuda'):
    model.eval()
    model.to(device)
    out = []
    with torch.no_grad():
        for batch in loader:
            x, y = batch
            x = x.to(device)
            yhat = torch.square(10*model(x))
            out.append(yhat)
    return torch.cat(out, 0).cpu().numpy()


def get_nn_preds(X_vl, weights= ['artifacts/epoch=35-val_loss=1.1030.ckpt', 'artifacts/epoch=45-val_loss=1.1023.ckpt'], scaler=None):
    BATCH_SIZE = 4096
    NUM_WORKERS = 8
    IN_F = 639
    D = 300
    P = 0.1
    LR = 0.01
    WD = 0.001
    GRAD_AVG = True
    MAX_EPOCHS = 75
    STEPS = [20, 40, 60]
    GAMMA = 0.2
    model = LitModel(IN_F, 4, D, P, lr=LR, wd=WD, grad_avg=GRAD_AVG, steps=STEPS, gamma=GAMMA)
    vl_ds = RandomData(X_vl, labels=np.ones(shape=(len(X_vl), 4)), scaler=scaler)
    te_dl = TDataLoader(vl_ds, batch_size=BATCH_SIZE*2, drop_last=False, shuffle=False, num_workers=NUM_WORKERS)
    preds_nn = []
    for file in weights:
        model.load_state_dict(torch.load(str(file))["state_dict"], strict=False)
        preds = _get_preds(model, te_dl, device='cuda')
        preds_nn.append(preds)

    return np.mean(preds_nn, 0)

In [4]:
X_vl1 = np.load("data/X_tr_v203_f1.npy")
X_vl2 = np.load("data/X_tr_v203_f2.npy")
vl_index = pd.read_csv("data/tr_index_smallv03.csv")
y_vl = vl_index[TARGETS].values

In [5]:
vl_index[vl_index.date >= 20210501].shape

(92586, 7)

In [84]:
cond = (vl_index.date == 20210427) & (vl_index.playerId == 656669)

In [119]:
cond = (vl_index.date == 20210427) & (vl_index.playerId == 656669)
tmp_index = vl_index.loc[cond]
X_tmp = feature_pipeline_te2.transform(tmp_index)

In [125]:
tpreds = get_preds(X_tmp, '403_2')
tpreds

array([[1.60236123, 2.62685046, 0.04648284, 1.25399516]])

In [129]:
tpreds = get_nn_preds(X_tmp, weights=['artifacts/nn_v202_2_8452.ckpt'],
                         scaler=joblib.load('artifacts/min_max_scaler_v202_2.pkl'))
tpreds

array([[1.0752183 , 2.3611927 , 0.02461948, 1.2032437 ]], dtype=float32)

In [85]:
preds_401_1 = get_preds(X_vl1[cond], '401_1')
mae(y_vl[cond], preds_401_1)

0.22969782241156145

In [86]:
preds_401_1

array([[1.5205035 , 3.03167104, 0.06904988, 1.31427229]])

In [87]:
preds_401_2 = get_preds(X_vl2[cond], '401_2')
mae(y_vl[cond], preds_401_2)

0.404371536827785

In [88]:
preds_401_2

array([[1.29328925, 2.34659451, 0.05047757, 1.08980788]])

In [89]:
preds_402_1 = get_preds(X_vl1[cond], '402_1')
mae(y_vl[cond], preds_402_1)

0.5024855578740295

In [90]:
preds_402_1

array([[1.48854037, 2.63366135, 0.08542706, 0.57058658]])

In [91]:
preds_402_2 = get_preds(X_vl2[cond], '402_2')
mae(y_vl[cond], preds_402_2)

0.46269321083299497

In [92]:
preds_402_2

array([[1.05146596, 2.31498275, 0.06072585, 0.63606137]])

In [93]:
print(mae(preds_402_2[:, 0], y_vl[cond, 0]))
print(mae(preds_402_2[:, 1], y_vl[cond, 1]))
print(mae(preds_402_2[:, 2], y_vl[cond, 2]))
print(mae(preds_402_2[:, 3], y_vl[cond, 3]))


0.10831209056372237
0.7155862116839504
0.19010250068828388
0.8367720403960232


In [94]:
preds_403_1 = get_preds(X_vl1[cond], '403_1')
mae(y_vl[cond], preds_403_1)

0.2586609672717792

In [95]:
preds_403_1

array([[1.56156573, 3.04493518, 0.066376  , 1.25541997]])

In [96]:
preds_403_2 = get_preds(X_vl2[cond], '403_2')
mae(y_vl[cond], preds_403_2)

0.43232651106605985

In [97]:
preds_nn1 = get_nn_preds(X_vl1[cond], weights=['artifacts/nn_v201_1_1p1023.ckpt', 'artifacts/nn_v201_1_1p1034.ckpt'],
                         scaler=joblib.load('artifacts/min_max_scaler_v2.pkl'))
mae(y_vl[cond], preds_nn1)

0.33593089739453474

In [98]:
preds_nn1

array([[0.9835156 , 2.606041  , 0.03456144, 0.8102665 ]], dtype=float32)

In [99]:
preds_nn2 = get_nn_preds(X_vl2[cond], weights=['artifacts/nn_v201_2_1p1054.ckpt', 'artifacts/nn_v201_2_1p1071.ckpt'],
                         scaler=joblib.load('artifacts/min_max_scaler_v3.pkl'))
mae(y_vl[cond], preds_nn2)

0.4457592854081336

In [100]:
preds_nn3 = get_nn_preds(X_vl1[cond], weights=['artifacts/nn_v202_1_8484.ckpt', 'artifacts/nn_v202_1_8502.ckpt'],
                        scaler=joblib.load('artifacts/min_max_scaler_v202_1.pkl'))
mae(y_vl[cond], preds_nn3)

0.3730482715241281

In [101]:
preds_nn4 = get_nn_preds(X_vl2[cond], weights=['artifacts/nn_v202_2_8415.ckpt', 'artifacts/nn_v202_2_8452.ckpt'],
                         scaler=joblib.load('artifacts/min_max_scaler_v202_2.pkl'))
mae(y_vl[cond], preds_nn4)

0.4942892586617413

In [102]:
print(mae(preds_nn4[:, 0], y_vl[cond, 0]))
print(mae(preds_nn4[:, 1], y_vl[cond, 1]))
print(mae(preds_nn4[:, 2], y_vl[cond, 2]))
print(mae(preds_nn4[:, 3], y_vl[cond, 3]))


0.09530032737683747
0.8690530118388486
0.2288091863944397
0.7839945090368394


In [103]:
from scipy.stats import gmean, hmean

In [104]:
preds_mean = np.mean([
                      preds_401_1,
                      preds_401_2,
                      preds_402_1,
                      preds_402_2,
                      preds_403_1,
                      preds_403_2,
                      preds_nn1,
                      preds_nn1,
                      preds_nn2,
                      preds_nn2,
                      preds_nn3,
                      preds_nn3,
                      preds_nn4,
                      preds_nn4
                       ], 0)
mae(y_vl[cond], preds_mean)

0.3769520434512544

In [105]:
preds_mean

array([[1.12946938, 2.55800561, 0.04292508, 0.83180737]])

In [35]:
preds_401_1

array([[5.31789843e-01, 7.65647447e+00, 5.98406365e-01, 1.02882387e+01],
       [3.70787573e-03, 3.31678870e-01, 1.73157094e-02, 6.76764033e-01],
       [6.67933057e-04, 1.92282070e+00, 1.02099941e-02, 8.51832823e-01],
       ...,
       [9.98481262e-05, 3.80502423e-01, 2.94933294e-03, 5.36500579e-01],
       [1.66230585e-03, 1.19416076e+00, 3.85852277e-03, 1.00327400e+00],
       [9.69871691e-04, 2.11688193e-01, 1.81770919e-02, 2.51616777e-01]])