In [11]:
"""Train LGB models."""
import sys
import itertools

import lightgbm as lgb
import numpy as np
import pandas as pd
from sklearn.metrics import mean_absolute_error as mae
from sklearn.pipeline import make_pipeline, make_union

from mllib.transformers import (
    DateLagN,
    ExpandingCount,
    ExpandingMean,
    ExpandingSum,
    FunctionTransfomer,
    LagN,
)
from src.constants import (
    TARGETS,
    awards_artifact,
    event_artifact,
    player_twitter_artifact,
    rosters_artifact,
    scores1_mean_artifact,
    scores2_mean_artifact,
    scores3_mean_artifact,
    scores4_mean_artifact,
    scores5_mean_artifact,
    targets_artifact,
    transactions_artifact,
)
from src.feature_gen1 import get_feature_pipeline1
from src.pipelines.artifacts import DataLoader, MapToCol, ParseJsonField



In [12]:
TRAIN_FILE = "data/train_updated.csv"
PLAYERS_FILE = "data/players.csv"
VAL_START_DATE = 20210601
DEVICE = "gpu"
device = DEVICE
artifacts_path = "data/artifacts/v01"
SAVE_FEATURES = True
LOAD_FEATURES = False
TRAIN_SEASON_ONLY = True
SEED1 = 786
SEED2 = 20201102

In [13]:
# raw_data = pd.read_csv(TRAIN_FILE)
# tr = raw_data.loc[raw_data.date < VAL_START_DATE]
# val = raw_data.loc[raw_data.date >= VAL_START_DATE]
# print(raw_data.shape, val.shape)

# roster_2021 = pd.read_csv(PLAYERS_FILE)
# roster_2021 = roster_2021.loc[roster_2021.playerForTestSetAndFuturePreds == True]
# target_enc = ParseJsonField(
#     date_field="date", data_field="nextDayPlayerEngagement", use_cols=TARGETS+['playerId']
# )
# tr_index = target_enc.transform(tr).reset_index(drop=False)
# tr_index = tr_index.loc[tr_index.playerId.isin(roster_2021.playerId.astype(str))]
# del tr

# vl_index = target_enc.transform(val).reset_index(drop=False)
# vl_index = vl_index.loc[vl_index.playerId.isin(roster_2021.playerId.astype(str))]
# del raw_data, val
# # tr_index.to_csv("data/tr_index_smallv01.csv", index=False)
# # vl_index.to_csv("data/vl_index_smallv01.csv", index=False)

tr_index = pd.read_csv("data/tr_index_smallv01.csv")
vl_index = pd.read_csv("data/vl_index_smallv01.csv")
print(tr_index.shape, vl_index.shape)

(1480189, 6) (55789, 6)


In [16]:
feature_pipeline_tr1, feature_pipeline_te1 = get_feature_pipeline1(artifacts_path, 'gpu', [7, 30, 150, 1500], [10, 30, 150], [30, 150])

In [17]:
if not LOAD_FEATURES:
    X_tr = feature_pipeline_tr1.transform(tr_index)
    X_vl = feature_pipeline_te1.transform(vl_index)
else:
    X_tr = np.load("data/X_tr_v201_f1.npy")
    X_vl = np.load("data/X_vl_v201_f1.npy")

y_tr = tr_index[TARGETS].values
y_vl = vl_index[TARGETS].values
print(X_tr.shape, X_vl.shape)

if SAVE_FEATURES:
    np.save("data/X_tr_v201_f1.npy", X_tr)
    np.save("data/X_vl_v201_f1.npy", X_vl)

if TRAIN_SEASON_ONLY:
    cond = X_tr[:, -1] > 0
    X_tr = X_tr[cond]
    y_tr = y_tr[cond]

    cond = X_vl[:, -1] > 0
    X_vl = X_vl[cond]
    y_vl = y_vl[cond]
    print(X_tr.shape, X_vl.shape, y_tr.shape, y_vl.shape)

(1480189, 639) (55789, 639)
(1003015, 639) (55789, 639) (1003015, 4) (55789, 4)


In [None]:
tr1 = lgb.Dataset(X_tr, y_tr[:, 0])
tr2 = lgb.Dataset(X_tr, y_tr[:, 1])
tr3 = lgb.Dataset(X_tr, y_tr[:, 2])
tr4 = lgb.Dataset(X_tr, y_tr[:, 3])

vl1 = lgb.Dataset(X_vl, y_vl[:, 0], reference=tr1)
vl2 = lgb.Dataset(X_vl, y_vl[:, 1], reference=tr2)
vl3 = lgb.Dataset(X_vl, y_vl[:, 2], reference=tr3)
vl4 = lgb.Dataset(X_vl, y_vl[:, 3], reference=tr4)

params1 = {
    "n_estimators": 5000,
    "learning_rate": 0.02,
    "num_leaves": 255,
    "max_depth": -1,
    "min_data_in_leaf": 20,
    "colsample_bytree": 0.5,
    "subsample": 0.95,
    "bagging_freq": 1,
    "reg_alpha": 0.1,
    "reg_lambda": 0.1,
    "extra_trees": False,
    "max_bin": 127,
    # 'device': 'gpu',
    # 'gpu_use_dp': False,
    # 'gpu_device_id': 0,
    "boost_from_average": True,
    "reg_sqrt": True,
    "objective": "mae",
    "metric": "mae",
    "verbose": -1,
    "seed": SEED1,
    "min_data_per_group": 10,
    "cat_l2": 10,
    "cat_smooth": 10,
    "num_threads": 16,
}

bst1 = lgb.train(params1, tr1, valid_sets=[vl1], early_stopping_rounds=200, verbose_eval=50)
pred21 = bst1.predict(X_vl)
print(mae(y_vl[:, 0], pred21))

bst2 = lgb.train(params1, tr2, valid_sets=[vl2], early_stopping_rounds=200, verbose_eval=50)
pred22 = bst2.predict(X_vl)
print(mae(y_vl[:, 1], pred22))

bst3 = lgb.train(params1, tr3, valid_sets=[vl3], early_stopping_rounds=200, verbose_eval=50)
pred23 = bst3.predict(X_vl)
print(mae(y_vl[:, 2], pred23))

bst4 = lgb.train(params1, tr4, valid_sets=[vl4], early_stopping_rounds=200, verbose_eval=50)
pred24 = bst4.predict(X_vl)
print(mae(y_vl[:, 3], pred24))

preds_2 = np.vstack((pred21, pred22, pred23, pred24)).T
print(f"Overall score for params {i} -> f{mae(y_vl, preds_2):6.4f}")
bst1.save_model(f"artifacts/bst1_train_v401_1.pkl")
bst2.save_model(f"artifacts/bst2_train_v401_1.pkl")
bst3.save_model(f"artifacts/bst3_train_v401_1.pkl")
bst4.save_model(f"artifacts/bst4_train_v401_1.pkl")

np.save(f"data/lgb_t1_logv401_skip10_1.npy", pred21)
np.save(f"data/lgb_t2_logv401_skip10_1.npy", pred22)
np.save(f"data/lgb_t3_logv401_skip10_1.npy", pred23)
np.save(f"data/lgb_t4_logv401_skip10_1.npy", pred24)




Training until validation scores don't improve for 200 rounds
[50]	valid_0's l1: 1.07487
[100]	valid_0's l1: 1.01235


In [None]:
feature_pipeline_tr2, feature_pipeline_te2 = get_feature_pipeline1(artifacts_path, 'gpu', [7, 30, 90, 500], [7, 21, 90], [21, 90])

In [None]:
if not LOAD_FEATURES:
    X_tr = feature_pipeline_tr2.transform(tr_index)
    X_vl = feature_pipeline_te2.transform(vl_index)
else:
    X_tr = np.load("data/X_tr_v201_f2.npy")
    X_vl = np.load("data/X_vl_v201_f2.npy")

y_tr = tr_index[TARGETS].values
y_vl = vl_index[TARGETS].values
print(X_tr.shape, X_vl.shape)

if SAVE_FEATURES:
    np.save("data/X_tr_v201_f2.npy", X_tr)
    np.save("data/X_vl_v201_f2.npy", X_vl)

if TRAIN_SEASON_ONLY:
    cond = X_tr[:, -1] > 0
    X_tr = X_tr[cond]
    y_tr = y_tr[cond]

    cond = X_vl[:, -1] > 0
    X_vl = X_vl[cond]
    y_vl = y_vl[cond]
    print(X_tr.shape, X_vl.shape, y_tr.shape, y_vl.shape)

In [None]:
tr1 = lgb.Dataset(X_tr, y_tr[:, 0])
tr2 = lgb.Dataset(X_tr, y_tr[:, 1])
tr3 = lgb.Dataset(X_tr, y_tr[:, 2])
tr4 = lgb.Dataset(X_tr, y_tr[:, 3])

vl1 = lgb.Dataset(X_vl, y_vl[:, 0], reference=tr1)
vl2 = lgb.Dataset(X_vl, y_vl[:, 1], reference=tr2)
vl3 = lgb.Dataset(X_vl, y_vl[:, 2], reference=tr3)
vl4 = lgb.Dataset(X_vl, y_vl[:, 3], reference=tr4)

params2 = {
    "n_estimators": 5000,
    "learning_rate": 0.02,
    "num_leaves": 255,
    "max_depth": -1,
    "min_data_in_leaf": 20,
    "colsample_bytree": 0.55,
    "subsample": 0.95,
    "bagging_freq": 1,
    "reg_alpha": 0.1,
    "reg_lambda": 0.1,
    "extra_trees": False,
    "max_bin": 127,
    # 'device': 'gpu',
    # 'gpu_use_dp': False,
    # 'gpu_device_id': 0,
    "boost_from_average": True,
    "reg_sqrt": True,
    "objective": "mae",
    "metric": "mae",
    "verbose": -1,
    "seed": SEED2,
    "min_data_per_group": 10,
    "cat_l2": 10,
    "cat_smooth": 10,
    "num_threads": 16,
}

bst1 = lgb.train(params1, tr1, valid_sets=[vl1], early_stopping_rounds=200, verbose_eval=50)
pred21 = bst1.predict(X_vl)
print(mae(y_vl[:, 0], pred21))

bst2 = lgb.train(params1, tr2, valid_sets=[vl2], early_stopping_rounds=200, verbose_eval=50)
pred22 = bst2.predict(X_vl)
print(mae(y_vl[:, 1], pred22))

bst3 = lgb.train(params1, tr3, valid_sets=[vl3], early_stopping_rounds=200, verbose_eval=50)
pred23 = bst3.predict(X_vl)
print(mae(y_vl[:, 2], pred23))

bst4 = lgb.train(params1, tr4, valid_sets=[vl4], early_stopping_rounds=200, verbose_eval=50)
pred24 = bst4.predict(X_vl)
print(mae(y_vl[:, 3], pred24))

preds_2 = np.vstack((pred21, pred22, pred23, pred24)).T
print(f"Overall score for params {i} -> f{mae(y_vl, preds_2):6.4f}")
bst1.save_model(f"artifacts/bst1_train_v401_2.pkl")
bst2.save_model(f"artifacts/bst2_train_v401_2.pkl")
bst3.save_model(f"artifacts/bst3_train_v401_2.pkl")
bst4.save_model(f"artifacts/bst4_train_v401_2.pkl")

np.save(f"data/lgb_t1_logv401_skip10_2.npy", pred21)
np.save(f"data/lgb_t2_logv401_skip10_2.npy", pred22)
np.save(f"data/lgb_t3_logv401_skip10_2.npy", pred23)
np.save(f"data/lgb_t4_logv401_skip10_2.npy", pred24)