In [1]:
import json
import hydra
from omegaconf import OmegaConf
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
from sklearn.pipeline import make_union, make_pipeline

from mllib.transformers import *
from src.pipelines.artifacts import ParsePlayerData
from src.utils.utils import print_config

In [2]:
# Generate index
TRAIN_FILE = "data/train.csv"
PLAYERS_FILE = "data/players.csv"
VAL_START_DATE = 20210415
TARGETS = ["target1", "target2", "target3", "target4"]


In [3]:
raw_data = pd.read_csv(TRAIN_FILE)
tr = raw_data.loc[raw_data.date < VAL_START_DATE]
val = raw_data.loc[raw_data.date >= VAL_START_DATE]
print(raw_data.shape, val.shape)

(1216, 12) (16, 12)


In [4]:
from src.pipelines.artifacts import ParseJsonField

In [5]:
# parser = ParseJsonField('date', 'rosters', use_cols=['status', 'statusCode'])
# roster_df = parser.transform(raw_data)

In [6]:
roster_2021 = pd.read_csv(PLAYERS_FILE)
roster_2021 = roster_2021.loc[roster_2021.playerForTestSetAndFuturePreds == True]
target_enc = ParsePlayerData("nextDayPlayerEngagement", TARGETS)
tr_index = target_enc.fit_transform(tr).reset_index(drop=False)
tr_index = tr_index.loc[tr_index.playerId.isin(roster_2021.playerId.astype(str))]
# tr_index['debutdate'] = tr_index.map()
vl_index = target_enc.fit_transform(val).reset_index(drop=False)
vl_index = vl_index.loc[vl_index.playerId.isin(roster_2021.playerId.astype(str))]
tr_index.to_csv("data/tr_index_small.csv", index=False)
vl_index.to_csv("data/vl_index_small.csv", index=False)


100%|██████████| 1200/1200 [00:17<00:00, 67.85it/s]
100%|██████████| 16/16 [00:00<00:00, 64.58it/s]


In [7]:
tr_index = pd.read_csv("data/tr_index_small.csv")
vl_index = pd.read_csv("data/vl_index_small.csv")
tr_index.shape, vl_index.shape

((1424400, 6), (18992, 6))

In [17]:
device = 'gpu'
artifacts_path = 'data/artifacts/tmp'
feature_pipeline = make_union(
    LagN('date', 'playerId', [0, 1, 2, 3], f'{artifacts_path}/train_plscores3.pkl', 'data/', N=1, skip=0, device=device),
    LagN('date', 'playerId', [0, 1, 2, 3], f'{artifacts_path}/train_plscores3.pkl', 'data/', N=2, skip=0, device=device),
    ExpandingSum('date', 'playerId', [0, 1, 2, 3], f'{artifacts_path}/train_plscores3.pkl', 'data/', N=30, skip=0, device=device),
    ExpandingSum('date', 'playerId', [0, 1, 2, 3], f'{artifacts_path}/train_plscores3.pkl', 'data/', N=300, skip=0, device=device),
    LagN('date', 'playerId', [0, 1, 2, 3], f'{artifacts_path}/train_plscores1.pkl', 'data/', N=1, skip=0, device=device),
    LagN('date', 'playerId', [4], f'{artifacts_path}/train_plscores1.pkl', 'data/', N=1, skip=0, device=device),
    LagN('date', 'playerId', [0, 1, 2, 3], f'{artifacts_path}/train_plscores1.pkl', 'data/', N=2, skip=0, device=device),
    LagN('date', 'playerId', [4], f'{artifacts_path}/train_plscores1.pkl', 'data/', N=2, skip=0, device=device),
    *[
        LagN('date', 'playerId', list(range(i*4, (i+1)*4)), f'{artifacts_path}/train_plscores2.pkl', 'data/', N=j+1, skip=0, device=device)
        for j in range(2) for i in range(6)
     ],
    *[
        ExpandingMean('date', 'playerId', list(range(i*4, (i+1)*4)), f'{artifacts_path}/train_plscores2.pkl', 'data/', N=j, skip=0, device=device)
        for j in [10, 30, 300] for i in range(6)
     ],
    *[
        ExpandingSum('date', 'playerId', list(range(i*4, (i+1)*4)), f'{artifacts_path}/train_plscores2.pkl', 'data/', N=j, skip=0, device=device)
        for j in [10, 30, 300] for i in range(6)
     ],
    *[
        LagN('date', 'playerId', list(range(i*4, (i+1)*4)), f'{artifacts_path}/train_plscores4.pkl', 'data/', N=j+1, skip=0, device=device)
        for j in range(2) for i in range(3)
     ],
    *[
        ExpandingMean('date', 'playerId', list(range(i*4, (i+1)*4)), f'{artifacts_path}/train_plscores4.pkl', 'data/', N=j, skip=0, device=device)
        for j in [10, 30, 300] for i in range(3)
     ],
    *[
        ExpandingSum('date', 'playerId', list(range(i*4, (i+1)*4)), f'{artifacts_path}/train_plscores4.pkl', 'data/', N=j, skip=0, device=device)
        for j in [10, 30, 300] for i in range(3)
     ],
    *[
        LagN('date', 'playerId', [12, 13, 14], f'{artifacts_path}/train_plscores4.pkl', 'data/', N=j+1, skip=0, device=device)
        for j in range(2)
     ],
    *[
        ExpandingMean('date', 'playerId', [12, 13, 14], f'{artifacts_path}/train_plscores4.pkl', 'data/', N=j, skip=0, device=device)
        for j in [10, 30, 300]
     ],
    *[
        ExpandingSum('date', 'playerId', [12, 13, 14], f'{artifacts_path}/train_plscores4.pkl', 'data/', N=j, skip=0, device=device)
        for j in [10, 30, 300]
     ],
    *[
        LagN('date', 'playerId', list(range(i*4, (i+1)*4)), f'{artifacts_path}/train_plscores5.pkl', 'data/', N=1, skip=0, device=device)
        for i in range(5)
     ],
    LagN('date', 'playerId', [20], f'{artifacts_path}/train_plscores5.pkl', 'data/', N=1, skip=0, device=device),
    ExpandingMean('date', 'playerId', list(range(0, 4)), f'{artifacts_path}/train_targets.pkl', 'data/', N=10, skip=3, device=device), 
    ExpandingMean('date', 'playerId', list(range(0, 4)), f'{artifacts_path}/train_targets.pkl', 'data/', N=28, skip=3, device=device), 
    ExpandingMean('date', 'playerId', list(range(0, 4)), f'{artifacts_path}/train_targets.pkl', 'data/', N=365, skip=3, device=device), 
    ExpandingMean('date', 'playerId', list(range(0, 4)), f'{artifacts_path}/train_targets.pkl', 'data/', N=1500, skip=3, device=device), 
    ExpandingMax('date', 'playerId', list(range(0, 4)), f'{artifacts_path}/train_targets.pkl', 'data/', N=28, skip=3, device=device), 
    ExpandingQ75('date', 'playerId', list(range(0, 4)), f'{artifacts_path}/train_targets.pkl', 'data/', N=28, skip=3, device=device), 
    ExpandingQ25('date', 'playerId', list(range(0, 4)), f'{artifacts_path}/train_targets.pkl', 'data/', N=28, skip=3, device=device), 
    ExpandingQ05('date', 'playerId', list(range(0, 4)), f'{artifacts_path}/train_targets.pkl', 'data/', N=28, skip=3, device=device),
    LagN('date', 'playerId', [0], f'{artifacts_path}/train_awards.pkl', 'data/', fill_value=np.nan, N=1, skip=0, device=device),
    ExpandingCount('date', 'playerId', [0], f'{artifacts_path}/train_awards.pkl', 'data/', fill_value=np.nan, N=365, skip=0, device=device),
    LagN('date', 'playerId', [0], f'{artifacts_path}/train_transactions.pkl', 'data/', fill_value=np.nan, N=1, skip=0, device=device),
    LagN('date', 'playerId', [0, 1], f'{artifacts_path}/train_rosters.pkl', 'data/', fill_value=np.nan, N=1, skip=0, device=device),
    verbose=10
)

In [18]:
%%time
X_tr1 = feature_pipeline.transform(tr_index)

Applied time series transformer
Applied time series transformer
Applied time series transformer
Applied time series transformer
Applied time series transformer
Applied time series transformer
Applied time series transformer
Applied time series transformer
Applied time series transformer
Applied time series transformer
Applied time series transformer
Applied time series transformer
Applied time series transformer
Applied time series transformer
Applied time series transformer
Applied time series transformer
Applied time series transformer
Applied time series transformer
Applied time series transformer
Applied time series transformer
Applied time series transformer
Applied time series transformer
Applied time series transformer
Applied time series transformer
Applied time series transformer
Applied time series transformer
Applied time series transformer
Applied time series transformer
Applied time series transformer
Applied time series transformer
Applied time series transformer
Applied 

In [19]:
%%time
X_vl1 = feature_pipeline.transform(vl_index)

Applied time series transformer
Applied time series transformer
Applied time series transformer
Applied time series transformer
Applied time series transformer
Applied time series transformer
Applied time series transformer
Applied time series transformer
Applied time series transformer
Applied time series transformer
Applied time series transformer
Applied time series transformer
Applied time series transformer
Applied time series transformer
Applied time series transformer
Applied time series transformer
Applied time series transformer
Applied time series transformer
Applied time series transformer
Applied time series transformer
Applied time series transformer
Applied time series transformer
Applied time series transformer
Applied time series transformer
Applied time series transformer
Applied time series transformer
Applied time series transformer
Applied time series transformer
Applied time series transformer
Applied time series transformer
Applied time series transformer
Applied 

In [20]:
X_tr1.shape

(1424400, 396)

In [21]:
import lightgbm as lgb
targets = ['target1', 'target2', 'target3', 'target4']
y_tr = tr_index[targets].values
y_vl = vl_index[targets].values
# print(np.unique(X_tra[:, 235]))
tr1 = lgb.Dataset(X_tr1, y_tr[:, 0], categorical_feature=[13, 14, 15, 16, ])
tr2 = lgb.Dataset(X_tr1, y_tr[:, 1], categorical_feature=[13, 14, 15, 16, ])
tr3 = lgb.Dataset(X_tr1, y_tr[:, 2], categorical_feature=[13, 14, 15, 16, ])
tr4 = lgb.Dataset(X_tr1, y_tr[:, 3], categorical_feature=[13, 14, 15, 16, ])

vl1 = lgb.Dataset(X_vl1, y_vl[:, 0], reference=tr1)
vl2 = lgb.Dataset(X_vl1, y_vl[:, 1], reference=tr2)
vl3 = lgb.Dataset(X_vl1, y_vl[:, 2], reference=tr3)
vl4 = lgb.Dataset(X_vl1, y_vl[:, 3], reference=tr4)

# params = {
#     'n_estimators': 4000,
#     'learning_rate': 0.08,
#     'num_leaves': 31,
#     'colsample_bytree': 0.3,
#     'subsample': 0.5,
#     'reg_alpha': 0.1,
#     'reg_lambda': 0.1,
#     'max_bin': 255,
#     'objective': 'mae',
#     'metric': 'mae'
# }

params = {
    'n_estimators': 4000,
    'learning_rate': 0.05,
    'num_leaves': 255,
    'max_depth': -1,
    'min_data_in_leaf': 20,
    'colsample_bytree': 0.5,
    'subsample': 0.95,
    'bagging_freq': 1,
    'reg_alpha': 0.1,
    'reg_lambda': 0.1,
    'extra_trees': False,
    'max_bin': 127,
    #'device': 'gpu',
    #'gpu_use_dp': False,
    #'gpu_device_id': 0,
    'boost_from_average': True,
    'reg_sqrt': True,
    'objective': 'mae',
    'metric': 'mae',
    'verbose': -1,
    'seed': 123478659,
    'min_data_per_group': 10,
    'cat_l2': 1,
    'cat_smooth': 10,
    'num_threads': 16
}
bst1 = lgb.train(params, tr1, valid_sets=[vl1], early_stopping_rounds=200, verbose_eval=50)


New categorical_feature is [13, 14, 15, 16]


Training until validation scores don't improve for 200 rounds
[50]	valid_0's l1: 1.16975
[100]	valid_0's l1: 1.12823
[150]	valid_0's l1: 1.11685
[200]	valid_0's l1: 1.11377
[250]	valid_0's l1: 1.11337
[300]	valid_0's l1: 1.11277
[350]	valid_0's l1: 1.11186
[400]	valid_0's l1: 1.11161
[450]	valid_0's l1: 1.11103
[500]	valid_0's l1: 1.10969
[550]	valid_0's l1: 1.10909
[600]	valid_0's l1: 1.10883
[650]	valid_0's l1: 1.1086
[700]	valid_0's l1: 1.10856
[750]	valid_0's l1: 1.1084
[800]	valid_0's l1: 1.10836
[850]	valid_0's l1: 1.10819
[900]	valid_0's l1: 1.10815
[950]	valid_0's l1: 1.10796
[1000]	valid_0's l1: 1.10784
[1050]	valid_0's l1: 1.10735
[1100]	valid_0's l1: 1.10664
[1150]	valid_0's l1: 1.10564
[1200]	valid_0's l1: 1.10523
[1250]	valid_0's l1: 1.10512
[1300]	valid_0's l1: 1.1047
[1350]	valid_0's l1: 1.10421
[1400]	valid_0's l1: 1.10407
[1450]	valid_0's l1: 1.10405
[1500]	valid_0's l1: 1.10419
Early stopping, best iteration is:
[1333]	valid_0's l1: 1.10402


In [22]:
bst2 = lgb.train(params, tr2, valid_sets=[vl2], early_stopping_rounds=200, verbose_eval=50)

Training until validation scores don't improve for 200 rounds
[50]	valid_0's l1: 2.16356
[100]	valid_0's l1: 2.11912
[150]	valid_0's l1: 2.09684
[200]	valid_0's l1: 2.08897
[250]	valid_0's l1: 2.08489
[300]	valid_0's l1: 2.08102
[350]	valid_0's l1: 2.07738
[400]	valid_0's l1: 2.07209
[450]	valid_0's l1: 2.06966
[500]	valid_0's l1: 2.06726
[550]	valid_0's l1: 2.06591
[600]	valid_0's l1: 2.06266
[650]	valid_0's l1: 2.06032
[700]	valid_0's l1: 2.05895
[750]	valid_0's l1: 2.05855
[800]	valid_0's l1: 2.05772
[850]	valid_0's l1: 2.05713
[900]	valid_0's l1: 2.05702
[950]	valid_0's l1: 2.05705
[1000]	valid_0's l1: 2.05689
[1050]	valid_0's l1: 2.05657
[1100]	valid_0's l1: 2.05603
[1150]	valid_0's l1: 2.05529
[1200]	valid_0's l1: 2.05537
[1250]	valid_0's l1: 2.05408
[1300]	valid_0's l1: 2.05401
[1350]	valid_0's l1: 2.0535
[1400]	valid_0's l1: 2.05321
[1450]	valid_0's l1: 2.05337
[1500]	valid_0's l1: 2.05306
[1550]	valid_0's l1: 2.0529
[1600]	valid_0's l1: 2.05253
[1650]	valid_0's l1: 2.05252
[17

In [None]:
bst3 = lgb.train(params, tr3, valid_sets=[vl3], early_stopping_rounds=200, verbose_eval=50)

Training until validation scores don't improve for 200 rounds
[50]	valid_0's l1: 0.914136
[100]	valid_0's l1: 0.901449
[150]	valid_0's l1: 0.899079


In [None]:
bst4 = lgb.train(params, tr4, valid_sets=[vl4], early_stopping_rounds=200, verbose_eval=50)

In [None]:
from sklearn.metrics import mean_absolute_error as mae
pred1 = bst1.predict(X_vl1)
pred2 = bst2.predict(X_vl1)
pred3 = bst3.predict(X_vl1)
pred4 = bst4.predict(X_vl1)
preds = np.vstack((pred1, pred2, pred3, pred4)).T
print(mae(y_vl, preds))   # 1.367

In [None]:
# 1.7930
# 1.7708 - added lag of flags
# 1.7368 - added categorical features from box score
# 1.6982 - added batter scores
# 1.6890       - added pitcher scores
# 1.6892   - added remaining features ( :-( )
# 1.6883     - added pitcher lags 
# 1.6810   - adde batter lags
# 1.6777   - more pitcher lags
# 1.5774 - added target mean (skip - 30 and last 365)
# change hyperparams - colsamplebytree 0.7 --> 0.4; 1.6814 (reverting)
# 1.534 - changed val to from 10 april - 
# 1.517 - added lag3
# 1.5146 - changed hyperparams - colsample to 0.5
# 1.509 - num_leaves 255
# 1.5078  - min_leaf_samples 20
# 1.4884 - added statusCode
# 1.4891 - made it cat
# 1.4795 - removed cat encoding
# 1.4455 - fixed last n expanding mean and added last 10
# 1.4396 - added last 10 stats
# 1.4379 - last 5 innings sum of pitching scores
# 1.4322 - last 5 mean scores
# 1.4278 - expading mean batter scores
# 1.4257 - expanding mean pitcher scores
# 1.4070 - expanding sum - last 10 scores 
# 1.3687 -- changed validation to last 15 days
# 1.3602 - changed target means from 10 days to 15 days 
# 1.3556 - changed from 15 to 30 days
# 1.355 - changes days a bit
# 1.3499 - changed lr to 0.05
# 1.346 - changed hyperparams

In [24]:
bst1.save_model("artifacts/bst1_train_v2.pkl")
bst2.save_model("artifacts/bst2_train_v2.pkl")
bst3.save_model("artifacts/bst3_train_v2.pkl")
bst4.save_model("artifacts/bst4_train_v2.pkl")

<lightgbm.basic.Booster at 0x7f6a9cc48d90>

In [38]:
pd.read_csv("data/seasons.csv")

Unnamed: 0,seasonId,seasonStartDate,seasonEndDate,preSeasonStartDate,preSeasonEndDate,regularSeasonStartDate,regularSeasonEndDate,lastDate1stHalf,allStarDate,firstDate2ndHalf,postSeasonStartDate,postSeasonEndDate
0,2017,2017-04-02,2017-11-01,2017-02-22,2017-04-01,2017-04-02,2017-10-01,2017-07-09,2017-07-11,2017-07-14,2017-10-03,2017-11-01
1,2018,2018-03-29,2018-10-28,2018-02-21,2018-03-27,2018-03-29,2018-10-01,2018-07-15,2018-07-17,2018-07-19,2018-10-02,2018-10-28
2,2019,2019-03-20,2019-10-30,2019-02-21,2019-03-26,2019-03-20,2019-09-29,2019-07-07,2019-07-09,2019-07-11,2019-10-01,2019-10-30
3,2020,2020-07-23,2020-10-28,2020-02-21,2020-07-22,2020-07-23,2020-09-27,2020-08-25,,2020-08-26,2020-09-29,2020-10-28
4,2021,2021-02-28,2021-10-31,2021-02-28,2021-03-30,2021-04-01,2021-10-03,2021-07-11,2021-07-13,2021-07-15,2021-10-04,2021-10-31


In [32]:
# TODO
# Running stats on rank per day