In [1]:
import json
import hydra
from omegaconf import OmegaConf
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
from sklearn.pipeline import make_union, make_pipeline

from mllib.transformers import *
from src.pipelines.artifacts import ParsePlayerData
from src.utils.utils import print_config

In [2]:
# Generate index
TRAIN_FILE = "data/train.csv"
PLAYERS_FILE = "data/players.csv"
VAL_START_DATE = 20210415
TARGETS = ["target1", "target2", "target3", "target4"]


In [None]:
raw_data = pd.read_csv(TRAIN_FILE)
tr = raw_data.loc[raw_data.date < VAL_START_DATE]
val = raw_data.loc[raw_data.date >= VAL_START_DATE]
print(raw_data.shape, val.shape)

In [4]:
from src.pipelines.artifacts import ParseJsonField

In [5]:
parser = ParseJsonField('date', 'rosters', use_cols=['status', 'statusCode'])
roster_df = parser.transform(raw_data)

100%|██████████| 1216/1216 [00:08<00:00, 143.27it/s]


In [6]:
roster_2021 = pd.read_csv(PLAYERS_FILE)
roster_2021 = roster_2021.loc[roster_2021.playerForTestSetAndFuturePreds == True]
target_enc = ParsePlayerData("nextDayPlayerEngagement", TARGETS)
tr_index = target_enc.fit_transform(tr).reset_index(drop=False)
tr_index = tr_index.loc[tr_index.playerId.isin(roster_2021.playerId.astype(str))]
# tr_index['debutdate'] = tr_index.map()
vl_index = target_enc.fit_transform(val).reset_index(drop=False)
vl_index = vl_index.loc[vl_index.playerId.isin(roster_2021.playerId.astype(str))]
tr_index.to_csv("data/tr_index_small.csv", index=False)
vl_index.to_csv("data/vl_index_small.csv", index=False)


100%|██████████| 1195/1195 [00:17<00:00, 69.11it/s]
100%|██████████| 21/21 [00:00<00:00, 66.49it/s]


A      1185608
RM      198985
D60      56561
D10      47279
D7         475
PL         264
SU         251
BRV        157
FME         71
RES          8
DEC          1
Name: statusCode, dtype: int64

In [43]:
tr_index.shape

(2444346, 6)

In [44]:
tr_index = pd.read_csv("data/tr_index_small.csv")
vl_index = pd.read_csv("data/vl_index_small.csv")
tr_index.shape, vl_index.shape

((1418465, 6), (24927, 6))

In [45]:
tr_index.loc[tr_index.target2 == 100]

Unnamed: 0,playerId,target1,target2,target3,target4,date
803,543333,54.108365,100.0,100.000000,51.862745,20180101
1630,543333,100.000000,100.0,100.000000,100.000000,20180102
2820,518397,1.815118,100.0,18.715575,65.434222,20180103
3945,518397,2.968885,100.0,33.840564,38.551258,20180104
7901,609280,0.012035,100.0,0.039505,6.037152,20180107
...,...,...,...,...,...,...
1413205,502624,1.473673,100.0,0.048067,20.067744,20210405
1414394,592791,27.535777,100.0,3.864680,92.858369,20210406
1415688,606213,25.360294,100.0,37.778370,67.678398,20210407
1416338,605397,38.580060,100.0,85.044774,100.000000,20210408


In [46]:
# Start with a simple features whether they batted or pitched or player scores
pipe1 = make_union(LagN('date', 'playerId', [0, 1, 2, 3], 'data/artifacts/full/train_plscores3.pkl', 'data/', fill_value=np.nan, N=1, skip=0, device='gpu'),
 LagN('date', 'playerId', [0, 1, 2, 3], 'data/artifacts/full/train_plscores3.pkl', 'data/', fill_value=np.nan, N=1, skip=0, device='gpu'))

pipe2 = make_union(LagN('date', 'playerId', [0, 1, 2, 3], 'data/artifacts/full/train_plscores3.pkl', 'data/', fill_value=np.nan, N=2, skip=0, device='gpu'),
 LagN('date', 'playerId', [0, 1, 2, 3], 'data/artifacts/full/train_plscores3.pkl', 'data/', fill_value=np.nan, N=2, skip=0, device='gpu'))


X_tr11 = pipe1.transform(tr_index)
X_vl11 = pipe1.transform(vl_index)

X_tr12 = pipe2.transform(tr_index)
X_vl12 = pipe2.transform(vl_index)

In [47]:
pipe3 = make_union(LagN('date', 'playerId', [0, 1, 2, 3], 'data/artifacts/full/train_plscores1.pkl', 'data/', fill_value=np.nan, N=1, skip=0, device='gpu'),
                     LagN('date', 'playerId', [4], 'data/artifacts/full/train_plscores1.pkl', 'data/', fill_value=np.nan, N=1, skip=0, device='gpu'))
X_tr13 = pipe3.transform(tr_index)
X_vl13 = pipe3.transform(vl_index)

In [48]:
from sklearn.pipeline import make_union
pipe4 = make_union(
    *[
        LagN('date', 'playerId', list(range(i*4, (i+1)*4)), 'data/artifacts/full/train_plscores2.pkl', 'data/', fill_value=np.nan, N=1, skip=0, device='gpu')
        for i in range(6)
     ],
    verbose=True
)
X_tr14 = pipe4.transform(tr_index)
X_vl14 = pipe4.transform(vl_index)

In [49]:
pipe41 = make_union(
    *[
        LagN('date', 'playerId', list(range(i*4, (i+1)*4)), 'data/artifacts/full/train_plscores2.pkl', 'data/', fill_value=np.nan, N=2, skip=0, device='gpu')
        for i in range(6)
     ],
    verbose=True
)
X_tr141 = pipe41.transform(tr_index)
X_vl141 = pipe41.transform(vl_index)

In [50]:
pipe42 = make_union(
    *[
        ExpandingMean('date', 'playerId', list(range(i*4, (i+1)*4)), 'data/artifacts/full/train_plscores2.pkl', 'data/', fill_value=np.nan, N=100, skip=0, device='gpu')
        for i in range(6)
     ],
    verbose=True
)
X_tr142 = pipe42.transform(tr_index)
X_vl142 = pipe42.transform(vl_index)

In [51]:
pipe43 = make_union(
    *[
        ExpandingMean('date', 'playerId', list(range(i*4, (i+1)*4)), 'data/artifacts/full/train_plscores2.pkl', 'data/', fill_value=np.nan, N=5, skip=0, device='gpu')
        for i in range(6)
     ],
    verbose=True
)
X_tr143 = pipe43.transform(tr_index)
X_vl143 = pipe43.transform(vl_index)

In [None]:
pipe44 = make_union(
    *[
        ExpandingSum('date', 'playerId', list(range(i*4, (i+1)*4)), 'data/artifacts/full/train_plscores2.pkl', 'data/', fill_value=np.nan, N=10, skip=0, device='gpu')
        for i in range(6)
     ],
    verbose=True
)
X_tr144 = pipe44.transform(tr_index)
X_vl144 = pipe44.transform(vl_index)

In [None]:
pipe5 = make_union(
    *[
        LagN('date', 'playerId', list(range(i*4, (i+1)*4)), 'data/artifacts/full/train_plscores4.pkl', 'data/', fill_value=np.nan, N=1, skip=0, device='gpu')
        for i in range(3)
     ],
    LagN('date', 'playerId', [12, 13, 14], 'data/artifacts/full/train_plscores4.pkl', 'data/', fill_value=np.nan, N=1, skip=0, device='gpu'),
    verbose=True
)
X_tr15 = pipe5.transform(tr_index)
X_vl15 = pipe5.transform(vl_index)

In [None]:
pipe6 = make_union(
    *[
        LagN('date', 'playerId', list(range(i*4, (i+1)*4)), 'data/artifacts/full/train_plscores4.pkl', 'data/', fill_value=np.nan, N=2, skip=0, device='gpu')
        for i in range(3)
     ],
    LagN('date', 'playerId', [12, 13, 14], 'data/artifacts/full/train_plscores4.pkl', 'data/', fill_value=np.nan, N=2, skip=0, device='gpu'),
    verbose=True
)
X_tr16 = pipe6.transform(tr_index)
X_vl16 = pipe6.transform(vl_index)

In [None]:
pipe62 = make_union(
    *[
        LagN('date', 'playerId', list(range(i*4, (i+1)*4)), 'data/artifacts/full/train_plscores4.pkl', 'data/', fill_value=np.nan, N=3, skip=0, device='gpu')
        for i in range(3)
     ],
    LagN('date', 'playerId', [12, 13, 14], 'data/artifacts/full/train_plscores4.pkl', 'data/', fill_value=np.nan, N=3, skip=0, device='gpu'),
    verbose=True
)
X_tr162 = pipe62.transform(tr_index)
X_vl162 = pipe62.transform(vl_index)

In [None]:
pipe63 = make_union(
    *[
        ExpandingMean('date', 'playerId', list(range(i*4, (i+1)*4)), 'data/artifacts/full/train_plscores4.pkl', 'data/', fill_value=np.nan, N=5, skip=0, device='gpu')
        for i in range(3)
     ],
    ExpandingMean('date', 'playerId', [12, 13, 14], 'data/artifacts/full/train_plscores4.pkl', 'data/', fill_value=np.nan, N=5, skip=0, device='gpu'),
    verbose=True
)
X_tr163 = pipe63.transform(tr_index)
X_vl163 = pipe63.transform(vl_index)

In [None]:
pipe64 = make_union(
    *[
        ExpandingMean('date', 'playerId', list(range(i*4, (i+1)*4)), 'data/artifacts/full/train_plscores4.pkl', 'data/', fill_value=np.nan, N=100, skip=0, device='gpu')
        for i in range(3)
     ],
    ExpandingMean('date', 'playerId', [12, 13, 14], 'data/artifacts/full/train_plscores4.pkl', 'data/', fill_value=np.nan, N=100, skip=0, device='gpu'),
    verbose=True
)
X_tr164 = pipe64.transform(tr_index)
X_vl164 = pipe64.transform(vl_index)

In [None]:
pipe65 = make_union(
    *[
        ExpandingSum('date', 'playerId', list(range(i*4, (i+1)*4)), 'data/artifacts/full/train_plscores4.pkl', 'data/', fill_value=np.nan, N=10, skip=0, device='gpu')
        for i in range(3)
     ],
    ExpandingSum('date', 'playerId', [12, 13, 14], 'data/artifacts/full/train_plscores4.pkl', 'data/', fill_value=np.nan, N=10, skip=0, device='gpu'),
    verbose=True
)
X_tr165 = pipe65.transform(tr_index)
X_vl165 = pipe65.transform(vl_index)

In [None]:
pipe66 = make_union(
    *[
        ExpandingSum('date', 'playerId', list(range(i*4, (i+1)*4)), 'data/artifacts/full/train_plscores4.pkl', 'data/', fill_value=np.nan, N=200, skip=0, device='gpu')
        for i in range(3)
     ],
    ExpandingSum('date', 'playerId', [12, 13, 14], 'data/artifacts/full/train_plscores4.pkl', 'data/', fill_value=np.nan, N=200, skip=0, device='gpu'),
    verbose=True
)
X_tr166 = pipe66.transform(tr_index)
X_vl166 = pipe66.transform(vl_index)

In [None]:
pipe7 = make_union(
    *[
        LagN('date', 'playerId', list(range(i*4, (i+1)*4)), 'data/artifacts/full/train_plscores5.pkl', 'data/', fill_value=np.nan, N=1, skip=0, device='gpu')
        for i in range(5)
     ],
    LagN('date', 'playerId', [20], 'data/artifacts/full/train_plscores5.pkl', 'data/', fill_value=np.nan, N=1, skip=0, device='gpu'),
    verbose=True
)
X_tr17 = pipe7.transform(tr_index)
X_vl17 = pipe7.transform(vl_index)

In [None]:
# pipe71 = make_union(
#     *[
#         ExpandingSum('date', 'playerId', list(range(i*4, (i+1)*4)), 'data/artifacts/full/train_plscores5.pkl', 'data/', fill_value=np.nan, N=100, skip=0, device='gpu')
#         for i in range(5)
#      ],
#     ExpandingSum('date', 'playerId', [20], 'data/artifacts/full/train_plscores5.pkl', 'data/', fill_value=np.nan, N=100, skip=0, device='gpu'),
#     verbose=True
# )
# X_tr171 = pipe71.transform(tr_index)
# X_vl171 = pipe71.transform(vl_index)

In [None]:
import gc
gc.collect()

In [None]:
pipe8 = make_union(
        ExpandingMean('date', 'playerId', list(range(0, 4)), 'data/artifacts/full/tr_targets.pkl', 'data/', fill_value=np.nan, N=10, skip=3, device='gpu'), 
        ExpandingMax('date', 'playerId', list(range(0, 4)), 'data/artifacts/full/tr_targets.pkl', 'data/', fill_value=np.nan, N=10, skip=3, device='gpu'), 
        ExpandingQ75('date', 'playerId', list(range(0, 4)), 'data/artifacts/full/tr_targets.pkl', 'data/', fill_value=np.nan, N=10, skip=3, device='gpu'), 
        ExpandingQ25('date', 'playerId', list(range(0, 4)), 'data/artifacts/full/tr_targets.pkl', 'data/', fill_value=np.nan, N=10, skip=3, device='gpu'), 
        ExpandingQ05('date', 'playerId', list(range(0, 4)), 'data/artifacts/full/tr_targets.pkl', 'data/', fill_value=np.nan, N=10, skip=3, device='gpu'), 
        ExpandingMean('date', 'playerId', list(range(0, 4)), 'data/artifacts/full/tr_targets.pkl', 'data/', fill_value=np.nan, N=365, skip=3, device='gpu'), 

    ExpandingMean('date', 'playerId', list(range(0, 4)), 'data/artifacts/full/tr_targets.pkl', 'data/', fill_value=np.nan, N=1500, skip=3, device='gpu'), 
        #ExpandingMedian('date', 'playerId', list(range(0, 4)), 'data/artifacts/full/tr_targets.pkl', 'data/', fill_value=np.nan, N=1500, skip=3, device='gpu'), 
        #LagN('date', 'playerId', [0, 1], 'data/artifacts/full/tr_targets.pkl', 'data/', fill_value=np.nan, N=1, skip=3, device='gpu'),
        #LagN('date', 'playerId', [2, 3], 'data/artifacts/full/tr_targets.pkl', 'data/', fill_value=np.nan, N=1, skip=3, device='gpu'),
        #LagN('date', 'playerId', [0, 1], 'data/artifacts/full/tr_targets.pkl', 'data/', fill_value=np.nan, N=1, skip=4, device='gpu'),
        #LagN('date', 'playerId', [2, 3], 'data/artifacts/full/tr_targets.pkl', 'data/', fill_value=np.nan, N=1, skip=4, device='gpu'),

    # ExpandingMedian('date', 'playerId', list(range(0, 4)), 'data/artifacts/full/train_targets.pkl', 'data/', fill_value=np.nan, N=365, skip=30, device='gpu')
    #,
    verbose=True
)
X_tr18 = pipe8.transform(tr_index)
X_vl18 = pipe8.transform(vl_index)

In [28]:
pipe9 = make_union(
        LagN('date', 'playerId', [0], 'data/artifacts/full/train_awards.pkl', 'data/', fill_value=np.nan, N=1, skip=0, device='gpu'),
        ExpandingCount('date', 'playerId', [0], 'data/artifacts/full/train_awards.pkl', 'data/', fill_value=np.nan, N=365, skip=0, device='gpu'),
        LagN('date', 'playerId', [0], 'data/artifacts/full/train_transactions.pkl', 'data/', fill_value=np.nan, N=1, skip=0, device='gpu'),
        LagN('date', 'playerId', [0], 'data/artifacts/full/train_rosters.pkl', 'data/', fill_value=np.nan, N=1, skip=0, device='gpu'),
        #ExpandingCount('date', 'playerId', [0], 'data/artifacts/full/train_rosters.pkl', 'data/', fill_value=np.nan, N=1500, skip=0, device='gpu'),

        #LagN('date', 'playerId', [0], 'data/artifacts/full/train_rosters.pkl', 'data/', fill_value=np.nan, N=2, skip=0, device='gpu'),

    # ExpandingMedian('date', 'playerId', list(range(0, 4)), 'data/artifacts/full/train_targets.pkl', 'data/', fill_value=np.nan, N=365, skip=30, device='gpu')
    #,
    verbose=True
)
X_tr19 = pipe9.transform(tr_index)
X_vl19 = pipe9.transform(vl_index)

In [29]:
X_tr11.shape, X_tr12.shape, X_tr13.shape, X_tr14.shape, X_tr15.shape, X_tr16.shape, X_tr17.shape

((2444346, 8),
 (2444346, 8),
 (2444346, 5),
 (2444346, 24),
 (2444346, 15),
 (2444346, 15),
 (2444346, 21))

In [30]:
X_tr1 = np.hstack((X_tr11, X_tr12, X_tr13, X_tr14,  X_tr141, X_tr15, X_tr16, X_tr17, X_tr162, X_tr19, X_tr18, X_tr143, X_tr142, X_tr163, X_tr164, X_tr165, X_tr144, X_tr166))
X_vl1 = np.hstack((X_vl11, X_vl12, X_vl13, X_vl14,  X_vl141, X_vl15, X_vl16, X_vl17, X_vl162, X_vl19, X_vl18, X_vl143, X_vl142, X_vl163, X_vl164, X_vl165, X_vl144, X_vl166))

In [31]:
X_tr1.shape

(2444346, 299)

In [32]:
config = OmegaConf.load("configs/data_preparation/config.yaml")
config.keys()

dict_keys(['read_root', 'save_root', 'data_flag', 'artifacts_path', 'date_field', 'user_field', 'save_prefix', 'filter_query', 'player_artifact_name', 'team_artifact_name', 'targets_artifact_name', 'plscores1_artifact_name', 'plscores2_artifact_name', 'plscores3_artifact_name', 'plscores4_artifact_name', 'plscores5_artifact_name', 'tscores1_artifact_name', 'tscores2_artifact_name', 'prepare_playeridartifact', 'prepare_teamidartifact', 'prepare_targets', 'prepare_targets2', 'prepare_scores1', 'prepare_scores2', 'prepare_scores3', 'prepare_scores4', 'prepare_scores5', 'team_scores1', 'team_scores2', 'awards', 'rosters', 'transactions', 'pltwitter', 'prepare_3Dartifacts'])

In [33]:
ftargets = config.prepare_targets.steps[0][1].use_cols[1:]

fplscores3 = config.prepare_scores3.steps[0][1].use_cols[1:]
fplscores31 = [f+"_lag1" for f in fplscores3]
fplscores1 = config.prepare_scores1.steps[0][1].use_cols[1:]

fplscores2 = config.prepare_scores2.steps[0][1].use_cols[1:]
fplscores21 = [f+"_lag1" for f in fplscores2]

fplscores4 = config.prepare_scores4.steps[0][1].use_cols[1:]
fplscores41 = [f+"_lag1" for f in fplscores4]

fplscores5 = config.prepare_scores3.steps[0][1].use_cols[1:]
fplscores42 = [f+"_lag2" for f in fplscores4]


In [34]:
feature_names = fplscores3 + fplscores31 + fplscores1 + fplscores2 + fplscores21 + fplscores4 + fplscores41 + fplscores5 + fplscores42

In [35]:
len(feature_names)

131

In [36]:
X_tr1.shape

(2444346, 299)

In [37]:
pd.DataFrame(X_tr1).notnull().sum(axis=1).mean()

104.4180230622015

In [38]:
import lightgbm as lgb
targets = ['target1', 'target2', 'target3', 'target4']
y_tr = tr_index[targets].values
y_vl = vl_index[targets].values
# print(np.unique(X_tra[:, 235]))
tr1 = lgb.Dataset(X_tr1, y_tr[:, 0]) #, categorical_feature=[13, 14, 15, 16, ])
tr2 = lgb.Dataset(X_tr1, y_tr[:, 1]) #, categorical_feature=[13, 14, 15, 16, ])
tr3 = lgb.Dataset(X_tr1, y_tr[:, 2]) #, categorical_feature=[13, 14, 15, 16, ])
tr4 = lgb.Dataset(X_tr1, y_tr[:, 3]) #, categorical_feature=[13, 14, 15, 16, ])

vl1 = lgb.Dataset(X_vl1, y_vl[:, 0], reference=tr1)
vl2 = lgb.Dataset(X_vl1, y_vl[:, 1], reference=tr2)
vl3 = lgb.Dataset(X_vl1, y_vl[:, 2], reference=tr3)
vl4 = lgb.Dataset(X_vl1, y_vl[:, 3], reference=tr4)

# params = {
#     'n_estimators': 4000,
#     'learning_rate': 0.08,
#     'num_leaves': 31,
#     'colsample_bytree': 0.3,
#     'subsample': 0.5,
#     'reg_alpha': 0.1,
#     'reg_lambda': 0.1,
#     'max_bin': 255,
#     'objective': 'mae',
#     'metric': 'mae'
# }

params = {
    'n_estimators': 4000,
    'learning_rate': 0.1,
    'num_leaves': 255,
    'max_depth': -1,
    'min_data_in_leaf': 20,
    'colsample_bytree': 0.5,
    'subsample': 0.95,
    'bagging_freq': 1,
    'reg_alpha': 0.1,
    'reg_lambda': 0.1,
    'extra_trees': False,
    'max_bin': 127,
    #'device': 'gpu',
    #'gpu_use_dp': False,
    #'gpu_device_id': 0,
    'boost_from_average': True,
    'reg_sqrt': True,
    'objective': 'mae',
    'metric': 'mae',
    'verbose': -1,
    'seed': 123478659,
    'min_data_per_group': 10,
    'cat_l2': 1,
    'cat_smooth': 10,
    'num_threads': 16
}
bst1 = lgb.train(params, tr1, valid_sets=[vl1], early_stopping_rounds=200, verbose_eval=50)




Training until validation scores don't improve for 200 rounds
[50]	valid_0's l1: 1.12692
[100]	valid_0's l1: 1.11319
[150]	valid_0's l1: 1.11267
[200]	valid_0's l1: 1.11232
[250]	valid_0's l1: 1.11192
[300]	valid_0's l1: 1.10941
[350]	valid_0's l1: 1.10804
[400]	valid_0's l1: 1.10754
[450]	valid_0's l1: 1.10693
[500]	valid_0's l1: 1.10659
[550]	valid_0's l1: 1.10617
[600]	valid_0's l1: 1.10561
[650]	valid_0's l1: 1.10504
[700]	valid_0's l1: 1.10251
[750]	valid_0's l1: 1.10103
[800]	valid_0's l1: 1.09966
[850]	valid_0's l1: 1.09937
[900]	valid_0's l1: 1.09937
[950]	valid_0's l1: 1.09936
[1000]	valid_0's l1: 1.09916
[1050]	valid_0's l1: 1.09801
[1100]	valid_0's l1: 1.09771
[1150]	valid_0's l1: 1.09757
[1200]	valid_0's l1: 1.09753
[1250]	valid_0's l1: 1.09742
[1300]	valid_0's l1: 1.09728
[1350]	valid_0's l1: 1.0973
[1400]	valid_0's l1: 1.0974
[1450]	valid_0's l1: 1.09726
[1500]	valid_0's l1: 1.09716
[1550]	valid_0's l1: 1.09682
[1600]	valid_0's l1: 1.09644
[1650]	valid_0's l1: 1.09641
[17

In [39]:
bst2 = lgb.train(params, tr2, valid_sets=[vl2], early_stopping_rounds=200, verbose_eval=50)

Training until validation scores don't improve for 200 rounds
[50]	valid_0's l1: 2.11272
[100]	valid_0's l1: 2.08055
[150]	valid_0's l1: 2.07282
[200]	valid_0's l1: 2.06687
[250]	valid_0's l1: 2.06363
[300]	valid_0's l1: 2.06283
[350]	valid_0's l1: 2.06149
[400]	valid_0's l1: 2.0603
[450]	valid_0's l1: 2.05845
[500]	valid_0's l1: 2.05818
[550]	valid_0's l1: 2.0569
[600]	valid_0's l1: 2.05698
[650]	valid_0's l1: 2.05498
[700]	valid_0's l1: 2.05491
[750]	valid_0's l1: 2.05351
[800]	valid_0's l1: 2.05282
[850]	valid_0's l1: 2.05151
[900]	valid_0's l1: 2.05109
[950]	valid_0's l1: 2.05102
[1000]	valid_0's l1: 2.05096
[1050]	valid_0's l1: 2.05009
[1100]	valid_0's l1: 2.04975
[1150]	valid_0's l1: 2.04901
[1200]	valid_0's l1: 2.04869
[1250]	valid_0's l1: 2.04831
[1300]	valid_0's l1: 2.04825
[1350]	valid_0's l1: 2.04848
[1400]	valid_0's l1: 2.0483
[1450]	valid_0's l1: 2.04815
Early stopping, best iteration is:
[1260]	valid_0's l1: 2.04805


In [40]:
bst3 = lgb.train(params, tr3, valid_sets=[vl3], early_stopping_rounds=200, verbose_eval=50)

Training until validation scores don't improve for 200 rounds
[50]	valid_0's l1: 0.944355
[100]	valid_0's l1: 0.941649
[150]	valid_0's l1: 0.941493
[200]	valid_0's l1: 0.941331
[250]	valid_0's l1: 0.940501
[300]	valid_0's l1: 0.939741
[350]	valid_0's l1: 0.938797
[400]	valid_0's l1: 0.938426
[450]	valid_0's l1: 0.937755
[500]	valid_0's l1: 0.937637
[550]	valid_0's l1: 0.935904
[600]	valid_0's l1: 0.935315
[650]	valid_0's l1: 0.934638
[700]	valid_0's l1: 0.934451
[750]	valid_0's l1: 0.933937
[800]	valid_0's l1: 0.93369
[850]	valid_0's l1: 0.933844
[900]	valid_0's l1: 0.933825
[950]	valid_0's l1: 0.93365
[1000]	valid_0's l1: 0.932781
[1050]	valid_0's l1: 0.932824
[1100]	valid_0's l1: 0.932643
[1150]	valid_0's l1: 0.932523
[1200]	valid_0's l1: 0.932483
[1250]	valid_0's l1: 0.932398
[1300]	valid_0's l1: 0.932325
[1350]	valid_0's l1: 0.932272
[1400]	valid_0's l1: 0.932303
[1450]	valid_0's l1: 0.932383
[1500]	valid_0's l1: 0.932414
[1550]	valid_0's l1: 0.932686
Early stopping, best iteration

In [41]:
bst4 = lgb.train(params, tr4, valid_sets=[vl4], early_stopping_rounds=200, verbose_eval=50)

Training until validation scores don't improve for 200 rounds
[50]	valid_0's l1: 1.57579
[100]	valid_0's l1: 1.56367
[150]	valid_0's l1: 1.56218
[200]	valid_0's l1: 1.55994
[250]	valid_0's l1: 1.55851
[300]	valid_0's l1: 1.55783
[350]	valid_0's l1: 1.55667
[400]	valid_0's l1: 1.55588
[450]	valid_0's l1: 1.55524
[500]	valid_0's l1: 1.55453
[550]	valid_0's l1: 1.55357
[600]	valid_0's l1: 1.55409
[650]	valid_0's l1: 1.55404
[700]	valid_0's l1: 1.55402
[750]	valid_0's l1: 1.55366
Early stopping, best iteration is:
[551]	valid_0's l1: 1.55357


In [42]:
from sklearn.metrics import mean_absolute_error as mae
pred1 = bst1.predict(X_vl1)
pred2 = bst2.predict(X_vl1)
pred3 = bst3.predict(X_vl1)
pred4 = bst4.predict(X_vl1)
preds = np.vstack((pred1, pred2, pred3, pred4)).T
print(mae(y_vl, preds))

1.4073541618501342


In [90]:
raw_data.columns

Index(['date', 'nextDayPlayerEngagement', 'games', 'rosters',
       'playerBoxScores', 'teamBoxScores', 'transactions', 'standings',
       'awards', 'events', 'playerTwitterFollowers', 'teamTwitterFollowers'],
      dtype='object')

In [125]:
bst1.feature_name = feature_names
lgb.plot_importance(bst1)

TypeError: 'list' object is not callable

In [126]:
imp = bst4.feature_importance(importance_type='gain')
idx = np.argsort(imp)[::-1]
np.array(feature_names)[idx]

array(['gamesPlayedPitching_lag1', 'gamesStartedPitching_lag1',
       'saveOpportunities', 'rbi', 'winsPitching_lag1', 'holds', 'saves',
       'completeGamesPitching_lag1', 'runsScored_lag1', 'strikeOuts_lag1',
       'atBatsPitching_lag2', 'airOutsPitching_lag2',
       'stolenBasesPitching', 'runsPitching_lag2',
       'intentionalWalksPitching_lag1', 'doublesPitching_lag1',
       'groundOutsPitching_lag1', 'gamesStartedPitching',
       'shutoutsPitching_lag1', 'sacFlies_lag1',
       'strikeOutsPitching_lag2', 'groundOuts_lag1', 'lossesPitching',
       'groundOutsPitching_lag2', 'atBatsPitching',
       'triplesPitching_lag1', 'homeRunsPitching',
       'hitByPitchPitching_lag1', 'blownSaves', 'groundIntoDoublePlay',
       'hitsPitching_lag1', 'intentionalWalks', 'strikeOuts', 'saves',
       'baseOnBalls', 'groundIntoTriplePlay', 'flyOutsPitching_lag1',
       'groundOuts', 'jerseyNum', 'strikeOutsPitching_lag1',
       'groundOutsPitching', 'home', 'winsPitching',
       'pl

In [38]:
X_tr1.shape

(1418465, 142)

In [None]:
# 1.7930
# 1.7708 - added lag of flags
# 1.7368 - added categorical features from box score
# 1.6982 - added batter scores
# 1.6890       - added pitcher scores
# 1.6892   - added remaining features ( :-( )
# 1.6883     - added pitcher lags 
# 1.6810   - adde batter lags
# 1.6777   - more pitcher lags
# 1.5774 - added target mean (skip - 30 and last 365)
# change hyperparams - colsamplebytree 0.7 --> 0.4; 1.6814 (reverting)
# 1.534 - changed val to from 10 april - 
# 1.517 - added lag3
# 1.5146 - changed hyperparams - colsample to 0.5
# 1.509 - num_leaves 255
# 1.5078  - min_leaf_samples 20
# 1.4884 - added statusCode
# 1.4891 - made it cat
# 1.4795 - removed cat encoding
# 1.4455 - fixed last n expanding mean and added last 10
# 1.4396 - added last 10 stats
# 1.4379 - last 5 innings sum of pitching scores
# 1.4322 - last 5 mean scores
# 1.4278 - expading mean batter scores
# 1.4257 - expanding mean pitcher scores
# 1.4070 - expanding sum - last 10 scores 
# 

In [38]:
pd.read_csv("data/seasons.csv")

Unnamed: 0,seasonId,seasonStartDate,seasonEndDate,preSeasonStartDate,preSeasonEndDate,regularSeasonStartDate,regularSeasonEndDate,lastDate1stHalf,allStarDate,firstDate2ndHalf,postSeasonStartDate,postSeasonEndDate
0,2017,2017-04-02,2017-11-01,2017-02-22,2017-04-01,2017-04-02,2017-10-01,2017-07-09,2017-07-11,2017-07-14,2017-10-03,2017-11-01
1,2018,2018-03-29,2018-10-28,2018-02-21,2018-03-27,2018-03-29,2018-10-01,2018-07-15,2018-07-17,2018-07-19,2018-10-02,2018-10-28
2,2019,2019-03-20,2019-10-30,2019-02-21,2019-03-26,2019-03-20,2019-09-29,2019-07-07,2019-07-09,2019-07-11,2019-10-01,2019-10-30
3,2020,2020-07-23,2020-10-28,2020-02-21,2020-07-22,2020-07-23,2020-09-27,2020-08-25,,2020-08-26,2020-09-29,2020-10-28
4,2021,2021-02-28,2021-10-31,2021-02-28,2021-03-30,2021-04-01,2021-10-03,2021-07-11,2021-07-13,2021-07-15,2021-10-04,2021-10-31


In [32]:
# TODO
# Running stats on rank per day