In [None]:
import sys
import time
from contextlib import contextmanager

import lightgbm as lgb

sys.path = ['../'] + sys.path

from src.parser import *
from src.train import *
from src.util import TimeSeriesSplit
from src.event_level_model import prep_events
from src.dataset_helper import *

@contextmanager
def timer(name):
    s = time.time()
    yield
    elapsed = time.time() - s
    print(f"[{name}] {elapsed:.3f}s")
    
TARGET_COLS = ['target1', 'target2', 'target3', 'target4']
DATA_DIR = '../input/mlb-player-digital-engagement-forecasting'
ARTIFACT_DIR = 'artifacts'
USE_UPDATED = True

os.makedirs(ARTIFACT_DIR, exist_ok=True)

In [None]:
base_df = make_df_base_from_train_engagement(load_subdata(DATA_DIR, 'nextDayPlayerEngagement', USE_UPDATED))
events = load_subdata(DATA_DIR, 'events', USE_UPDATED)
rosters = load_subdata(DATA_DIR, 'rosters', USE_UPDATED)
players = pd.read_csv(os.path.join(DATA_DIR, 'players.csv'))

rosters['dailyDataDate'] = pd.to_datetime(rosters['dailyDataDate'], format='%Y%m%d')

print(len(base_df))

events.head()

In [None]:
events_stacked = prep_events(events)

In [None]:
merged = pd.merge(events_stacked, base_df, how='left', on=['dailyDataDate', 'playerId'])
merged = merged[~merged['target1'].isnull()]
print(merged.shape)

In [None]:
params = {
    'objective': 'mae',
    'metrics': 'mae',
    'num_leaves': 256,
    'max_depth': 16,
    'n_estimators': 100
}

splits = [
    (('2018-01-01', '2018-05-01'), ('2018-05-01', '2018-06-01')),
    (('2018-01-01', '2018-06-01'), ('2018-06-01', '2018-07-01')),
    (('2018-01-01', '2018-07-01'), ('2018-07-01', '2018-08-01')),
    (('2018-01-01', '2018-08-01'), ('2018-08-01', '2019-01-01')),
    (('2018-01-01', '2019-01-01'), ('2019-01-01', '2019-08-01')),
    (('2018-01-01', '2019-08-01'), ('2019-08-01', '2020-01-01')),
    (('2018-01-01', '2020-01-01'), ('2020-01-01', '2020-08-01')),
    (('2018-01-01', '2020-08-01'), ('2020-08-01', '2021-04-01')),
    (('2018-01-01', '2021-04-01'), ('2021-04-01', '2022-01-01')),
]

d_base = merged[['dailyDataDate', 'playerId', 'teamId']].copy()

aggregated_features = []
agg_g_features = []
agg_t_features = []

for tgt in ['target1', 'target2', 'target3', 'target4']:
    X = merged.drop(['target1', 'target2', 'target3', 'target4', 'dailyDataDate', 'playerId', 'teamId'], axis=1).astype(np.float32)
    y = merged[tgt]
    
    tgt_mask = ~y.isnull()

    cv = TimeSeriesSplit('dailyDataDate', splits)
    
    model = lgb.LGBMRegressor(**params)
    oof = np.zeros(len(X))
    
    for i, (train_index, valid_index) in enumerate(cv.split(merged)):
        X_tr, X_va = X.iloc[train_index], X.iloc[valid_index]
        y_tr, y_va = y.iloc[train_index], y.iloc[valid_index]
        
        # drop null training data
        tr_is_null = y_tr.isnull()
        va_is_null = y_va.isnull()
        
        X_tr = X_tr[~tr_is_null]
        y_tr = y_tr[~tr_is_null]
        
        print(f'fold {i} tr: {len(X_tr)}, va: {len(X_va)}')
        model.fit(X_tr, y_tr, categorical_feature=['atBatEvent', 'event', 'menOnBase', 'gameType', 'pitchType', 'call'])
        
        oof[valid_index] = model.predict(X_va)
        
        mae = mean_absolute_error(y_va[~va_is_null], oof[valid_index][~va_is_null])
        
        print(f"{tgt} fold {i} : {mae}")
        

    d_base['oof'] = oof
    aggregated = d_base.groupby(['dailyDataDate', 'playerId'])['oof'].agg(['min', 'max', 'mean']).reset_index()
    aggregated.columns = ['dailyDataDate', 'playerId', f'events_oof_{tgt}_min', f'events_oof_{tgt}_max', f'events_oof_{tgt}_mean']
    
    aggregated_features.append(aggregated)

    aggregated2 = d_base.groupby(['dailyDataDate'])['oof'].agg(['max', 'mean']).reset_index()
    aggregated2.columns = ['dailyDataDate', f'events_oof_{tgt}_g_max', f'events_oof_{tgt}_g_mean']

    agg_g_features.append(aggregated2)

    aggregated3 = d_base.groupby(['dailyDataDate', 'teamId'])['oof'].agg(['max', 'mean']).reset_index()
    aggregated3.columns = ['dailyDataDate', 'teamId', f'events_oof_{tgt}_t_max', f'events_oof_{tgt}_t_mean']

    agg_t_features.append(aggregated3)

    model_path = os.path.join(ARTIFACT_DIR, f'meta_model_{tgt}.bin')
    model.booster_.save_model(model_path)


In [None]:
o_base = base_df[['dailyDataDate', 'playerId']].copy()
o_base = pd.merge_asof(o_base, rosters[['playerId', 'dailyDataDate', 'teamId']], on='dailyDataDate', by='playerId')
o_base['teamId'] = o_base['teamId'].fillna(-1).astype(int)

# merge-asofが古いデータとマッチしすぎると、テストデータでの予測に学習期間のeventに対する予測が含まれてしまう。
# プレー単位の記録がエンゲージメントに与える影響は短期と想定されるため、適当な期間で打ち切る
tolerance=pd.Timedelta('30d')

for agg in aggregated_features:
    o_base = pd.merge_asof(o_base, agg, on='dailyDataDate', by='playerId', tolerance=tolerance)

for agg in agg_g_features:
    o_base = pd.merge(o_base, agg, on='dailyDataDate', how='left')

for agg in agg_t_features:
    agg['teamId'] = agg['teamId'].astype(int)
    o_base = pd.merge(o_base, agg, on=['dailyDataDate', 'teamId'], how='left')
    
assert len(o_base) == len(base_df)

o_base.drop(['dailyDataDate', 'playerId', 'teamId'], axis=1).to_feather(os.path.join(ARTIFACT_DIR, 'events_oof_asof_4tgt_3.f'))