In [1]:
import json
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
from sklearn.pipeline import make_union, make_pipeline

from mllib.transformers import *
from src.pipelines.artifacts import *
from src.constants import (
    TARGETS,
    PLTWITTER,
    SCORES1,
    SCORES2,
    SCORES3,
    SCORES4,
    SCORES5,
    TEAM_SCORES1,
    TEAM_SCORES2,
    TEAM_SCORES3,
    TEAM_STANDINGS,
    AWARDS,
    ROSTERS,
    TRANSACTIONS,
    AWARDID_DICT
)
from src.constants import (
    playerid_mapping,
    teamid_mapping,
    targets_artifact,
    scores1_mean_artifact,
    scores1_first_artifact,
    scores1_last_artifact,
    scores2_mean_artifact,
    scores2_first_artifact,
    scores2_last_artifact,
    scores3_mean_artifact,
    scores3_first_artifact,
    scores3_last_artifact,
    scores4_mean_artifact,
    scores4_first_artifact,
    scores4_last_artifact,
    scores5_mean_artifact,
    scores5_first_artifact,
    scores5_last_artifact,
    team_scores1_mean_artifact,
    team_scores2_mean_artifact,
    team_scores3_mean_artifact,
    awards_artifact,
    rosters_artifact,
    player_twitter_artifact,
    transactions_artifact,
    team_standings_artifact
)

In [2]:
# Generate index
TRAIN_FILE = "data/train.csv"
VAL_START_DATE = 20210415
DEVICE = 'gpu'
device = DEVICE
artifacts_path = 'data/artifacts/v0'

In [3]:
# raw_data = pd.read_csv(TRAIN_FILE)
# tr = raw_data.loc[raw_data.date < VAL_START_DATE]
# val = raw_data.loc[raw_data.date >= VAL_START_DATE]
# print(raw_data.shape, val.shape)

# roster_2021 = pd.read_csv(PLAYERS_FILE)
# roster_2021 = roster_2021.loc[roster_2021.playerForTestSetAndFuturePreds == True]
# target_enc = ParsePlayerData("nextDayPlayerEngagement", TARGETS)
# tr_index = target_enc.fit_transform(tr).reset_index(drop=False)
# tr_index = tr_index.loc[tr_index.playerId.isin(roster_2021.playerId.astype(str))]
# # tr_index['debutdate'] = tr_index.map()
# vl_index = target_enc.fit_transform(val).reset_index(drop=False)
# vl_index = vl_index.loc[vl_index.playerId.isin(roster_2021.playerId.astype(str))]
# tr_index.to_csv("data/tr_index_small.csv", index=False)
# vl_index.to_csv("data/vl_index_small.csv", index=False)

In [415]:
raw_data = pd.read_csv("data/train_updated.csv")
vl = raw_data.loc[raw_data.date > 20210501]
roster_2021 = pd.read_csv("data/players.csv")
roster_2021 = roster_2021.loc[roster_2021.playerForTestSetAndFuturePreds == True]
target_enc = ParsePlayerData("nextDayPlayerEngagement", TARGETS)
vl_index2 = target_enc.fit_transform(vl).reset_index(drop=False)
vl_index2 = vl_index2.loc[vl_index2.playerId.isin(roster_2021.playerId.astype(str))]
vl_index2.to_csv("data/vl_index_small2.csv", index=False)

100%|██████████| 77/77 [00:01<00:00, 63.72it/s]


In [None]:
tr_index = pd.read_csv("data/tr_index_small.csv")
vl_index = pd.read_csv("data/vl_index_small.csv")
vl_indexw = pd.read_csv("data/vl_index_small2.csv")

tr_index.shape, vl_index.shape, vl_index2.shape

In [5]:
f1 = [f'scores3_{i}' for i in range(11)]
f2 = [f'scores3_count_last{j}_last{i}' for i in range(11) for j in [10, 30, 300]]
f3 = [f'scores1_{i}_lag{j}' for j in range(2) for i in range(5)]
f4 = [f'scores2_{i}_lag{j}' for i in range(24) for j in range(2)]
f5 = [f'scores2_{i}_mean{j}' for i in range(24) for j in [10, 30, 300]]
f6 = [f'scores2_{i}_sum{j}' for i in range(24) for j in [10, 30, 300]]
f7 = [f'scores4_{i}_lag{j}' for i in range(15) for j in range(2)]
f8 = [f'scores4_{i}_mean{j}' for i in range(15) for j in [10, 30, 300]]
f9 = [f'scores4_{i}_sum{j}' for i in range(15) for j in [10, 30, 300]]
f10 = [f'scores5_{i}_lag{j}' for i in range(21) for j in range(1)]
f11 = [f'target_{i}_mean{j}' for j in [10, 28, 365, 1500] for i in range(4)]
f12 = [f'target_{i}_{stat}' for stat in ['max', 'min', 'q75', 'q25'] for i in range(4)]
f13 = ['last_award', 'num_awards', 'last_txn', 'is_active', 'player_twitter', 'position_code']
fnames = f1 + f2 + f3 + f4 + f5 + f6 +f7 +f8 + f9 + f10 + f11 + f12 + f13
len(fnames)

425

In [6]:
len(f1+f2+f3), len(f1+f2+f3+f4+f5+f6)

(54, 246)

In [332]:
target_stats = make_union(
    ExpandingMean('date', 'playerId', list(range(0, 4)), f'{artifacts_path}/{targets_artifact}', N=10, skip=3, device=DEVICE, fill_value=0),
    ExpandingMean('date', 'playerId', list(range(0, 4)), f'{artifacts_path}/{targets_artifact}', N=30, skip=3, device=DEVICE, fill_value=0), 
    ExpandingMean('date', 'playerId', list(range(0, 4)), f'{artifacts_path}/{targets_artifact}', N=300, skip=3, device=DEVICE, fill_value=0),
    ExpandingMedian('date', 'playerId', list(range(0, 4)), f'{artifacts_path}/{targets_artifact}', N=2000, skip=3, device=DEVICE, fill_value=0), 
    ExpandingMax('date', 'playerId', list(range(0, 4)), f'{artifacts_path}/{targets_artifact}', N=20, skip=3, device=DEVICE, fill_value=0), 
    ExpandingMin('date', 'playerId', list(range(0, 4)), f'{artifacts_path}/{targets_artifact}', N=20, skip=3, device=DEVICE, fill_value=0),
    ExpandingMedian('date', 'playerId', list(range(0, 4)), f'{artifacts_path}/{targets_artifact}', N=20, skip=3, device=DEVICE, fill_value=0), 
    ExpandingQ75('date', 'playerId', list(range(0, 4)), f'{artifacts_path}/{targets_artifact}', N=20, skip=3, device=DEVICE, fill_value=0), 
    ExpandingQ25('date', 'playerId', list(range(0, 4)), f'{artifacts_path}/{targets_artifact}', N=20, skip=3, device=DEVICE, fill_value=0),
    LagN('date', 'playerId', [0, 1, 2, 3], f'{artifacts_path}/{targets_artifact}', N=1, skip=2, device=DEVICE, fill_value=0),
    LagN('date', 'playerId', [0, 1, 2, 3], f'{artifacts_path}/{targets_artifact}', N=2, skip=2, device=DEVICE, fill_value=0),
    LagN('date', 'playerId', [0, 1, 2, 3], f'{artifacts_path}/{targets_artifact}', N=3, skip=2, device=DEVICE, fill_value=0),
    LagN('date', 'playerId', [0, 1, 2, 3], f'{artifacts_path}/{targets_artifact}', N=4, skip=2, device=DEVICE, fill_value=0),
    LagN('date', 'playerId', [0, 1, 2, 3], f'{artifacts_path}/{targets_artifact}', N=5, skip=2, device=DEVICE, fill_value=0),
    LagN('date', 'playerId', [0, 1, 2, 3], f'{artifacts_path}/{targets_artifact}', N=6, skip=2, device=DEVICE, fill_value=0),
    LagN('date', 'playerId', [0, 1, 2, 3], f'{artifacts_path}/{targets_artifact}', N=7, skip=2, device=DEVICE, fill_value=0),
)

other_features = make_union(
    LagN('date', 'playerId', [0], f'{artifacts_path}/train_awards.pkl', fill_value=-1, N=1, skip=0, device=device),
    ExpandingCount('date', 'playerId', [0], f'{artifacts_path}/train_awards.pkl', fill_value=0, N=365, skip=0, device=device),
    LagN('date', 'playerId', [0], f'{artifacts_path}/train_transactions.pkl', fill_value=-1, N=1, skip=0, device=device),
    LagN('date', 'playerId', [0], f'{artifacts_path}/train_rosters.pkl', fill_value=-1, N=1, skip=0, device=device),
    ExpandingCount('date', 'playerId', [0], f'{artifacts_path}/train_rosters.pkl', fill_value=0, N=20, skip=0, device=device),
    make_pipeline(LagN('date', 'playerId', [0], f'{artifacts_path}/train_pltwitter.pkl', fill_value=0, N=1, skip=0, device=device),
                  FunctionTransfomer(np.log1p)),
)

scores1 = make_union(
    LagN('date', 'playerId', [0, 1, 2, 3], f'{artifacts_path}/{scores1_last_artifact}', fill_value=0, N=1, skip=0, device=device),
    LagN('date', 'playerId', [4], f'{artifacts_path}/{scores1_last_artifact}', N=1, skip=0, fill_value=0, device=device),
    LagN('date', 'playerId', [0, 1, 2, 3], f'{artifacts_path}/{scores1_last_artifact}', fill_value=0, N=2, skip=0, device=device),
    LagN('date', 'playerId', [4], f'{artifacts_path}/{scores1_last_artifact}', N=2, skip=0, fill_value=0, device=device),
    LagN('date', 'playerId', [0, 1, 2, 3], f'{artifacts_path}/{scores1_last_artifact}', fill_value=0, N=3, skip=0, device=device),
    LagN('date', 'playerId', [4], f'{artifacts_path}/{scores1_last_artifact}', N=3, skip=0, fill_value=0, device=device),
    LagN('date', 'playerId', [0, 1, 2, 3], f'{artifacts_path}/{scores1_last_artifact}', fill_value=0, N=4, skip=0, device=device),
    LagN('date', 'playerId', [4], f'{artifacts_path}/{scores1_last_artifact}', N=4, skip=0, fill_value=0, device=device),
    LagN('date', 'playerId', [0, 1, 2, 3], f'{artifacts_path}/{scores1_last_artifact}', fill_value=0, N=5, skip=0, device=device),
    LagN('date', 'playerId', [4], f'{artifacts_path}/{scores1_last_artifact}', N=5, skip=0, fill_value=0, device=device),
    LagN('date', 'playerId', [0, 1, 2, 3], f'{artifacts_path}/{scores1_last_artifact}', fill_value=0, N=6, skip=0, device=device),
    LagN('date', 'playerId', [4], f'{artifacts_path}/{scores1_last_artifact}', N=6, skip=0, fill_value=0, device=device),
    LagN('date', 'playerId', [0, 1, 2, 3], f'{artifacts_path}/{scores1_last_artifact}', fill_value=0, N=7, skip=0, device=device),
    LagN('date', 'playerId', [4], f'{artifacts_path}/{scores1_last_artifact}', N=7, skip=0, fill_value=0, device=device),
    LagN('date', 'playerId', [0, 1, 2, 3], f'{artifacts_path}/{scores1_last_artifact}', fill_value=0, N=8, skip=0, device=device),
    LagN('date', 'playerId', [4], f'{artifacts_path}/{scores1_last_artifact}', N=8, skip=0, fill_value=0, device=device),
    ExpandingCount('date', 'playerId', [0, 1, 2, 3], f'{artifacts_path}/{scores1_last_artifact}', fill_value=0, N=20, skip=0, device=device),
    ExpandingCount('date', 'playerId', [4], f'{artifacts_path}/{scores1_last_artifact}', N=20, skip=0, fill_value=0, device=device),
)

scores3 = make_union(
    *[
        ExpandingSum('date', 'playerId', list(range(i*4, (i+1)*4)), f'{artifacts_path}/{scores3_last_artifact}', fill_value=0, N=10, skip=0, device=device)
         for i in range(2)
     ],
    ExpandingSum('date', 'playerId', [8, 9, 10], f'{artifacts_path}/{scores3_last_artifact}', fill_value=0, N=10, skip=0, device=device),
    *[
        ExpandingSum('date', 'playerId', list(range(i*4, (i+1)*4)), f'{artifacts_path}/{scores3_last_artifact}', fill_value=0, N=200, skip=0, device=device)
         for i in range(2)
     ],
    ExpandingSum('date', 'playerId', [8, 9, 10], f'{artifacts_path}/{scores3_last_artifact}', fill_value=0, N=200, skip=0, device=device),

)

scores2 = make_union(
    *[
        LagN('date', 'playerId', list(range(i*4, (i+1)*4)), f'{artifacts_path}/{scores2_last_artifact}', fill_value=0, N=j+1, skip=0, device=device)
        for i in range(6) for j in range(4)
     ],
)

scores4 = make_union(
    *[
        LagN('date', 'playerId', list(range(i*4, (i+1)*4)), f'{artifacts_path}/{scores4_last_artifact}', fill_value=0, N=j+1, skip=0, device=device)
        for i in range(3) for j in range(2)
     ],
    *[
        LagN('date', 'playerId', [12, 13, 14], f'{artifacts_path}/{scores4_last_artifact}', fill_value=0, N=j+1, skip=0, device=device)
        for j in range(2)
     ],
)

scores5 = make_union(
    *[
        LagN('date', 'playerId', list(range(i*4, (i+1)*4)), f'{artifacts_path}/{scores5_last_artifact}', fill_value=0, N=j+1, skip=0, device=device)
        for i in range(5) for j in range(1)
     ],
    *[
        LagN('date', 'playerId', [20], f'{artifacts_path}/{scores5_last_artifact}', fill_value=0, N=j+1, skip=0, device=device)
        for j in range(1)
     ],
)

all_players = make_union(
    *[
        DateLagN('date', list(range(i*4, (i+1)*4)), f'{artifacts_path}/{scores2_last_artifact}', N=j+1, skip=0, device=device)
        for i in range(6) for j in range(1)
     ],
    *[
        DateLagN('date', list(range(i*4, (i+1)*4)), f'{artifacts_path}/{scores4_last_artifact}', N=j+1, skip=0, device=device)
        for i in range(6) for j in range(1)
     ],
    *[
        DateLagN('date', [12, 13, 14], f'{artifacts_path}/{scores4_last_artifact}', N=j+1, skip=0, device=device)
        for j in range(1)
     ],
)

In [333]:
feature_pipeline = make_union(target_stats, other_features, scores1, scores2, scores3, scores4, scores5, all_players)

In [None]:
# X_tr01 = target_stats.transform(tr_index)
# X_vl01 = target_stats.transform(vl_index)

# X_tr02 = other_features.transform(tr_index)
# X_vl02 = other_features.transform(vl_index)

# X_tr03 = scores1.transform(tr_index)
# X_vl03 = scores1.transform(vl_index)

# X_tr04 = scores3.transform(tr_index)
# X_vl04 = scores3.transform(vl_index)

# X_tr05 = scores2.transform(tr_index)
# X_vl05 = scores2.transform(vl_index)

# X_tr06 = scores4.transform(tr_index)
# X_vl06 = scores4.transform(vl_index)

# X_tr07 = scores5.transform(tr_index)
# X_vl07 = scores5.transform(vl_index)

# X_tr08 = all_players.transform(tr_index)
# X_vl08 = all_players.transform(vl_index)

In [36]:
# feature_pipeline = make_union(
#     *[
#         LagN('date', 'playerId', list(range(i*4, (i+1)*4)), f'{artifacts_path}/{targets_artifact}', fill_value=np.nan, N=j, skip=0, deviLagNdevice)
#          for i in range(2) for j in range(1)
#      ],
#     LagN('date', 'playerId', [8, 9, 10], f'{artifacts_path}/train_plscores3.pkl', fill_value=np.nan, N=1, skip=0, device=device),
#     *[
#         ExpandingSum('date', 'playerId', list(range(i*4, (i+1)*4)), f'{artifacts_path}/train_plscores3.pkl', fill_value=np.nan, N=j, skip=0, device=device)
#          for i in range(2) for j in [10, 30, 300]
#      ],
#     *[
#         ExpandingSum('date', 'playerId', [8, 9, 10], f'{artifacts_path}/train_plscores3.pkl', 'data/', fill_value=np.nan, N=j, skip=0, device=device)
#         for j in [10, 30, 300]
#     ],
#     LagN('date', 'playerId', [0, 1, 2, 3], f'{artifacts_path}/train_plscores1.pkl', 'data/', fill_value=np.nan, N=1, skip=0, device=device),
#     LagN('date', 'playerId', [4], f'{artifacts_path}/train_plscores1.pkl', 'data/', N=1, skip=0, fill_value=np.nan, device=device),
#     LagN('date', 'playerId', [0, 1, 2, 3], f'{artifacts_path}/train_plscores1.pkl', 'data/', fill_value=np.nan, N=2, skip=0, device=device),
#     LagN('date', 'playerId', [4], f'{artifacts_path}/train_plscores1.pkl', 'data/', N=2, skip=0, fill_value=np.nan, device=device),
#     *[
#         LagN('date', 'playerId', list(range(i*4, (i+1)*4)), f'{artifacts_path}/train_plscores2.pkl', 'data/', fill_value=np.nan, N=j+1, skip=0, device=device)
#         for i in range(6) for j in range(2)
#      ],
#     *[
#         ExpandingMean('date', 'playerId', list(range(i*4, (i+1)*4)), f'{artifacts_path}/train_plscores2.pkl', 'data/', fill_value=np.nan, N=j, skip=0, device=device)
#          for i in range(6) for j in [10, 30, 300]
#      ],
#     *[
#         ExpandingSum('date', 'playerId', list(range(i*4, (i+1)*4)), f'{artifacts_path}/train_plscores2.pkl', 'data/', fill_value=np.nan, N=j, skip=0, device=device)
#         for i in range(6) for j in [10, 30, 300] 
#      ],
#     *[
#         LagN('date', 'playerId', list(range(i*4, (i+1)*4)), f'{artifacts_path}/train_plscores4.pkl', 'data/', fill_value=np.nan, N=j+1, skip=0, device=device)
#         for i in range(3) for j in range(2)
#      ],
#     *[
#         LagN('date', 'playerId', [12, 13, 14], f'{artifacts_path}/train_plscores4.pkl', 'data/', fill_value=np.nan, N=j+1, skip=0, device=device)
#         for j in range(2)
#      ],
#     *[
#         ExpandingMean('date', 'playerId', list(range(i*4, (i+1)*4)), f'{artifacts_path}/train_plscores4.pkl', 'data/', fill_value=np.nan, N=j, skip=0, device=device)
#         for i in range(3) for j in [10, 30, 300] 
#      ],
#     *[
#         ExpandingMean('date', 'playerId', [12, 13, 14], f'{artifacts_path}/train_plscores4.pkl', 'data/', fill_value=np.nan, N=j, skip=0, device=device)
#         for j in [10, 30, 300]
#      ],
#     *[
#         ExpandingSum('date', 'playerId', list(range(i*4, (i+1)*4)), f'{artifacts_path}/train_plscores4.pkl', 'data/', fill_value=np.nan, N=j, skip=0, device=device)
#         for i in range(3) for j in [10, 30, 300] 
#      ],
#     *[
#         ExpandingSum('date', 'playerId', [12, 13, 14], f'{artifacts_path}/train_plscores4.pkl', 'data/', N=j, fill_value=np.nan, skip=0, device=device)
#         for j in [10, 30, 300]
#      ],
#     *[
#         LagN('date', 'playerId', list(range(i*4, (i+1)*4)), f'{artifacts_path}/train_plscores5.pkl', 'data/', fill_value=np.nan, N=1, skip=0, device=device)
#         for i in range(5)
#      ],
#     LagN('date', 'playerId', [20], f'{artifacts_path}/train_plscores5.pkl', 'data/', N=1, skip=0, device=device, fill_value=np.nan),
#     ExpandingMean('date', 'playerId', list(range(0, 4)), f'{artifacts_path}/train_targets.pkl', 'data/', N=10, skip=3, device=device, fill_value=np.nan),
#     # LagN('date', 'playerId', list(range(0, 4)), f'{artifacts_path}/train_targets.pkl', 'data/', N=3, skip=0, device=device),
#     ExpandingMean('date', 'playerId', list(range(0, 4)), f'{artifacts_path}/train_targets.pkl', 'data/', N=28, skip=3, device=device, fill_value=np.nan), 
#     ExpandingMean('date', 'playerId', list(range(0, 4)), f'{artifacts_path}/train_targets.pkl', 'data/', N=365, skip=3, device=device, fill_value=np.nan),
#     # ExpandingMedian('date', 'playerId', list(range(0, 4)), f'{artifacts_path}/train_targets.pkl', 'data/', N=365, skip=3, device=device), 
#     ExpandingMean('date', 'playerId', list(range(0, 4)), f'{artifacts_path}/train_targets.pkl', 'data/', N=1500, skip=3, device=device, fill_value=np.nan), 
#     ExpandingMax('date', 'playerId', list(range(0, 4)), f'{artifacts_path}/train_targets.pkl', 'data/', N=28, skip=3, device=device, fill_value=np.nan), 
#     ExpandingMin('date', 'playerId', list(range(0, 4)), f'{artifacts_path}/train_targets.pkl', 'data/', N=28, skip=3, device=device, fill_value=np.nan), 
#     ExpandingQ75('date', 'playerId', list(range(0, 4)), f'{artifacts_path}/train_targets.pkl', 'data/', N=28, skip=3, device=device, fill_value=np.nan), 
#     ExpandingQ25('date', 'playerId', list(range(0, 4)), f'{artifacts_path}/train_targets.pkl', 'data/', N=28, skip=3, device=device, fill_value=np.nan), 
#     LagN('date', 'playerId', [0], f'{artifacts_path}/train_awards.pkl', 'data/', fill_value=np.nan, N=1, skip=0, device=device),
#     ExpandingCount('date', 'playerId', [0], f'{artifacts_path}/train_awards.pkl', 'data/', fill_value=0, N=365, skip=0, device=device),
#     LagN('date', 'playerId', [0], f'{artifacts_path}/train_transactions.pkl', 'data/', fill_value=np.nan, N=1, skip=0, device=device),
#     LagN('date', 'playerId', [0], f'{artifacts_path}/train_rosters.pkl', 'data/', fill_value=np.nan, N=1, skip=0, device=device),
#     LagN('date', 'playerId', [0], f'{artifacts_path}/train_pltwitter.pkl', 'data/', fill_value=np.nan, N=1, skip=0, device=device),
#     make_pipeline(
#         MapToCol(map_col='playerId', attr='primaryPositionCode', mapper_input='players.csv', mapper_pipeline=make_pipeline(
#             DataLoader(load_path='data'),
#             MapCol(field_name='primaryPositionCode', mapping={
#                 '8': 8, '3': 3, '1': 1, '4': 4, '6': 6, '2': 2, '9': 9, '7': 7, '5': 5, 'O': 0, '10': 10, 'I': 11,
#             })
#         )
#         )
#     ),
#     # LagN('date', 'playerId', [0], f'{artifacts_path}/train_pltwitter.pkl', 'data/', fill_value=np.nan, N=1, skip=0, device=device),
#     verbose=10
# )

In [334]:
%%time
X_tr = feature_pipeline.transform(tr_index)

CPU times: user 3min 57s, sys: 12.3 s, total: 4min 10s
Wall time: 4min 8s


In [335]:
%%time
X_vl = feature_pipeline.transform(vl_index)

CPU times: user 13.6 s, sys: 548 ms, total: 14.2 s
Wall time: 14.2 s


In [336]:
np.save("data/X_tr1_v1.npy", X_tr1)
np.save("data/X_vl1_v1.npy", X_vl1)

In [337]:
X_tr = np.load("data/X_tr1_v1.npy")
X_vl = np.load("data/X_vl1_v1.npy")

In [338]:
# feature_pipeline3 = make_union(
#     *[
#         DateLagN('date', list(range(i*4, (i+1)*4)), f'{artifacts_path}/train_plscores2.pkl', 'data/', N=j+1, skip=0, device=device)
#         for i in range(6) for j in range(2)
#      ],
#     *[
#         DateLagN('date', list(range(i*4, (i+1)*4)), f'{artifacts_path}/train_plscores4.pkl', 'data/', N=j+1, skip=0, device=device)
#         for i in range(3) for j in range(2)
#      ],
#     *[
#         DateLagN('date', [12, 13, 14], f'{artifacts_path}/train_plscores4.pkl', 'data/', N=j+1, skip=0, device=device)
#         for j in range(2)
#      ],

# )
# X_tr11 = feature_pipeline3.transform(tr_index)
# X_vl11 = feature_pipeline3.transform(vl_index)
# print(X_tr11.shape, X_vl11.shape)

In [339]:
# feature_pipeline2 = make_pipeline(
#     AddFeature(
#         name='teamId',
#         pipe=LagN('date', 'playerId', [1], f'{artifacts_path}/train_rosters.pkl', 'data/', fill_value=-1, N=1, skip=0, device=device),
#     ),
#     make_union(
#         *[
#             LagN('date', 'teamId', list(range(i*4, (i+1)*4)), f'{artifacts_path}/train_tscores1.pkl', 'data/', N=j+1, skip=0, device=device)
#             for i in range(6) for j in range(2)
#          ],
#         *[
#             LagN('date', 'teamId', list(range(i*4, (i+1)*4)), f'{artifacts_path}/train_tscores2.pkl', 'data/', N=j+1, skip=0, device=device)
#             for i in range(4) for j in range(2)
#          ],
#         *[
#             LagN('date', 'teamId', [12, 13], f'{artifacts_path}/train_tscores2.pkl', 'data/', N=j+1, skip=0, device=device)
#             for j in range(2)
#         ],
# #         *[
# #             LagN('date', 'teamId', list(range(i*4, (i+1)*4)), f'{artifacts_path}/train_tscores3.pkl', 'data/', N=j+1, skip=0, device=device)
# #             for i in range(4) for j in range(1)
# #          ], 
# #         *[
# #             LagN('date', 'teamId', [12, 13], f'{artifacts_path}/train_tscores3.pkl', 'data/', N=j+1, skip=0, device=device)
# #             for j in range(1)
# #         ],
#         *[
#             LagN('date', 'teamId', list(range(i*4, (i+1)*4)), f'{artifacts_path}/train_standings.pkl', 'data/', N=1, skip=0, device=device)
#             for i in range(10)
#          ],

#         verbose=True
#     ), verbose=True
# )

In [340]:
# X_tr2 = feature_pipeline2.transform(tr_index)
# X_vl2 = feature_pipeline2.transform(vl_index)
# X_tr = np.hstack((X_tr1, X_tr1[:,54:54+48]/(1e-6 + X_tr11[:, :48]), X_tr1[:,246:246+30]/(1e-6 + X_tr11[:, 48:]),X_tr2))
# X_vl = np.hstack((X_vl1, X_vl1[:,54:54+48]/(1e-6 + X_vl11[:, :48]), X_vl1[:,246:246+30]/(1e-6 + X_vl11[:, 48:]),X_vl2))

In [311]:
# X_tr = np.hstack((X_tr01, X_tr02, X_tr03, X_tr04, X_tr05, X_tr06, X_tr07, X_tr08))
# X_vl = np.hstack((X_vl01, X_vl02, X_vl03, X_vl04, X_vl05, X_vl06, X_vl07, X_vl08))

In [389]:
import lightgbm as lgb
targets = ['target1', 'target2', 'target3', 'target4']
y_tr = tr_index[targets].values
y_vl = vl_index[targets].values
# print(np.unique(X_tra[:, 235]))
y_tr_log = np.log1p(y_tr)
y_vl_log = np.log1p(y_vl)

tr1 = lgb.Dataset(X_tr, y_tr_log[:, 0]) #, categorical_feature=[13, 14, 15, 16, 419])
tr2 = lgb.Dataset(X_tr, y_tr_log[:, 1]) #, categorical_feature=[13, 14, 15, 16, 419])
tr3 = lgb.Dataset(X_tr, y_tr_log[:, 2]) #, categorical_feature=[13, 14, 15, 16, 419])
tr4 = lgb.Dataset(X_tr, y_tr_log[:, 3]) #, categorical_feature=[13, 14, 15, 16, 419])

vl1 = lgb.Dataset(X_vl, y_vl_log[:, 0], reference=tr1)
vl2 = lgb.Dataset(X_vl, y_vl_log[:, 1], reference=tr2)
vl3 = lgb.Dataset(X_vl, y_vl_log[:, 2], reference=tr3)
vl4 = lgb.Dataset(X_vl, y_vl_log[:, 3], reference=tr4)

# params = {
#     'n_estimators': 4000,
#     'learning_rate': 0.08,
#     'num_leaves': 31,
#     'colsample_bytree': 0.3,
#     'subsample': 0.5,
#     'reg_alpha': 0.1,
#     'reg_lambda': 0.1,
#     'max_bin': 255,
#     'objective': 'mae',
#     'metric': 'mae'
# }

params = {
    'n_estimators': 4000,
    'learning_rate': 0.02,
    'num_leaves': 255,
    'max_depth': -1,
    'min_data_in_leaf': 20,
    'colsample_bytree': 0.5,
    'subsample': 0.95,
    'bagging_freq': 1,
    'reg_alpha': 0.1,
    'reg_lambda': 0.1,
    'extra_trees': False,
    'max_bin': 127,
    #'device': 'gpu',
    #'gpu_use_dp': False,
    #'gpu_device_id': 0,
    'boost_from_average': True,
    'reg_sqrt': False,
    'objective': 'mae',
    'metric': 'mae',
    'verbose': -1,
    'seed': 1234786591000,
    'min_data_per_group': 10,
    'cat_l2': 10,
    'cat_smooth': 10,
    'num_threads': 16
}
from sklearn.metrics import mean_absolute_error as mae


In [326]:
bst1 = lgb.train(params, tr1, valid_sets=[vl1], early_stopping_rounds=200, verbose_eval=50)
pred1 = bst1.predict(X_vl)
print(mae(y_vl[:, 0], np.expm1(pred1)))   # 1.08



Training until validation scores don't improve for 200 rounds
[50]	valid_0's l1: 0.253433
[100]	valid_0's l1: 0.232622
[150]	valid_0's l1: 0.226322
[200]	valid_0's l1: 0.223521
[250]	valid_0's l1: 0.222098
[300]	valid_0's l1: 0.221225
[350]	valid_0's l1: 0.220672
[400]	valid_0's l1: 0.220318
[450]	valid_0's l1: 0.219994
[500]	valid_0's l1: 0.219812
[550]	valid_0's l1: 0.219675
[600]	valid_0's l1: 0.219442
[650]	valid_0's l1: 0.219273
[700]	valid_0's l1: 0.219136
[750]	valid_0's l1: 0.219028
[800]	valid_0's l1: 0.218942
[850]	valid_0's l1: 0.218893
[900]	valid_0's l1: 0.218879
[950]	valid_0's l1: 0.218855
[1000]	valid_0's l1: 0.218836
[1050]	valid_0's l1: 0.21882
[1100]	valid_0's l1: 0.21882
[1150]	valid_0's l1: 0.218821
[1200]	valid_0's l1: 0.218825
[1250]	valid_0's l1: 0.218812
[1300]	valid_0's l1: 0.218811
[1350]	valid_0's l1: 0.218816
[1400]	valid_0's l1: 0.218813
[1450]	valid_0's l1: 0.218814
Early stopping, best iteration is:
[1287]	valid_0's l1: 0.21881
1.0975758711811299


In [327]:
bst2 = lgb.train(params, tr2, valid_sets=[vl2], early_stopping_rounds=200, verbose_eval=50)
pred2 = bst2.predict(X_vl)
print(mae(y_vl[:, 1], np.expm1(pred2))) 
# 2.22(target means - zero fill value)
# 2.2332 (-1 fill value)
# 2.31 remove 10 mean
# 2.21 add lag 3
# 2.26 removed quantiles
# 2.18 quantiles back and lags till 4
# 2.18 add median for last 30 as well
# 2.177 added lag till 7 days
# 2.179 added lag till 15 days
# 2.114 keep only 7 day lag and try skip 2 days for 1 and 2 day lag
# 2.116 skip 2 for everything
# 2.118 skip 2 for lags, others3 and changed 30 to 40 days for stats
# 2.091 changed 40 to 20 days
# 2.106 changed 20 to 10
# 2.107 changed 10 to 15
# 2.060 added xtra features, awards, roster, txns, pltwitter
# 2.060 added non nan count in rosters for last 20 days
# 1.9986 added scores1 - current
# 1.9734 scores1 lag1
# 1.9643 added scores1 lag 2
# 1.9613 added scores1 lag 3
# 1.9552 added scores1 expnding count last 20
# 1.9588 added scores1 expanding count last 1000
# 1.9537 added scores1 lag 5
# 1.9487 added scores1 lag6
# 1.9429 added scores1 lag7
# 1.9429 added scores1 lag 8
# 1.9438 added scores3 lag0
# 1.9232 replaced scores3 lag0 with expanding sum last 10
# 1.9294 added scores3 sum last 20
# 1.9209 replaced scores3 sum 20 by sum 200
# 1.939 replace scores3 sum 10 by sum 20
# 1.9223 replace scores3 sum 200 by sum 100 
# rolled back to 1.9209 version
# 1.8997 added scores2 lag0, lag1
# 1.8846 added scores2 lag2, lag3
# 1.894 added scores2 lag4, lag5
# 1.8857 removed lag4,5 and added mean last 10
# 1.8874 scores2 -- replace mean10 - sum10
# 1.8932 scores2 sum10 --> sum100
# 1.892 scores2 sum100 -> max10
# rolled back to only lags for scores2
# 1.8675 added scores4 lag0,1
# 1.873 added scores4 lag2
# 1.876 added scores4 lag3
# 1.8798 remove lag2, 3 and add mean 20
# 1.8807 mean20 -> count20
# 1.8722 count20 -> mean200
# rolling back to only lags0,1 for scores4
# 1.8645 scores5 lag0
# 1.8864 add team standings cols (0, 1, 6, 7, 8)
# 1.881 standings - only win loss pct
# removing standings completely
# 1.872 adding team scores2 lag0
# 1.8693 team scores1 lag0
# removed team features completely
# 1.892 add all players scores1 sum for given date
# 1.8610 all players scores2 sum - lag0
# 1.8635 all players scores4 sum - lag0



Training until validation scores don't improve for 200 rounds
[50]	valid_0's l1: 0.405786
[100]	valid_0's l1: 0.341385
[150]	valid_0's l1: 0.3238
[200]	valid_0's l1: 0.318141
[250]	valid_0's l1: 0.315744
[300]	valid_0's l1: 0.313835
[350]	valid_0's l1: 0.312163
[400]	valid_0's l1: 0.311034
[450]	valid_0's l1: 0.310392
[500]	valid_0's l1: 0.309711
[550]	valid_0's l1: 0.309219
[600]	valid_0's l1: 0.308713
[650]	valid_0's l1: 0.308492
[700]	valid_0's l1: 0.308251
[750]	valid_0's l1: 0.308087
[800]	valid_0's l1: 0.307828
[850]	valid_0's l1: 0.307763
[900]	valid_0's l1: 0.307673
[950]	valid_0's l1: 0.30748
[1000]	valid_0's l1: 0.307292
[1050]	valid_0's l1: 0.307139
[1100]	valid_0's l1: 0.306957
[1150]	valid_0's l1: 0.306854
[1200]	valid_0's l1: 0.306623
[1250]	valid_0's l1: 0.306565
[1300]	valid_0's l1: 0.306528
[1350]	valid_0's l1: 0.306488
[1400]	valid_0's l1: 0.306408
[1450]	valid_0's l1: 0.306334
[1500]	valid_0's l1: 0.306271
[1550]	valid_0's l1: 0.306151
[1600]	valid_0's l1: 0.306102
[

In [328]:
bst3 = lgb.train(params, tr3, valid_sets=[vl3], early_stopping_rounds=200, verbose_eval=50)
pred3 = bst3.predict(X_vl)
print(mae(y_vl[:, 2], np.expm1(pred3)))   # 0.892



Training until validation scores don't improve for 200 rounds
[50]	valid_0's l1: 0.189993
[100]	valid_0's l1: 0.183489
[150]	valid_0's l1: 0.180153
[200]	valid_0's l1: 0.178669
[250]	valid_0's l1: 0.177954
[300]	valid_0's l1: 0.177592
[350]	valid_0's l1: 0.177408
[400]	valid_0's l1: 0.17728
[450]	valid_0's l1: 0.177172
[500]	valid_0's l1: 0.176985
[550]	valid_0's l1: 0.176949
[600]	valid_0's l1: 0.176857
[650]	valid_0's l1: 0.176795
[700]	valid_0's l1: 0.176765
[750]	valid_0's l1: 0.176728
[800]	valid_0's l1: 0.176734
[850]	valid_0's l1: 0.176708
[900]	valid_0's l1: 0.1767
[950]	valid_0's l1: 0.176692
[1000]	valid_0's l1: 0.17666
[1050]	valid_0's l1: 0.176631
[1100]	valid_0's l1: 0.176484
[1150]	valid_0's l1: 0.176409
[1200]	valid_0's l1: 0.176397
[1250]	valid_0's l1: 0.17637
[1300]	valid_0's l1: 0.176364
[1350]	valid_0's l1: 0.176366
[1400]	valid_0's l1: 0.176366
[1450]	valid_0's l1: 0.176366
Early stopping, best iteration is:
[1283]	valid_0's l1: 0.176364
0.8767324179204725


In [410]:
params1 = {
    'n_estimators': 4000,
    'learning_rate': 0.05,
    'num_leaves': 31,
    'max_depth': -1,
    'min_data_in_leaf': 20,
    'colsample_bytree': 0.55,
    'subsample': 0.95,
    'bagging_freq': 1,
    'reg_alpha': 0.1,
    'reg_lambda': 0.1,
    'extra_trees': False,
    'max_bin': 127,
    #'device': 'gpu',
    #'gpu_use_dp': False,
    #'gpu_device_id': 0,
    'boost_from_average': True,
    'reg_sqrt': False,
    'objective': 'mae',
    'metric': 'mae',
    'verbose': -1,
    'seed': 1234786,
    'min_data_per_group': 10,
    'cat_l2': 10,
    'cat_smooth': 10,
    'num_threads': 16
}
bst4 = lgb.train(params1, tr4, valid_sets=[vl4], early_stopping_rounds=200, verbose_eval=50)



Training until validation scores don't improve for 200 rounds
[50]	valid_0's l1: 0.315039
[100]	valid_0's l1: 0.316094
[150]	valid_0's l1: 0.319409
[200]	valid_0's l1: 0.321343
[250]	valid_0's l1: 0.322876
Early stopping, best iteration is:
[74]	valid_0's l1: 0.311756


In [411]:
pred4 = bst4.predict(X_vl)
print(mae(y_vl[:, 3], np.expm1(pred4)))   # 1.4466

1.5059481170024789


In [412]:
#pred1 = bst1.predict(X_vl)
#pred2 = bst2.predict(X_vl)
#pred3 = bst3.predict(X_vl)
#pred4 = bst4.predict(X_vl)
preds = np.vstack((pred1, pred2, pred3, pred4)).T
print(mae(y_vl, np.expm1(preds)))   # 1.3549

1.3343184249474347


In [None]:
# 1.7930
# 1.7708 - added lag of flags
# 1.7368 - added categorical features from box score
# 1.6982 - added batter scores
# 1.6890       - added pitcher scores
# 1.6892   - added remaining features ( :-( )
# 1.6883     - added pitcher lags 
# 1.6810   - adde batter lags
# 1.6777   - more pitcher lags
# 1.5774 - added target mean (skip - 30 and last 365)
# change hyperparams - colsamplebytree 0.7 --> 0.4; 1.6814 (reverting)
# 1.534 - changed val to from 10 april - 
# 1.517 - added lag3
# 1.5146 - changed hyperparams - colsample to 0.5
# 1.509 - num_leaves 255
# 1.5078  - min_leaf_samples 20
# 1.4884 - added statusCode
# 1.4891 - made it cat
# 1.4795 - removed cat encoding
# 1.4455 - fixed last n expanding mean and added last 10
# 1.4396 - added last 10 stats
# 1.4379 - last 5 innings sum of pitching scores
# 1.4322 - last 5 mean scores
# 1.4278 - expading mean batter scores
# 1.4257 - expanding mean pitcher scores
# 1.4070 - expanding sum - last 10 scores 
# 1.3687 -- changed validation to last 15 days
# 1.3602 - changed target means from 10 days to 15 days 
# 1.3556 - changed from 15 to 30 days
# 1.355 - changes days a bit
# 1.3499 - changed lr to 0.05
# 1.346 - changed hyperparams

In [406]:
bst1.save_model("artifacts/bst1_train_v3.pkl")
bst2.save_model("artifacts/bst2_train_v3.pkl")
bst3.save_model("artifacts/bst3_train_v3.pkl")
bst4.save_model("artifacts/bst4_train_v3.pkl")

<lightgbm.basic.Booster at 0x7f18d033acd0>

In [None]:
np.save("data/lgb_t1.npy", pred1)
np.save("data/lgb_t2.npy", pred2)
np.save("data/lgb_t3.npy", pred3)
np.save("data/lgb_t4.npy", pred4)


In [None]:
from catboost import CatBoostRegressor

In [None]:
model1 = CatBoostRegressor(loss_function='MAE', learning_rate=0.1, iterations=2000, grow_policy='Lossguide', max_leaves=255, min_data_in_leaf=20, 
                          )
model1.fit(X_tr1, y_tr[:, 0], eval_set=(X_vl1, y_vl[:, 0]), verbose_eval=50, early_stopping_rounds=200)

In [None]:
pred12= model1.predict(X_vl1)


In [None]:
mae(y_vl[:, 0], np.mean([pred1, pred12], 0))

In [None]:
mae(y_vl[:, 0], pred1*0.7+pred12*0.3)

In [None]:
pd.read_csv("data/seasons.csv")

In [None]:
# TODO
# Running stats on rank per day