In [1]:
import json
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
from sklearn.pipeline import make_union, make_pipeline

from mllib.transformers import *
from src.pipelines.artifacts import *
from src.constants import (
    TARGETS,
    PLTWITTER,
    SCORES1,
    SCORES2,
    SCORES3,
    SCORES4,
    SCORES5,
    TEAM_SCORES1,
    TEAM_SCORES2,
    TEAM_SCORES3,
    TEAM_STANDINGS,
    AWARDS,
    ROSTERS,
    TRANSACTIONS,
    AWARDID_DICT
)
from src.constants import (
    playerid_mapping,
    teamid_mapping,
    targets_artifact,
    scores1_mean_artifact,
    scores1_first_artifact,
    scores1_last_artifact,
    scores2_mean_artifact,
    scores2_first_artifact,
    scores2_last_artifact,
    scores3_mean_artifact,
    scores3_first_artifact,
    scores3_last_artifact,
    scores4_mean_artifact,
    scores4_first_artifact,
    scores4_last_artifact,
    scores5_mean_artifact,
    scores5_first_artifact,
    scores5_last_artifact,
    team_scores1_mean_artifact,
    team_scores2_mean_artifact,
    team_scores3_mean_artifact,
    awards_artifact,
    rosters_artifact,
    player_twitter_artifact,
    transactions_artifact,
    team_standings_artifact,
    event_artifact
)

In [2]:
# Generate index
TRAIN_FILE = "data/train_updated.csv"
VAL_START_DATE = 20210701
DEVICE = 'gpu'
device = DEVICE
artifacts_path = 'data/artifacts/v02'

In [3]:
raw_data = pd.read_csv(TRAIN_FILE)
tr = raw_data.loc[raw_data.date < VAL_START_DATE]
val = raw_data.loc[raw_data.date >= VAL_START_DATE]
print(raw_data.shape, val.shape)

roster_2021 = pd.read_csv("data/players.csv")
roster_2021 = roster_2021.loc[roster_2021.playerForTestSetAndFuturePreds == True]
target_enc = ParsePlayerData("nextDayPlayerEngagement", TARGETS)
tr_index = target_enc.fit_transform(tr).reset_index(drop=False)
tr_index = tr_index.loc[tr_index.playerId.isin(roster_2021.playerId.astype(str))]
# tr_index['debutdate'] = tr_index.map()
vl_index = target_enc.fit_transform(val).reset_index(drop=False)
vl_index = vl_index.loc[vl_index.playerId.isin(roster_2021.playerId.astype(str))]
tr_index.to_csv("data/tr_index_smallv02.csv", index=False)
vl_index.to_csv("data/vl_index_smallv02.csv", index=False)

  1%|          | 7/1277 [00:00<00:19, 65.40it/s]

(1294, 12) (17, 12)


100%|██████████| 1277/1277 [00:18<00:00, 68.02it/s]
100%|██████████| 17/17 [00:00<00:00, 69.03it/s]


In [11]:
# raw_data = pd.read_csv("data/train_updated.csv")
# vl = raw_data.loc[raw_data.date > 20210501]
# roster_2021 = pd.read_csv("data/players.csv")
# roster_2021 = roster_2021.loc[roster_2021.playerForTestSetAndFuturePreds == True]
# target_enc = ParsePlayerData("nextDayPlayerEngagement", TARGETS)
# vl_index2 = target_enc.fit_transform(vl).reset_index(drop=False)
# vl_index2 = vl_index2.loc[vl_index2.playerId.isin(roster_2021.playerId.astype(str))]
# vl_index2.to_csv("data/vl_index_small2.csv", index=False)

In [4]:
tr_index = pd.read_csv("data/tr_index_smallv02.csv")
vl_index = pd.read_csv("data/vl_index_smallv02.csv")
#vl_indexw = pd.read_csv("data/vl_index_small2.csv")

tr_index.shape, vl_index.shape, #vl_index2.shape

((1515799, 6), (20179, 6))

In [5]:
seasons = pd.read_csv("data/seasons_formatted.csv")
seasons.head()

Unnamed: 0,date,seasonflag,season_start,season_end,all_star
0,20180101,0,2018-03-29,2018-10-28,2018-07-17
1,20180102,0,2018-03-29,2018-10-28,2018-07-17
2,20180103,0,2018-03-29,2018-10-28,2018-07-17
3,20180104,0,2018-03-29,2018-10-28,2018-07-17
4,20180105,0,2018-03-29,2018-10-28,2018-07-17


In [6]:
f1 = [f'scores3_{i}' for i in range(11)]
f2 = [f'scores3_count_last{j}_last{i}' for i in range(11) for j in [10, 30, 300]]
f3 = [f'scores1_{i}_lag{j}' for j in range(2) for i in range(5)]
f4 = [f'scores2_{i}_lag{j}' for i in range(24) for j in range(2)]
f5 = [f'scores2_{i}_mean{j}' for i in range(24) for j in [10, 30, 300]]
f6 = [f'scores2_{i}_sum{j}' for i in range(24) for j in [10, 30, 300]]
f7 = [f'scores4_{i}_lag{j}' for i in range(15) for j in range(2)]
f8 = [f'scores4_{i}_mean{j}' for i in range(15) for j in [10, 30, 300]]
f9 = [f'scores4_{i}_sum{j}' for i in range(15) for j in [10, 30, 300]]
f10 = [f'scores5_{i}_lag{j}' for i in range(21) for j in range(1)]
f11 = [f'target_{i}_mean{j}' for j in [10, 28, 365, 1500] for i in range(4)]
f12 = [f'target_{i}_{stat}' for stat in ['max', 'min', 'q75', 'q25'] for i in range(4)]
f13 = ['last_award', 'num_awards', 'last_txn', 'is_active', 'player_twitter', 'position_code']
fnames = f1 + f2 + f3 + f4 + f5 + f6 +f7 +f8 + f9 + f10 + f11 + f12 + f13
len(fnames)

425

In [7]:
len(f1+f2+f3), len(f1+f2+f3+f4+f5+f6)

(54, 246)

In [8]:
target_stats_train = make_union(
    ExpandingMean('date', 'playerId', list(range(0, 4)), f'{artifacts_path}/{targets_artifact}', N=7, skip=10, device=DEVICE, fill_value=0),
    ExpandingMean('date', 'playerId', list(range(0, 4)), f'{artifacts_path}/{targets_artifact}', N=30, skip=10, device=DEVICE, fill_value=0),
    ExpandingMean('date', 'playerId', list(range(0, 4)), f'{artifacts_path}/{targets_artifact}', N=60, skip=10, device=DEVICE, fill_value=0), 
    ExpandingMean('date', 'playerId', list(range(0, 4)), f'{artifacts_path}/{targets_artifact}', N=300, skip=10, device=DEVICE, fill_value=0),
    # ExpandingMedian('date', 'playerId', list(range(0, 4)), f'{artifacts_path}/{targets_artifact}', N=2000, skip=15, device=DEVICE, fill_value=0), 
#     ExpandingMax('date', 'playerId', list(range(0, 4)), f'{artifacts_path}/{targets_artifact}', N=30, skip=15, device=DEVICE, fill_value=0), 
#     ExpandingMin('date', 'playerId', list(range(0, 4)), f'{artifacts_path}/{targets_artifact}', N=30, skip=15, device=DEVICE, fill_value=0),
#     # ExpandingMedian('date', 'playerId', list(range(0, 4)), f'{artifacts_path}/{targets_artifact}', N=20, skip=3, device=DEVICE, fill_value=0), 
#     ExpandingQ75('date', 'playerId', list(range(0, 4)), f'{artifacts_path}/{targets_artifact}', N=30, skip=15, device=DEVICE, fill_value=0), 
#     ExpandingQ25('date', 'playerId', list(range(0, 4)), f'{artifacts_path}/{targets_artifact}', N=30, skip=15, device=DEVICE, fill_value=0),
#     ExpandingQ75('date', 'playerId', list(range(0, 4)), f'{artifacts_path}/{targets_artifact}', N=100, skip=15, device=DEVICE, fill_value=0), 
#     ExpandingQ25('date', 'playerId', list(range(0, 4)), f'{artifacts_path}/{targets_artifact}', N=100, skip=15, device=DEVICE, fill_value=0),
#     LagN('date', 'playerId', [0, 1, 2, 3], f'{artifacts_path}/{targets_artifact}', N=1, skip=15, device=DEVICE, fill_value=0),
#     LagN('date', 'playerId', [0, 1, 2, 3], f'{artifacts_path}/{targets_artifact}', N=2, skip=15, device=DEVICE, fill_value=0),
#     LagN('date', 'playerId', [0, 1, 2, 3], f'{artifacts_path}/{targets_artifact}', N=3, skip=15, device=DEVICE, fill_value=0),
    # LagN('date', 'playerId', [0, 1, 2, 3], f'{artifacts_path}/{targets_artifact}', N=4, skip=2, device=DEVICE, fill_value=0),
    # LagN('date', 'playerId', [0, 1, 2, 3], f'{artifacts_path}/{targets_artifact}', N=5, skip=2, device=DEVICE, fill_value=0),
    # LagN('date', 'playerId', [0, 1, 2, 3], f'{artifacts_path}/{targets_artifact}', N=6, skip=2, device=DEVICE, fill_value=0),
    # LagN('date', 'playerId', [0, 1, 2, 3], f'{artifacts_path}/{targets_artifact}', N=7, skip=2, device=DEVICE, fill_value=0),
)

target_stats_test = make_union(
    ExpandingMean('date', 'playerId', list(range(0, 4)), f'{artifacts_path}/{targets_artifact}', N=7, skip=1, device=DEVICE, fill_value=0),
    ExpandingMean('date', 'playerId', list(range(0, 4)), f'{artifacts_path}/{targets_artifact}', N=30, skip=1, device=DEVICE, fill_value=0),
    ExpandingMean('date', 'playerId', list(range(0, 4)), f'{artifacts_path}/{targets_artifact}', N=60, skip=1, device=DEVICE, fill_value=0), 
    ExpandingMean('date', 'playerId', list(range(0, 4)), f'{artifacts_path}/{targets_artifact}', N=300, skip=1, device=DEVICE, fill_value=0),
    # ExpandingMedian('date', 'playerId', list(range(0, 4)), f'{artifacts_path}/{targets_artifact}', N=2000, skip=1, device=DEVICE, fill_value=0), 
#     ExpandingMax('date', 'playerId', list(range(0, 4)), f'{artifacts_path}/{targets_artifact}', N=30, skip=1, device=DEVICE, fill_value=0), 
#     ExpandingMin('date', 'playerId', list(range(0, 4)), f'{artifacts_path}/{targets_artifact}', N=30, skip=1, device=DEVICE, fill_value=0),
#     # ExpandingMedian('date', 'playerId', list(range(0, 4)), f'{artifacts_path}/{targets_artifact}', N=20, skip=3, device=DEVICE, fill_value=0), 
#     ExpandingQ75('date', 'playerId', list(range(0, 4)), f'{artifacts_path}/{targets_artifact}', N=30, skip=1, device=DEVICE, fill_value=0), 
#     ExpandingQ25('date', 'playerId', list(range(0, 4)), f'{artifacts_path}/{targets_artifact}', N=30, skip=1, device=DEVICE, fill_value=0),
#     ExpandingQ75('date', 'playerId', list(range(0, 4)), f'{artifacts_path}/{targets_artifact}', N=100, skip=1, device=DEVICE, fill_value=0), 
#     ExpandingQ25('date', 'playerId', list(range(0, 4)), f'{artifacts_path}/{targets_artifact}', N=100, skip=1, device=DEVICE, fill_value=0),
#     LagN('date', 'playerId', [0, 1, 2, 3], f'{artifacts_path}/{targets_artifact}', N=1, skip=1, device=DEVICE, fill_value=0),
#     LagN('date', 'playerId', [0, 1, 2, 3], f'{artifacts_path}/{targets_artifact}', N=2, skip=1, device=DEVICE, fill_value=0),
#     LagN('date', 'playerId', [0, 1, 2, 3], f'{artifacts_path}/{targets_artifact}', N=3, skip=1, device=DEVICE, fill_value=0),
    # LagN('date', 'playerId', [0, 1, 2, 3], f'{artifacts_path}/{targets_artifact}', N=4, skip=2, device=DEVICE, fill_value=0),
    # LagN('date', 'playerId', [0, 1, 2, 3], f'{artifacts_path}/{targets_artifact}', N=5, skip=2, device=DEVICE, fill_value=0),
    # LagN('date', 'playerId', [0, 1, 2, 3], f'{artifacts_path}/{targets_artifact}', N=6, skip=2, device=DEVICE, fill_value=0),
    # LagN('date', 'playerId', [0, 1, 2, 3], f'{artifacts_path}/{targets_artifact}', N=7, skip=2, device=DEVICE, fill_value=0),
)

other_features = make_union(
    LagN('date', 'playerId', [0], f'{artifacts_path}/train_awards.pkl', fill_value=-1, N=1, skip=0, device=device),
    ExpandingCount('date', 'playerId', [0], f'{artifacts_path}/train_awards.pkl', fill_value=0, N=365, skip=0, device=device),
    # ExpandingCount('date', 'playerId', [0], f'{artifacts_path}/train_awards.pkl', fill_value=0, N=30, skip=0, device=device),
    LagN('date', 'playerId', [0, 1, 2], f'{artifacts_path}/train_transactions.pkl', fill_value=-1, N=1, skip=0, device=device),
    LagN('date', 'playerId', [0], f'{artifacts_path}/train_rosters.pkl', fill_value=-1, N=1, skip=0, device=device),
    ExpandingCount('date', 'playerId', [0], f'{artifacts_path}/train_rosters.pkl', fill_value=0, N=30, skip=0, device=device),
    ExpandingCount('date', 'playerId', [0], f'{artifacts_path}/train_rosters.pkl', fill_value=0, N=300, skip=0, device=device),
    make_pipeline(LagN('date', 'playerId', [0], f'{artifacts_path}/train_pltwitter.pkl', fill_value=0, N=1, skip=0, device=device),
                  FunctionTransfomer(np.log1p)),
)

scores1 = make_union(
        LagN('date', 'playerId', [0, 1, 2, 3], f'{artifacts_path}/{scores1_mean_artifact}', fill_value=-1, N=1, skip=0, device=device),
        LagN('date', 'playerId', [4], f'{artifacts_path}/{scores1_mean_artifact}', N=  1, skip=0, fill_value=0, device=device),
    *[
        LagN('date', 'playerId', [0, 4], f'{artifacts_path}/{scores1_mean_artifact}', fill_value=-1, N=j+1, skip=0, device=device)
        for j in range(1, 14)
    ],

    ExpandingCount('date', 'playerId', [0], f'{artifacts_path}/{scores1_mean_artifact}', fill_value=0, N=30, skip=0, device=device),
    #ExpandingCount('date', 'playerId', [4], f'{artifacts_path}/{scores1_mean_artifact}', N=30, skip=0, fill_value=0, device=device),
)

scores3 = make_union(
    *[
        LagN('date', 'playerId', list(range(i*4, (i+1)*4)), f'{artifacts_path}/{scores3_mean_artifact}', fill_value=0, N=1, skip=0, device=device)
         for i in range(2)
     ],
    LagN('date', 'playerId', [8, 9, 10], f'{artifacts_path}/{scores3_mean_artifact}', fill_value=0, N=1, skip=0, device=device),

    *[
        ExpandingSum('date', 'playerId', list(range(i*4, (i+1)*4)), f'{artifacts_path}/{scores3_mean_artifact}', fill_value=0, N=10, skip=0, device=device)
         for i in range(2)
     ],
    ExpandingSum('date', 'playerId', [8, 9, 10], f'{artifacts_path}/{scores3_mean_artifact}', fill_value=0, N=10, skip=0, device=device),
    *[
        ExpandingSum('date', 'playerId', list(range(i*4, (i+1)*4)), f'{artifacts_path}/{scores3_mean_artifact}', fill_value=0, N=200, skip=0, device=device)
         for i in range(2)
     ],
    ExpandingSum('date', 'playerId', [8, 9, 10], f'{artifacts_path}/{scores3_mean_artifact}', fill_value=0, N=200, skip=0, device=device),

)

scores2 = make_union(
    *[
        LagN('date', 'playerId', list(range(i*4, (i+1)*4)), f'{artifacts_path}/{scores2_mean_artifact}', fill_value=0, N=j+1, skip=0, device=device)
        for i in range(6) for j in range(3)
     ],
    *[
        ExpandingMean('date', 'playerId', list(range(i*4, (i+1)*4)), f'{artifacts_path}/{scores2_mean_artifact}', fill_value=0, N=j, skip=0, device=device)
        for i in range(6) for j in [30]
     ],
    *[
        ExpandingSum('date', 'playerId', list(range(i*4, (i+1)*4)), f'{artifacts_path}/{scores2_mean_artifact}', fill_value=0, N=j, skip=0, device=device)
        for i in range(6) for j in [30]
     ],
    *[
        ExpandingSum('date', 'playerId', list(range(i*4, (i+1)*4)), f'{artifacts_path}/{scores2_mean_artifact}', fill_value=0, N=j, skip=0, device=device)
        for i in range(6) for j in [150]
     ],
)

scores4 = make_union(
    *[
        LagN('date', 'playerId', list(range(i*4, (i+1)*4)), f'{artifacts_path}/{scores4_mean_artifact}', fill_value=0, N=j+1, skip=0, device=device)
        for i in range(3) for j in range(2)
     ],
    *[
        LagN('date', 'playerId', [12, 13, 14], f'{artifacts_path}/{scores4_mean_artifact}', fill_value=0, N=j+1, skip=0, device=device)
        for j in range(2)
     ],
    *[
        ExpandingMean('date', 'playerId', list(range(i*4, (i+1)*4)), f'{artifacts_path}/{scores4_mean_artifact}', fill_value=0, N=j, skip=0, device=device)
        for i in range(3) for j in [30]
     ],
    *[
        ExpandingMean('date', 'playerId', [12, 13, 14], f'{artifacts_path}/{scores4_mean_artifact}', fill_value=0, N=j, skip=0, device=device)
        for j in [30]
     ],
    *[
        ExpandingSum('date', 'playerId', list(range(i*4, (i+1)*4)), f'{artifacts_path}/{scores4_mean_artifact}', fill_value=0, N=j, skip=0, device=device)
        for i in range(3) for j in [30]
     ],
    *[
        ExpandingSum('date', 'playerId', [12, 13, 14], f'{artifacts_path}/{scores4_last_artifact}', fill_value=0, N=j, skip=0, device=device)
        for j in [30]
     ],
    *[
        ExpandingSum('date', 'playerId', list(range(i*4, (i+1)*4)), f'{artifacts_path}/{scores4_mean_artifact}', fill_value=0, N=j, skip=0, device=device)
        for i in range(3) for j in [150]
     ],
    *[
        ExpandingSum('date', 'playerId', [12, 13, 14], f'{artifacts_path}/{scores4_last_artifact}', fill_value=0, N=j, skip=0, device=device)
        for j in [150]
     ],
)


scores5 = make_union(
    *[
        LagN('date', 'playerId', list(range(i*4, (i+1)*4)), f'{artifacts_path}/{scores5_mean_artifact}', fill_value=0, N=j+1, skip=0, device=device)
        for i in range(5) for j in range(1)
     ],
    *[
        LagN('date', 'playerId', [20], f'{artifacts_path}/{scores5_mean_artifact}', fill_value=0, N=j+1, skip=0, device=device)
        for j in range(1)
     ],
    *[
        ExpandingSum('date', 'playerId', list(range(i*4, (i+1)*4)), f'{artifacts_path}/{scores5_mean_artifact}', fill_value=0, N=j, skip=0, device=device)
        for i in range(3) for j in [30]
     ],
    *[
        ExpandingSum('date', 'playerId', list(range(i*4, (i+1)*4)), f'{artifacts_path}/{scores5_mean_artifact}', fill_value=0, N=j, skip=0, device=device)
        for i in range(3) for j in [150]
     ],
    *[
        ExpandingSum('date', 'playerId', [12, 13, 14], f'{artifacts_path}/{scores5_mean_artifact}', fill_value=0, N=j, skip=0, device=device)
        for j in [30]
     ],
    *[
        ExpandingSum('date', 'playerId', [12, 13, 14], f'{artifacts_path}/{scores5_mean_artifact}', fill_value=0, N=j, skip=0, device=device)
        for j in [150]
     ],
    *[
        ExpandingMean('date', 'playerId', list(range(i*4, (i+1)*4)), f'{artifacts_path}/{scores5_mean_artifact}', fill_value=0, N=j, skip=0, device=device)
        for i in range(3) for j in [30]
     ],
    *[
        ExpandingMean('date', 'playerId', [12, 13, 14], f'{artifacts_path}/{scores5_mean_artifact}', fill_value=0, N=j, skip=0, device=device)
        for j in [30]
     ],
)

all_players = make_union(
    *[
        DateLagN('date', list(range(i*4, (i+1)*4)), f'{artifacts_path}/{scores2_mean_artifact}', N=j+1, skip=0, device=device)
        for i in range(6) for j in range(1)
     ],
    *[
        DateLagN('date', list(range(i*4, (i+1)*4)), f'{artifacts_path}/{scores4_mean_artifact}', N=j+1, skip=0, device=device)
        for i in range(6) for j in range(1)
     ],
    *[
        DateLagN('date', [12, 13, 14], f'{artifacts_path}/{scores4_mean_artifact}', N=j+1, skip=0, device=device)
        for j in range(1)
     ],
)

team_standings = make_pipeline(
    AddFeature(
        name='teamId',
        pipe=LagN('date', 'playerId', [1], f'{artifacts_path}/train_rosters.pkl', fill_value=-1, N=1, skip=0, device=device),
    ),
    make_union(
        *[
            LagN('date', 'teamId', list(range(i*4, (i+1)*4)), f'{artifacts_path}/{team_standings_artifact}', N=1, fill_value=-1, skip=0, device=device)
            for i in range(10)
         ],

        verbose=True
    ), verbose=True
)



In [9]:
# TODO: One more round of feature engg. -> get all MLB stats
# more datetime features
# Tune on new data, tune features to this season
# Current score prediction task
# stats for current seasons
# only in season data ??

In [10]:
feature_pipeline_tr = make_union(target_stats_train, other_features, scores1, scores2, scores3, scores4, scores5, all_players)
feature_pipeline_te = make_union(target_stats_test, other_features, scores1, scores2, scores3, scores4, scores5, all_players)

In [11]:
#X_tr01 = target_stats_train.transform(tr_index)
#X_vl01 = target_stats_test.transform(vl_index)

# X_tr02 = other_features.transform(tr_index)
# X_vl02 = other_features.transform(vl_index)

# X_tr03 = scores1.transform(tr_index)
# X_vl03 = scores1.transform(vl_index)

# X_tr04 = scores3.transform(tr_index)
# X_vl04 = scores3.transform(vl_index)

# X_tr05 = scores2.transform(tr_index)
# X_vl05 = scores2.transform(vl_index)

#X_tr06 = scores4.transform(tr_index)
#X_vl06 = scores4.transform(vl_index)

# X_tr07 = scores5.transform(tr_index)
# X_vl07 = scores5.transform(vl_index)

#X_tr08 = all_players.transform(tr_index)
#X_vl08 = all_players.transform(vl_index)

#X_tr09 = team_standings.transform(tr_index)
#X_vl09 = team_standings.transform(vl_index)


In [12]:
%%time
X_tr = feature_pipeline_tr.transform(tr_index)

CPU times: user 8min 29s, sys: 18.4 s, total: 8min 47s
Wall time: 8min 47s


In [13]:
%%time
X_vl = feature_pipeline_te.transform(vl_index)

CPU times: user 26 s, sys: 1.98 s, total: 28 s
Wall time: 27.9 s


In [14]:
new_pipe = make_union(
        LagN('date', 'playerId', [0, 1, 2, 3], f'{artifacts_path}/{event_artifact}', fill_value=0, N=1, skip=0, device=device),
        LagN('date', 'playerId', [4, 5, 6, 7], f'{artifacts_path}/{event_artifact}', fill_value=0, N=1, skip=0, device=device),
        LagN('date', 'playerId', [8, 9, 10, 11], f'{artifacts_path}/{event_artifact}', fill_value=0, N=1, skip=0, device=device),
        LagN('date', 'playerId', [12, 13, 14, 15], f'{artifacts_path}/{event_artifact}', fill_value=0, N=1, skip=0, device=device),
)

In [15]:
X_tr2 = new_pipe.transform(tr_index)
X_vl2 = new_pipe.transform(vl_index)

In [16]:
pd.Series(X_vl2[:, 15]).describe()

count    20179.0
mean         0.0
std          0.0
min          0.0
25%          0.0
50%          0.0
75%          0.0
max          0.0
dtype: float64

In [17]:
X_vl2.shape

(20179, 16)

In [18]:
#X_tr_sub = X_tr[np.sum(X_tr[:, 21] != 0]
#X_tr_sub.shape

In [19]:
#np.corrcoef(X_tr2[:, 14], y_tr[:, 2])

In [20]:
#X_tr.shape, X_vl.shape

In [22]:
np.save("data/X_tr1_v202_skip10.npy", X_tr)
np.save("data/X_vl1_v202_skip10.npy", X_vl)

In [None]:
# X_tr = np.load("data/X_tr1_v201_skip10.npy")
# X_vl = np.load("data/X_vl1_v201_skip10.npy")
# # remove scores2 - 15

In [None]:
tr_index['seasonflag'] = tr_index.date.map(seasons.set_index('date')['seasonflag'])
vl_index['seasonflag'] = vl_index.date.map(seasons.set_index('date')['seasonflag'])


X_tr1 = np.hstack((X_tr, X_tr2, tr_index.seasonflag.values.reshape(-1, 1)))
X_vl1 = np.hstack((X_vl, X_vl2, vl_index.seasonflag.values.reshape(-1, 1)))


X_tr1 = X_tr1[tr_index.seasonflag > 0]
X_vl1 = X_vl1[vl_index.seasonflag > 0]

In [23]:
import lightgbm as lgb
targets = ['target1', 'target2', 'target3', 'target4']
y_tr = tr_index[targets].values[tr_index.seasonflag > 0]
y_vl = vl_index[targets].values[vl_index.seasonflag > 0]
# print(np.unique(X_tra[:, 235]))

tr1 = lgb.Dataset(X_tr1, y_tr[:, 0]) #, categorical_feature=[13, 14, 15, 16, 419])
tr2 = lgb.Dataset(X_tr1, y_tr[:, 1]) #, categorical_feature=[13, 14, 15, 16, 419])
tr3 = lgb.Dataset(X_tr1, y_tr[:, 2]) #, categorical_feature=[13, 14, 15, 16, 419])
tr4 = lgb.Dataset(X_tr1, y_tr[:, 3]) #, categorical_feature=[13, 14, 15, 16, 419])

vl1 = lgb.Dataset(X_vl1, y_vl[:, 0], reference=tr1)
vl2 = lgb.Dataset(X_vl1, y_vl[:, 1], reference=tr2)
vl3 = lgb.Dataset(X_vl1, y_vl[:, 2], reference=tr3)
vl4 = lgb.Dataset(X_vl1, y_vl[:, 3], reference=tr4)

# params = {
#     'n_estimators': 4000,
#     'learning_rate': 0.08,
#     'num_leaves': 31,
#     'colsample_bytree': 0.3,
#     'subsample': 0.5,
#     'reg_alpha': 0.1,
#     'reg_lambda': 0.1,
#     'max_bin': 255,
#     'objective': 'mae',
#     'metric': 'mae'
# }

params = {
    'n_estimators': 5000,
    'learning_rate': 0.02,
    'num_leaves': 255,
    'max_depth': -1,
    'min_data_in_leaf': 20,
    'colsample_bytree': 0.4,
    'subsample': 0.95,
    'bagging_freq': 1,
    'reg_alpha': 0.2,
    'reg_lambda': 0.2,
    'extra_trees': False,
    'max_bin': 127,
    #'device': 'gpu',
    #'gpu_use_dp': False,
    #'gpu_device_id': 0,
    'boost_from_average': True,
    'reg_sqrt': True,
    'objective': 'mae',
    'metric': 'mae',
    'verbose': -1,
    'seed': 786,
    'min_data_per_group': 10,
    'cat_l2': 10,
    'cat_smooth': 10,
    'num_threads': 16
}
from sklearn.metrics import mean_absolute_error as mae
bst1 = lgb.train(params, tr1, valid_sets=[vl1], early_stopping_rounds=200, verbose_eval=50)
pred21 = bst1.predict(X_vl1)
print(mae(y_vl[:, 0], pred21))   # 0.9509 (skip 3); 0.9478 (skip 10); 0.9481 (skip 15); 0.9473 (skip 7); 0.9475 (skip 12)
# sticking with skip10
# targetmean7 -> 10; 0.9477
# 

AttributeError: 'DataFrame' object has no attribute 'seasonflag'

In [29]:
from itertools import zip_longest
a = [1, 2, 3, 4, 5]

def batch_list(a, n=4):
    out = []
    tmp = []
    for i in range(len(a)):
        tmp.append(a[i])
        if (i + 1) % n == 0:
            out.append(tmp)
            tmp = []
    if len(tmp) > 0:
        out.append(tmp)
    return out

batch_list(a, 4)

[[1, 2, 3, 4], [5]]

In [None]:
bst2 = lgb.train(params, tr2, valid_sets=[vl2], early_stopping_rounds=200, verbose_eval=50)
pred22 = bst2.predict(X_vl1)
print(mae(y_vl[:, 1], pred22))   # (skip 15) 1.5609, 1.5578 (skip 10), 1.5864 (skip 3), 1.5665 (skip 7); 1.5591 (skip 12)
# targetmean7 -> 10 ; 1.5473
# targetmean10, 30 -> 15, 60; 1.5417

bst3 = lgb.train(params, tr3, valid_sets=[vl3], early_stopping_rounds=200, verbose_eval=50)
pred23 = bst3.predict(X_vl1)
print(mae(y_vl[:, 2], pred23))   # (skip 15) 0.7469, 0.7459 (skip 10), 0.7488 (skip 3), 0.7463 (skip 7); 0.7442 (skip 12)
# targetmean7 -> 10 ; 0.7465
# targetmean10, 30 -> 15, 60; 0.7470



bst4 = lgb.train(params, tr4, valid_sets=[vl4], early_stopping_rounds=200, verbose_eval=50)
pred24 = bst4.predict(X_vl1)
print(mae(y_vl[:, 3], pred24))   # (skip 15) 1.1930 , 1.1812 (skip 10), 1.1863 (skip 3); 1.1818 (skip 7); 1.1834 (skip 12)
# targetmean7 -> 10 1.1974; 
# targetmean10, 30 -> 15, 60; 1.1931


preds_2 = np.vstack((pred21, pred22, pred23, pred24)).T
print(mae(y_vl, preds_2))   # 1.1122 (skip 15), 1.1082 (skip 10), 1.1181 (skip 3); 1.1105 (skip 7); 1.1085 (skip 12)
# targetmean7 -> 10 1.1097; 
# targetmean10, 30 -> 15, 60; 1.1076



In [None]:
bst1.save_model("artifacts/bst1_train_v401.pkl")
bst2.save_model("artifacts/bst2_train_v401.pkl")
bst3.save_model("artifacts/bst3_train_v401.pkl")
bst4.save_model("artifacts/bst4_train_v401.pkl")

In [None]:
np.save("data/lgb_t1_logv401_skip3.npy", pred21)
np.save("data/lgb_t2_logv401_skip3.npy", pred22)
np.save("data/lgb_t3_logv401_skip3.npy", pred23)
np.save("data/lgb_t4_logv401_skip3.npy", pred24)


In [None]:
# iterations with leaky validation

# 2.22 (target means and stats over 20 days - zero fill value)
# 2.2332 (-1 fill value)
# 2.31 remove 10 mean
# 2.21 add lag 3
# 2.26 removed quantiles
# 2.18 quantiles back and lags till 4
# 2.18 add median for last 30 as well
# 2.177 added lag till 7 days
# 2.179 added lag till 15 days
# 2.114 keep only 7 day lag and try skip 2 days for 1 and 2 day lag
# 2.116 skip 2 for everything
# 2.118 skip 2 for lags, others3 and changed 30 to 40 days for stats
# 2.091 changed 40 to 20 days
# 2.106 changed 20 to 10
# 2.107 changed 10 to 15
# 2.060 added xtra features, awards, roster, txns, pltwitter
# 2.060 added non nan count in rosters for last 20 days
# 1.9986 added scores1 - current
# 1.9734 scores1 lag1
# 1.9643 added scores1 lag 2
# 1.9613 added scores1 lag 3
# 1.9552 added scores1 expnding count last 20
# 1.9588 added scores1 expanding count last 1000
# 1.9537 added scores1 lag 5
# 1.9487 added scores1 lag6
# 1.9429 added scores1 lag7
# 1.9429 added scores1 lag 8
# 1.9438 added scores3 lag0
# 1.9232 replaced scores3 lag0 with expanding sum last 10
# 1.9294 added scores3 sum last 20
# 1.9209 replaced scores3 sum 20 by sum 200
# 1.939 replace scores3 sum 10 by sum 20
# 1.9223 replace scores3 sum 200 by sum 100 
# rolled back to 1.9209 version
# 1.8997 added scores2 lag0, lag1
# 1.8846 added scores2 lag2, lag3
# 1.894 added scores2 lag4, lag5
# 1.8857 removed lag4,5 and added mean last 10
# 1.8874 scores2 -- replace mean10 - sum10
# 1.8932 scores2 sum10 --> sum100
# 1.892 scores2 sum100 -> max10
# rolled back to only lags for scores2
# 1.8675 added scores4 lag0,1
# 1.873 added scores4 lag2
# 1.876 added scores4 lag3
# 1.8798 remove lag2, 3 and add mean 20
# 1.8807 mean20 -> count20
# 1.8722 count20 -> mean200
# rolling back to only lags0,1 for scores4
# 1.8645 scores5 lag0
# 1.8864 add team standings cols (0, 1, 6, 7, 8)
# 1.881 standings - only win loss pct
# removing standings completely
# 1.872 adding team scores2 lag0
# 1.8693 team scores1 lag0
# removed team features completely
# 1.892 add all players scores1 sum for given date
# 1.8610 all players scores2 sum - lag0
# 1.8635 all players scores4 sum - lag0

In [None]:
predst1_lgb = np.load("data/lgb_t1_logv1.npy")
predst2_lgb = np.load("data/lgb_t2_logv1.npy")
predst3_lgb = np.load("data/lgb_t3_logv1.npy")
predst4_lgb = np.load("data/lgb_t4_logv1.npy")

In [None]:
preds1_lgb = np.vstack((predst1_lgb, predst2_lgb, predst3_lgb, predst4_lgb)).T
mae(y_vl, 0.5*preds1_lgb+0.5*preds_2)

In [None]:
    target_stats_train2 = make_union(
        *[
            ExpandingMean(
                key_cols=[0, 1, 2, 3],
                hist_data_path=f"{artifacts_path}/{targets_artifact}",
                N=j,
                skip=10,
                device=DEVICE,
                fill_value=0,
            )
            for j in [7, 30, 60, 300]
        ],
    )

    target_stats_test2 = make_union(
        *[
            ExpandingMean(
                key_cols=[0, 1, 2, 3],
                hist_data_path=f"{artifacts_path}/{targets_artifact}",
                N=j,
                skip=1,
                device=DEVICE,
                fill_value=0,
            )
            for j in [7, 30, 60, 300]
        ],
    )

In [None]:
award_features = [('LagN', {'N': 1,  'hist_data_path': f"{artifacts_path}/{awards_artifact}"})]

In [None]:
    other_features2 = make_union(
        LagN(
            "date",
            "playerId",
            [0],
            f"{artifacts_path}/{awards_artifact}",
            fill_value=-1,
            N=1,
            skip=0,
            device=device,
        ),
        ExpandingCount(
            "date",
            "playerId",
            [0],
            f"{artifacts_path}/{awards_artifact}",
            fill_value=0,
            N=365,
            skip=0,
            device=device,
        ),
        LagN(
            "date",
            "playerId",
            [0, 1, 2],
            f"{artifacts_path}/{transactions_artifact}",
            fill_value=-1,
            N=1,
            skip=0,
            device=device,
        ),
        LagN(
            "date",
            "playerId",
            [0],
            f"{artifacts_path}/{rosters_artifact}",
            fill_value=-1,
            N=1,
            skip=0,
            device=device,
        ),
        *[
            ExpandingCount(
                "date",
                "playerId",
                [0],
                f"{artifacts_path}/{rosters_artifact}",
                fill_value=0,
                N=j,
                skip=0,
                device=device,
            )
            for j in [30, 300]
        ],
        make_pipeline(
            LagN(
                "date",
                "playerId",
                [0],
                f"{artifacts_path}/{player_twitter_artifact}",
                fill_value=0,
                N=1,
                skip=0,
                device=device,
            ),
            FunctionTransfomer(np.log1p),
        ),
    )


In [None]:
lag_features

In [34]:
len(SCORES1)

5