In [8]:
import joblib
import lightgbm as lgb
import numpy as np
import pandas as pd
import pytorch_lightning as pl
from sklearn.metrics import mean_absolute_error as mae
from sklearn.pipeline import make_pipeline, make_union
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
from timm.optim import NovoGrad
import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader
import wandb

from src.constants import TARGETS, SCORES1, SCORES2, SCORES3, SCORES4, SCORES5
from src.constants import scores1_mean_artifact, scores2_mean_artifact, scores3_mean_artifact, scores4_mean_artifact, scores5_mean_artifact, event_artifact
from mllib.transformers import LagN
seed=123786000
pl.seed_everything(seed)

Global seed set to 123786000


123786000

In [20]:
vl_index = pd.read_csv("data/vl_index_smallv02.csv")
seasons = pd.read_csv("data/seasons_formatted.csv")


In [22]:
DEVICE='gpu'
device = DEVICE
artifacts_path = 'data/artifacts/v02'

new_pipe = make_union(
        LagN('date', 'playerId', [0, 1, 2, 3], f'{artifacts_path}/{event_artifact}', fill_value=0, N=1, skip=0, device=device),
        LagN('date', 'playerId', [4, 5, 6, 7], f'{artifacts_path}/{event_artifact}', fill_value=0, N=1, skip=0, device=device),
        LagN('date', 'playerId', [8, 9, 10, 11], f'{artifacts_path}/{event_artifact}', fill_value=0, N=1, skip=0, device=device),
        LagN('date', 'playerId', [12, 13, 14, 15], f'{artifacts_path}/{event_artifact}', fill_value=0, N=1, skip=0, device=device),
)
X_vl2 = new_pipe.transform(vl_index)

In [28]:
bst12 = lgb.Booster(model_file='artifacts/bst1_train_v401.pkl')
bst22 = lgb.Booster(model_file='artifacts/bst2_train_v401.pkl') 
bst32 = lgb.Booster(model_file='artifacts/bst3_train_v401.pkl') 
bst42 = lgb.Booster(model_file='artifacts/bst4_train_v401.pkl') 

In [24]:
X_test = np.load("data/X_vl1_v202_skip10.npy")
X_test.shape

(20179, 426)

In [25]:
vl_index['seasonflag'] = vl_index.date.map(seasons.set_index('date')['seasonflag'])
X_test = np.hstack((X_test, X_vl2, vl_index.seasonflag.values.reshape(-1, 1)))
X_test = X_test[vl_index.seasonflag > 0]

In [26]:
y_vl = vl_index[TARGETS].values #[vl_index.seasonflag > 0]

In [29]:
preds_lgb12 = bst12.predict(X_test)
preds_lgb22 = bst22.predict(X_test)
preds_lgb32 = bst32.predict(X_test)
preds_lgb42 = bst42.predict(X_test)
preds_lgb2 = np.vstack((preds_lgb12, preds_lgb22, preds_lgb32, preds_lgb42)).T

In [30]:
print(mae(y_vl[:, 0], preds_lgb12))

0.8864539146348119


In [31]:
print(mae(y_vl, preds_lgb2))

0.8872481885315682


In [32]:
print(mae(y_vl[:, 1], preds_lgb22))

1.241082491313306


In [33]:
print(mae(y_vl[:, 2], preds_lgb32))

0.6757235671370531


In [34]:
print(mae(y_vl[:, 3], preds_lgb42))

0.7457327810411016


In [2]:
# Encode box scores - last 30 (also try, 29, 30th as avg of history before that)
# Also need days since last 30 games
# Predict next box scores/targets
# ~ batch_size, 30, 70, use MSE
# TFM1
# 

In [3]:
data = pd.read_csv("data/train.csv")

In [4]:
data.head()

Unnamed: 0,date,nextDayPlayerEngagement,games,rosters,playerBoxScores,teamBoxScores,transactions,standings,awards,events,playerTwitterFollowers,teamTwitterFollowers
0,20180101,"[{""engagementMetricsDate"":""2018-01-02"",""player...",,"[{""playerId"":400121,""gameDate"":""2018-01-01"",""t...",,,"[{""transactionId"":340732,""playerId"":547348,""pl...",,,,"[{""date"":""2018-01-01"",""playerId"":545361,""playe...","[{""date"":""2018-01-01"",""teamId"":147,""teamName"":..."
1,20180102,"[{""engagementMetricsDate"":""2018-01-03"",""player...",,"[{""playerId"":134181,""gameDate"":""2018-01-02"",""t...",,,"[{""transactionId"":339458,""playerId"":621173,""pl...",,,,,
2,20180103,"[{""engagementMetricsDate"":""2018-01-04"",""player...",,"[{""playerId"":425492,""gameDate"":""2018-01-03"",""t...",,,"[{""transactionId"":347527,""playerId"":572389,""pl...",,,,,
3,20180104,"[{""engagementMetricsDate"":""2018-01-05"",""player...",,"[{""playerId"":282332,""gameDate"":""2018-01-04"",""t...",,,"[{""transactionId"":339549,""playerId"":545343,""pl...",,,,,
4,20180105,"[{""engagementMetricsDate"":""2018-01-06"",""player...",,"[{""playerId"":282332,""gameDate"":""2018-01-05"",""t...",,,"[{""transactionId"":341195,""playerId"":628336,""pl...",,,,,


In [5]:
from src.pipelines.artifacts import ParseJsonField
EVENT_COLS = [
    "gameType",
    "pitcherTeamId",
    "pitcherId",
    "hitterTeamId",
    "hitterId",
    "pitcherHand",
    "batSide",
    "launchSpeed",
    "launchAngle",
    "totalDistance",
    "startSpeed",
    "endSpeed",
    "spinRate",
    "spinDirection",
    "event",
    "inning",
    "halfInning",
    "homeScore",
    "awayScore"
]
enc = ParseJsonField('date', 'events', EVENT_COLS)
tmp = enc.transform(data)

100%|██████████| 1216/1216 [01:42<00:00, 11.85it/s]


In [273]:
tmp.halfInning.unique()

array(['top', 'bottom'], dtype=object)

In [268]:
eventdesc_dict = {ev: i for i, ev in enumerate(tmp.event.astype(str).unique())}
eventdesc_dict

{'Balk': 0,
 'Injury': 1,
 'Ejection': 2,
 'Pickoff 1B': 3,
 'Runner Out': 4,
 'Wild Pitch': 5,
 'Passed Ball': 6,
 'Defensive Sub': 7,
 'Game Advisory': 8,
 'Stolen Base 2B': 9,
 'Stolen Base 3B': 10,
 'Pitch Challenge': 11,
 'Defensive Indiff': 12,
 'Defensive Switch': 13,
 'Pickoff Error 1B': 14,
 'Pitching Substitution': 15,
 'Offensive Substitution': 16,
 'None': 17,
 'Walk': 18,
 'Double': 19,
 'Flyout': 20,
 'Single': 21,
 'Triple': 22,
 'Lineout': 23,
 'Pop Out': 24,
 'Forceout': 25,
 'Home Run': 26,
 'Sac Bunt': 27,
 'Groundout': 28,
 'Double Play': 29,
 'Field Error': 30,
 'Intent Walk': 31,
 'Hit By Pitch': 32,
 'Grounded Into DP': 33,
 'Caught Stealing 2B': 34,
 'Sac Fly': 35,
 'Catcher Interference': 36,
 'Strikeout': 37,
 'Fielders Choice Out': 38,
 'Umpire Substitution': 39,
 'Bunt Groundout': 40,
 'Fielders Choice': 41,
 'Strikeout Double Play': 42,
 'Stolen Base Home': 43,
 'Bunt Pop Out': 44,
 'Pickoff Error 2B': 45,
 'Batter Interference': 46,
 'Caught Stealing 3B': 

In [None]:
['Balk': 1, 'Injury': 2, 'Ejection': 3, 'Pickoff 1B': 4, 'Runner Out': 5,
       'Wild Pitch': 6, 'Passed Ball': 7, 'Defensive Sub': 8,
       'Stolen Base 2B': 9, 'Stolen Base 3B': 10, 'Pitch Challenge': 11,
       'Defensive Indiff': 12, 'Defensive Switch': 13, 'Pickoff Error 1B': 14,
       'Pitching Substitution': 15, 'Offensive Substitution': 16, 'Walk': 17,
       'Double': 18, 'Flyout': 19, 'Single': 20, 'Triple':21, 'Lineout':22, 'Pop Out':23,
       'Forceout':24, 'Home Run':25, 'Sac Bunt':26, 'Groundout': 27, 'Double Play': 28,
       'Field Error', 'Intent Walk', 'Hit By Pitch', 'Grounded Into DP',
       'Caught Stealing 2B', 'Sac Fly', 'Catcher Interference',
       'Strikeout', 'Fielders Choice Out', 'Umpire Substitution',
       'Bunt Groundout', 'Fielders Choice', 'Strikeout Double Play',
       'Stolen Base Home', 'Bunt Pop Out', 'Pickoff Error 2B',
       'Batter Interference', 'Caught Stealing 3B', 'Other Advance',
       'Pickoff Caught Stealing 2B', 'Caught Stealing Home',
       'Bunt Lineout', 'Pickoff 2B', 'Error',
       'Pickoff Caught Stealing Home', 'Fan Interference', 'Triple Play',
       'Sac Fly Double Play', 'Pickoff 3B', 'Pickoff Caught Stealing 3B',
       'Sac Bunt Double Play', 'Pitcher Switch', 'Pickoff Error 3B',
       'Runner Placed On Base']

In [260]:
pd.set_option('display.max_columns', 100)
pd.set_option('display.max_rows', 100)

tmp

Unnamed: 0,gamePk,gameDate,gameTimeUTC,season,gameType,playId,eventId,inning,halfInning,homeScore,awayScore,menOnBase,atBatIndex,atBatDesc,atBatEvent,hasOut,pitcherTeamId,isPitcherHome,pitcherTeam,hitterTeamId,hitterTeam,pitcherId,pitcherName,isStarter,pitcherHand,hitterId,hitterName,batSide,pitchNumber,balls,strikes,isGB,isLD,isFB,isPU,launchSpeed,launchAngle,totalDistance,event,description,rbi,pitchType,call,outs,inPlay,isPaOver,startSpeed,endSpeed,nastyFactor,breakAngle,breakLength,breakY,spinRate,spinDirection,pX,pZ,aX,aY,aZ,pfxX,pfxZ,vX0,vY0,vZ0,x,y,x0,y0,z0,type,zone
0,634581,2021-04-14,2021-04-14T23:20:00Z,2021,R,,3,8,top,5,5,,63,Miguel Rojas walks.,Walk,0,144,1,Atlanta Braves,146,Miami Marlins,592426,Luke Jackson,0,R,500743,Miguel Rojas,R,2,1,0,,,,,,,,Balk,"With Miguel Rojas batting, Jazz Chisholm Jr. ...",,,,2,,,,,,,,,,,,,,,,,,,,,,,,,,action,
1,634582,2021-04-14,2021-04-14T19:45:00Z,2021,R,,1,8,bottom,2,0,,56,Sal Romano intentionally walks Mike Yastrzemski.,Intent Walk,0,113,0,Cincinnati Reds,137,San Francisco Giants,607219,Sal Romano,0,R,573262,Mike Yastrzemski,L,1,0,0,,,,,,,,Balk,"With Mike Yastrzemski batting, Evan Longoria a...",,,,2,,,,,,,,,,,,,,,,,,,,,,,,,,action,
2,634521,2021-04-14,2021-04-14T23:10:00Z,2021,R,,1,5,bottom,2,6,,44,Myles Straw strikes out swinging.,Strikeout,1,116,0,Detroit Tigers,117,Houston Astros,605242,Michael Fulmer,1,R,664702,Myles Straw,R,1,0,0,,,,,,,,Injury,Tigers right fielder Nomar Mazara left the gam...,,,,2,,,,,,,,,,,,,,,,,,,,,,,,,,action,
3,634582,2021-04-14,2021-04-14T19:45:00Z,2021,R,,1,7,bottom,2,0,,49,Austin Slater strikes out swinging.,Strikeout,1,113,0,Cincinnati Reds,137,San Francisco Giants,607219,Sal Romano,0,R,596103,Austin Slater,R,2,0,1,,,,,,,,Injury,Giants pitcher Johnny Cueto left the game due ...,,,,2,,,,,,,,,,,,,,,,,,,,,,,,,,action,
4,634614,2021-04-14,2021-04-15T02:10:00Z,2021,R,,2,3,top,2,0,,20,Garrett Hampson called out on strikes.,Strikeout,1,119,1,Los Angeles Dodgers,115,Colorado Rockies,669160,Dustin May,1,R,641658,Garrett Hampson,R,3,0,2,,,,,,,,Ejection,Rockies manager Bud Black ejected by HP umpire...,,,,1,,,,,,,,,,,,,,,,,,,,,,,,,,action,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4238,632223,2021-04-14,2021-04-14T03:33:00Z,2021,R,8ebd7ab1-7432-4511-8288-7289498cb514,3,1,top,0,0,Empty,1,"Alex Verdugo grounds into a double play, secon...",Grounded Into DP,1,142,1,Minnesota Twins,111,Boston Red Sox,621244,Jose Berrios,1,R,657077,Alex Verdugo,L,4,1,2,1.0,0.0,0.0,0.0,81.4,8.0,107.0,Grounded Into DP,"Alex Verdugo grounds into a double play, secon...",,CH,X,2,1.0,1.0,85.4,78.2,,21.6,7.2,24.0,1494.0,253.0,-0.63,1.88,-12.34,25.18,-26.31,-7.93,3.77,8.38,-124.06,-3.23,110.52,162.10,-3.03,50.00,5.39,pitch,7.0
4239,634523,2021-04-14,2021-04-14T22:35:00Z,2021,R,25c38468-58bf-4f4c-a219-32d62e03151a,3,3,top,1,0,Empty,18,"Trent Grisham grounds into a double play, shor...",Grounded Into DP,1,134,1,Pittsburgh Pirates,135,San Diego Padres,542881,Tyler Anderson,1,L,663757,Trent Grisham,L,4,1,2,1.0,0.0,0.0,0.0,105.9,-12.0,7.0,Grounded Into DP,"Trent Grisham grounds into a double play, shor...",,FC,X,2,1.0,1.0,81.7,76.0,,3.6,7.2,24.0,2600.0,163.0,-0.36,1.74,-0.71,20.40,-25.75,-0.49,4.43,-5.63,-118.90,-4.42,130.65,191.93,2.09,50.00,5.92,pitch,7.0
4240,634513,2021-04-14,2021-04-14T17:07:00Z,2021,R,19b54a24-3eb0-463f-bb3d-3f3071507bb0,4,2,top,0,1,Empty,10,"Jay Bruce grounds into a double play, third ba...",Grounded Into DP,1,141,1,Toronto Blue Jays,147,New York Yankees,643615,T.J. Zeuch,1,R,457803,Jay Bruce,L,5,2,2,1.0,0.0,0.0,0.0,98.0,4.0,78.0,Grounded Into DP,"Jay Bruce grounds into a double play, third ba...",,SI,X,2,1.0,1.0,94.2,86.5,,36.0,7.2,24.0,2268.0,225.0,-0.52,1.66,-19.61,30.30,-23.53,-10.32,4.55,4.70,-137.01,-7.87,136.63,193.97,-0.91,50.00,6.18,pitch,7.0
4241,634513,2021-04-14,2021-04-14T17:07:00Z,2021,R,d0091b13-58a6-41f3-bf11-b9e66cacac59,4,4,bottom,3,4,Empty,36,"Bo Bichette grounds into a double play, second...",Grounded Into DP,1,147,0,New York Yankees,141,Toronto Blue Jays,446372,Corey Kluber,1,R,666182,Bo Bichette,R,4,0,2,1.0,0.0,0.0,0.0,93.1,-28.0,6.0,Grounded Into DP,"Bo Bichette grounds into a double play, second...",,SI,X,3,1.0,1.0,90.4,83.3,,20.4,4.8,24.0,2150.0,211.0,0.19,3.31,-9.53,25.41,-21.41,-5.38,6.08,6.71,-131.56,-0.62,109.82,149.47,-1.69,50.00,5.12,pitch,2.0


In [254]:
tmp.loc[tmp.gamePk == 634554].sort_values(by=['homeScore', 'inning', 'halfInning', 'atBatIndex', 'pitchNumber']).head(100)

Unnamed: 0,gamePk,gameDate,gameTimeUTC,season,gameType,playId,eventId,inning,halfInning,homeScore,awayScore,menOnBase,atBatIndex,atBatDesc,atBatEvent,hasOut,pitcherTeamId,isPitcherHome,pitcherTeam,hitterTeamId,hitterTeam,pitcherId,pitcherName,isStarter,pitcherHand,hitterId,hitterName,batSide,pitchNumber,balls,strikes,isGB,isLD,isFB,isPU,launchSpeed,launchAngle,totalDistance,event,description,rbi,pitchType,call,outs,inPlay,isPaOver,startSpeed,endSpeed,nastyFactor,breakAngle,breakLength,breakY,spinRate,spinDirection,pX,pZ,aX,aY,aZ,pfxX,pfxZ,vX0,vY0,vZ0,x,y,x0,y0,z0,type,zone


In [289]:
%%time
a = tmp.groupby('hitterId').agg(
    avg_exit_velocity = ('launchSpeed', 'mean'),
    maxexit_velocity = ('launchSpeed', 'max'),
    avg_launch_ange = ('launchAngle', 'mean'),
    #sweet_spot_pct = ('launchAngle', lambda x: sum((x > 8) & (x < 32))),
)
b = tmp.loc[(tmp.launchAngle > 8) & (tmp.launchAngle < 32)].groupby('hitterId')['launchAngle'].count()
a = pd.merge(a, b, right_index=True, left_index=True)
#b = tmp.groupby('hitterId')['launchAngle'].agg(['mean', 'max'])
#c = tmp.groupby('hitterId')['totalDistance'].agg(['mean', 'max'])
#d = tmp.loc[(tmp.launchAngle > 8) & (tmp.launchAngle < 32)].groupby('hitterId')['launchAngle'].count()
#a = pd.merge(a,b)

CPU times: user 118 ms, sys: 7.87 ms, total: 126 ms
Wall time: 128 ms


In [12]:
from src.pipelines.artifacts import DfTransformer
from src.constants import GAME_TYPE_MAP

class ParseEventField(DfTransformer):
    def __init__(self, date_field="date", data_field=None, use_cols=None):
        self.date_field = date_field
        self.data_field = data_field
        self.use_cols = use_cols

    def _transform(self, X):
        if (self.data_field not in X.columns) or (self.data_field not in X.columns):
            return None

        data = []
        for _, row in tqdm(X.iterrows(), total=len(X)):
            row_data = row[self.data_field]
            try:
                row_df = pd.read_json(row_data)[self.use_cols]
                final_score_diff = row_df['homeScore'].max() - row_df['awayScore'].max()
                avg_score_diff = (row_df['homeScore'] - row_df['awayScore']).std()
                std_score_diff = (row_df['homeScore'] - row_df['awayScore']).mean()

                try:
                    a = row_df.groupby('hitterId').agg(
                        avg_exit_velocity=('launchSpeed', 'mean'),
                        maxexit_velocity=('launchSpeed', 'max'),
                        avg_launch_ange=('launchAngle', 'mean'),
                        game_type=('gameType', 'first')

                    )
                    a['game_type'] = a['game_type'].map(GAME_TYPE_MAP)

                except Exception:
                    a = None
                    print("launhspeed and launchangle stats failed")

                if a is not None:
                    try:
                        b = row_df.loc[(row_df.launchAngle > 8) & (row_df.launchAngle < 32)].groupby('hitterId').agg(
                            sweet_spot_pct=('launchAngle', 'count'),
                        )
                        a = pd.merge(a, b, right_index=True, left_index=True)
                    except Exception:
                        print("Sweetspot calculation failed")
                        a['sweet_spot_pct'] = 0
                a.index.name = 'playerId'
                row_df = a.reset_index(drop=False)
            except (ValueError, KeyError):
                continue

            row_df[self.date_field] = row[self.date_field]
            row_df['final_score_diff'] = final_score_diff
            row_df['avg_score_diff'] = avg_score_diff
            row_df['std_score_diff'] = std_score_diff
            data.append(row_df)

        if len(data) == 0:
            return None
        return pd.concat(data)

        

In [13]:
from sklearn.pipeline import make_pipeline
from tqdm import tqdm
pipe = ParseEventField('date', 'events', EVENT_COLS)

In [14]:
df = pipe.transform(data)

100%|██████████| 1216/1216 [01:51<00:00, 10.90it/s]


In [15]:
df

Unnamed: 0,playerId,avg_exit_velocity,maxexit_velocity,avg_launch_ange,game_type,sweet_spot_pct,date
0,134181,85.260000,103.41,29.520000,1,2,20180329
1,400284,104.090000,104.09,10.650000,1,1,20180329
2,405395,99.445000,106.30,18.525000,1,3,20180329
3,408045,86.812000,100.12,3.390000,1,2,20180329
4,408236,89.023333,107.23,16.646667,1,3,20180329
...,...,...,...,...,...,...,...
157,669289,71.000000,90.20,-33.500000,1,1,20210430
158,669720,90.766667,105.90,25.666667,1,1,20210430
159,670541,94.350000,114.70,8.750000,1,1,20210430
160,672386,101.425000,109.40,32.750000,1,3,20210430


In [227]:
tmp.loc[(tmp.gamePk == 634554) & (tmp.event == 'Ejection'), 'description'].values

array(['Red Sox manager Alex Cora ejected by HP umpire Jordan Baker.'],
      dtype=object)

In [256]:
tmp.loc[tmp.pitcherId == 607074].sort_values(by=['homeScore', 'inning', 'halfInning', 'atBatIndex', 'pitchNumber']).head(100)

Unnamed: 0,gamePk,gameDate,gameTimeUTC,season,gameType,playId,eventId,inning,halfInning,homeScore,awayScore,menOnBase,atBatIndex,atBatDesc,atBatEvent,hasOut,pitcherTeamId,isPitcherHome,pitcherTeam,hitterTeamId,hitterTeam,pitcherId,pitcherName,isStarter,pitcherHand,hitterId,hitterName,batSide,pitchNumber,balls,strikes,isGB,isLD,isFB,isPU,launchSpeed,launchAngle,totalDistance,event,description,rbi,pitchType,call,outs,inPlay,isPaOver,startSpeed,endSpeed,nastyFactor,breakAngle,breakLength,breakY,spinRate,spinDirection,pX,pZ,aX,aY,aZ,pfxX,pfxZ,vX0,vY0,vZ0,x,y,x0,y0,z0,type,zone
62,634516,2021-04-14,2021-04-15T00:10:00Z,2021,R,,1,1,top,0,0,,0,Jordan Luplow flies out to left fielder Andrew...,Flyout,1,145,1,Chicago White Sox,114,Cleveland Indians,607074,Carlos Rodon,1,L,656669,Jordan Luplow,R,1,0,0,,,,,,,,Game Advisory,Status Change - Warmup,,,,0,,,,,,,,,,,,,,,,,,,,,,,,,,action,
90,634516,2021-04-14,2021-04-15T00:10:00Z,2021,R,,0,1,top,0,0,,0,Jordan Luplow flies out to left fielder Andrew...,Flyout,1,145,1,Chicago White Sox,114,Cleveland Indians,607074,Carlos Rodon,1,L,656669,Jordan Luplow,R,1,0,0,,,,,,,,Game Advisory,Status Change - Pre-Game,,,,0,,,,,,,,,,,,,,,,,,,,,,,,,,action,
133,634516,2021-04-14,2021-04-15T00:10:00Z,2021,R,,2,1,top,0,0,,0,Jordan Luplow flies out to left fielder Andrew...,Flyout,1,145,1,Chicago White Sox,114,Cleveland Indians,607074,Carlos Rodon,1,L,656669,Jordan Luplow,R,1,0,0,,,,,,,,Game Advisory,Status Change - In Progress,,,,0,,,,,,,,,,,,,,,,,,,,,,,,,,action,
1618,634516,2021-04-14,2021-04-15T00:10:00Z,2021,R,705b99b9-bcac-4630-8430-26ec896cab41,3,1,top,0,0,Empty,0,Jordan Luplow flies out to left fielder Andrew...,Flyout,1,145,1,Chicago White Sox,114,Cleveland Indians,607074,Carlos Rodon,1,L,656669,Jordan Luplow,R,1,0,0,0.0,0.0,1.0,0.0,92.4,36.0,333.0,Flyout,Jordan Luplow flies out to left fielder Andrew...,,FF,X,1,1.0,1.0,91.6,83.4,,13.2,4.8,24.0,2037.0,148.0,-0.38,2.41,6.0,30.63,-16.69,3.37,8.7,-7.06,-133.08,-7.54,131.37,173.82,1.88,50.0,6.5,pitch,4.0
595,634516,2021-04-14,2021-04-15T00:10:00Z,2021,R,4c3362bd-dad3-42d0-8221-c2598b904b2d,0,1,top,0,0,Empty,1,"Cesar Hernandez grounds out, shortstop Leury G...",Groundout,1,145,1,Chicago White Sox,114,Cleveland Indians,607074,Carlos Rodon,1,L,514917,Cesar Hernandez,R,1,0,1,0.0,0.0,0.0,0.0,,,,,,,FF,C,2,0.0,0.0,91.3,83.8,,24.0,3.6,24.0,2276.0,154.0,0.61,2.52,7.97,27.48,-14.09,4.45,10.11,-5.14,-132.69,-7.82,93.67,170.76,1.99,50.01,6.52,pitch,6.0
2094,634516,2021-04-14,2021-04-15T00:10:00Z,2021,R,7f967fa5-4809-492d-81bc-9ceee6933171,1,1,top,0,0,Empty,1,"Cesar Hernandez grounds out, shortstop Leury G...",Groundout,1,145,1,Chicago White Sox,114,Cleveland Indians,607074,Carlos Rodon,1,L,514917,Cesar Hernandez,R,2,1,1,0.0,0.0,0.0,0.0,,,,,,,FF,B,2,0.0,0.0,93.1,85.1,,21.6,3.6,24.0,2221.0,155.0,0.82,1.1,6.11,29.98,-10.79,3.32,11.6,-4.43,-135.04,-12.1,85.75,209.05,2.05,50.0,6.4,pitch,14.0
1840,634516,2021-04-14,2021-04-15T00:10:00Z,2021,R,a494b3d5-0f3d-4a06-98eb-d2f98ae60f06,2,1,top,0,0,Empty,1,"Cesar Hernandez grounds out, shortstop Leury G...",Groundout,1,145,1,Chicago White Sox,114,Cleveland Indians,607074,Carlos Rodon,1,L,514917,Cesar Hernandez,R,3,1,2,0.0,0.0,0.0,0.0,,,,,,,CH,S,2,0.0,0.0,84.5,77.4,,24.0,6.0,24.0,1648.0,132.0,0.08,1.3,10.55,25.2,-19.76,6.95,8.17,-6.69,-122.7,-8.04,113.83,203.81,1.95,50.0,6.31,pitch,14.0
2803,634516,2021-04-14,2021-04-15T00:10:00Z,2021,R,d7f97d71-ed32-4911-8807-bdf36f2bdf06,3,1,top,0,0,Empty,1,"Cesar Hernandez grounds out, shortstop Leury G...",Groundout,1,145,1,Chicago White Sox,114,Cleveland Indians,607074,Carlos Rodon,1,L,514917,Cesar Hernandez,R,4,2,2,0.0,0.0,0.0,0.0,,,,,,,CH,B,2,0.0,0.0,84.3,76.4,,28.8,6.0,24.0,1817.0,123.0,1.13,1.42,12.39,27.43,-19.27,8.29,8.63,-4.83,-122.38,-7.59,73.74,200.5,2.07,50.0,6.25,pitch,14.0
3585,634516,2021-04-14,2021-04-15T00:10:00Z,2021,R,a6ef5266-19af-44e6-a952-0eed2f08854c,4,1,top,0,0,Empty,1,"Cesar Hernandez grounds out, shortstop Leury G...",Groundout,1,145,1,Chicago White Sox,114,Cleveland Indians,607074,Carlos Rodon,1,L,514917,Cesar Hernandez,R,5,3,2,0.0,0.0,0.0,0.0,,,,,,,SL,B,2,0.0,0.0,84.0,77.5,,13.2,7.2,24.0,2420.0,234.0,-1.56,1.4,-4.67,24.31,-23.82,-3.1,5.55,-7.86,-122.01,-7.06,176.51,200.92,2.11,50.0,6.39,pitch,13.0
3933,634516,2021-04-14,2021-04-15T00:10:00Z,2021,R,789bd914-7fb5-49c8-8c91-d1b4633b1e9f,5,1,top,0,0,Empty,1,"Cesar Hernandez grounds out, shortstop Leury G...",Groundout,1,145,1,Chicago White Sox,114,Cleveland Indians,607074,Carlos Rodon,1,L,514917,Cesar Hernandez,R,6,3,2,1.0,0.0,0.0,0.0,93.2,-25.0,6.0,Groundout,"Cesar Hernandez grounds out, shortstop Leury G...",,FF,X,2,1.0,1.0,94.8,86.5,,28.8,2.4,24.0,2297.0,151.0,0.06,2.39,7.8,31.14,-9.62,4.07,11.77,-6.54,-137.66,-9.34,114.85,174.26,1.94,50.0,6.48,pitch,5.0


In [204]:
scores1_art= joblib.load("data/artifacts/v0/train_targets.pkl")
scores1_data = scores1_art["data"]
players = scores1_art['playerId']

In [205]:
scores1_date = scores1_art['date']
scores1_date[0]

'20180101'

In [206]:
scores1_data.shape

(1200, 1187, 4)

In [240]:
[players[pid] for pid in np.where(scores1_data[-1, :, 1] > 50)[0]]

[606213, 660670, 607074]

In [249]:
pldf = pd.read_csv("data/players.csv")
pldf.loc[pldf.playerId == 607074]

Unnamed: 0,playerId,playerName,DOB,mlbDebutDate,birthCity,birthStateProvince,birthCountry,heightInches,weight,primaryPositionCode,primaryPositionName,playerForTestSetAndFuturePreds
1329,607074,Carlos Rodon,1992-12-10,2015-04-21,Miami,FL,USA,75,245,1,Pitcher,True


In [176]:
pd.Series(scores1_data[:, :, 3].flatten()).describe()   # scores3 (3, 4), scores2 (15, 22, 23), scores5 - 8, [14]

count    1.424400e+06
mean     1.314379e+00
std      4.614010e+00
min      0.000000e+00
25%      5.013369e-02
50%      2.572016e-01
75%      9.199489e-01
max      1.000000e+02
dtype: float64

In [173]:
1200*1187

1424400

In [164]:
pd.Series(scores1_data[:, :, 16].flatten()).sum()

2047.0

In [108]:
np.where(scores1_data[:, :, 15] == 1)

(array([138, 264, 484]), array([ 487,  943, 1166]))

In [110]:
tr_index.loc[(tr_index.date == int(scores1_date[264])) & (tr_index.playerId == players[943])]

Unnamed: 0,playerId,target1,target2,target3,target4,date
601083,545350,3.838867,3.252564,4.289893,2.309711,20190522


In [29]:
# 29068/(538*1187)


0.04551789366213283

In [7]:
np.unique(scores1_data[:,:,2])

array([ 1.,  2.,  3.,  4.,  5.,  6.,  7.,  8.,  9., 10., 11., 12., nan],
      dtype=float32)

In [10]:
tr_index = pd.read_csv("data/tr_index_small.csv")
vl_index = pd.read_csv("data/vl_index_small.csv")

In [11]:
tr_index.shape, vl_index.shape

((1424400, 6), (18992, 6))

In [None]:
artifacts_path = 'data/artifacts/v0'
sc1_cols = [0, 1, 2, 3, 4, 5]
scores_pipeline = make_union(
    *[
        LagN('date', 'playerId', [0, 1, 2, 3], f'{artifacts_path}/{scores1_mean_artifact}', fill_value=0, N=1, skip=0, device=device)
    ],
    LagN('date', 'playerId', [4], f'{artifacts_path}/{scores1_mean_artifact}', fill_value=0, N=1, skip=0, device=device),
    *[
        LagN('date', 'playerId', list(range(i*4, (i+1)*4)), f'{artifacts_path}/{scores3_mean_artifact}', fill_value=0, N=1, skip=0, device=device)
         for i in range(2)
     ],
)
