In [49]:
import pandas as pd
import numpy as np
import os
from datetime import datetime

from pathlib import Path

from utils.data_loader import Dataset
from utils.data_cleaning import optimize

In [50]:
ds = Dataset('train_1')
ds.load_games(start_date='2000-01-01', end_date='2004-01-01');

In [51]:
class Files:
    # Data directory
    base_dir = Path('../data')
    # Game-by-game covariate data, coming from the paper
    games = base_dir / Path('mlb_games_df.csv')
    # Pitchers summary data, (primarily reference keys, not much in the way of stats)
    pitchers = base_dir / Path('pitchers_summary.csv')
    # Team-level pitching stats year-by-year
    team_pitching = base_dir / Path('team_pitching_stats.csv')
    # Team-level general data (attendance, W-L, etc)
    teams = base_dir / Path('team_stats.csv')
    # Game-level pitcher stats
    pitchers_games = base_dir / Path('starting_pitchers_games.csv')

In [52]:
def merge_right(data, home_or_away, cols, game_offset=1, season_avg=False):
    home_or_away = home_or_away.lower()
    assert home_or_away in ['home', 'away'], "home_or_away must be one of ['home', 'away']"

    pitchers_df = pd.read_csv(Files.pitchers_games)
    pitchers_df = optimize(pitchers_df)
    pitchers_df['Date'] = pd.to_datetime(pitchers_df['Date'])
    pitchers_df['Year'] = pitchers_df['Date'].dt.year
    pitchers_df = pitchers_df[['name', 'Date', 'Year'] + cols]

    # Rank the game number for each pitcher for each season
    pitchers_df['season_game'] = pitchers_df.groupby(['name', 'Year'])['Date'].rank('min')
    # Create a column which is just the game number offset by game_offset.
    # This is simply make the join later on easier.
    for i in range(1, game_offset+1):
        pitchers_df[f'season_game_offset{i}'] = pitchers_df['season_game'] + i
    # Do the same ranking (of pitcher season game number) in the non-pitchers data
    data[f'{home_or_away}_pitcher_season_game'] = data.groupby([f'{home_or_away}_pitcher', 'Y'])['date'].rank('min')
    # Merge the non-pitchers data and the pitchers data by matching up to the past game
    for i in range(1, game_offset+1):
        data = data.merge(pitchers_df,
                          left_on=[f'{home_or_away}_pitcher', 'Y', f'{home_or_away}_pitcher_season_game'],
                          right_on=['name', 'Year', f'season_game_offset{i}'],
                          how='left',
                          suffixes=('', f'_{i}'))
        cols_to_drop = [f'{c}_{i}' for c in data.columns if c not in cols]
        cols_to_drop = list(set(data.columns).intersection(set(cols_to_drop)))
        data = data.drop(cols_to_drop, axis='columns')

    cols_to_drop = ['name', 'Year', 'Date', 'season_game']
    cols_to_drop += [f'season_game_offset{i}' for i in range(1, game_offset+1)]
    cols_to_drop += [f'season_game_offset{i}' for i in range(1, game_offset+1)]
    cols_to_drop += ['season_game']
    cols_to_drop = list(set(data.columns).intersection(set(cols_to_drop)))
    data = data.drop(cols_to_drop, axis='columns')
    for c in cols:
        c_cols = [col for col in data.columns if col.startswith(c)]
        print(f'Averaging {c_cols} for stat {c}')
        data[f'{home_or_away}_pitcher_{c}_avg_{game_offset}games'] = data[c_cols].mean(axis=1)
        data = data.drop(c_cols, axis='columns')
    
    return data

In [53]:
ds.data.head()

Unnamed: 0,date,Y,M,D,home_team,away_team,home_win,home_pitcher,away_pitcher,home_elo,...,avg_diff,obp_diff,slg_diff,avg_pct_diff,obp_pct_diff,slg_pct_diff,home_rest,away_rest,away_team_season_game_num,home_team_season_game_num
0,2001-04-01,2001,4.0,1.0,TOR,TEX,1.0,loaizes01,helliri01,1499.562988,...,-0.00806,-0.010103,0.023271,-2.947374,-2.977845,4.989568,5.0,5.0,0,0
1,2001-04-02,2001,4.0,2.0,SEA,OAK,1.0,garcifr03,hudsoti01,1519.463989,...,-0.000864,0.00119,-0.016229,-0.323318,0.331871,-3.70521,5.0,5.0,0,0
2,2001-04-02,2001,4.0,2.0,NYA,KCA,1.0,clemero02,suppaje01,1529.510986,...,-0.010188,0.006929,0.024787,-3.703559,1.970596,5.554343,5.0,5.0,0,0
3,2001-04-02,2001,4.0,2.0,CIN,ATL,0.0,harnipe01,burkejo03,1527.274048,...,0.003972,-0.001729,0.020216,1.459194,-0.50696,4.555242,5.0,5.0,0,0
4,2001-04-02,2001,4.0,2.0,CHN,WAS,0.0,liebejo01,vazquja01,1462.51001,...,-0.010158,0.009335,-0.018992,-3.99634,2.80356,-4.646432,5.0,5.0,0,0


In [54]:
cols = ['WHIP', 'ERA']
game_offset = 162
ds.data = merge_right(ds.data, 'home', cols, game_offset)

Averaging ['WHIP', 'WHIP_2', 'WHIP_3', 'WHIP_4', 'WHIP_5', 'WHIP_6', 'WHIP_7', 'WHIP_8', 'WHIP_9', 'WHIP_10', 'WHIP_11', 'WHIP_12', 'WHIP_13', 'WHIP_14', 'WHIP_15', 'WHIP_16', 'WHIP_17', 'WHIP_18', 'WHIP_19', 'WHIP_20', 'WHIP_21', 'WHIP_22', 'WHIP_23', 'WHIP_24', 'WHIP_25', 'WHIP_26', 'WHIP_27', 'WHIP_28', 'WHIP_29', 'WHIP_30', 'WHIP_31', 'WHIP_32', 'WHIP_33', 'WHIP_34', 'WHIP_35', 'WHIP_36', 'WHIP_37', 'WHIP_38', 'WHIP_39', 'WHIP_40', 'WHIP_41', 'WHIP_42', 'WHIP_43', 'WHIP_44', 'WHIP_45', 'WHIP_46', 'WHIP_47', 'WHIP_48', 'WHIP_49', 'WHIP_50', 'WHIP_51', 'WHIP_52', 'WHIP_53', 'WHIP_54', 'WHIP_55', 'WHIP_56', 'WHIP_57', 'WHIP_58', 'WHIP_59', 'WHIP_60', 'WHIP_61', 'WHIP_62', 'WHIP_63', 'WHIP_64', 'WHIP_65', 'WHIP_66', 'WHIP_67', 'WHIP_68', 'WHIP_69', 'WHIP_70', 'WHIP_71', 'WHIP_72', 'WHIP_73', 'WHIP_74', 'WHIP_75', 'WHIP_76', 'WHIP_77', 'WHIP_78', 'WHIP_79', 'WHIP_80', 'WHIP_81', 'WHIP_82', 'WHIP_83', 'WHIP_84', 'WHIP_85', 'WHIP_86', 'WHIP_87', 'WHIP_88', 'WHIP_89', 'WHIP_90', 'WHIP_91',

In [55]:
ds.data.head()

Unnamed: 0,date,Y,M,D,home_team,away_team,home_win,home_pitcher,away_pitcher,home_elo,...,avg_pct_diff,obp_pct_diff,slg_pct_diff,home_rest,away_rest,away_team_season_game_num,home_team_season_game_num,home_pitcher_season_game,home_pitcher_WHIP_avg_162games,home_pitcher_ERA_avg_162games
0,2001-04-01,2001,4.0,1.0,TOR,TEX,1.0,loaizes01,helliri01,1499.562988,...,-2.947374,-2.977845,4.989568,5.0,5.0,0,0,1.0,,
1,2001-04-02,2001,4.0,2.0,SEA,OAK,1.0,garcifr03,hudsoti01,1519.463989,...,-0.323318,0.331871,-3.70521,5.0,5.0,0,0,1.0,,
2,2001-04-02,2001,4.0,2.0,NYA,KCA,1.0,clemero02,suppaje01,1529.510986,...,-3.703559,1.970596,5.554343,5.0,5.0,0,0,1.0,,
3,2001-04-02,2001,4.0,2.0,CIN,ATL,0.0,harnipe01,burkejo03,1527.274048,...,1.459194,-0.50696,4.555242,5.0,5.0,0,0,1.0,,
4,2001-04-02,2001,4.0,2.0,CHN,WAS,0.0,liebejo01,vazquja01,1462.51001,...,-3.99634,2.80356,-4.646432,5.0,5.0,0,0,1.0,,


In [56]:
ds.data = merge_right(ds.data, 'away', cols, game_offset)

Averaging ['WHIP', 'WHIP_2', 'WHIP_3', 'WHIP_4', 'WHIP_5', 'WHIP_6', 'WHIP_7', 'WHIP_8', 'WHIP_9', 'WHIP_10', 'WHIP_11', 'WHIP_12', 'WHIP_13', 'WHIP_14', 'WHIP_15', 'WHIP_16', 'WHIP_17', 'WHIP_18', 'WHIP_19', 'WHIP_20', 'WHIP_21', 'WHIP_22', 'WHIP_23', 'WHIP_24', 'WHIP_25', 'WHIP_26', 'WHIP_27', 'WHIP_28', 'WHIP_29', 'WHIP_30', 'WHIP_31', 'WHIP_32', 'WHIP_33', 'WHIP_34', 'WHIP_35', 'WHIP_36', 'WHIP_37', 'WHIP_38', 'WHIP_39', 'WHIP_40', 'WHIP_41', 'WHIP_42', 'WHIP_43', 'WHIP_44', 'WHIP_45', 'WHIP_46', 'WHIP_47', 'WHIP_48', 'WHIP_49', 'WHIP_50', 'WHIP_51', 'WHIP_52', 'WHIP_53', 'WHIP_54', 'WHIP_55', 'WHIP_56', 'WHIP_57', 'WHIP_58', 'WHIP_59', 'WHIP_60', 'WHIP_61', 'WHIP_62', 'WHIP_63', 'WHIP_64', 'WHIP_65', 'WHIP_66', 'WHIP_67', 'WHIP_68', 'WHIP_69', 'WHIP_70', 'WHIP_71', 'WHIP_72', 'WHIP_73', 'WHIP_74', 'WHIP_75', 'WHIP_76', 'WHIP_77', 'WHIP_78', 'WHIP_79', 'WHIP_80', 'WHIP_81', 'WHIP_82', 'WHIP_83', 'WHIP_84', 'WHIP_85', 'WHIP_86', 'WHIP_87', 'WHIP_88', 'WHIP_89', 'WHIP_90', 'WHIP_91',

In [57]:
ds.data.tail()

Unnamed: 0,date,Y,M,D,home_team,away_team,home_win,home_pitcher,away_pitcher,home_elo,...,home_rest,away_rest,away_team_season_game_num,home_team_season_game_num,home_pitcher_season_game,home_pitcher_WHIP_avg_162games,home_pitcher_ERA_avg_162games,away_pitcher_season_game,away_pitcher_WHIP_avg_162games,away_pitcher_ERA_avg_162games
7280,2003-09-28,2003,9.0,28.0,KCA,CHA,0.0,limajo01,loaizes01,1478.23999,...,1.0,1.0,80,79,7.0,1.04225,3.83,16.0,1.171062,1.908667
7281,2003-09-28,2003,9.0,28.0,ANA,TEX,1.0,shielsc01,thomsjo01,1502.781006,...,1.0,1.0,80,81,7.0,1.344999,1.795,18.0,1.755261,5.715882
7282,2003-09-28,2003,9.0,28.0,ARI,SLN,0.0,webbbr01,williwo02,1518.597046,...,1.0,1.0,80,80,14.0,1.068221,1.848461,15.0,1.156947,1.514286
7283,2003-09-28,2003,9.0,28.0,TBA,BOS,1.0,zambrvi01,wakefti01,1454.175049,...,1.0,1.0,80,80,14.0,1.462154,5.973076,16.0,1.553808,5.027334
7284,2003-09-28,2003,9.0,28.0,DET,MIN,1.0,marotmi01,lohseky01,1377.172974,...,1.0,1.0,80,80,17.0,1.591635,5.033125,19.0,1.5251,3.281111
