In [47]:
import pandas as pd
import numpy as np
import os
from datetime import datetime

from pathlib import Path

In [3]:
base_dir = Path('../data')

In [221]:
class Files:
    # Game-by-game covariate data, coming from the paper
    games = base_dir / Path('mlb_games_df.csv')
    # Pitchers summary data, (primarily reference keys, not much in the way of stats)
    pitchers = base_dir / Path('pitchers_summary.csv')
    # Team-level pitching stats year-by-year
    team_pitching = base_dir / Path('team_pitching_stats.csv')
    # Team-level general data (attendance, W-L, etc)
    teams = base_dir / Path('team_stats.csv')
    
    # Directory containing game-by-game stats for all pitchers
    pitchers_games_dir = base_dir / Path('pitchers_games')

In [223]:
class Dataset:
    def __init__(self, name):
        self.name = name
        
        self.created_at = datetime.now()
        self.modified_at = self.created_at
        
        self.data = None
        
# Trying to make a function to abstract out this whole merge-filter-rename process,
# but the function is uglier than just copy-pasting it.
#     def _year_offset(self, left_df, right_df, left_on, right_on, 
#                      right_stats_cols, year_offset, home_away, 
#                      left_year='Year', right_year='Year',
#                      left_team='Team', right_team='Team'):
#         left_df = left_df.merge(right_df[[right_team, right_year] + right_stats_cols], left_on=left_on, right_on=right_on)
#         left_df = left_df[left_df[left_year] + year_offset == left_df[right_year]]
#         left_df = left_df.drop([right_team, right_year], axis='columns')
#         all_cols = list(left_df.columns)
#         all_cols[-len(right_stats_cols):] = [f'{home_away}_{c}_offset{np.abs(year_offset)}' for c in all_cols[-len(cols):]]
#         left_df.columns = all_cols
        
    def load_games(self, start_date='2000-01-01', end_date='2015-12-31'):
        '''
        Load all games between supplied dates.
        start_date (str or date): (Default 2000-01-01) 
        end_date (str or date): (Default 2015-12-31) 
        '''
        games_df = pd.read_csv(Files.games)
        games_df['date'] = pd.to_datetime(games_df['date'])
        games_df = games_df[games_df['date'].between(start_date, end_date)]
        games_df['Y'] = games_df['Y'].astype(int)
        self.data = games_df
        return games_df
        
    def add_team_pitching(self, year_offset=-1, cols=[]):
        '''
        Load team pitching data and join it to the game data. Note that you can run this more
        than once to join several years of pitching data.
        year_offset (int): (Default -1) If None, join pitching data for the same year as the
            game occurred in. So a year_offset of -1 means games in 2015 would join in pitching
            data from 2014 (one year earlier).
        cols (list): (Default []) Columns from team pitching data to include. By default no
            data is included.
        '''
        assert self.data is not None, 'First run Dataset.load_games() to load some games into memory'
        if isinstance(cols, str):
            cols = [cols]
        
        pitching_df = pd.read_csv(Files.team_pitching)
        
        # Append columns for the home team
        self.data = self.data.merge(pitching_df[['Team', 'Year'] + cols], left_on='home_team', right_on='Team')
        self.data = self.data[self.data['Y'] + year_offset == self.data['Year']]
        self.data = self.data.drop(['Team', 'Year'], axis='columns')
        all_cols = list(self.data.columns)
        all_cols[-len(cols):] = [f'home_{c}_offset{np.abs(year_offset)}' for c in all_cols[-len(cols):]]
        self.data.columns = all_cols
        
        # Repeat for away team
        self.data = self.data.merge(pitching_df[['Team', 'Year'] + cols], left_on='away_team', right_on='Team')
        self.data = self.data[self.data['Y'] + year_offset == self.data['Year']]
        self.data = self.data.drop(['Team', 'Year'], axis='columns')
        all_cols = list(self.data.columns)
        all_cols[-len(cols):] = [f'away_{c}_offset{np.abs(year_offset)}' for c in all_cols[-len(cols):]]
        self.data.columns = all_cols
        return self.data
    
    def add_team_stats(self, year_offset=-1, cols=[]):
        '''
        Load team statistics (attendance, W-L%, etc.). Note that you can run this more
        than once to join several years of data.
        year_offset (int): (Default -1) If None, join team data for the same year as the
            game occurred in. So a year_offset of -1 means games in 2015 would join in team
            data from 2014 (one year earlier).
        cols (list): (Default []) Columns from team data to include. By default no
            data is included.
        '''
        assert self.data is not None, 'First run Dataset.load_games() to load some games into memory'
        if isinstance(cols, str):
            cols = [cols]
            
        teams_df = pd.read_csv(Files.teams)
        
        # Append columns for the home team
        self.data = self.data.merge(teams_df[['Team', 'Year'] + cols], left_on='home_team', right_on='Team')
        self.data = self.data[self.data['Y'] + year_offset == self.data['Year']]
        self.data = self.data.drop(['Team', 'Year'], axis='columns')
        all_cols = list(self.data.columns)
        all_cols[-len(cols):] = [f'home_{c}_offset{np.abs(year_offset)}' for c in all_cols[-len(cols):]]
        self.data.columns = all_cols
        
        # Repeat for away team
        self.data = self.data.merge(teams_df[['Team', 'Year'] + cols], left_on='away_team', right_on='Team')
        self.data = self.data[self.data['Y'] + year_offset == self.data['Year']]
        self.data = self.data.drop(['Team', 'Year'], axis='columns')
        all_cols = list(self.data.columns)
        all_cols[-len(cols):] = [f'away_{c}_offset{np.abs(year_offset)}' for c in all_cols[-len(cols):]]
        self.data.columns = all_cols
        
        return self.data
    
    def _load_pitcher(self, key_bbref):
        if isinstance(key_bbref, str):
            key_bbref = [key_bbref]
            
        f_list = [Files.pitchers_games_dir / Path(f'{key}.csv') for key in key_bbref]
        
        pitchers_subset_df = None
        for f in f_list:
            pitcher_df = pd.read_csv(f)
            if pitchers_subset_df is None:
                pitchers_subset_df = pitcher_df
            else:
                pitchers_subset_df = pd.concat([pitchers_subset_df, pitcher_df])
        
        return pitchers_subset_df
    
    def add_pitcher_stats(self, game_offset=-1, cols=[]):
        '''
        Load pitcher statistics (IP, ERA, etc.). Note that you can run this more
        than once to join several games worth of data.
        game_offset (int): (Default -1) If None, join pitcher data for the same game as the
            game occurred in. So a game_offset of -1 means games on 2015-01-30 would join on the
            the first game occurring before this (one game backward).
        cols (list): (Default []) Columns from pitcher data to include. By default no
            data is included.
        '''
        

In [216]:
ds = Dataset('train')
ds.load_games();

In [217]:
ds.data.head()

Unnamed: 0,date,Y,M,D,home_team,away_team,home_win,home_pitcher,away_pitcher,home_elo,...,elo_diff,elo_pct_diff,avg_diff,obp_diff,slg_diff,avg_pct_diff,obp_pct_diff,slg_pct_diff,home_rest,away_rest
0,2001-04-01,2001,4.0,1.0,TOR,TEX,1.0,loaizes01,helliri01,1499.563,...,20.4,1.360396,-0.00806,-0.010103,0.023271,-2.947374,-2.977845,4.989568,5.0,5.0
1,2001-04-02,2001,4.0,2.0,SFN,SDN,1.0,hernali01,williwo02,1540.841,...,48.041,3.117843,0.024251,0.032292,0.070273,8.784099,8.985458,14.998766,5.0,5.0
2,2001-04-02,2001,4.0,2.0,SEA,OAK,1.0,garcifr03,hudsoti01,1519.464,...,-15.232,-1.002459,-0.000864,0.00119,-0.016229,-0.323318,0.331871,-3.70521,5.0,5.0
3,2001-04-02,2001,4.0,2.0,NYA,KCA,1.0,clemero02,suppaje01,1529.511,...,36.359,2.377165,-0.010188,0.006929,0.024787,-3.703559,1.970596,5.554343,5.0,5.0
4,2001-04-02,2001,4.0,2.0,LAN,MIL,1.0,parkch01,wrighja01,1515.925,...,25.529,1.684054,0.01066,0.014276,0.026359,4.193722,4.236467,6.181414,5.0,5.0


In [218]:
ds.add_team_pitching(cols='ERA');

In [219]:
ds.data.head()

Unnamed: 0,date,Y,M,D,home_team,away_team,home_win,home_pitcher,away_pitcher,home_elo,...,avg_diff,obp_diff,slg_diff,avg_pct_diff,obp_pct_diff,slg_pct_diff,home_rest,away_rest,home_ERA_offset1,away_ERA_offset1
0,2001-04-01,2001,4.0,1.0,TOR,TEX,1.0,loaizes01,helliri01,1499.563,...,-0.00806,-0.010103,0.023271,-2.947374,-2.977845,4.989568,5.0,5.0,5.17,5.52
20,2001-04-24,2001,4.0,24.0,TOR,TEX,1.0,hamiljo02,davisdo02,1513.202,...,-0.061344,-0.052413,-0.092964,-24.448645,-15.287162,-21.483376,2.0,2.0,5.17,5.52
40,2001-04-25,2001,4.0,25.0,TOR,TEX,1.0,carpech01,rogerke01,1514.574,...,-0.062848,-0.064907,-0.108138,-25.324224,-19.726679,-25.543036,1.0,1.0,5.17,5.52
60,2001-08-17,2001,8.0,17.0,TOR,TEX,1.0,loaizes01,oliveda02,1486.627,...,0.052072,0.038208,0.114988,16.891756,10.999306,20.949847,1.0,1.0,5.17,5.52
80,2001-08-18,2001,8.0,18.0,TOR,TEX,0.0,hallaro01,bellro01,1489.93,...,0.032679,0.016909,0.079077,10.907021,4.948137,15.053859,1.0,1.0,5.17,5.52


In [220]:
ds.add_team_stats(cols=['W-L-pct', 'Avg_Attendance'])

Unnamed: 0,date,Y,M,D,home_team,away_team,home_win,home_pitcher,away_pitcher,home_elo,...,obp_pct_diff,slg_pct_diff,home_rest,away_rest,home_ERA_offset1,away_ERA_offset1,home_W-L-pct_offset1,home_Avg_Attendance_offset1,away_W-L-pct_offset1,away_Avg_Attendance_offset1
0,2001-04-01,2001,4.0,1.0,TOR,TEX,1.0,loaizes01,helliri01,1499.563,...,-2.977845,4.989568,5.0,5.0,5.17,5.52,0.512346,24861.419753,0.438272,32341.993789
20,2001-04-24,2001,4.0,24.0,TOR,TEX,1.0,hamiljo02,davisdo02,1513.202,...,-15.287162,-21.483376,2.0,2.0,5.17,5.52,0.512346,24861.419753,0.438272,32341.993789
40,2001-04-25,2001,4.0,25.0,TOR,TEX,1.0,carpech01,rogerke01,1514.574,...,-19.726679,-25.543036,1.0,1.0,5.17,5.52,0.512346,24861.419753,0.438272,32341.993789
60,2001-08-17,2001,8.0,17.0,TOR,TEX,1.0,loaizes01,oliveda02,1486.627,...,10.999306,20.949847,1.0,1.0,5.17,5.52,0.512346,24861.419753,0.438272,32341.993789
80,2001-08-18,2001,8.0,18.0,TOR,TEX,0.0,hallaro01,bellro01,1489.930,...,4.948137,15.053859,1.0,1.0,5.17,5.52,0.512346,24861.419753,0.438272,32341.993789
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
714573,2014-05-17,2014,5.0,17.0,TEX,TOR,0.0,rossro01,buehrma01,1505.675,...,-18.336315,-23.994152,1.0,1.0,3.63,4.26,0.558282,33473.073620,0.456790,29677.623457
714593,2014-05-18,2014,5.0,18.0,TEX,TOR,1.0,martini01,dicker.01,1503.770,...,-20.297000,-28.585825,1.0,1.0,3.63,4.26,0.558282,33473.073620,0.456790,29677.623457
714614,2015-08-25,2015,8.0,25.0,TEX,TOR,0.0,hollade01,buehrma01,1503.437,...,-31.835425,-32.908732,2.0,2.0,4.49,4.00,0.413580,30360.061728,0.512346,29518.018519
714634,2015-08-26,2015,8.0,26.0,TEX,TOR,0.0,lewisco01,priceda01,1502.258,...,-30.941935,-34.173435,1.0,1.0,4.49,4.00,0.413580,30360.061728,0.512346,29518.018519
