In [40]:
import pandas as pd
import numpy as np
import os
from datetime import datetime

from pathlib import Path

In [41]:
base_dir = Path('../data')

In [42]:
class Files:
    # Game-by-game covariate data, coming from the paper
    games = base_dir / Path('mlb_games_df.csv')
    # Pitchers summary data, (primarily reference keys, not much in the way of stats)
    pitchers = base_dir / Path('pitchers_summary.csv')
    # Team-level pitching stats year-by-year
    team_pitching = base_dir / Path('team_pitching_stats.csv')
    # Team-level general data (attendance, W-L, etc)
    teams = base_dir / Path('team_stats.csv')
    # Game-level pitcher stats
    pitchers_games = base_dir / Path('pitchers_games.csv')

In [259]:
class Dataset:
    def __init__(self, name):
        self.name = name
        
        self.created_at = datetime.now()
        self.modified_at = self.created_at
        
        self.data = None
        
    def load_games(self, start_date='2000-01-01', end_date='2015-12-31', cols=[]):
        '''
        Load all games between supplied dates.
        start_date (str or date): (Default 2000-01-01) 
        end_date (str or date): (Default 2015-12-31) 
        '''
        games_df = pd.read_csv(Files.games)
        games_df['date'] = pd.to_datetime(games_df['date'])
        games_df = games_df[games_df['date'].between(start_date, end_date)]
        games_df['Y'] = games_df['Y'].astype(int)
        self.data = games_df
        self.data = self._downcast(self.data)
        return self.data
        
    def add_team_pitching_stats(self, year_offset=-1, cols=[]):
        '''
        Load team pitching data and join it to the game data. Note that you can run this more
        than once to join several years of pitching data.
        year_offset (int): (Default -1) If None, join pitching data for the same year as the
            game occurred in. So a year_offset of -1 means games in 2015 would join in pitching
            data from 2014 (one year earlier).
        cols (list): (Default []) Columns from team pitching data to include. By default no
            data is included.
        '''
        assert self.data is not None, 'First run Dataset.load_games() to load some games into memory'
        if isinstance(cols, str):
            cols = [cols]
        
        pitching_df = pd.read_csv(Files.team_pitching)
        
        # Append columns for the home team
        self.data = self.data.merge(pitching_df[['Team', 'Year'] + cols], left_on='home_team', right_on='Team')
        self.data = self.data[self.data['Y'] + year_offset == self.data['Year']]
        self.data = self.data.drop(['Team', 'Year'], axis='columns')
        all_cols = list(self.data.columns)
        all_cols[-len(cols):] = [f'home_{c}_offset{np.abs(year_offset)}' for c in all_cols[-len(cols):]]
        self.data.columns = all_cols
        self.data = self._downcast(self.data)
        
        # Repeat for away team
        self.data = self.data.merge(pitching_df[['Team', 'Year'] + cols], left_on='away_team', right_on='Team')
        self.data = self.data[self.data['Y'] + year_offset == self.data['Year']]
        self.data = self.data.drop(['Team', 'Year'], axis='columns')
        all_cols = list(self.data.columns)
        all_cols[-len(cols):] = [f'away_{c}_offset{np.abs(year_offset)}' for c in all_cols[-len(cols):]]
        self.data.columns = all_cols
        self.data = self._downcast(self.data)
        return self.data
    
    def add_team_stats(self, year_offset=1, cols=[]):
        '''
        Load team statistics (attendance, W-L%, etc.). Note that you can run this more
        than once to join several years of data.
        year_offset (int): (Default 1) If None, join team data for the same year as the
            game occurred in. So a year_offset of 1 means games in 2015 would join in team
            data from 2014 (one year earlier).
        cols (list): (Default []) Columns from team data to include. By default no
            data is included.
        '''
        assert self.data is not None, 'First run Dataset.load_games() to load some games into memory'
        if isinstance(cols, str):
            cols = [cols]
            
        teams_df = pd.read_csv(Files.teams)
        
        # Append columns for the home team
        self.data = self.data.merge(teams_df[['Team', 'Year'] + cols], left_on='home_team', right_on='Team')
        self.data = self.data[self.data['Y'] - year_offset == self.data['Year']]
        self.data = self.data.drop(['Team', 'Year'], axis='columns')
        all_cols = list(self.data.columns)
        all_cols[-len(cols):] = [f'home_{c}_offset{np.abs(year_offset)}' for c in all_cols[-len(cols):]]
        self.data.columns = all_cols
        self.data = self._downcast(self.data)
        
        # Repeat for away team
        self.data = self.data.merge(teams_df[['Team', 'Year'] + cols], left_on='away_team', right_on='Team')
        self.data = self.data[self.data['Y'] - year_offset == self.data['Year']]
        self.data = self.data.drop(['Team', 'Year'], axis='columns')
        all_cols = list(self.data.columns)
        all_cols[-len(cols):] = [f'away_{c}_offset{np.abs(year_offset)}' for c in all_cols[-len(cols):]]
        self.data.columns = all_cols
        self.data = self._downcast(self.data)
        
        return self.data
    
    def add_pitcher_stats(self, game_offset=1, cols=[]):
        '''
        Load pitcher statistics (IP, ERA, etc.). Note that you can run this more
        than once to join several games worth of data.
        game_offset (int): (Default 1) If None, join pitcher data for the same game as the
            game occurred in. So a game_offset of 1 means games on 2015-01-30 would join on the
            the first game occurring before this (one game backward).
        cols (list): (Default []) Columns from pitcher data to include. By default no
            data is included.
        '''
        assert self.data is not None, 'First run Dataset.load_games() to load some games into memory'
        if isinstance(cols, str):
            cols = [cols]
            
        # Home team
        pitchers_df = pd.read_csv(Files.pitchers_games)
        pitchers_df = self._downcast(pitchers_df)
        pitchers_df['Date'] = pd.to_datetime(pitchers_df['Date'])
        pitchers_df['Year'] = pitchers_df['Date'].dt.year
        pitchers_df = pitchers_df[['name', 'Date', 'Year'] + cols]
        self.data = self.data.merge(pitchers_df, left_on=['home_pitcher', 'Y'], right_on=['name', 'Year'], how='left')
        # Do <= in case it's the pitchers first ever start. If you do < then that row of data
        # will just be thrown away (not good!)
        self.data = self.data[self.data['Date'] <= self.data['date']]
        
        processed_df = None
        for _, game_df in self.data.groupby(['home_pitcher', 'date']):
            # Check if this is the pitchers first start, i.e. only "previous" game is the
            # one they're currently playing. If so, null-out the stats.
            if game_df.shape[0] == 1:
                game_df[cols] = None
            game_df = game_df.sort_values('Date', ascending=False)
            game_df = game_df.iloc[[game_offset-1]]
            if processed_df is None:
                processed_df = game_df
            else:
                processed_df = pd.concat([processed_df, game_df])
        
        processed_df = processed_df.drop(['name', 'Year', 'Date'], axis='columns')
        self.data = processed_df
        all_cols = list(self.data.columns)
        all_cols[-len(cols):] = [f'home_{c}_offset{np.abs(game_offset)}' for c in all_cols[-len(cols):]]
        self.data.columns = all_cols
        self.data = self._downcast(self.data)
        
        # Away team
        pitchers_df = pd.read_csv(Files.pitchers_games)
        pitchers_df = self._downcast(pitchers_df)
        pitchers_df['Date'] = pd.to_datetime(pitchers_df['Date'])
        pitchers_df['Year'] = pitchers_df['Date'].dt.year
        pitchers_df = pitchers_df[['name', 'Date', 'Year'] + cols]
        self.data = self.data.merge(pitchers_df, left_on=['away_pitcher', 'Y'], right_on=['name', 'Year'], how='left')
        # Do <= in case it's the pitchers first ever start. If you do < then that row of data
        # will just be thrown away (not good!)
        self.data = self.data[self.data['Date'] <= self.data['date']]
        
        processed_df = None
        for _, game_df in self.data.groupby(['away_pitcher', 'date']):
            # Check if this is the pitchers first start, i.e. only "previous" game is the
            # one they're currently playing. If so, null-out the stats.
            if game_df.shape[0] == 1:
                game_df[cols] = None
            game_df = game_df.sort_values('Date', ascending=False)
            game_df = game_df.iloc[[game_offset-1]]
            if processed_df is None:
                processed_df = game_df
            else:
                processed_df = pd.concat([processed_df, game_df])
        
        processed_df = processed_df.drop(['name', 'Year', 'Date'], axis='columns')
        self.data = processed_df
        all_cols = list(self.data.columns)
        all_cols[-len(cols):] = [f'away_{c}_offset{np.abs(game_offset)}' for c in all_cols[-len(cols):]]
        self.data.columns = all_cols
        self.data = self._downcast(self.data)
        
        return self.data
    
    def _downcast(self, df, show_reduction=False):
        original_mem_usage = sum(df.memory_usage() / 10**6)
        for c in df.select_dtypes(int).columns:
            # Positive integers
            if df[c].min() > 0:
                if df[c].max() < 255:
                    df[c] = df[c].astype(np.uint8)
                elif df[c].max() < 65535:
                    df[c] = df[c].astype(np.uint16)
                elif df[c].max() < 4294967295:
                    df[c] = df[c].astype(np.uint32)
                else:
                    df[c] = df[c].astype(np.uint64)
            # Negative integers
            else:
                if df[c].max() < 127 and df[c].min() > -127:
                    df[c] = df[c].astype(np.int8)
                elif df[c].max() < 32767 and df[c].min() > -32767:
                    df[c] = df[c].astype(np.int16)
                elif df[c].max() < 2147483648 and df[c].min() > -2147483648:
                    df[c] = df[c].astype(np.int32)
                else:
                    df[c] = df[c].astype(np.int64)

            # Downcast all floats to 32 bits (unlikely to need more precision than that)
            for c in df.select_dtypes(float).columns:
                df[c] = df[c].astype(np.float32)

        if show_reduction:
            reduced_mem_usage = sum(df.memory_usage() / 10**6)
            print(f'{original_mem_usage:.2f}MB -> {reduced_mem_usage:.2f}MB ({100*(1-reduced_mem_usage/original_mem_usage):.2f}% reduction)')
        return df

In [260]:
ds = Dataset('train')
ds.load_games(start_date='2000-01-01', end_date='2005-01-01');

In [261]:
ds.add_team_pitching_stats(cols='ERA');

In [262]:
ds.add_team_stats(cols=['W-L-pct', 'Avg_Attendance']);

In [263]:
ds.add_pitcher_stats(cols=['ERA']);

  if (await self.run_code(code, result,  async_=asy)):


Index(['away_team_season_game_num', 'home_team_season_game_num', 'date', 'Y',
       'M', 'D', 'home_team', 'away_team', 'home_win', 'home_pitcher',
       'away_pitcher', 'home_elo', 'away_elo', 'home_avg', 'away_avg',
       'home_obp', 'away_obp', 'home_slg', 'away_slg', 'home_iso', 'away_iso',
       'elo_diff', 'elo_pct_diff', 'avg_diff', 'obp_diff', 'slg_diff',
       'avg_pct_diff', 'obp_pct_diff', 'slg_pct_diff', 'home_rest',
       'away_rest', 'home_ERA_offset1', 'away_ERA_offset1',
       'home_W-L-pct_offset1', 'home_Avg_Attendance_offset1',
       'away_W-L-pct_offset1', 'away_Avg_Attendance_offset1',
       'home_ERA_offset1', 'name', 'Date', 'Year', 'ERA'],
      dtype='object')


In [264]:
ds.data.head()

Unnamed: 0,away_team_season_game_num,home_team_season_game_num,date,Y,M,D,home_team,away_team,home_win,home_pitcher,...,away_rest,home_ERA_offset1,away_ERA_offset1,home_W-L-pct_offset1,home_Avg_Attendance_offset1,away_W-L-pct_offset1,away_Avg_Attendance_offset1,home_ERA_offset1.1,Date,away_ERA_offset1.1
16213,13,13,2001-04-28,2001,4.0,28.0,CHA,SEA,0.0,biddlro01,...,1.0,4.67,4.53,0.58642,26695.111111,0.561728,33215.67284,3.86,2001-04-28,
55542,18,19,2001-05-11,2001,5.0,11.0,TOR,SEA,0.0,hamiljo02,...,1.0,5.17,4.53,0.512346,24861.419753,0.561728,33215.67284,4.99,2001-05-11,4.7
141565,21,22,2001-05-22,2001,5.0,22.0,MIN,SEA,1.0,radkebr01,...,2.0,5.16,4.53,0.425926,20939.512346,0.561728,33215.67284,3.39,2001-05-22,6.66
34596,26,26,2001-05-28,2001,5.0,28.0,KCA,SEA,0.0,durbich01,...,1.0,5.48,4.53,0.475309,24699.740741,0.561728,33215.67284,5.2,2001-05-28,5.67
7496,28,35,2001-06-14,2001,6.0,14.0,COL,SEA,0.0,astacpe01,...,2.0,5.29,4.53,0.506173,34330.888199,0.561728,33215.67284,5.28,2001-06-14,4.25


In [220]:
pitchers_df['Date'] = pd.to_datetime(pitchers_df['Date'])
pitchers_df['Year'] = pitchers_df['Date'].dt.year
pitchers_df = pitchers_df[['name', 'Date', 'Year', 'Tm', 'ERA', 'WHIP']]

In [221]:
df = ds.data.merge(pitchers_df, left_on='home_pitcher', right_on='name', how='left')

In [222]:
df = df[df['Date'] < df['date']]
df = df[df['Y'] == df['Year']]

In [216]:
test_p = pitchers_df[(pitchers_df['Tm'] == 'ANA') & (pitchers_df['Date'] <= '2010-05-11') & ((pitchers_df['Date'] > '2010-04-01'))]

In [225]:
test_g = df[(df['date'] == '2010-04-19') & (df['home_team'] == 'ANA')]

In [226]:
test_g

Unnamed: 0,away_team_season_game_num,home_team_season_game_num,date,Y,M,D,home_team,away_team,home_win,home_pitcher,...,obp_pct_diff,slg_pct_diff,home_rest,away_rest,name,Date,Year,Tm,ERA,WHIP
4734005,6,7,2010-04-19,2010,4.0,19.0,ANA,DET,1.0,pineijo01,...,-5.89986,2.817477,1.0,1.0,pineijo01,2010-04-08,2010.0,ANA,4.5,1.333333
4734006,6,7,2010-04-19,2010,4.0,19.0,ANA,DET,1.0,pineijo01,...,-5.89986,2.817477,1.0,1.0,pineijo01,2010-04-14,2010.0,ANA,2.77,0.714286


In [184]:
pitchers_df[(pitchers_df['Tm'] == 'ANA') & (pitchers_df['Date'] <= '2001-05-01') & (pitchers_df['Date'] > '2001-01-01')]

Unnamed: 0,name,Date,Year,Tm,ERA,WHIP
16615,potelo01,2001-04-04,2001,ANA,4.5,0.500000
16616,potelo01,2001-04-10,2001,ANA,1.93,2.272727
16617,potelo01,2001-04-12,2001,ANA,2.25,1.290323
16618,potelo01,2001-04-16,2001,ANA,1.74,2.380952
16619,potelo01,2001-04-21,2001,ANA,1.5,0.000000
...,...,...,...,...,...,...
364314,weberbe01,2001-04-15,2001,ANA,1.5,2.000000
364315,weberbe01,2001-04-19,2001,ANA,1.42,10.000000
364316,weberbe01,2001-04-21,2001,ANA,1.23,2.000000
364317,weberbe01,2001-04-27,2001,ANA,1.08,1.000000


In [135]:
i = 0

for _, game_df in df.groupby(['home_team', 'date']):
    game_df = game_df.sort_values('Date')
    print(game_df[['date', 'Date', 'Tm']].tail(1))
    if i > 3:
        break
    i += 1
    print('='*40)

           date       Date   Tm
2173 2001-04-10 2001-04-04  ANA
           date       Date   Tm
2514 2001-04-11 2001-04-05  ANA
           date       Date   Tm
2581 2001-04-12 2001-04-06  ANA
           date       Date   Tm
3565 2001-04-15 2001-04-10  ANA
           date       Date   Tm
4282 2001-04-17 2001-04-11  ANA


In [153]:
games_df.head()

Unnamed: 0,date,Y,M,D,home_team,away_team,home_win,home_pitcher,away_pitcher,home_elo,...,elo_diff,elo_pct_diff,avg_diff,obp_diff,slg_diff,avg_pct_diff,obp_pct_diff,slg_pct_diff,home_rest,away_rest
0,2001-04-01,2001,4.0,1.0,TOR,TEX,1.0,loaizes01,helliri01,1499.563,...,20.4,1.360396,-0.00806,-0.010103,0.023271,-2.947374,-2.977845,4.989568,5.0,5.0
1,2001-04-02,2001,4.0,2.0,SFN,SDN,1.0,hernali01,williwo02,1540.841,...,48.041,3.117843,0.024251,0.032292,0.070273,8.784099,8.985458,14.998766,5.0,5.0
2,2001-04-02,2001,4.0,2.0,SEA,OAK,1.0,garcifr03,hudsoti01,1519.464,...,-15.232,-1.002459,-0.000864,0.00119,-0.016229,-0.323318,0.331871,-3.70521,5.0,5.0
3,2001-04-02,2001,4.0,2.0,NYA,KCA,1.0,clemero02,suppaje01,1529.511,...,36.359,2.377165,-0.010188,0.006929,0.024787,-3.703559,1.970596,5.554343,5.0,5.0
4,2001-04-02,2001,4.0,2.0,LAN,MIL,1.0,parkch01,wrighja01,1515.925,...,25.529,1.684054,0.01066,0.014276,0.026359,4.193722,4.236467,6.181414,5.0,5.0


In [154]:
pitchers_df.head()

Unnamed: 0,Gcar,Gtm,Date,Tm,Opp,Inngs,Dec,DR,IP,H,...,Double_Header,Home_Tm,WHIP,Result,Tm_Score,Opp_Score,name,DFS(DK),DFS(FD),Year
0,478,2,2000-04-05,ARI,PHI,6-GF,S(1),99,4.0,3,...,,ARI,0.75,W,11,3,morgami01,,,2000
1,479,7,2000-04-10,ARI,SDP,7-GF,S(2),4,3.0,4,...,,SDP,1.333333,W,8,4,morgami01,,,2000
2,480,10,2000-04-13,ARI,SDP,8-GF,S(3),2,2.0,1,...,,SDP,0.5,W,5,4,morgami01,,,2000
3,481,13,2000-04-17,ARI,COL,6-7,,3,1.2,0,...,,ARI,0.0,L,1,9,morgami01,,,2000
4,482,15,2000-04-19,ARI,COL,8-8,W(1-0),1,1.0,0,...,,ARI,0.0,W,8,7,morgami01,,,2000
