In [40]:
import pandas as pd
import numpy as np
import os
from datetime import datetime

from pathlib import Path

In [41]:
base_dir = Path('../data')

In [42]:
class Files:
    # Game-by-game covariate data, coming from the paper
    games = base_dir / Path('mlb_games_df.csv')
    # Pitchers summary data, (primarily reference keys, not much in the way of stats)
    pitchers = base_dir / Path('pitchers_summary.csv')
    # Team-level pitching stats year-by-year
    team_pitching = base_dir / Path('team_pitching_stats.csv')
    # Team-level general data (attendance, W-L, etc)
    teams = base_dir / Path('team_stats.csv')
    # Game-level pitcher stats
    pitchers_games = base_dir / Path('pitchers_games.csv')

In [43]:
class Dataset:
    def __init__(self, name):
        self.name = name
        
        self.created_at = datetime.now()
        self.modified_at = self.created_at
        
        self.data = None
        
# Trying to make a function to abstract out this whole merge-filter-rename process,
# but the function is uglier than just copy-pasting it.
#     def _year_offset(self, left_df, right_df, left_on, right_on, 
#                      right_stats_cols, year_offset, home_away, 
#                      left_year='Year', right_year='Year',
#                      left_team='Team', right_team='Team'):
#         left_df = left_df.merge(right_df[[right_team, right_year] + right_stats_cols], left_on=left_on, right_on=right_on)
#         left_df = left_df[left_df[left_year] + year_offset == left_df[right_year]]
#         left_df = left_df.drop([right_team, right_year], axis='columns')
#         all_cols = list(left_df.columns)
#         all_cols[-len(right_stats_cols):] = [f'{home_away}_{c}_offset{np.abs(year_offset)}' for c in all_cols[-len(cols):]]
#         left_df.columns = all_cols
        
    def load_games(self, start_date='2000-01-01', end_date='2015-12-31'):
        '''
        Load all games between supplied dates.
        start_date (str or date): (Default 2000-01-01) 
        end_date (str or date): (Default 2015-12-31) 
        '''
        games_df = pd.read_csv(Files.games)
        games_df['date'] = pd.to_datetime(games_df['date'])
        games_df = games_df[games_df['date'].between(start_date, end_date)]
        games_df['Y'] = games_df['Y'].astype(int)
        self.data = games_df
        self.data = self._downcast(self.data)
        return self.data
        
    def add_team_pitching(self, year_offset=-1, cols=[]):
        '''
        Load team pitching data and join it to the game data. Note that you can run this more
        than once to join several years of pitching data.
        year_offset (int): (Default -1) If None, join pitching data for the same year as the
            game occurred in. So a year_offset of -1 means games in 2015 would join in pitching
            data from 2014 (one year earlier).
        cols (list): (Default []) Columns from team pitching data to include. By default no
            data is included.
        '''
        assert self.data is not None, 'First run Dataset.load_games() to load some games into memory'
        if isinstance(cols, str):
            cols = [cols]
        
        pitching_df = pd.read_csv(Files.team_pitching)
        
        # Append columns for the home team
        self.data = self.data.merge(pitching_df[['Team', 'Year'] + cols], left_on='home_team', right_on='Team')
        self.data = self.data[self.data['Y'] + year_offset == self.data['Year']]
        self.data = self.data.drop(['Team', 'Year'], axis='columns')
        all_cols = list(self.data.columns)
        all_cols[-len(cols):] = [f'home_{c}_offset{np.abs(year_offset)}' for c in all_cols[-len(cols):]]
        self.data.columns = all_cols
        self.data = self._downcast(self.data)
        
        # Repeat for away team
        self.data = self.data.merge(pitching_df[['Team', 'Year'] + cols], left_on='away_team', right_on='Team')
        self.data = self.data[self.data['Y'] + year_offset == self.data['Year']]
        self.data = self.data.drop(['Team', 'Year'], axis='columns')
        all_cols = list(self.data.columns)
        all_cols[-len(cols):] = [f'away_{c}_offset{np.abs(year_offset)}' for c in all_cols[-len(cols):]]
        self.data.columns = all_cols
        self.data = self._downcast(self.data)
        return self.data
    
    def add_team_stats(self, year_offset=-1, cols=[]):
        '''
        Load team statistics (attendance, W-L%, etc.). Note that you can run this more
        than once to join several years of data.
        year_offset (int): (Default -1) If None, join team data for the same year as the
            game occurred in. So a year_offset of -1 means games in 2015 would join in team
            data from 2014 (one year earlier).
        cols (list): (Default []) Columns from team data to include. By default no
            data is included.
        '''
        assert self.data is not None, 'First run Dataset.load_games() to load some games into memory'
        if isinstance(cols, str):
            cols = [cols]
            
        teams_df = pd.read_csv(Files.teams)
        
        # Append columns for the home team
        self.data = self.data.merge(teams_df[['Team', 'Year'] + cols], left_on='home_team', right_on='Team')
        self.data = self.data[self.data['Y'] + year_offset == self.data['Year']]
        self.data = self.data.drop(['Team', 'Year'], axis='columns')
        all_cols = list(self.data.columns)
        all_cols[-len(cols):] = [f'home_{c}_offset{np.abs(year_offset)}' for c in all_cols[-len(cols):]]
        self.data.columns = all_cols
        self.data = self._downcast(self.data)
        
        # Repeat for away team
        self.data = self.data.merge(teams_df[['Team', 'Year'] + cols], left_on='away_team', right_on='Team')
        self.data = self.data[self.data['Y'] + year_offset == self.data['Year']]
        self.data = self.data.drop(['Team', 'Year'], axis='columns')
        all_cols = list(self.data.columns)
        all_cols[-len(cols):] = [f'away_{c}_offset{np.abs(year_offset)}' for c in all_cols[-len(cols):]]
        self.data.columns = all_cols
        self.data = self._downcast(self.data)
        
        return self.data
    
    def add_pitcher_stats(self, game_offset=-1, cols=[]):
        '''
        Load pitcher statistics (IP, ERA, etc.). Note that you can run this more
        than once to join several games worth of data.
        game_offset (int): (Default -1) If None, join pitcher data for the same game as the
            game occurred in. So a game_offset of -1 means games on 2015-01-30 would join on the
            the first game occurring before this (one game backward).
        cols (list): (Default []) Columns from pitcher data to include. By default no
            data is included.
        '''
        assert self.data is not None, 'First run Dataset.load_games() to load some games into memory'
        if isinstance(cols, str):
            cols = [cols]
            
        pitchers_df = pd.read_csv(Files.pitchers_games)
        pitchers_df['Date'] = pd.to_datetime(pitchers_df['Date'])
        
        # Append columns for the home team
        self.data = self.data.merge(pitchers_df[['name', 'Date'] + cols], left_on='home_pitcher', right_on='name')
        self.data = self.data[self.data['Y'] + year_offset == self.data['Date'].dt.year]
        self.data = self.data.drop(['name', 'Date'], axis='columns')
        all_cols = list(self.data.columns)
        all_cols[-len(cols):] = [f'home_{c}_offset{np.abs(year_offset)}' for c in all_cols[-len(cols):]]
        self.data.columns = all_cols
        self.data = self._downcast(self.data)
        
        # Repeat for away team
        self.data = self.data.merge(pitchers_df[['name', 'Date'] + cols], left_on='away_pitcher', right_on='name')
        self.data = self.data[self.data['Y'] + year_offset == self.data['Date'].dt.year]
        self.data = self.data.drop(['name', 'Date'], axis='columns')
        all_cols = list(self.data.columns)
        all_cols[-len(cols):] = [f'away_{c}_offset{np.abs(year_offset)}' for c in all_cols[-len(cols):]]
        self.data.columns = all_cols
        self.data = self._downcast(self.data)
        
        return self.data
    
    def _downcast(self, df, show_reduction=False):
        original_mem_usage = sum(df.memory_usage() / 10**6)
        for c in df.select_dtypes(int).columns:
            # Positive integers
            if df[c].min() > 0:
                if df[c].max() < 255:
                    df[c] = df[c].astype(np.uint8)
                elif df[c].max() < 65535:
                    df[c] = df[c].astype(np.uint16)
                elif df[c].max() < 4294967295:
                    df[c] = df[c].astype(np.uint32)
                else:
                    df[c] = df[c].astype(np.uint64)
            # Negative integers
            else:
                if df[c].max() < 127 and df[c].min() > -127:
                    df[c] = df[c].astype(np.int8)
                elif df[c].max() < 32767 and df[c].min() > -32767:
                    df[c] = df[c].astype(np.int16)
                elif df[c].max() < 2147483648 and df[c].min() > -2147483648:
                    df[c] = df[c].astype(np.int32)
                else:
                    df[c] = df[c].astype(np.int64)

            # Downcast all floats to 32 bits (unlikely to need more precision than that)
            for c in df.select_dtypes(float).columns:
                df[c] = df[c].astype(np.float32)

        if show_reduction:
            reduced_mem_usage = sum(df.memory_usage() / 10**6)
            print(f'{original_mem_usage:.2f}MB -> {reduced_mem_usage:.2f}MB ({100*(1-reduced_mem_usage/original_mem_usage):.2f}% reduction)')
        return df

In [44]:
ds = Dataset('train')
ds.load_games();

In [13]:
ds.add_team_pitching(cols='ERA');

In [None]:
ds.add_team_stats(cols=['W-L-pct', 'Avg_Attendance']);

In [15]:
ds.add_pitcher_stats(cols=['ERA']);

  if (await self.run_code(code, result,  async_=asy)):


NameError: name 'year_offset' is not defined

In [48]:
ds.data.head()

Unnamed: 0,away_team_season_game_num,home_team_season_game_num,date,Y,M,D,home_team,away_team,home_win,home_pitcher,...,elo_diff,elo_pct_diff,avg_diff,obp_diff,slg_diff,avg_pct_diff,obp_pct_diff,slg_pct_diff,home_rest,away_rest
0,0,0,2001-04-01,2001,4.0,1.0,TOR,TEX,1.0,loaizes01,...,20.4,1.360396,-0.00806,-0.010103,0.023271,-2.947374,-2.977845,4.989568,5.0,5.0
1,0,0,2001-04-02,2001,4.0,2.0,SEA,OAK,1.0,garcifr03,...,-15.232,-1.002459,-0.000864,0.00119,-0.016229,-0.323318,0.331871,-3.70521,5.0,5.0
2,0,0,2001-04-02,2001,4.0,2.0,NYA,KCA,1.0,clemero02,...,36.359001,2.377165,-0.010188,0.006929,0.024787,-3.703559,1.970596,5.554343,5.0,5.0
3,0,0,2001-04-02,2001,4.0,2.0,CIN,ATL,0.0,harnipe01,...,3.41,0.223274,0.003972,-0.001729,0.020216,1.459194,-0.50696,4.555242,5.0,5.0
4,0,0,2001-04-02,2001,4.0,2.0,CHN,WAS,0.0,liebejo01,...,0.745,0.05094,-0.010158,0.009335,-0.018992,-3.99634,2.80356,-4.646432,5.0,5.0


In [138]:
pitchers_df = pd.read_csv(Files.pitchers_games)

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


In [139]:
pitchers_df['Date'] = pd.to_datetime(pitchers_df['Date'])
pitchers_df['Year'] = pitchers_df['Date'].dt.year
# pitchers_df = pitchers_df[['Date', 'Year', 'Tm', 'ERA', 'WHIP']]

In [150]:
df = ds.data.merge(pitchers_df, left_on=['home_team', 'Y', 'home_pitcher'], right_on=['Home_Tm', 'Year', 'name'], how='left')

In [151]:
df = df[df['Date'] < df['date']]

In [152]:
df[(df['date'] <= '2001-05-01') & (df['home_team'] == 'ANA')][['date', 'home_team', 'away_team', 'Date']].head(30)

Unnamed: 0,date,home_team,away_team,Date
1271,2001-04-15,ANA,SEA,2001-04-10
1490,2001-04-17,ANA,OAK,2001-04-11
2713,2001-05-01,ANA,CHA,2001-04-10
2714,2001-05-01,ANA,CHA,2001-04-15


In [149]:
pitchers_df[(pitchers_df['Tm'] == 'ANA') & (pitchers_df['Date'] <= '2001-05-01') & (pitchers_df['Date'] > '2001-01-01')][['Tm', 'Opp', 'Home_Tm', 'Date']]

Unnamed: 0,Tm,Opp,Home_Tm,Date
16615,ANA,TEX,TEX,2001-04-04
16616,ANA,TEX,ANA,2001-04-10
16617,ANA,TEX,ANA,2001-04-12
16618,ANA,OAK,ANA,2001-04-16
16619,ANA,SEA,SEA,2001-04-21
...,...,...,...,...
364314,ANA,SEA,ANA,2001-04-15
364315,ANA,SEA,SEA,2001-04-19
364316,ANA,SEA,SEA,2001-04-21
364317,ANA,TOR,TOR,2001-04-27


In [135]:
i = 0

for _, game_df in df.groupby(['home_team', 'date']):
    game_df = game_df.sort_values('Date')
    print(game_df[['date', 'Date', 'Tm']].tail(1))
    if i > 3:
        break
    i += 1
    print('='*40)

           date       Date   Tm
2173 2001-04-10 2001-04-04  ANA
           date       Date   Tm
2514 2001-04-11 2001-04-05  ANA
           date       Date   Tm
2581 2001-04-12 2001-04-06  ANA
           date       Date   Tm
3565 2001-04-15 2001-04-10  ANA
           date       Date   Tm
4282 2001-04-17 2001-04-11  ANA
