# Data loader

I wrote some helper functions to make loading data easier for experimentation. The code is located in `Notebook/utils/data_loader.py`. Below is examples of how to use it.

In [35]:
from utils.data_loader import Dataset

import pandas as pd

Everything is housed inside a `Dataset`. It only requires a name (used for saving and loading).

In [36]:
train_ds = Dataset(name='my_training_dataset')

Initially it does nothing. You can always see what data it has actually loaded by accessing the `.data` attribute, which is a Pandas DataFrame.

In [37]:
train_ds.data == None

True

## Games

Load games using the `load_games` method. Optionally you can specify start and end dates. By default it gets all data from 2000 to 2015 inclusive.

In [38]:
train_ds.load_games(start_date='2000-01-01', end_date='2003-01-01')

Unnamed: 0,date,Y,M,D,home_team,away_team,home_win,home_pitcher,away_pitcher,home_elo,...,avg_diff,obp_diff,slg_diff,avg_pct_diff,obp_pct_diff,slg_pct_diff,home_rest,away_rest,away_team_season_game_num,home_team_season_game_num


This returns all games in the requested date range (data comes from `data/mlb_games_df.csv`). It returns the data, but it's also always saved at the `.data` attribute.

In [15]:
# The same as what was printed out above
train_ds.data.head()

Unnamed: 0,away_team_season_game_num,home_team_season_game_num,date,Y,M,D,home_team,away_team,home_win,home_pitcher,...,elo_diff,elo_pct_diff,avg_diff,obp_diff,slg_diff,avg_pct_diff,obp_pct_diff,slg_pct_diff,home_rest,away_rest
0,0,0,2001-04-01,2001,4.0,1.0,TOR,TEX,1.0,loaizes01,...,20.4,1.360396,-0.00806,-0.010103,0.023271,-2.947374,-2.977845,4.989568,5.0,5.0
1,0,0,2001-04-02,2001,4.0,2.0,SEA,OAK,1.0,garcifr03,...,-15.232,-1.002459,-0.000864,0.00119,-0.016229,-0.323318,0.331871,-3.70521,5.0,5.0
2,0,0,2001-04-02,2001,4.0,2.0,NYA,KCA,1.0,clemero02,...,36.359001,2.377165,-0.010188,0.006929,0.024787,-3.703559,1.970596,5.554343,5.0,5.0
3,0,0,2001-04-02,2001,4.0,2.0,CIN,ATL,0.0,harnipe01,...,3.41,0.223274,0.003972,-0.001729,0.020216,1.459194,-0.50696,4.555242,5.0,5.0
4,0,0,2001-04-02,2001,4.0,2.0,CHN,WAS,0.0,liebejo01,...,0.745,0.05094,-0.010158,0.009335,-0.018992,-3.99634,2.80356,-4.646432,5.0,5.0


## Team stats

You can add team stats (from `data/team_stats.csv`) from previous years. To do so, use the `.add_team_stats()` method. By default it uses the previous years data, but you can add different/more years by specifying `year_offset` (default of 1). So, for example, `year_offset=2` would get team stats from two years prior. Columns are named `home/away_{col_name}_offset{year_offset}years`. So for example, `home_Avg_Attendance_offset1` for average attendance for the home team last year.

By default, **no columns are actually loaded from the team stats**, so you need to specify them using `cols=[...]`. Again, check out the `team_stats.csv` file to see what columns are available.

In [16]:
train_ds.add_team_stats(cols=['Avg_Attendance', 'W-L-pct'])

Unnamed: 0,away_team_season_game_num,home_team_season_game_num,date,Y,M,D,home_team,away_team,home_win,home_pitcher,...,slg_diff,avg_pct_diff,obp_pct_diff,slg_pct_diff,home_rest,away_rest,home_Avg_Attendance_offset1,home_W-L-pct_offset1,away_Avg_Attendance_offset1,away_W-L-pct_offset1
0,0,0,2001-04-01,2001,4.0,1.0,TOR,TEX,1.0,loaizes01,...,0.023271,-2.947374,-2.977845,4.989568,5.0,5.0,24861.419753,0.512346,32341.993789,0.438272
20,10,10,2001-04-24,2001,4.0,24.0,TOR,TEX,1.0,hamiljo02,...,-0.092964,-24.448645,-15.287162,-21.483376,2.0,2.0,24861.419753,0.512346,32341.993789,0.438272
40,11,11,2001-04-25,2001,4.0,25.0,TOR,TEX,1.0,carpech01,...,-0.108138,-25.324224,-19.726679,-25.543036,1.0,1.0,24861.419753,0.512346,32341.993789,0.438272
60,61,63,2001-08-17,2001,8.0,17.0,TOR,TEX,1.0,loaizes01,...,0.114988,16.891756,10.999306,20.949846,1.0,1.0,24861.419753,0.512346,32341.993789,0.438272
80,62,64,2001-08-18,2001,8.0,18.0,TOR,TEX,0.0,hallaro01,...,0.079077,10.907022,4.948137,15.053859,1.0,1.0,24861.419753,0.512346,32341.993789,0.438272
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
92951,54,55,2002-08-03,2002,8.0,3.0,SDN,CIN,0.0,tomkobr01,...,0.072507,23.059618,4.474744,15.819613,1.0,1.0,30563.919255,0.487654,28449.643750,0.407407
92971,55,56,2002-08-04,2002,8.0,4.0,SDN,CIN,0.0,jonesbo03,...,-0.002086,14.271653,-2.340771,-0.447244,1.0,1.0,30563.919255,0.487654,28449.643750,0.407407
92991,29,31,2002-06-07,2002,6.0,7.0,ANA,CIN,1.0,seleaa01,...,0.143131,16.378798,12.287816,29.475273,1.0,1.0,27714.104938,0.462963,28449.643750,0.407407
93011,30,32,2002-06-08,2002,6.0,8.0,ANA,CIN,0.0,ortizra02,...,0.145953,16.756199,11.639690,31.446281,1.0,1.0,27714.104938,0.462963,28449.643750,0.407407


## Team pitching

Similarly, you can add team pitching stats (at a team-level, _not_ a pitcher level) using `.add_team_pitching_stats()`. The parameters are the same as `.add_team_stats()`. Data comes from `data/team_pitching_stats.csv`.

In [17]:
train_ds.add_team_pitching_stats(cols=['WHIP', 'ERA'])

Unnamed: 0,away_team_season_game_num,home_team_season_game_num,date,Y,M,D,home_team,away_team,home_win,home_pitcher,...,home_rest,away_rest,home_Avg_Attendance_offset1,home_W-L-pct_offset1,away_Avg_Attendance_offset1,away_W-L-pct_offset1,team_home_WHIP_offset1,team_home_ERA_offset1,team_away_WHIP_offset1,team_away_ERA_offset1
0,0,0,2001-04-01,2001,4.0,1.0,TOR,TEX,1.0,loaizes01,...,5.0,5.0,24861.419753,0.512346,32341.993789,0.438272,1.513465,5.17,1.640308,5.52
20,10,10,2001-04-24,2001,4.0,24.0,TOR,TEX,1.0,hamiljo02,...,2.0,2.0,24861.419753,0.512346,32341.993789,0.438272,1.513465,5.17,1.640308,5.52
40,11,11,2001-04-25,2001,4.0,25.0,TOR,TEX,1.0,carpech01,...,1.0,1.0,24861.419753,0.512346,32341.993789,0.438272,1.513465,5.17,1.640308,5.52
60,61,63,2001-08-17,2001,8.0,17.0,TOR,TEX,1.0,loaizes01,...,1.0,1.0,24861.419753,0.512346,32341.993789,0.438272,1.513465,5.17,1.640308,5.52
80,62,64,2001-08-18,2001,8.0,18.0,TOR,TEX,0.0,hallaro01,...,1.0,1.0,24861.419753,0.512346,32341.993789,0.438272,1.513465,5.17,1.640308,5.52
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
90520,16,17,2001-05-08,2001,5.0,8.0,ARI,CIN,1.0,johnsra05,...,1.0,1.0,33104.598765,0.524691,34630.858025,0.524691,1.344928,4.36,1.445642,4.33
90540,17,18,2001-05-09,2001,5.0,9.0,ARI,CIN,1.0,ellisro02,...,1.0,1.0,33104.598765,0.524691,34630.858025,0.524691,1.344928,4.36,1.445642,4.33
90561,59,59,2002-08-20,2002,8.0,20.0,ARI,CIN,1.0,johnsra05,...,2.0,1.0,32990.739130,0.567901,28449.643750,0.407407,1.242462,3.88,1.447095,4.78
90581,60,60,2002-08-21,2002,8.0,21.0,ARI,CIN,1.0,schilcu01,...,1.0,1.0,32990.739130,0.567901,28449.643750,0.407407,1.242462,3.88,1.447095,4.78


## Pitcher stats

Finally, you can add stats for an individual pitcher (from `data/pitchers_games.csv`). Rather than a year offset, this uses `game_offset`, and only considers games from the same season (so for the first game of the season any "previous" stats will be `None`). By default it uses the previous games stats. Note that these are games where that pitcher started, so it's possible to have stats come from several games back, if that were the most recent game in which the pitcher started (I can change this is you'd like it to ignore who the starting pitcher is).

**Note:** This function is fairly slow, especially for a big block of seasons. It's grabbing _all_ past games for this pitcher, filtering, and manipulating. So if you've loaded all the data you want, consider using the `.save()` method below so you don't have to reload it over and over.

In [18]:
train_ds.add_pitcher_stats(cols=['WHIP', 'ERA', 'IP'])

  if (await self.run_code(code, result,  async_=asy)):


Unnamed: 0,away_team_season_game_num,home_team_season_game_num,date,Y,M,D,home_team,away_team,home_win,home_pitcher,...,team_home_WHIP_offset1,team_home_ERA_offset1,team_away_WHIP_offset1,team_away_ERA_offset1,pitcher_home_WHIP_offset1,pitcher_home_ERA_offset1,pitcher_home_IP_offset1,pitcher_away_WHIP_offset1,pitcher_away_ERA_offset1,pitcher_away_IP_offset1
9549,13,13,2001-04-28,2001,4.0,28.0,CHA,SEA,0.0,biddlro01,...,1.464037,4.67,1.440466,4.53,2.549020,3.86,5.1,,,
28030,18,19,2001-05-11,2001,5.0,11.0,TOR,SEA,0.0,hamiljo02,...,1.513465,5.17,1.440466,4.53,1.285714,4.99,7.0,1.568627,4.7,5.1
72000,21,22,2001-05-22,2001,5.0,22.0,MIN,SEA,1.0,radkebr01,...,1.501187,5.16,1.440466,4.53,2.195122,3.39,4.1,4.545455,6.66,2.2
19077,26,26,2001-05-28,2001,5.0,28.0,KCA,SEA,0.0,durbich01,...,1.582934,5.48,1.440466,4.53,3.333333,5.2,3.0,1.000000,5.67,9.0
4683,28,35,2001-06-14,2001,6.0,14.0,COL,SEA,0.0,astacpe01,...,1.507692,5.29,1.440466,4.53,2.000000,5.28,6.0,1.142857,4.25,7.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3334,46,49,2002-07-23,2002,7.0,23.0,ANA,OAK,0.0,appieke01,...,1.375591,4.20,1.246668,3.59,0.750000,4.5,8.0,1.475410,3.04,6.1
47901,54,54,2002-08-08,2002,8.0,8.0,BOS,OAK,1.0,lowede01,...,1.350829,4.18,1.246668,3.59,0.857143,2.09,7.0,1.375000,3.04,8.0
70814,62,62,2002-08-23,2002,8.0,23.0,DET,OAK,0.0,powelbr01,...,1.523336,5.01,1.246668,3.59,2.195122,5.09,4.1,0.857143,2.89,7.0
54885,70,71,2002-09-08,2002,9.0,8.0,MIN,OAK,0.0,miltoer01,...,1.345500,4.51,1.246668,3.59,1.500000,4.68,4.0,0.714286,2.74,7.0


## Saving and loading

Finally, you can save and load your data easily. By default files are saved to `data/saved_datasets/{your_dataset_name}.csv`.

In [19]:
train_ds.save()

If you want to load some data that you previous saved, create a new blank dataset with the same name as your saved data (so for example, to load the dataset we just saved we would create a new dataset with the same `name='my_training_dataset'`). Then run `.load()`.

In [20]:
new_train_ds = Dataset(name='my_training_dataset')
new_train_ds.load()

In [21]:
new_train_ds.data.head()

Unnamed: 0,away_team_season_game_num,home_team_season_game_num,date,Y,M,D,home_team,away_team,home_win,home_pitcher,...,team_home_WHIP_offset1,team_home_ERA_offset1,team_away_WHIP_offset1,team_away_ERA_offset1,pitcher_home_WHIP_offset1,pitcher_home_ERA_offset1,pitcher_home_IP_offset1,pitcher_away_WHIP_offset1,pitcher_away_ERA_offset1,pitcher_away_IP_offset1
0,13,13,2001-04-28,2001,4.0,28.0,CHA,SEA,0.0,biddlro01,...,1.464037,4.67,1.440466,4.53,2.54902,3.86,5.1,,,
1,18,19,2001-05-11,2001,5.0,11.0,TOR,SEA,0.0,hamiljo02,...,1.513465,5.17,1.440466,4.53,1.285714,4.99,7.0,1.568628,4.7,5.1
2,21,22,2001-05-22,2001,5.0,22.0,MIN,SEA,1.0,radkebr01,...,1.501187,5.16,1.440466,4.53,2.195122,3.39,4.1,4.545454,6.66,2.2
3,26,26,2001-05-28,2001,5.0,28.0,KCA,SEA,0.0,durbich01,...,1.582934,5.48,1.440466,4.53,3.333333,5.2,3.0,1.0,5.67,9.0
4,28,35,2001-06-14,2001,6.0,14.0,COL,SEA,0.0,astacpe01,...,1.507692,5.29,1.440466,4.53,2.0,5.28,6.0,1.142857,4.25,7.0


In [23]:
new_train_ds.data.shape == train_ds.data.shape

True

## Delete me

In [43]:
from pathlib import Path

base_dir = Path('../data')

class Files:
    # Game-by-game covariate data, coming from the paper
    games = base_dir / Path('mlb_games_df.csv')
    # Pitchers summary data, (primarily reference keys, not much in the way of stats)
    pitchers = base_dir / Path('pitchers_summary.csv')
    # Team-level pitching stats year-by-year
    team_pitching = base_dir / Path('team_pitching_stats.csv')
    # Team-level general data (attendance, W-L, etc)
    teams = base_dir / Path('team_stats.csv')
    # Game-level pitcher stats
    pitchers_games = base_dir / Path('pitchers_games.csv')

In [44]:
games_df = pd.read_csv(Files.games)

In [45]:
games_df.head()

Unnamed: 0,away_team_season_game_num,home_team_season_game_num,date,Y,M,D,home_team,away_team,home_win,home_pitcher,...,elo_diff,elo_pct_diff,avg_diff,obp_diff,slg_diff,avg_pct_diff,obp_pct_diff,slg_pct_diff,home_rest,away_rest
0,0,0,2001-04-01,2001,4.0,1.0,TOR,TEX,1.0,loaizes01,...,20.4,1.360396,-0.00806,-0.010103,0.023271,-2.947374,-2.977845,4.989568,5.0,5.0
1,0,0,2001-04-02,2001,4.0,2.0,SEA,OAK,1.0,garcifr03,...,-15.232,-1.002459,-0.000864,0.00119,-0.016229,-0.323318,0.331871,-3.70521,5.0,5.0
2,0,0,2001-04-02,2001,4.0,2.0,NYA,KCA,1.0,clemero02,...,36.359,2.377165,-0.010188,0.006929,0.024787,-3.703559,1.970596,5.554343,5.0,5.0
3,0,0,2001-04-02,2001,4.0,2.0,CIN,ATL,0.0,harnipe01,...,3.41,0.223274,0.003972,-0.001729,0.020216,1.459194,-0.50696,4.555242,5.0,5.0
4,0,0,2001-04-02,2001,4.0,2.0,CHN,WAS,0.0,liebejo01,...,0.745,0.05094,-0.010158,0.009335,-0.018992,-3.99634,2.80356,-4.646432,5.0,5.0


In [47]:
cols = list(games_df.columns)
cols

['away_team_season_game_num',
 'home_team_season_game_num',
 'date',
 'Y',
 'M',
 'D',
 'home_team',
 'away_team',
 'home_win',
 'home_pitcher',
 'away_pitcher',
 'home_elo',
 'away_elo',
 'home_avg',
 'away_avg',
 'home_obp',
 'away_obp',
 'home_slg',
 'away_slg',
 'home_iso',
 'away_iso',
 'elo_diff',
 'elo_pct_diff',
 'avg_diff',
 'obp_diff',
 'slg_diff',
 'avg_pct_diff',
 'obp_pct_diff',
 'slg_pct_diff',
 'home_rest',
 'away_rest']

In [None]:
games_df = g