In [1]:
import pandas as pd
import numpy as np
import os

from utils.data_cleaning import optimize, uniform_name

In [8]:
def load_rs(year):
    usecols = [0, 3, 6, 9, 10] + list(range(21, 77))

    cols = ['date', 'away_team', 'home_team', 'away_score', 'home_score']

    cols += [f'away_off_{i}' for i in range(22, 39)]
    cols += [f'away_pitch_{i}' for i in range(39, 44)]
    cols += [f'away_def_{i}' for i in range(44, 50)]

    cols += [f'home_off_{i}' for i in range(50, 67)]
    cols += [f'home_pitch_{i}' for i in range(67, 72)]
    cols += [f'home_def_{i}' for i in range(72, 78)]

    assert len(usecols) == len(cols)

    rs_df = pd.read_csv('../data/retrosheet/GL2000.txt', usecols=usecols, names=cols)

    rs_df['home_win'] = rs_df['home_score'] > rs_df['away_score']
    
    rs_df['away_team'] = rs_df['away_team'].apply(uniform_name)
    rs_df['home_team'] = rs_df['home_team'].apply(uniform_name)
    
    rs_df['date'] = pd.to_datetime(rs_df['date'], format='%Y%m%d')
    
    return rs_df

In [11]:
rs_all_years_df = None

for year in range(2000, 2020):
    rs_year_df = load_rs(year)
    if rs_all_years_df is None:
        rs_all_years_df = rs_year_df
    else:
        rs_all_years_df = pd.concat([rs_all_years_df, rs_year_df])
        
rs_all_years_df = optimize(rs_all_years_df)

In [12]:
rs_all_years_df.head()

Unnamed: 0,date,away_team,home_team,away_score,home_score,away_off_22,away_off_23,away_off_24,away_off_25,away_off_26,...,home_pitch_69,home_pitch_70,home_pitch_71,home_def_72,home_def_73,home_def_74,home_def_75,home_def_76,home_def_77,home_win
0,2000-03-29,CHN,NYN,5,3,33,12,1,0,2,...,5,1,0,27,12,0,0,4,0,False
1,2000-03-30,NYN,CHN,5,1,37,6,2,0,1,...,5,0,0,33,14,0,0,0,0,False
2,2000-04-03,COL,ATL,0,2,31,6,2,0,0,...,0,0,0,27,12,0,0,1,0,True
3,2000-04-03,MIL,CIN,3,3,22,7,1,0,0,...,2,0,0,16,8,2,0,0,0,False
4,2000-04-03,SFN,MIA,4,6,35,10,2,2,1,...,4,0,0,27,15,0,0,2,0,True


In [13]:
rs_all_years_df.shape

(48580, 62)

In [14]:
rs_all_years_df.to_csv('../data/retrosheet.csv', index=False)