In [15]:
import os
import wget
from zipfile import ZipFile
import pandas as pd
pd.set_option('display.max_columns', None)

In [16]:
def download_zipfile_from_url(url_str, output_name):
    print(f"Downloading data from {url_str}")
    output_filepath = os.path.join(os.getcwd(), 'zip_files', output_name)
    wget.download(url_str, output_filepath)
    print(f"Downloaded zip file to {output_filepath}\n")
    return output_filepath

def unzip_file(zip_filepath, output_dir):
    with ZipFile(zip_filepath, 'r') as zipObj:
        zipObj.extractall(output_dir)
    print(f"Unzipped File to {output_dir}\n")
    return None

def parse_retrosheet_events():
    os.chdir('retrosheet_raw')
    for filename in os.listdir(os.getcwd()):
        if '.EV' in filename:
            cmd_str = f"cwevent -n -f 0-96 -y 2020 {filename} > {filename.split('.')[0] + '.csv'}"
            os.system(cmd_str)
            os.remove(filename)
    print(f"Parsed Retrosheet data")
    os.chdir('../')
    return None

In [17]:
#01. LAHMAN DATA

## Download 
lahman_zip_filepath = download_zipfile_from_url(
    url_str="https://github.com/chadwickbureau/baseballdatabank/archive/master.zip", 
    output_name='lahman_db_master.zip')

## Unzip
unzip_file(
    zip_filepath=lahman_zip_filepath,
    output_dir=os.path.join(os.getcwd(), 'final_data'))

# 02. RETROSHEET PLAY-BY-PLAY DATA

## Download 
retrosheet_event_zip_filepath = download_zipfile_from_url(
    url_str="http://www.retrosheet.org/events/2020eve.zip", 
    output_name='2020eve.zip')
## Unzip
#Get Retrosheet EVENT Data (raw)
unzip_file(
    zip_filepath=retrosheet_event_zip_filepath, 
    output_dir='retrosheet_raw')

#Convert Retrosheet event files to csv, using Chadwick's parser
parse_retrosheet_events()

#Output all csvs to one dataframe with column names added
all_pbp_data = pd.DataFrame()
all_roster_data = pd.DataFrame()
roster_headers = ['PlayerID', 'LastName', 'FirstName', 'Bats', 'Pitches', 'Team', 'Position']

for filename in os.listdir(os.path.join(os.getcwd(), 'retrosheet_raw')):
    if '.csv' in filename:
        pbp_df = pd.read_csv(os.path.join(os.getcwd(), 'retrosheet_raw', filename), header=0)
        all_pbp_data = all_pbp_data.append(pbp_df)
    if '.ROS' in filename:
        roster_df = pd.read_csv(os.path.join(os.getcwd(), 'retrosheet_raw', filename), header=None, names=roster_headers)
        all_roster_data = all_roster_data.append(roster_df)
        
all_pbp_data.to_csv('final_data/retrosheet_pbp/all_pbp_data.csv')
all_roster_data.to_csv('final_data/retrosheet_rosters/all_roster_data.csv')

#Download Retrosheet game log data
retrosheet_gamelog_zip_filepath = download_zipfile_from_url(
    url_str="https://www.retrosheet.org/gamelogs/gl2020.zip", 
    output_name='gl2020.zip')

#Get Retrosheet EVENT Data (raw)
unzip_file(
    zip_filepath=retrosheet_gamelog_zip_filepath, 
    output_dir=os.path.join(os.getcwd(), 'retrosheet_raw'))

gamelog_headers = list(pd.read_csv('https://raw.githubusercontent.com/maxtoki/baseball_R/master/data/game_log_header.csv').columns)
gamelog_df = pd.read_csv('retrosheet_raw/GL2020.txt', header=None, names=gamelog_headers)
gamelog_df.to_csv('final_data/retrosheet_gamelogs/gamelog_2020.csv')

Downloading data from https://github.com/chadwickbureau/baseballdatabank/archive/master.zip
Downloaded zip file to C:\Users\Michael Kalmus\python_dev\retrosheet_parsing\retrosheet_parsing\zip_files\lahman_db_master.zip

Unzipped File to C:\Users\Michael Kalmus\python_dev\retrosheet_parsing\retrosheet_parsing\final_data

Downloading data from http://www.retrosheet.org/events/2020eve.zip
Downloaded zip file to C:\Users\Michael Kalmus\python_dev\retrosheet_parsing\retrosheet_parsing\zip_files\2020eve.zip

Unzipped File to retrosheet_raw

Parsed Retrosheet data
Downloading data from https://www.retrosheet.org/gamelogs/gl2020.zip
Downloaded zip file to C:\Users\Michael Kalmus\python_dev\retrosheet_parsing\retrosheet_parsing\zip_files\gl2020.zip

Unzipped File to C:\Users\Michael Kalmus\python_dev\retrosheet_parsing\retrosheet_parsing\retrosheet_raw

