In [1]:
import glob
import pandas as pd
import bs4
from bs4 import BeautifulSoup
import datetime
import requests
import urllib2
import re
import sys
import os

In [2]:
def get_soup(filepath):
    with open(filepath, 'r') as myfile:
        xml = myfile.read()
    soup = BeautifulSoup(xml, 'lxml-xml')
    return soup

In [3]:
server_url = 'http://gdx.mlb.com/components/game/mlb/'

In [4]:
def extract_field(document, tag, attr=None):
    result = document.find_all(tag)

    if len(result) == 0:
        return None

    if attr is not None:
        result = [r.get(attr).encode('ASCII', 'ignore') for r in result]
    else:
        result = [r.text.encode('ASCII', 'ignore') for r in result]
     
    if len(result) == 1:
        return result[0]
    else:
        return result

In [5]:
def extract_game_info(game_soup):
    game_dict = {}
    
    boxscore = game_soup.find('boxscore')
    umpires = game_soup.find('umpires')
    teams = game_soup.find_all('team')
    
    game_dict['game_type'] = boxscore['game_type']
    game_dict['venue_name'] = boxscore['venue_name']
    game_dict['venue_id'] = boxscore['venue_id']
    game_dict['game_id'] = boxscore['game_id']
    game_dict['date'] = boxscore['date']
    
    umpire_dict = {}
    for umpire in umpires:
        if umpire['position'] == 'HP':
            umpire_dict['id'] = umpire['id']
            umpire_dict['name'] = umpire['name']
            
            game_dict['umpire'] = umpire_dict
            break
    
    team_list = []
    
    for team in teams:
        team_list.append(team.attrs)
    
    game_dict['teams'] = team_list
    
    return game_dict

In [6]:
def extract_pitch_info(inning_soup):
    atbats = inning_soup.find_all('atbat')
    
    atbat_list = []
    
    for atbat in atbats:
        
        pitches = atbat.find_all('pitch')

        pitch_count = 0
        
        for pitch in pitches:
            
            pitch_count += 1
            try:
                pitch_dict = {}

                ## If any one of these attribute is missing, we omit the pitch data for it is no longer useful in plotting pitchfx data
                pitch_dict['isHome'] = 0 if atbat.parent.name == 'top' else 1
                pitch_dict['inning_count'] = atbat.parent.parent['num']

                pitch_dict['atbat_count'] = int(atbat['num'])
                pitch_dict['batter_height'] = atbat['b_height']
                pitch_dict['batter_id'] = atbat['batter']
                pitch_dict['pitcher_id'] = atbat['pitcher']
                pitch_dict['handedness'] = atbat['stand']
                pitch_dict['p_throws'] = atbat['p_throws']
 
                pitch_dict['pitch_count'] = pitch_count
                pitch_dict['des'] = pitch['des']
                pitch_dict['type'] = pitch['type']
                pitch_dict['px'] = float(pitch['px'])
                pitch_dict['pz'] = float(pitch['pz'])
                pitch_dict['sz_top'] = float(pitch['sz_top'])
                pitch_dict['sz_bottom'] = float(pitch['sz_bot'])

                atbat_list.append(pitch_dict)
            
            except:
                continue
        
        
        
    return atbat_list

In [7]:
def parse_game(gid_dir):
    
    gid_dir = gid_dir.rstrip('/') + "/"
    
    game_dir = gid_dir + "rawboxscore.xml"
    inning_dir = gid_dir + "inning_all.xml"
    
    game_soup = get_soup(game_dir)
    inning_soup = get_soup(inning_dir)
        
    game_dict = extract_game_info(game_soup)
    
    game_dict['pitches'] = extract_pitch_info(inning_soup)
    
    return game_dict

In [8]:
data_dir = './data'

In [9]:
def make_dir(designated_dir):
    if not os.path.exists(designated_dir):
        os.makedirs(designated_dir)

In [10]:
def download_page(url, download_dir):
    """
    Grab page at url and either return it as a string or save it to file
    """
    try:
        response = urllib2.urlopen(url)
        html = response.read()
    except Exception as err:
        print >>sys.stderr, "url: {0}\n\t{1}".format(url, str(err))

    filename = url.split('/')[-1]
    if not os.path.exists(download_dir + filename):
        print "Downloading %s" %(download_dir.strip('./data/') + "/" + filename)
        try:
            with open(download_dir + filename, 'w') as handle:
                handle.write(html)
        except:
            pass
    else:
        pass

In [11]:
def grab_page_with_pattern(url, pattern):
    
    """
    get_data() extracts pitch data from all the games played on the given date.
    we expect year, month and day to be a string, or integers, of the form "YYYY", "MM", "DD", respectively.
    By default, download_data() will download the entire 2017 season, will download, at least, at the season level.
    """
    
    response = urllib2.urlopen(url)
    link = response.read()
    soup = BeautifulSoup(link)
    
    if len(pattern) > 0:
        wanted_pattern = pattern[0]
        matched_patterns = []
        
        for ahref in soup.findAll('a'):
            for c in ahref.contents:
                c = str(c).strip()
                
                if re.match(wanted_pattern, c):                  
                    if len(pattern) == 2:
                        c = c.strip('/')
                        
                    if len(pattern) == 1:
                        c = "/" + c
                        make_dir(data_dir + c)
                
                    matched_patterns.append(c)
    
        for matched in matched_patterns:
            new_url = url + matched
            grab_page_with_pattern(new_url, pattern[1:])

    else:
        directory = data_dir + "/" + url.split('/')[-2] + "/"
        download_page(url + 'rawboxscore.xml', directory)
        download_page(url + 'inning/inning_all.xml', directory)   

In [12]:
def download_data(year = 2017, month = "", day = ""):
    
    """
    get_data() extracts pitch data from all the games played on the given date.
    we expect year, month and day to be a string, or integers, of the form "YYYY", "MM", "DD", respectively.
    By default, download_data() will download the entire 2017 season, will download, at least, at the season level.
    """
    
    url = server_url
    
    year = str(year)
    month = str(month)
    day = str(day)
    
    year_pattern = "year_" + str(year)
    month_pattern = "month_" + str(month) if month is not None else "month_"
    day_pattern = "day_" + str(day) if day is not None else "day_"
    game_pattern  = "gid_"
    
    if month is "":
        url = server_url + year_pattern + "/"
        pattern = [month_pattern, day_pattern, game_pattern]
        
    elif day is "":
        url = server_url + year_pattern + "/" + month_pattern
        pattern = [day_pattern, game_pattern]

    else:
        url = server_url + year_pattern + "/" + month_pattern + "/" + day_pattern
        pattern = [game_pattern]
        
    print "Downloading data for %s %s %s" %(year, month, day)
   
    grab_page_with_pattern(url, pattern)    

In [13]:
import pymongo
from pymongo import MongoClient

In [14]:
client = MongoClient('mongodb://localhost:27017/')

In [15]:
pitchfx_db2 = client.test_database
pitchfx_collection2 = pitchfx_db2.test_collection

In [16]:
data_dir = "./data/"
gid_dirs = glob.glob(data_dir + 'gid_*/')

In [None]:
pitchfx_collection2.delete_many({})

<pymongo.results.DeleteResult at 0x7fc16175c870>

In [None]:
start_time = datetime.datetime.now()
print start_time

for d in gid_dirs:
    print d
    try:
        pitchfx_collection2.insert_one(parse_game(d))
    except:
        pass
    
end_time = datetime.datetime.now()
print 'Elapsed:', end_time - start_time

2018-09-11 15:43:06.972714
./data/gid_2017_04_23_tormlb_anamlb_1/
./data/gid_2017_09_09_sdnmlb_arimlb_1/
./data/gid_2017_08_26_houmlb_anamlb_1/
./data/gid_2017_06_02_lanmlb_milmlb_1/
./data/gid_2017_04_24_sdnmlb_arimlb_1/
./data/gid_2017_08_13_sdnmlb_lanmlb_1/
./data/gid_2017_10_06_awcmlb_adtmlb_1/
./data/gid_2017_06_14_atlmlb_wasmlb_1/
./data/gid_2017_08_18_arimlb_minmlb_1/
./data/gid_2017_08_04_detmlb_balmlb_1/
./data/gid_2017_08_02_tbamlb_houmlb_1/
./data/gid_2017_05_03_milmlb_slnmlb_1/
./data/gid_2017_07_08_oakmlb_seamlb_1/
./data/gid_2017_07_01_sfnmlb_pitmlb_1/
./data/gid_2017_08_25_sdnmlb_miamlb_1/
./data/gid_2017_05_01_chamlb_kcamlb_1/
./data/gid_2017_05_17_wasmlb_pitmlb_1/
./data/gid_2017_04_25_houmlb_clemlb_1/
./data/gid_2017_09_19_texmlb_seamlb_1/
./data/gid_2017_08_29_miamlb_wasmlb_1/
./data/gid_2017_09_27_cinmlb_milmlb_1/
./data/gid_2017_06_28_colmlb_sfnmlb_1/
./data/gid_2017_09_10_tbamlb_bosmlb_1/
./data/gid_2017_05_03_texmlb_houmlb_1/
./data/gid_2017_06_11_colmlb_chnmlb_1

./data/gid_2017_06_22_clemlb_balmlb_1/
./data/gid_2017_09_12_chamlb_kcamlb_1/
./data/gid_2017_04_15_chamlb_minmlb_1/
./data/gid_2017_09_08_colmlb_lanmlb_1/
./data/gid_2017_05_29_lanmlb_slnmlb_1/
./data/gid_2017_07_14_slnmlb_pitmlb_1/
./data/gid_2017_07_21_texmlb_tbamlb_1/
./data/gid_2017_07_25_houmlb_phimlb_1/
./data/gid_2017_10_19_lanmlb_chnmlb_1/
./data/gid_2017_07_14_nyamlb_bosmlb_1/
./data/gid_2017_06_19_houmlb_oakmlb_1/
./data/gid_2017_04_18_wasmlb_atlmlb_1/
./data/gid_2017_08_06_texmlb_minmlb_1/
./data/gid_2017_08_23_arimlb_nynmlb_1/
./data/gid_2017_06_11_miamlb_pitmlb_1/
./data/gid_2017_08_06_slnmlb_cinmlb_1/
./data/gid_2017_06_20_nynmlb_lanmlb_1/
./data/gid_2017_04_19_texmlb_oakmlb_1/
./data/gid_2017_06_06_chamlb_tbamlb_1/
./data/gid_2017_04_04_sdnmlb_lanmlb_1/
./data/gid_2017_08_15_atlmlb_colmlb_1/
./data/gid_2017_04_30_colmlb_arimlb_1/
./data/gid_2017_06_07_wasmlb_lanmlb_1/
./data/gid_2017_09_07_nyamlb_balmlb_1/
./data/gid_2017_07_05_miamlb_slnmlb_1/
./data/gid_2017_07_24_kca

./data/gid_2017_09_03_arimlb_colmlb_1/
./data/gid_2017_04_25_tbamlb_balmlb_1/
./data/gid_2017_09_02_cinmlb_pitmlb_1/
./data/gid_2017_06_28_kcamlb_detmlb_1/
./data/gid_2017_06_01_milmlb_nynmlb_1/
./data/gid_2017_06_26_nyamlb_chamlb_1/
./data/gid_2017_06_17_chamlb_tormlb_1/
./data/gid_2017_09_23_slnmlb_pitmlb_1/
./data/gid_2017_07_20_sdnmlb_sfnmlb_1/
./data/gid_2017_06_08_sfnmlb_milmlb_1/
./data/gid_2017_06_03_sfnmlb_phimlb_1/
./data/gid_2017_08_22_lanmlb_pitmlb_1/
./data/gid_2017_06_27_chnmlb_wasmlb_1/
./data/gid_2017_04_01_cifmin_cinmlb_1/
./data/gid_2017_06_12_nyamlb_anamlb_1/
./data/gid_2017_06_17_lanmlb_cinmlb_1/
./data/gid_2017_09_21_colmlb_sdnmlb_1/
./data/gid_2017_07_06_sdnmlb_clemlb_1/
./data/gid_2017_05_19_texmlb_detmlb_1/
./data/gid_2017_09_04_slnmlb_sdnmlb_1/
./data/gid_2017_07_09_sdnmlb_phimlb_1/
./data/gid_2017_07_09_miamlb_sfnmlb_1/
./data/gid_2017_04_06_miamlb_wasmlb_1/
./data/gid_2017_06_04_minmlb_anamlb_1/
./data/gid_2017_07_18_seamlb_houmlb_1/
./data/gid_2017_07_26_cin