In [1]:
from bs4 import BeautifulSoup
from urllib.request import urlopen
import datetime
import pandas as pd
from mlb_data_utils import get_game_dict, get_ab_pitches, finish_up, close_con

Connecting to the PostgreSQL database...
PostgreSQL database version:
('PostgreSQL 9.6.6 on x86_64-pc-linux-gnu, compiled by gcc (Debian 4.7.2-5) 4.7.2, 64-bit',)


In [2]:
start_year, start_month, start_day = 2015, 2, 15
end_year, end_month, end_day = 2016, 11, 15
start_date = datetime.date(start_year, start_month, start_day)
end_date = datetime.date(end_year, end_month, end_day)
delta = end_date - start_date

In [3]:
base_url = "http://gd2.mlb.com/components/game/mlb/"
game_dicts, ab_dfs, p_dfs, retro_fixes = [], [], [], []
game_count = 0

In [4]:
with open('error_messages.txt', 'w') as f_err:
    for i in range(delta.days + 1):
        active_date = (start_date + datetime.timedelta(days=i))
        day_url = '{}year_{}/month_{:02}/day_{:02}/'.format(base_url, active_date.year,
                                                            active_date.month, active_date.day)
        games_url = day_url + 'miniscoreboard.xml'
        try:
            game_soup = BeautifulSoup(urlopen(games_url), 'lxml').games
        except:
            f_err.write('Error opening {}\n'.format(games_url))
            continue
        if len(game_soup):
            for game in game_soup.find_all('game'):
                game_id = game.attrs['gameday_link']
                if game.attrs['home_sport_code'] != game.attrs['away_sport_code']:
                    continue
                inning_url = day_url + 'gid_' + game_id + '/inning/inning_all.xml'
                try:
                    innings = BeautifulSoup(urlopen(inning_url), 'lxml').game.find_all('inning')
                except:
                    f_err.write('Error opening {}\n'.format(inning_url))
                    continue
                try:
                    innings[0].find('atbat').find('pitch')
                except:
                    f_err.write('No AB or pitch info in {}\n'.format(inning_url))
                    continue
                game_url = day_url + 'gid_' + game_id + '/game.xml'
                try:
                    game_info = BeautifulSoup(urlopen(game_url), 'lxml')
                except:
                    f_err.write('Error opening {}\n'.format(game_url))
                    game_info = None
                game_number = int(game_id[-1]) - 1
                if game_number:  # digit on end of link was 2 and is now 1
                    game_number += 1  # set number back to 2
                    retro_fixes.append(game_id[:-1] + '1')
                    # store the game id for a first game that needs its retro game
                    # id changed so that the last character can later be changed
                    # from 0 to 1
                game_dict = get_game_dict(game_info, game_id)
                game_dict['retro_game_id'] = '{}{}{:02}{:02}{}'.format(game_dict['home_team_id'].upper(),
                                                                       active_date.year,
                                                                       active_date.month,
                                                                       active_date.day,
                                                                       game_number)
                game_dicts.append(game_dict)
                game_abs, game_pitches = get_ab_pitches(innings, game_id)
                ab_dfs += game_abs
                p_dfs += game_pitches
                game_count += 1
                if game_count % 100 == 0:
                    finish_up(ab_dfs, p_dfs, game_dicts, retro_fixes)
                    game_dicts, ab_dfs, p_dfs, retro_fixes = [], [], [], []
                    print('{} games processed and written'.format(game_count))
                    
finish_up(ab_dfs, p_dfs, game_dicts, retro_fixes)
print('{} games processed and written\n'.format(game_count))
close_con()


100 games processed and written
200 games processed and written
300 games processed and written
400 games processed and written
500 games processed and written
600 games processed and written
700 games processed and written
800 games processed and written
900 games processed and written
1000 games processed and written
1100 games processed and written
1200 games processed and written
1300 games processed and written
1400 games processed and written
1500 games processed and written
1600 games processed and written
1700 games processed and written
1800 games processed and written
1900 games processed and written
2000 games processed and written
2100 games processed and written
2200 games processed and written
2300 games processed and written
2400 games processed and written
2500 games processed and written
2600 games processed and written
2700 games processed and written
2800 games processed and written
2900 games processed and written
3000 games processed and written
3100 games processe