# Download

Baseball data download routines

In [1]:
import os
import requests
import collections
import pandas as pd
from bs4 import Comment
from bs4 import BeautifulSoup
from datetime import date

### Fangraphs lineups

In [16]:
year_list = range(1908, 2019)

In [17]:
def parse_batter(cell):
    d = {
        'id': cell.a['data-entry-id'],
        'name': cell.a['title'],
        'position': cell.small.text.split("-")[1]
    }
    return 'pitcher' if d['position'] == 'P' else d['name'].split()[-1]

In [18]:
def parse_game(game):
    cell_list = game.find_all("th") + game.find_all("td")
    d = collections.OrderedDict(
        number=cell_list[0].a['name'],
        one=parse_batter(cell_list[1]),
        two=parse_batter(cell_list[2]),
        three=parse_batter(cell_list[3]),
        four=parse_batter(cell_list[4]),
        five=parse_batter(cell_list[5]),
        six=parse_batter(cell_list[6]),
        seven=parse_batter(cell_list[7]),
        eight=parse_batter(cell_list[8]),
        nine=parse_batter(cell_list[9]),
    )
    return d

In [34]:
def download_lineup_year(year):
    url = 'https://www.baseball-reference.com/teams/CHC/{}-batting-orders.shtml'.format(year)
    r = requests.get(url)
    soup = BeautifulSoup(r.text, 'html.parser')
    table = soup.find("table", class_="grid_table")
    game_list = table.tbody.find_all("tr")
    dict_list = [parse_game(game) for game in game_list]
    return dict_list

In [37]:
for year in year_list:
    path = "./input/CHC-batting-orders-{}.csv".format(year)
    if os.path.exists(path):
        continue
    print("Downloading {}".format(year))
    df = pd.DataFrame(download_lineup_year(year))
    df.to_csv(path, index=False, encoding="utf-8")

In [42]:
df_list = []
for year in year_list:
    path = "./input/CHC-batting-orders-{}.csv".format(year)
    df = pd.read_csv(path)
    df['year'] = year
    df_list.append(df)
df = pd.concat(df_list, sort=True)
df[[
    'year',
    'number',
    'one',
    'two',
    'three',
    'four',
    'five',
    'six',
    'seven',
    'eight',
    'nine'
]].to_csv("./input/CHC_batting_orders_all.csv", index=False, encoding="utf-8")

### Baseball Reference WAR archive

For pitchers

In [2]:
!curl -o ./input/war_daily_pitch.txt https://www.baseball-reference.com/data/war_daily_pitch.txt

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100 11.2M  100 11.2M    0     0   897k      0  0:00:12  0:00:12 --:--:-- 1883k


For hitters

In [4]:
!curl -o ./input/war_daily_bat.txt https://www.baseball-reference.com/data/war_daily_bat.txt

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100 27.8M  100 27.8M    0     0   356k      0  0:01:19  0:01:19 --:--:--  635k


### Baseball Reference pitchers by year

In [15]:
year_list = range(1871, 2019)

In [4]:
def get_standard_player_pitching(year):
    url = "https://www.baseball-reference.com/leagues/MLB/{}-standard-pitching.shtml".format(year)
    r = requests.get(url)
    soup = BeautifulSoup(r.text, 'html.parser')
    comments = soup.find_all(string=lambda text:isinstance(text, Comment))
    table = BeautifulSoup(comments[16], 'html.parser')
    players = []
    for row in table.find_all("tr")[1:]:
        # Skip headers
        if 'thead' in row.attrs.get("class"):
            continue
        # Get all the cells
        cells = row.find_all("td")
        # If its a footer, skip it
        if cells[0]['csk'] == 'ZZZZZZ':
            continue
        d = collections.OrderedDict((
            ('year_ID', year),
            ('player_ID', cells[0]["data-append-csv"]),
            ('name_common', cells[0].a.string),
        ))
        for stat in cells[1:]:
            d[stat['data-stat']] = stat.string
        players.append(d)
    return players

In [7]:
for year in year_list:
    path = "./input/standard_player_pitching_stats_{}.csv".format(year)
    if os.path.exists(path):
        continue
    print("Downloading {}".format(year))
    df = pd.DataFrame(get_standard_player_pitching(year))
    df.to_csv(path, index=False, encoding="utf-8")

In [11]:
df_list = []
for year in year_list:
    path = "./input/standard_player_pitching_stats_{}.csv".format(year)
    df = pd.read_csv(path)
    df_list.append(df)
df = pd.concat(df_list, sort=True)
df.to_csv("./input/standard_player_pitching_stats_all.csv", index=False, encoding="utf-8")

### Baseball reference batters by year

In [1]:
def get_standard_player_batting(year):
    url = "https://www.baseball-reference.com/leagues/MLB/{}-standard-batting.shtml".format(year)
    r = requests.get(url)
    soup = BeautifulSoup(r.text, 'html.parser')
    comments = soup.find_all(string=lambda text:isinstance(text, Comment))
    table = BeautifulSoup(comments[16], 'html.parser')
    players = []
    for row in table.find_all("tr")[1:]:
        # Skip headers
        if 'thead' in row.attrs.get("class"):
            continue
        # Get all the cells
        cells = row.find_all("td")
        # If its a footer, skip it
        if cells[0]['csk'] == 'ZZZZZZ':
            continue
        d = collections.OrderedDict((
            ('year_ID', year),
            ('player_ID', cells[0]["data-append-csv"]),
            ('name_common', cells[0].a.string),
        ))
        for stat in cells[1:]:
            d[stat['data-stat']] = stat.string
        players.append(d)
    return players

In [7]:
for year in year_list:
    path = "./input/standard_player_batting_stats_{}.csv".format(year)
    if os.path.exists(path):
        continue
    print("Downloading {}".format(year))
    df = pd.DataFrame(get_standard_player_batting(year))
    df.to_csv(path, index=False, encoding="utf-8")

In [16]:
df_list = []
for year in year_list:
    path = "./input/standard_player_batting_stats_{}.csv".format(year)
    df = pd.read_csv(path)
    df_list.append(df)
df = pd.concat(df_list, sort=True)
df.to_csv("./input/standard_player_batting_stats_all.csv", index=False, encoding="utf-8")

### Baseball reference batter game log

In [9]:
def get_batter_gamelog(player_id, year, force=False):
    path = "./input/player_{}_batting_game_log_{}.csv".format(player_id, year)
    if os.path.exists(path) and not force:
        return
    url_template = "https://www.baseball-reference.com/players/gl.fcgi?id={}&t=b&year={}"
    url = url_template.format(player_id, year)
    print("- Requesting {}".format(url))
    r = requests.get(url)
    soup = BeautifulSoup(r.text, 'html.parser')
    table = soup.find('table', id='batting_gamelogs')
    game_list = table.findAll('tr', id=lambda x: x and x.startswith('batting_gamelogs'))
    print("- Scraping {} games".format(len(game_list)))
    stat_list = []
    for i, game in enumerate(game_list):
        cells = game.find_all("td")
        d = collections.OrderedDict()
        for stat in cells[1:]:
            if 'csk' in stat.attrs.keys():
                d[stat['data-stat']] = stat['csk']
            else:
                d[stat['data-stat']] = stat.string
        stat_list.append(d)
    df = pd.DataFrame(stat_list)
    print("- Writing {} rows".format(len(df)))
    df.to_csv(path, index=False, encoding="utf-8")
    return path

In [10]:
the2550club = [
    ["cedence01",1973],
    ["morgajo02",1973],
    ["cedence01",1974],
    ["morgajo02",1976],
    ["sandbry01",1985],
    ["daviser01",1986],
    ["henderi01",1986],
    ["daviser01",1987],
    ["bondsba01",1990],
    ["henderi01",1990],
    ["ramirha01",2007],
]

In [15]:
df_list = []
for player_id, year in the2550club:
    path = get_batter_gamelog(player_id, year, force=True)
    df = pd.read_csv(path)
    df['player_ID'] = player_id
    df['year_ID'] = year
    df_list.append(df)
pd.concat(df_list).to_csv("./input/the_2550_club_gamelogs.csv", index=False, encoding="utf-8")

- Requesting https://www.baseball-reference.com/players/gl.fcgi?id=cedence01&t=b&year=1973
- Scraping 139 games
- Writing 139 rows
- Requesting https://www.baseball-reference.com/players/gl.fcgi?id=morgajo02&t=b&year=1973
- Scraping 157 games
- Writing 157 rows
- Requesting https://www.baseball-reference.com/players/gl.fcgi?id=cedence01&t=b&year=1974
- Scraping 160 games
- Writing 160 rows
- Requesting https://www.baseball-reference.com/players/gl.fcgi?id=morgajo02&t=b&year=1976
- Scraping 141 games
- Writing 141 rows
- Requesting https://www.baseball-reference.com/players/gl.fcgi?id=sandbry01&t=b&year=1985
- Scraping 153 games
- Writing 153 rows
- Requesting https://www.baseball-reference.com/players/gl.fcgi?id=daviser01&t=b&year=1986
- Scraping 132 games
- Writing 132 rows
- Requesting https://www.baseball-reference.com/players/gl.fcgi?id=henderi01&t=b&year=1986
- Scraping 153 games
- Writing 153 rows
- Requesting https://www.baseball-reference.com/players/gl.fcgi?id=daviser01&t=b&ye

### FanGraphs starter pitch value

In [None]:
def download(year):
    url_template = "https://www.fangraphs.com/leaders.aspx?pos=all&stats=sta&lg=all&qual=y&type=7&season=2018&month=0&season1={}&ind=0&team=0&rost=0&age=0&filter=&players=0&page=1_50"
    url = url_template.format(year)
    r = r.request(url)
    soup = BeautifulSoup(r.text)
    