# Download

Baseball data download routines

In [2]:
import os
import requests
import collections
import pandas as pd
from bs4 import Comment
from bs4 import BeautifulSoup
from datetime import date

### Fangraphs lineups

In [2]:
url = 'https://www.baseball-reference.com/teams/CHC/2018-batting-orders.shtml'

In [3]:
r = requests.get(url)

In [4]:
soup = BeautifulSoup(r.text, 'html.parser')

In [9]:
table = soup.find("table", id="grid_table_235551")

In [26]:
game_list = table.tbody.find_all("tr")

In [45]:
def parse_batter(cell):
    d = {
        'id': cell.a['data-entry-id'],
        'name': cell.a['title'],
        'position': cell.small.text.split("-")[1]
    }
    return 'pitcher' if d['position'] == 'P' else d['name'].split()[-1]

In [46]:
def parse_game(game):
    cell_list = game.find_all("th") + game.find_all("td")
    d = collections.OrderedDict(
        number=cell_list[0].a['name'],
        one=parse_batter(cell_list[1]),
        two=parse_batter(cell_list[2]),
        three=parse_batter(cell_list[3]),
        four=parse_batter(cell_list[4]),
        five=parse_batter(cell_list[5]),
        six=parse_batter(cell_list[6]),
        seven=parse_batter(cell_list[7]),
        eight=parse_batter(cell_list[8]),
        nine=parse_batter(cell_list[9]),
    )
    return d

In [47]:
dict_list = [parse_game(game) for game in game_list]

In [48]:
game_df = pd.DataFrame(dict_list)

In [51]:
game_df[[
    'number',
    'one',
    'two',
    'three',
    'four',
    'five',
    'six',
    'seven',
    'eight',
    'nine'
]].to_csv("./input/cubs_2018_lineup.csv", index=False, encoding="utf-8")

### Baseball Reference WAR archive

For pitchers

In [2]:
!curl -o ./input/war_daily_pitch.txt https://www.baseball-reference.com/data/war_daily_pitch.txt

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100 11.2M  100 11.2M    0     0   897k      0  0:00:12  0:00:12 --:--:-- 1883k


For hitters

In [4]:
!curl -o ./input/war_daily_bat.txt https://www.baseball-reference.com/data/war_daily_bat.txt

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100 27.8M  100 27.8M    0     0   356k      0  0:01:19  0:01:19 --:--:--  635k


### Baseball Reference pitchers by year

In [4]:
def get_standard_player_pitching(year):
    url = "https://www.baseball-reference.com/leagues/MLB/{}-standard-pitching.shtml".format(year)
    r = requests.get(url)
    soup = BeautifulSoup(r.text, 'html.parser')
    comments = soup.find_all(string=lambda text:isinstance(text, Comment))
    table = BeautifulSoup(comments[16], 'html.parser')
    players = []
    for row in table.find_all("tr")[1:]:
        # Skip headers
        if 'thead' in row.attrs.get("class"):
            continue
        # Get all the cells
        cells = row.find_all("td")
        # If its a footer, skip it
        if cells[0]['csk'] == 'ZZZZZZ':
            continue
        d = collections.OrderedDict((
            ('year_ID', year),
            ('player_ID', cells[0]["data-append-csv"]),
            ('name_common', cells[0].a.string),
        ))
        for stat in cells[1:]:
            d[stat['data-stat']] = stat.string
        players.append(d)
    return players

In [15]:
year_list = range(1871, 2019)

In [7]:
for year in year_list:
    path = "./input/standard_player_pitching_stats_{}.csv".format(year)
    if os.path.exists(path):
        continue
    print("Downloading {}".format(year))
    df = pd.DataFrame(get_standard_player_pitching(year))
    df.to_csv(path, index=False, encoding="utf-8")

In [11]:
df_list = []
for year in year_list:
    path = "./input/standard_player_pitching_stats_{}.csv".format(year)
    df = pd.read_csv(path)
    df_list.append(df)
df = pd.concat(df_list, sort=True)
df.to_csv("./input/standard_player_pitching_stats_all.csv", index=False, encoding="utf-8")

### Baseball reference batters by year

In [1]:
def get_standard_player_batting(year):
    url = "https://www.baseball-reference.com/leagues/MLB/{}-standard-batting.shtml".format(year)
    r = requests.get(url)
    soup = BeautifulSoup(r.text, 'html.parser')
    comments = soup.find_all(string=lambda text:isinstance(text, Comment))
    table = BeautifulSoup(comments[16], 'html.parser')
    players = []
    for row in table.find_all("tr")[1:]:
        # Skip headers
        if 'thead' in row.attrs.get("class"):
            continue
        # Get all the cells
        cells = row.find_all("td")
        # If its a footer, skip it
        if cells[0]['csk'] == 'ZZZZZZ':
            continue
        d = collections.OrderedDict((
            ('year_ID', year),
            ('player_ID', cells[0]["data-append-csv"]),
            ('name_common', cells[0].a.string),
        ))
        for stat in cells[1:]:
            d[stat['data-stat']] = stat.string
        players.append(d)
    return players

In [7]:
for year in year_list:
    path = "./input/standard_player_batting_stats_{}.csv".format(year)
    if os.path.exists(path):
        continue
    print("Downloading {}".format(year))
    df = pd.DataFrame(get_standard_player_batting(year))
    df.to_csv(path, index=False, encoding="utf-8")

In [16]:
df_list = []
for year in year_list:
    path = "./input/standard_player_batting_stats_{}.csv".format(year)
    df = pd.read_csv(path)
    df_list.append(df)
df = pd.concat(df_list, sort=True)
df.to_csv("./input/standard_player_batting_stats_all.csv", index=False, encoding="utf-8")