In [1]:

from bs4 import BeautifulSoup
from pybaseball import playerid_lookup
import json
import lxml
import pandas as pd
import re, os
import requests
from time import sleep
from collections import OrderedDict, defaultdict

pd.set_option("display.max_columns", 1000)
dd = defaultdict(list)

# Scraping Each Player's Meanest Season
We're going to initialize a list of all of our players and then run each player through a scraper to pull a set of stats that describes the numbers we could expect out of them for a full, 162-game season.

For batters, this is easy. Here's the formula we can use for each stat to arrive at an expected value for that stat for a full 162-game season: 

```
Career games played / 162 = [[FACTOR]]
[[STAT]] / [[FACTOR]] = expectation
```

For pitchers, this is a little more complicated, because pitchers don't play every game. But our friends over at Baseball Reference **(BBREF)** have attempted to tackle this by treating each complete pitcher-season as:

```
games_pitched = X
games_started = X * 2
games_pitched + games_started = 68
```

So for a pitcher, either 34 starts or 68 relief appearances is worth one pitcher_season, and one pitcher_season is pegged to a value of 68. Their rationale is that, across history, full-time starters average to around 34 starts per year, and full-time relievers average about 68 relief appearances per year.

But some pitchers appear as both starters and relievers. Hence the need for normalization.

Thankfully, our friends at Baseball Reference have done all of these calculations for us, and they're available to view for free, with no login necessary.

Which mean we can scrape the hell out of it.

## Let's initialize a list of our players

In [2]:
pitchers = [
    'bartolo colon',
    'fernando rodney',
    'reyes moronta',
    'josé mijares',
    'tom healey',
    'jean machi',
    'fernando rodney',
    'angel castro',
    'edwar colina',
    'framber valdez',
    'andrew carignan',
    'keegan akin',
    'julio mateo',
    'enrique gonzalez',
]
batters = [
    'alejandro kirk',
    'brayan pena',
    'pablo sandoval',
    'tyler white',
    'josh phegley',
    'donovan solano',
    'alberto callaspo',
    'miguel tejada',
    'josh naylor',
    'dayán viciedo',
    'harold ramirez',
    'willians astudillo',
    'bob fothergill'
]


In order to scrape Baseball Reference, we'll need the unique ID that BBREF has given every player for whom they have a record.

Thankfully, the PyBaseball library has a module that'll dig out this ID for us if we feed it our player names:

```python

from pybaseball import playerid_lookup

playerid_lookup('last', 'first', fuzzy=True)['key_bbref'][0]

# e.g.:


>>> Bartolo Colon as ('colon', 'bartolo')

'colonba01'
```

But we can make ourselves a little loop that will take our list of player names, spin them around, run them through player_lookup, and return a cute little dataframe with everything we need.

And then we party.

In [3]:
names_and_ids = []
count = 0
for name in range(len(pitchers)):
    name = pitchers[count].split()
    name.append(playerid_lookup(name[1], name[0], fuzzy=True)["key_bbref"][0])
    names_and_ids.append(name)
    count += 1
    sleep(0.25)
huskies_p = pd.DataFrame(names_and_ids, columns=["first", "last", "key_bbref"])
huskies_p['pitcher'] = True

No identically matched names found! Returning the 5 most similar names.


In [4]:
names_and_ids = []
count = 0
for name in range(len(batters)):
    name = batters[count].split()
    name.append(playerid_lookup(name[1], name[0], fuzzy=True)["key_bbref"][0])
    names_and_ids.append(name)
    count += 1
    sleep(0.25)
huskies_b = pd.DataFrame(names_and_ids, columns=["first", "last", "key_bbref"])
huskies_b['pitcher'] = False

In [5]:
huskies = huskies_b.merge(huskies_p, how='outer')

## Now we run everything through our scraper

In [6]:
targets = []
count = 0
for player in range(len(huskies['key_bbref'])):
    id = huskies['key_bbref'][count]
    targets.append(id)
    count += 1

In [7]:
# Snippet from github user: BenKite
# https://github.com/BenKite/baseball_data/blob/master/baseballReferenceScrape.py

def findTables(url):
    res = requests.get(url)
    ## The next two lines get around the issue with comments breaking the parsing.
    comm = re.compile("<!--|-->")
    soup = BeautifulSoup(comm.sub("", res.text), 'lxml')
    divs = soup.findAll('div', id = "content")
    divs = divs[0].findAll("div", id=re.compile("^all"))
    ids = []
    for div in divs:
        searchme = str(div.findAll("table"))
        x = searchme[searchme.find("id=") + 3: searchme.find(">")]
        x = x.replace("\"", "")
        if len(x) > 0:
            ids.append(x)
    return(ids)

In [8]:
# Snippet from github user: BenKite
# https://github.com/BenKite/baseball_data/blob/master/baseballReferenceScrape.py

def pullTable(url, tableID):
    res = requests.get(url)
    ## Work around comments
    comm = re.compile("<!--|-->")
    soup = BeautifulSoup(comm.sub("", res.text), 'lxml')
    tables = soup.findAll('table', id = tableID)
    data_rows = tables[0].findAll('tr')
    data_header = tables[0].findAll('thead')
    data_header = data_header[0].findAll("tr")
    data_header = data_header[0].findAll("th")
    game_data = [[td.getText() for td in data_rows[i].findAll(['th','td'])]
        for i in range(len(data_rows))
        ]
    data = pd.DataFrame(game_data)
    header = []
    for i in range(len(data.columns)):
        header.append(data_header[i].getText())
    data.columns = header
    data = data.loc[data[header[0]] != header[0]]
    data = data.reset_index(drop = True)
    return(data)

In [9]:
root = "https://www.baseball-reference.com/players/"
suffix = ".shtml"

In [10]:
name = huskies['last'][10]
name

'ramirez'

In [55]:

count = 0
dicts = []
players = pd.DataFrame()
for target in range(len(targets)):
    target = targets[count]
    prepend = target[0] + "/"
    query = root + prepend + target + suffix
    print(count)
    if huskies.pitcher[count] == True:
        name = huskies['last'][count]
        pitching_standard = pullTable(query, "pitching_standard")
        print(type(pitching_standard))
        pitching_standard['last'] = huskies['last'][count]
        pitching_standard['index'] = count
        # pitching_standard['key_bbref'] = target
        pitching_standard = pitching_standard[pitching_standard['Year'] == '162 Game Avg.']
        # pitching_standard = pitching_standard.set_index(pitching_standard['index'])
        # print('after drop')
        # print(type(pitching_standard))
        # pitching_standard_keys = pitching_standard.columns.to_list()
        # pitching_standard_values = pitching_standard.values.tolist()[0]
        # pitching_standard = pitching_standard.set_index([count])
        # pitching_standard = pitching_standard.apply(lambda col: col.drop_duplicates().reset_index(drop=True))
        try:
            players.loc[len(players.index)] = player
        except:
            players = pitching_standard
        sleep(.25)
    else:
        print('nah')
        
    # else:
    #     batting_standard = pullTable(query, "batting_standard")
    #     sleep(1)
    #     batting_value = pullTable(query, "batting_value")
    #     batting_combined = batting_standard.append(batting_value)
    #     batting_combined = batting_combined.groupby('Year', as_index=False).first()
    #     batting_combined['first'] = huskies['first'][count]
    #     batting_combined['last'] = huskies['last'][count]
    #     batting_combined['key_bbref'] = target
    #     try:
    #         huskies_b.append(batting_combined)
    #     except:
    #         huskies_b = batting_combined
    count += 1
    sleep(.2)
players


0
nah
1
nah
2
nah
3
nah
4
nah
5
nah
6
nah
7
nah
8
nah
9
nah
10
nah
11
nah
12
nah
13
<class 'pandas.core.frame.DataFrame'>
14
<class 'pandas.core.frame.DataFrame'>
15
<class 'pandas.core.frame.DataFrame'>
16
<class 'pandas.core.frame.DataFrame'>
17
<class 'pandas.core.frame.DataFrame'>
18
<class 'pandas.core.frame.DataFrame'>
19
<class 'pandas.core.frame.DataFrame'>
20
<class 'pandas.core.frame.DataFrame'>
21
<class 'pandas.core.frame.DataFrame'>
22
<class 'pandas.core.frame.DataFrame'>
23
<class 'pandas.core.frame.DataFrame'>
24
<class 'pandas.core.frame.DataFrame'>
25
<class 'pandas.core.frame.DataFrame'>
26
<class 'pandas.core.frame.DataFrame'>


[                     Year        Age         Tm            Lg             W  \
 37          162 Game Avg.         15         11          .568          4.12   
 1   {19: '162 Game Avg.'}  {19: '4'}  {19: '7'}  {19: '.333'}  {19: '5.81'}   
 2   {19: '162 Game Avg.'}  {19: '4'}  {19: '7'}  {19: '.333'}  {19: '5.81'}   
 3   {19: '162 Game Avg.'}  {19: '4'}  {19: '7'}  {19: '.333'}  {19: '5.81'}   
 4   {19: '162 Game Avg.'}  {19: '4'}  {19: '7'}  {19: '.333'}  {19: '5.81'}   
 5   {19: '162 Game Avg.'}  {19: '4'}  {19: '7'}  {19: '.333'}  {19: '5.81'}   
 6   {19: '162 Game Avg.'}  {19: '4'}  {19: '7'}  {19: '.333'}  {19: '5.81'}   
 7   {19: '162 Game Avg.'}  {19: '4'}  {19: '7'}  {19: '.333'}  {19: '5.81'}   
 8   {19: '162 Game Avg.'}  {19: '4'}  {19: '7'}  {19: '.333'}  {19: '5.81'}   
 9   {19: '162 Game Avg.'}  {19: '4'}  {19: '7'}  {19: '.333'}  {19: '5.81'}   
 10  {19: '162 Game Avg.'}  {19: '4'}  {19: '7'}  {19: '.333'}  {19: '5.81'}   
 11  {19: '162 Game Avg.'}  {19: '4'}  {

In [56]:
players

Unnamed: 0,Year,Age,Tm,Lg,W,L,W-L%,ERA,G,GS,GF,CG,SHO,SV,IP,H,R,ER,HR,BB,IBB,SO,HBP,BK,WP,BF,ERA+,FIP,WHIP,H9,HR9,BB9,SO9,SO/W,Awards,last,index
37,162 Game Avg.,15,11,.568,4.12,34,34,0,2,1,0,211,219,105,96,27,58,3,154,4,0,3,892,106,4.15,1.312,9.3,1.1,2.5,6.6,2.67,,,,,colon,13
1,{19: '162 Game Avg.'},{19: '4'},{19: '7'},{19: '.333'},{19: '5.81'},{19: '51'},{19: '17'},{19: '23'},{19: '0'},{19: '0'},{19: '0'},{19: '140 '},{19: '149'},{19: '95'},{19: '90'},{19: '19'},{19: '59'},{19: '2'},{19: '78'},{19: '4'},{19: '0'},{19: '2'},{19: '612'},{19: '80'},{19: '5.08'},{19: '1.483'},{19: '9.6'},{19: '1.2'},{19: '3.8'},{19: '5.0'},{19: '1.33'},{19: ''},{19: None},{19: None},{19: None},{19: 'gonzalez'},{19: 26}
2,{19: '162 Game Avg.'},{19: '4'},{19: '7'},{19: '.333'},{19: '5.81'},{19: '51'},{19: '17'},{19: '23'},{19: '0'},{19: '0'},{19: '0'},{19: '140 '},{19: '149'},{19: '95'},{19: '90'},{19: '19'},{19: '59'},{19: '2'},{19: '78'},{19: '4'},{19: '0'},{19: '2'},{19: '612'},{19: '80'},{19: '5.08'},{19: '1.483'},{19: '9.6'},{19: '1.2'},{19: '3.8'},{19: '5.0'},{19: '1.33'},{19: ''},{19: None},{19: None},{19: None},{19: 'gonzalez'},{19: 26}
3,{19: '162 Game Avg.'},{19: '4'},{19: '7'},{19: '.333'},{19: '5.81'},{19: '51'},{19: '17'},{19: '23'},{19: '0'},{19: '0'},{19: '0'},{19: '140 '},{19: '149'},{19: '95'},{19: '90'},{19: '19'},{19: '59'},{19: '2'},{19: '78'},{19: '4'},{19: '0'},{19: '2'},{19: '612'},{19: '80'},{19: '5.08'},{19: '1.483'},{19: '9.6'},{19: '1.2'},{19: '3.8'},{19: '5.0'},{19: '1.33'},{19: ''},{19: None},{19: None},{19: None},{19: 'gonzalez'},{19: 26}
4,{19: '162 Game Avg.'},{19: '4'},{19: '7'},{19: '.333'},{19: '5.81'},{19: '51'},{19: '17'},{19: '23'},{19: '0'},{19: '0'},{19: '0'},{19: '140 '},{19: '149'},{19: '95'},{19: '90'},{19: '19'},{19: '59'},{19: '2'},{19: '78'},{19: '4'},{19: '0'},{19: '2'},{19: '612'},{19: '80'},{19: '5.08'},{19: '1.483'},{19: '9.6'},{19: '1.2'},{19: '3.8'},{19: '5.0'},{19: '1.33'},{19: ''},{19: None},{19: None},{19: None},{19: 'gonzalez'},{19: 26}
5,{19: '162 Game Avg.'},{19: '4'},{19: '7'},{19: '.333'},{19: '5.81'},{19: '51'},{19: '17'},{19: '23'},{19: '0'},{19: '0'},{19: '0'},{19: '140 '},{19: '149'},{19: '95'},{19: '90'},{19: '19'},{19: '59'},{19: '2'},{19: '78'},{19: '4'},{19: '0'},{19: '2'},{19: '612'},{19: '80'},{19: '5.08'},{19: '1.483'},{19: '9.6'},{19: '1.2'},{19: '3.8'},{19: '5.0'},{19: '1.33'},{19: ''},{19: None},{19: None},{19: None},{19: 'gonzalez'},{19: 26}
6,{19: '162 Game Avg.'},{19: '4'},{19: '7'},{19: '.333'},{19: '5.81'},{19: '51'},{19: '17'},{19: '23'},{19: '0'},{19: '0'},{19: '0'},{19: '140 '},{19: '149'},{19: '95'},{19: '90'},{19: '19'},{19: '59'},{19: '2'},{19: '78'},{19: '4'},{19: '0'},{19: '2'},{19: '612'},{19: '80'},{19: '5.08'},{19: '1.483'},{19: '9.6'},{19: '1.2'},{19: '3.8'},{19: '5.0'},{19: '1.33'},{19: ''},{19: None},{19: None},{19: None},{19: 'gonzalez'},{19: 26}
7,{19: '162 Game Avg.'},{19: '4'},{19: '7'},{19: '.333'},{19: '5.81'},{19: '51'},{19: '17'},{19: '23'},{19: '0'},{19: '0'},{19: '0'},{19: '140 '},{19: '149'},{19: '95'},{19: '90'},{19: '19'},{19: '59'},{19: '2'},{19: '78'},{19: '4'},{19: '0'},{19: '2'},{19: '612'},{19: '80'},{19: '5.08'},{19: '1.483'},{19: '9.6'},{19: '1.2'},{19: '3.8'},{19: '5.0'},{19: '1.33'},{19: ''},{19: None},{19: None},{19: None},{19: 'gonzalez'},{19: 26}
8,{19: '162 Game Avg.'},{19: '4'},{19: '7'},{19: '.333'},{19: '5.81'},{19: '51'},{19: '17'},{19: '23'},{19: '0'},{19: '0'},{19: '0'},{19: '140 '},{19: '149'},{19: '95'},{19: '90'},{19: '19'},{19: '59'},{19: '2'},{19: '78'},{19: '4'},{19: '0'},{19: '2'},{19: '612'},{19: '80'},{19: '5.08'},{19: '1.483'},{19: '9.6'},{19: '1.2'},{19: '3.8'},{19: '5.0'},{19: '1.33'},{19: ''},{19: None},{19: None},{19: None},{19: 'gonzalez'},{19: 26}
9,{19: '162 Game Avg.'},{19: '4'},{19: '7'},{19: '.333'},{19: '5.81'},{19: '51'},{19: '17'},{19: '23'},{19: '0'},{19: '0'},{19: '0'},{19: '140 '},{19: '149'},{19: '95'},{19: '90'},{19: '19'},{19: '59'},{19: '2'},{19: '78'},{19: '4'},{19: '0'},{19: '2'},{19: '612'},{19: '80'},{19: '5.08'},{19: '1.483'},{19: '9.6'},{19: '1.2'},{19: '3.8'},{19: '5.0'},{19: '1.33'},{19: ''},{19: None},{19: None},{19: None},{19: 'gonzalez'},{19: 26}


In [None]:
for target in range(len(targets)):
    target = targets[count]
    prepend = target[0] + "/"
    query = root + prepend + target + suffix
    print(count)
    
    if huskies.pitcher[count] == True:
        pitching_standard = pullTable(query, "pitching_standard")
        pitching_standard = pitching_standard.drop(pitching_standard.index[:-1])
        sleep(.2)
        pitching_value = pullTable(query, "pitching_value")
        pitching_value = pitching_value.drop(pitching_value.index[:-1])
        pitching_combined = pitching_standard.append(pitching_value)
        pitching_combined = pitching_combined.groupby('Year', as_index=False).first()
        pitching_combined['first'] = huskies['first'][count]
        pitching_combined['last'] = huskies['last'][count]
        pitching_combined['key_bbref'] = target
        huskies_p.append(pitching_combined)
    else:
        print('nah')
        
    # else:
    #     batting_standard = pullTable(query, "batting_standard")
    #     sleep(1)
    #     batting_value = pullTable(query, "batting_value")
    #     batting_combined = batting_standard.append(batting_value)
    #     batting_combined = batting_combined.groupby('Year', as_index=False).first()
    #     batting_combined['first'] = huskies['first'][count]
    #     batting_combined['last'] = huskies['last'][count]
    #     batting_combined['key_bbref'] = target
    #     try:
    #         huskies_b.append(batting_combined)
    #     except:
    #         huskies_b = batting_combined

    sleep(.2)
    count += 1


In [None]:
print(huskies_p)

Empty DataFrame
Columns: []
Index: []


In [None]:
root = "https://www.baseball-reference.com/players/"
suffix = ".shtml"
count = 0
for target in range(len(targets)):
    target = targets[count]
    prepend = target[0] + "/"
    query = root + prepend + target + suffix
    print(query)
    response = requests.get(query)
    doc = BeautifulSoup(response.text)
    if huskies.pitcher[count] == True:
        table_a = doc.select("#pitching_standard > tfoot > tr:nth-child(2) .right")
        table_b = doc.select("#pitching_value > tfoot > tr:nth-child(2) .right")

        sleep(0.25)
        ##
        metrics_a = []
        metrics_b = []
        vals_a = []
        vals_b = []
        if table_a:
            for row in range(len(table_a)):
                metrics_a.append(table_a[row]['data-stat'])
                vals_a.append(table_a[row].text)

        else:
            print("couldn't find table a")
        sleep(1)
        if table_b:
            for row in range(len(table_b)):
                metrics_b.append(table_b[row]['data-stat'])
                vals_b.append(table_b[row].text)
            else:
                print("couldn't find table_b")
            # metrics.append(page_content[row]['data-stat'])
            # vals.append(page_content[row].text)
        # print(metrics)
        # print(vals)
        # for row in range(len(table_b)):
        #     metric = []
        #     val = []
        #     stat = []
        #     metrics_b.append(table_b[row]['data-stat'])
        #     vals_b.append(table_b[row].text)
        #     # metrics.append(page_content[row]['data-stat'])
        #     # vals.append(page_content[row].text)
        print("---")
        print(metrics_a)
        print(vals_a)
        print(metrics_b)
        print(vals_b)
    else:
        print('nah')


    # stats = pd.DataFrame({
    #     'metrics': metrics,
    #     'values': vals
    # })
    # sleep(.5)
    # count += 1
    # stats.set_index('metrics').T
    count += 1


https://www.baseball-reference.com/players/k/kirkal01.shtml
nah
https://www.baseball-reference.com/players/p/penabr01.shtml
nah
https://www.baseball-reference.com/players/s/sandopa01.shtml
nah
https://www.baseball-reference.com/players/w/whitety01.shtml
nah
https://www.baseball-reference.com/players/p/phegljo01.shtml
nah
https://www.baseball-reference.com/players/s/solando01.shtml
nah
https://www.baseball-reference.com/players/c/callaal01.shtml
nah
https://www.baseball-reference.com/players/t/tejadmi01.shtml
nah
https://www.baseball-reference.com/players/n/naylojo01.shtml
nah
https://www.baseball-reference.com/players/v/vicieda01.shtml
nah
https://www.baseball-reference.com/players/r/ramirha02.shtml
nah
https://www.baseball-reference.com/players/a/astudwi01.shtml
nah
https://www.baseball-reference.com/players/f/fothebo01.shtml
nah
https://www.baseball-reference.com/players/c/colonba01.shtml
---
['W', 'L', 'win_loss_perc', 'earned_run_avg', 'G', 'GS', 'GF', 'CG', 'SHO', 'SV', 'IP', 'H',

KeyboardInterrupt: 

In [None]:
stats

Unnamed: 0,metrics,values


In [None]:
metrics = []
vals = []
for row in range(len(page_content)):
    metric = []
    val = []
    stat = []
    metrics.append(page_content[row]['data-stat'])
    vals.append(page_content[row].text)

stats = pd.DataFrame({
    'metrics': metrics,
    'values': vals
})

stats.set_index('metrics').T

metrics,W,L,win_loss_perc,earned_run_avg,G,GS,GF,CG,SHO,SV,...,WP,batters_faced,earned_run_avg_plus,fip,whip,hits_per_nine,home_runs_per_nine,bases_on_balls_per_nine,strikeouts_per_nine,strikeouts_per_base_on_balls
values,15,11,0.568,4.12,34,34,0,2,1,0,...,3,892,106,4.15,1.312,9.3,1.1,2.5,6.6,2.67
