In [468]:
from bs4 import BeautifulSoup
from pybaseball import playerid_lookup
import json
import lxml
import pandas as pd
import re
import requests
from time import sleep

# Scraping Each Player's Meanest Season
We're going to initialize a list of all of our players and then run each player through a scraper to pull a set of stats that describes the numbers we could expect out of them for a full, 162-game season.

For batters, this is easy. Here's the formula we can use for each stat to arrive at an expected value for that stat for a full 162-game season: 

```
Career games played / 162 = [[FACTOR]]
[[STAT]] / [[FACTOR]] = expectation
```

For pitchers, this is a little more complicated, because pitchers don't play every game. But our friends over at Baseball Reference **(BBREF)** have attempted to tackle this by treating each complete pitcher-season as:

```
games_pitched = X
games_started = X * 2
games_pitched + games_started = 68
```

So for a pitcher, either 34 starts or 68 relief appearances is worth one pitcher_season, and one pitcher_season is pegged to a value of 68. Their rationale is that, across history, full-time starters average to around 34 starts per year, and full-time relievers average about 68 relief appearances per year.

But some pitchers appear as both starters and relievers. Hence the need for normalization.

Thankfully, our friends at Baseball Reference have done all of these calculations for us, and they're available to view for free, with no login necessary.

Which mean we can scrape the hell out of it.

## Let's initialize a list of our players

In [469]:
names = [
    'bartolo colon',
    'fernando rodney',
    'reyes moronta',
    'josé mijares',
    'tom healey',
    'jean machi',
    'fernando rodney',
    'angel castro',
    'edwar colina',
    'framber valdez',
    'andrew carignan',
    'keegan akin',
    'julio mateo',
    'enrique gonzalez',
    'alejandro kirk',
    'brayan pena',
    'pablo sandoval',
    'tyler white',
    'josh phegley',
    'donovan solano',
    'alberto callaspo',
    'miguel tejada',
    'josh naylor',
    'dayán viciedo',
    'harold ramirez',
    'willians astudillo',
    'bob fothergill'
]


In order to scrape Baseball Reference, we'll need the unique ID that BBREF has given every player for whom they have a record.

Thankfully, the PyBaseball library has a module that'll dig out this ID for us if we feed it our player names:

```python

from pybaseball import playerid_lookup

playerid_lookup('last', 'first', fuzzy=True)['key_bbref'][0]

# e.g.:


>>> Bartolo Colon as ('colon', 'bartolo')

'colonba01'
```

But we can make ourselves a little loop that will take our list of player names, spin them around, run them through player_lookup, and return a cute little dataframe with everything we need.

And then we party.

In [470]:
names_and_ids = []
count = 0
for name in range(len(names)):
    name = names[count].split()
    name.append(playerid_lookup(name[1], name[0], fuzzy=True)["key_bbref"][0])
    names_and_ids.append(name)
    count += 1
    sleep(0.25)
huskies = pd.DataFrame(names_and_ids, columns=["first", "last", "key_bbref"])

No identically matched names found! Returning the 5 most similar names.


## Now we run everything through our scraper

In [474]:
huskies['key_bbref']

0     colonba01
1     rodnefe01
2     moronre01
3     mijarjo01
4     healeto01
5     machije01
6     rodnefe01
7     castran01
8     colined01
9     valdefr01
10    carigan01
11     akinke01
12    mateoju01
13    gonzaen01
14     kirkal01
15     penabr01
16    sandopa01
17    whitety01
18    phegljo01
19    solando01
20    callaal01
21    tejadmi01
22    naylojo01
23    vicieda01
24    ramirha02
25    astudwi01
26    fothebo01
Name: key_bbref, dtype: object

In [476]:
targets = []
count = 0
for player in range(len(huskies['key_bbref'])):
    id = huskies['key_bbref'][count]
    targets.append(id)
    count += 1

colonba01
rodnefe01
moronre01
mijarjo01
healeto01
machije01
rodnefe01
castran01
colined01
valdefr01
carigan01
akinke01
mateoju01
gonzaen01
kirkal01
penabr01
sandopa01
whitety01
phegljo01
solando01
callaal01
tejadmi01
naylojo01
vicieda01
ramirha02
astudwi01
fothebo01


In [490]:
root = "https://www.baseball-reference.com/players/"
suffix = ".shtml"
count = 0
for target in range(len(targets)):
    target = targets[count]
    query = root + target + suffix
    print(query)
    response = requests.get(query)
    doc = BeautifulSoup(response.text)
    page_content = doc.select("#pitching_standard > tfoot > tr:nth-child(2) .right")
    # sleep(0.5)
    # ##
    
    # metrics = []
    # vals = []
    # for row in range(len(page_content)):
    #     metric = []
    #     val = []
    #     stat = []
    #     metrics.append(page_content[row]['data-stat'])
    #     vals.append(page_content[row].text)
    # print(metrics)
    # print(vals)


    # stats = pd.DataFrame({
    #     'metrics': metrics,
    #     'values': vals
    # })
    # sleep(.5)
    # count += 1
    # stats.set_index('metrics').T



https://www.baseball-reference.com/players/colonba01.shtml
https://www.baseball-reference.com/players/rodnefe01.shtml
https://www.baseball-reference.com/players/moronre01.shtml
https://www.baseball-reference.com/players/mijarjo01.shtml
https://www.baseball-reference.com/players/healeto01.shtml
https://www.baseball-reference.com/players/machije01.shtml
https://www.baseball-reference.com/players/rodnefe01.shtml
https://www.baseball-reference.com/players/castran01.shtml
https://www.baseball-reference.com/players/colined01.shtml
https://www.baseball-reference.com/players/valdefr01.shtml
https://www.baseball-reference.com/players/carigan01.shtml
https://www.baseball-reference.com/players/akinke01.shtml
https://www.baseball-reference.com/players/mateoju01.shtml
https://www.baseball-reference.com/players/gonzaen01.shtml
https://www.baseball-reference.com/players/kirkal01.shtml
https://www.baseball-reference.com/players/penabr01.shtml
https://www.baseball-reference.com/players/sandopa01.shtml


In [482]:
stats

Unnamed: 0,metrics,values


In [240]:
metrics = []
vals = []
for row in range(len(page_content)):
    metric = []
    val = []
    stat = []
    metrics.append(page_content[row]['data-stat'])
    vals.append(page_content[row].text)

stats = pd.DataFrame({
    'metrics': metrics,
    'values': vals
})

stats.set_index('metrics').T

metrics,W,L,win_loss_perc,earned_run_avg,G,GS,GF,CG,SHO,SV,...,WP,batters_faced,earned_run_avg_plus,fip,whip,hits_per_nine,home_runs_per_nine,bases_on_balls_per_nine,strikeouts_per_nine,strikeouts_per_base_on_balls
values,15,11,0.568,4.12,34,34,0,2,1,0,...,3,892,106,4.15,1.312,9.3,1.1,2.5,6.6,2.67
