## Using BeautifulSoup/Selenium to Scrape BaseballSavant

First, importing needed packages:

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
from pylab import rcParams
%matplotlib inline
rcParams['figure.figsize'] = 20,10
import numpy as np
import glob
from scipy import stats
from bs4 import BeautifulSoup
import requests
import re
from IPython.core.display import display, HTML    # make sure Jupyter knows to display it as HTML

Importing selenium and setting chrome driver:

In [2]:
import time, os
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
chromedriver = "/Applications/chromedriver" # path to the chromedriver executable
os.environ["webdriver.chrome.driver"] = chromedriver

In [3]:
#Data from 2015-2020, players with at least 200 PA in that given season:
savant_url = 'https://baseballsavant.mlb.com/leaderboard/custom?year=2020,2019,2018,2017,2016,2015&type=batter&filter=&sort=16&sortDir=desc&min=200&selections=player_age,b_total_pa,b_game,exit_velocity_avg,launch_angle_avg,barrel_batted_rate,solidcontact_percent,hard_hit_percent,z_swing_percent,oz_swing_percent,pull_percent,straightaway_percent,opposite_percent,groundballs_percent,flyballs_percent,linedrives_percent,sprint_speed,&chart=true&x=player_age&y=player_age&r=no&chartType=beeswarm'

In [4]:
driver = webdriver.Chrome(chromedriver)
driver.get(savant_url)

In [5]:
driver.page_source[:1000]

'<html lang="en_US" class=" device-desktop"><head>\n    <title>Statcast Custom Leaderboards | baseballsavant.com </title>\n\n    <!-- meta meta tag -->\n    <meta charset="utf-8">\n    <meta http-equiv="X-UA-Compatible" content="IE=Edge">\n    <meta http-equiv="Cache-Control" content="no-cache">\n    <meta http-equiv="Pragma" content="no-cache">\n    <meta http-equiv="Expires" content="-1">\n    <meta name="keywords" content="Baseball Savant">\n    <!-- -->\n    <meta property="og:title" content="Statcast Custom Leaderboards">\n    <meta itemprop="name" content="Statcast Custom Leaderboards baseballsavant.com">\n    \n    <meta property="og:site_name" content="baseballsavant.com">\n    <meta property="og:type" content="website">\n    \n    <meta property="og:image" content="https://baseballsavant.mlb.com/site-core/images/statcast-logo-gcp-social.png">\n    <meta itemprop="image" content="https://baseballsavant.mlb.com/site-core/images/statcast-logo-gcp-social.png">\n     <meta name="tw

In [6]:
soup = BeautifulSoup(driver.page_source, 'html.parser')

In [7]:
headers = soup.find('div', id='sortable_stats').find_all('th')

Finding Table Rows:

In [8]:
columns = [col.get_text() for col in headers]

In [9]:
print(columns)

['Rk.', 'Player', 'Year', 'Age', 'PA', 'G', 'Avg EV (MPH)', 'Avg LA (°)', 'Barrel%', 'Solid Contact %', 'Hard Hit %', 'Zone Swing %', 'Out of Zone Swing %', 'Pull %', 'Straight Away %', 'Oppo %', 'GB%', 'FB%', 'LD %', 'Sprint Speed']


In [10]:
savant_df = pd.DataFrame(columns=columns)
savant_df.head()

Unnamed: 0,Rk.,Player,Year,Age,PA,G,Avg EV (MPH),Avg LA (°),Barrel%,Solid Contact %,Hard Hit %,Zone Swing %,Out of Zone Swing %,Pull %,Straight Away %,Oppo %,GB%,FB%,LD %,Sprint Speed


Finding Stats:

In [11]:
stats_test = soup.find('div', id='sortable_stats').find_all('tr')

In [12]:
stats = [col.get_text() for col in stats_test]

In [21]:
stat_lines = soup.find('div', id='sortable_stats').find_all('tr')
compiled_stats = []
for line_item in stat_lines:
    ind_stat_line = [item.get_text() for item in line_item.find_all('td')]
    compiled_stats.append(ind_stat_line)
    final_stats = compiled_stats[1:]

Building DataFrame:

In [28]:
savant_df = pd.DataFrame(final_stats, columns=columns)
num_cols = ['Age','G','PA','Year','Avg EV (MPH)','Avg LA (°)','Barrel%','Solid Contact %','Hard Hit %','Zone Swing %','Out of Zone Swing %','Pull %','Straight Away %','Oppo %','GB%','FB%','LD %','Sprint Speed']
savant_df[num_cols] = savant_df[num_cols].apply(pd.to_numeric, errors='coerce', axis=1)
savant_df.set_index(savant_df['Rk.'], inplace=True)
savant_df.drop(columns='Rk.',inplace=True)
savant_df.head()

Unnamed: 0_level_0,Player,Year,Age,PA,G,Avg EV (MPH),Avg LA (°),Barrel%,Solid Contact %,Hard Hit %,Zone Swing %,Out of Zone Swing %,Pull %,Straight Away %,Oppo %,GB%,FB%,LD %,Sprint Speed
Rk.,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
1,"Saltalamacchia, Jarrod",2016.0,31.0,292.0,92.0,89.2,21.4,11.6,6.8,35.4,65.7,23.9,44.2,34.0,21.8,31.3,43.5,19.7,25.6
2,"Schimpf, Ryan",2016.0,28.0,330.0,89.0,90.3,29.7,16.7,8.0,41.4,64.7,23.4,41.4,33.9,24.7,20.1,42.0,21.8,28.2
3,"Gallo, Joey",2017.0,24.0,532.0,145.0,93.3,23.0,22.1,8.3,52.2,72.7,28.2,50.2,29.2,20.6,29.2,37.9,22.1,27.8
4,"Carpenter, Matt",2018.0,33.0,677.0,156.0,90.7,21.6,13.7,11.5,44.7,57.1,18.5,47.6,30.7,21.7,28.3,37.8,28.5,26.5
5,"Bruce, Jay",2019.0,32.0,333.0,98.0,90.2,21.4,13.4,8.2,43.7,79.6,37.0,42.9,29.9,27.3,29.4,37.7,19.9,26.7


In [29]:
savant_df.Player.value_counts()

 McCutchen, Andrew         6
 Sano, Miguel              6
 Ramirez, Jose             6
 Semien, Marcus            6
 Martinez, J.D.            6
                          ..
 Bourgeois, Jason          1
 Castillo, Rusney          1
 Morneau, Justin           1
 Navarro, Dioner           1
 Bethancourt, Christian    1
Name: Player, Length: 646, dtype: int64

In [17]:
savant_df

Unnamed: 0,Rk.,Player,Year,Age,PA,G,Avg EV (MPH),Avg LA (°),Barrel%,Solid Contact %,Hard Hit %,Zone Swing %,Out of Zone Swing %,Pull %,Straight Away %,Oppo %,GB%,FB%,LD %,Sprint Speed
0,1,"Saltalamacchia, Jarrod",2016.0,31.0,292,92,89.2,21.4,11.6,6.8,35.4,65.7,23.9,44.2,34.0,21.8,31.3,43.5,19.7,25.6
1,2,"Schimpf, Ryan",2016.0,28.0,330,89,90.3,29.7,16.7,8.0,41.4,64.7,23.4,41.4,33.9,24.7,20.1,42.0,21.8,28.2
2,3,"Gallo, Joey",2017.0,24.0,532,145,93.3,23.0,22.1,8.3,52.2,72.7,28.2,50.2,29.2,20.6,29.2,37.9,22.1,27.8
3,4,"Carpenter, Matt",2018.0,33.0,677,156,90.7,21.6,13.7,11.5,44.7,57.1,18.5,47.6,30.7,21.7,28.3,37.8,28.5,26.5
4,5,"Bruce, Jay",2019.0,32.0,333,98,90.2,21.4,13.4,8.2,43.7,79.6,37.0,42.9,29.9,27.3,29.4,37.7,19.9,26.7
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1888,1889,"Jankowski, Travis",2016.0,25.0,383,131,86.2,0.8,1.3,1.3,26.9,56.9,21.4,20.2,43.3,36.6,62.2,10.1,26.5,29.5
1889,1890,"LeMahieu, DJ",2016.0,28.0,635,146,91.7,5.9,4.9,5.6,47.5,59.2,23.0,20.8,42.0,37.2,52.9,10.1,33.7,27.1
1890,1891,"Jankowski, Travis",2018.0,27.0,387,117,85.6,3.3,0.7,1.1,19.3,59.2,18.7,30.1,39.9,30.1,60.1,10.1,25.4,29.0
1891,1892,"Slater, Austin",2018.0,26.0,225,74,87.3,2.6,2.3,0.8,35.1,65.9,28.0,23.7,36.6,39.7,63.4,9.2,22.9,28.0


Driver's work here is done.

In [15]:
driver.close()

Creating a list of names that can feed into the BBREF code:

In [141]:
player_column = savant_df.Player

In [142]:
player_column.drop_duplicates(inplace=True)

In [143]:
player_list = player_column.tolist()

In [144]:
player_list_split = [player.split(', ') for player in player_list]
for player in player_list_split:
    player[0] = player[0].strip()

final_player_list = [(player[1] + ' ' + player[0]) for player in player_list_split]    

In [145]:
final_player_list

['Tim Locastro',
 'Byron Buxton',
 'Adam Engel',
 'Delino DeShields',
 'Trea Turner',
 'Billy Hamilton',
 'Garrett Hampson',
 'Mallex Smith',
 'Bradley Zimmer',
 'Harrison Bader',
 'Adalberto Mondesi',
 'Keon Broxton',
 'Peter Bourjos',
 'Paulo Orlando',
 'Jon Berti',
 'Cesar Hernandez',
 'Kevin Kiermaier',
 'Rajai Davis',
 'Mike Trout',
 'Starling Marte',
 'Justin Maxwell',
 'Trevor Story',
 'Dee Strange-Gordon',
 'Ronald Acuna Jr.',
 'Oscar Mercado',
 'Lewis Brinson',
 'Richie Martin',
 'Brett Lawrie',
 'Jarrod Dyson',
 'Travis Jankowski',
 'Tommy Edman',
 'Nick Senzel',
 'Manuel Margot',
 'Jake Marisnick',
 'Jeremy Hazelbaker',
 "Shawn O'Malley",
 'Fernando Tatis Jr.',
 'Scott Kingery',
 'Amed Rosario',
 'Dansby Swanson',
 'Victor Robles',
 'Ryan Cordell',
 'Allen Cordoba',
 'Billy Burns',
 'Alen Hanson',
 'Whit Merrifield',
 'Victor Reyes',
 'Teoscar Hernandez',
 'Nick Williams',
 'Ketel Marte',
 'Wilmer Difo',
 'Avisail Garcia',
 'Cameron Maybin',
 'Aledmys Diaz',
 "Chase d'Arnaud

In [146]:
len(final_player_list)

646

In [147]:
#Checking to make sure length of unique names matches player list.
savant_df.Player.value_counts()

 Rosario, Eddie         6
 Rendon, Anthony        6
 Reddick, Josh          6
 Hicks, Aaron           6
 Betts, Mookie          6
                       ..
 Freeman, Mike          1
 Park, ByungHo          1
 Palka, Daniel          1
 Ortega, Rafael         1
 Astudillo, Willians    1
Name: Player, Length: 646, dtype: int64

## Moving in BBREF Data

Creating a Baseball reference URL based on a player list input:

In [148]:
def Player_URL_Generator(player_list):
    '''
    Takes a list of player names, with each name being an entry of first name and last name.
    Returns a URL on baseball reference to each player's original page.
    '''
    separated_names = [player.split(" ") for player in player_list]
    url_list = []
    for i, player in enumerate(separated_names):
        #Stripping punctuation from names:
        new_name = [name.replace(".","") for name in player]
        new_name = [name.replace("'","") for name in new_name]
        #print(len(new_name))
        #In other words, if a player has 3 names not two:
        if len(new_name) == 3:
            comb_name = [new_name[0], new_name[1]+new_name[2]]
            #print(comb_name)
            if len(comb_name[1]) <= 5:
                url = 'https://www.baseball-reference.com/players/{}/{}{}01.shtml'.format(comb_name[1][0].lower(), comb_name[1].lower(), comb_name[0][0:2].lower())
            else:
                url = 'https://www.baseball-reference.com/players/{}/{}{}01.shtml'.format(comb_name[1][0].lower(), comb_name[1][0:5].lower(), comb_name[0][0:2].lower())
        else:    
            if len(new_name[1]) <= 5:
                url = 'https://www.baseball-reference.com/players/{}/{}{}01.shtml'.format(new_name[1][0].lower(), new_name[1].lower(), new_name[0][0:2].lower())
            else:
                url = 'https://www.baseball-reference.com/players/{}/{}{}01.shtml'.format(new_name[1][0].lower(), new_name[1][0:5].lower(), new_name[0][0:2].lower())
        url_list.append(url)
    return url_list 

Testing URL Generator w/ edge cases (apostrophes, periods in name, spaces in name, etc.)

In [149]:
Player_URL_Generator(["Tyler O'Neill", "Chase d'Arnaud", "C.J. Cron", 'Tommy La Stella','Alejandro De Aza'])

['https://www.baseball-reference.com/players/o/oneilty01.shtml',
 'https://www.baseball-reference.com/players/d/darnach01.shtml',
 'https://www.baseball-reference.com/players/c/croncj01.shtml',
 'https://www.baseball-reference.com/players/l/lasteto01.shtml',
 'https://www.baseball-reference.com/players/d/deazaal01.shtml']

From the Player URL Function, generating a BS4 object:

In [150]:
def Player_Soup_Generator(player_list):
    '''
    Takes in a list of players and returns Beautiful Soup objects.
    '''
    url_list = Player_URL_Generator(player_list)
    response_list = [requests.get(url) for url in url_list]
    soup_list = []
    for response in response_list:
        page = response.text
        soup_object = BeautifulSoup(page,'lxml')
        name = player.title.text.split("Stats")[0]
        if player_list[i].strip() != name.strip():
            #Add in code for updating URL in this case
        soup_list.append(BeautifulSoup(page,'lxml'))
    return soup_list

Pull the years the player played from baseball reference:

In [151]:
def Pull_Player_Seasons(player_soup):
    '''
    Takes in a player's soup object.
    Returns a list of the season the player played in.
    '''
    seasons = player_soup.find_all('tr', attrs={'id':re.compile('batting_standard.')})
    #seasons = player_soup.find_all('tr', attrs={'data-stat':re.compile('')})
    seasons_played = []
    for season in seasons:
        season_item = [str(item.get_text()) for item in season.find_all('th')]
        seasons_played.append(season_item)
    return seasons_played

Pull the year by year stas from BBREF:

In [152]:
def Pull_Player_Stats(player_soup):
    '''
    Takes in a player's soup object.
    Returns a list of stat lines by season.
    '''
    stat_line = player_soup.find_all('tr', attrs={'id':re.compile('batting_standard.')})
    career_stats = []
    for season in stat_line:
        season_stat_line = [item.get_text() for item in season.find_all('td')]
        career_stats.append(season_stat_line)
    for i,year in enumerate(career_stats):
        career_stats[i].insert(0,Pull_Player_Seasons(player_soup)[i][0])
    return career_stats

From the years and stats, build a pandas dataframe:

In [153]:
def Player_Dataframe_Builder(player_soup_list):
    '''
    Takes in a list of BeautifulSoup Objects, returns a data frame of their Baseball-Reference stats.
    '''
    i=0
    
    for player in player_soup_list:
        header = player.find_all('th', attrs={'class': 'poptip'})
        columns = [col.get_text() for col in header]
        current_player_df = pd.DataFrame(Pull_Player_Stats(player), columns=columns)
        if i ==0:
            compiled_player_df = pd.DataFrame(columns=columns)
            compiled_player_df['Name'] = ''
            i += 1
        #Adding Player Name as a column:
        name = player.title.text.split("Stats")[0]
        current_player_df['Name'] = name
            
        compiled_player_df = pd.concat([compiled_player_df, current_player_df], ignore_index=True)
    num_cols = ['Age', 'G', 'PA', 'AB', 'R','H','2B','3B','HR','RBI','SB','CS','BB','SO','BA','OBP','SLG','OPS','OPS+','TB','GDP','HBP','SH','SF','IBB']
    compiled_player_df[num_cols] = compiled_player_df[num_cols].apply(pd.to_numeric, errors='coerce', axis=1)
    return compiled_player_df

Test Case, using the final_player_list from the BasebalL Savant Data:

In [154]:
my_soup_list = Player_Soup_Generator(final_player_list)

Testing for players whose URL and names don't match using the normal method:

In [160]:
for i, player in enumerate(my_soup_list):
    name = player.title.text.split("Stats")[0]
    if final_player_list[i].strip() != name.strip():
        print('WRONG!')
        print(name, final_player_list[i])

WRONG!
Mayo Smith  Mallex Smith
WRONG!
Page Not Found (404 error) | Baseball-Reference.com Adalberto Mondesi
WRONG!
Page Not Found (404 error) | Baseball-Reference.com Dee Strange-Gordon
WRONG!
Fernando Tatis  Fernando Tatis Jr.
WRONG!
Bill Burns  Billy Burns
WRONG!
Alex Diaz  Aledmys Diaz
WRONG!
Ri Jones  JaCoby Jones
WRONG!
Hanley Ramirez  Harold Ramirez
WRONG!
Chink Taylor  Chris Taylor
WRONG!
Page Not Found (404 error) | Baseball-Reference.com Michael A. Taylor
WRONG!
Page Not Found (404 error) | Baseball-Reference.com JB Shuck
WRONG!
Page Not Found (404 error) | Baseball-Reference.com Tommy Pham
WRONG!
Leo Garcia  Leury Garcia
WRONG!
Ed Nunez  Eduardo Nunez
WRONG!
Adam Russell  Addison Russell
WRONG!
José Iglesias  Jose Iglesias
WRONG!
Ronald Herrera  Rosell Herrera
WRONG!
Albert Almora  Albert Almora Jr.
WRONG!
Page Not Found (404 error) | Baseball-Reference.com Yolmer Sanchez
WRONG!
Yoenis Céspedes  Yoenis Cespedes
WRONG!
Brian Harper  Bryce Harper
WRONG!
Charlie Blackburn  Char

In [156]:
multiple_player_df = Player_Dataframe_Builder(my_soup_list)

ValueError: 34 columns passed, passed data had 30 columns

In [None]:
multiple_player_df.Name.value_counts()

In [None]:
multiple_player_df.head()

In [40]:
my_list = ['Eloy Jimenez','Jose Abreu','Mike Trout','Adam Eaton','Frank Thomas']

In [60]:
url_list = Player_URL_Generator(my_list)
response_list = [requests.get(url) for url in url_list]
soup_list = []
for response in response_list:
    page = response.text
    soup_list.append(BeautifulSoup(page,'lxml'))

<built-in method title of str object at 0x7f90e6498000>
<built-in method title of str object at 0x7f90ed79c000>
<built-in method title of str object at 0x7f90fd200000>
<built-in method title of str object at 0x7f9124e6d000>
<built-in method title of str object at 0x7f90e6498000>


In [62]:
for i, player in enumerate(soup_list):
    name = player.title.text.split("Stats")[0]
    print(name)
    print(my_list[i])
    if str(name) != str(my_list[i]):
        print('WRONG!')
    else:
        print('MATC!')

Elvio Jimenez 
Eloy Jimenez
WRONG!
Joe Abreu 
Jose Abreu
WRONG!
Mike Trout 
Mike Trout
WRONG!
Adam Eaton 
Adam Eaton
WRONG!
Frosty Thomas 
Frank Thomas
WRONG!
