## Using BeautifulSoup/Selenium to Scrape BaseballSavant

First, importing needed packages:

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
from pylab import rcParams
%matplotlib inline
rcParams['figure.figsize'] = 20,10
import numpy as np
import glob
from scipy import stats
from bs4 import BeautifulSoup
import requests
import re
from IPython.core.display import display, HTML    # make sure Jupyter knows to display it as HTML

Importing selenium and setting chrome driver:

In [2]:
import time, os
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
chromedriver = "/Applications/chromedriver" # path to the chromedriver executable
os.environ["webdriver.chrome.driver"] = chromedriver

In [3]:
savant_url = 'https://baseballsavant.mlb.com/leaderboard/custom?year=2020,2019,2018,2017&type=batter&filter=&sort=4&sortDir=desc&min=50&selections=player_age,exit_velocity_avg,launch_angle_avg,barrel_batted_rate,solidcontact_percent,hard_hit_percent,z_swing_percent,oz_swing_percent,pull_percent,straightaway_percent,opposite_percent,groundballs_percent,flyballs_percent,linedrives_percent,sprint_speed,&chart=true&x=player_age&y=player_age&r=no&chartType=beeswarm'

In [4]:
driver = webdriver.Chrome(chromedriver)
driver.get(savant_url)

In [5]:
driver.page_source[:1000]

'<html lang="en_US" class=" device-desktop"><head>\n    <title>Statcast Custom Leaderboards | baseballsavant.com </title>\n\n    <!-- meta meta tag -->\n    <meta charset="utf-8">\n    <meta http-equiv="X-UA-Compatible" content="IE=Edge">\n    <meta http-equiv="Cache-Control" content="no-cache">\n    <meta http-equiv="Pragma" content="no-cache">\n    <meta http-equiv="Expires" content="-1">\n    <meta name="keywords" content="Baseball Savant">\n    <!-- -->\n    <meta property="og:title" content="Statcast Custom Leaderboards">\n    <meta itemprop="name" content="Statcast Custom Leaderboards baseballsavant.com">\n    \n    <meta property="og:site_name" content="baseballsavant.com">\n    <meta property="og:type" content="website">\n    \n    <meta property="og:image" content="https://baseballsavant.mlb.com/site-core/images/statcast-logo-gcp-social.png">\n    <meta itemprop="image" content="https://baseballsavant.mlb.com/site-core/images/statcast-logo-gcp-social.png">\n     <meta name="tw

In [6]:
soup = BeautifulSoup(driver.page_source, 'html.parser')

In [7]:
headers = soup.find('div', id='sortable_stats').find_all('th')

Finding Table Rows:

In [8]:
columns = [col.get_text() for col in headers]

In [9]:
print(columns)

['Rk.', 'Player', 'Year', 'Age', 'Avg EV (MPH)', 'Avg LA (°)', 'Barrel%', 'Solid Contact %', 'Hard Hit %', 'Zone Swing %', 'Out of Zone Swing %', 'Pull %', 'Straight Away %', 'Oppo %', 'GB%', 'FB%', 'LD %', 'Sprint Speed']


In [10]:
savant_df = pd.DataFrame(columns=columns)
savant_df.head()

Unnamed: 0,Rk.,Player,Year,Age,Avg EV (MPH),Avg LA (°),Barrel%,Solid Contact %,Hard Hit %,Zone Swing %,Out of Zone Swing %,Pull %,Straight Away %,Oppo %,GB%,FB%,LD %,Sprint Speed


Finding Stats:

In [11]:
stats_test = soup.find('div', id='sortable_stats').find_all('tr')

In [12]:
stats = [col.get_text() for col in stats_test]

In [13]:
stat_lines = soup.find('div', id='sortable_stats').find_all('tr')
compiled_stats = []
for line_item in stat_lines:
    ind_stat_line = [item.get_text() for item in line_item.find_all('td')]
    compiled_stats.append(ind_stat_line)
    final_stats = compiled_stats[1:]

Building DataFrame:

In [14]:
savant_df = pd.DataFrame(final_stats, columns=columns)
num_cols = ['Age','Year','Avg EV (MPH)','Avg LA (°)','Barrel%','Solid Contact %','Hard Hit %','Zone Swing %','Out of Zone Swing %','Pull %','Straight Away %','Oppo %','GB%','FB%','LD %','Sprint Speed']
savant_df[num_cols] = savant_df[num_cols].apply(pd.to_numeric, errors='coerce', axis=1)
savant_df.head()

Unnamed: 0,Rk.,Player,Year,Age,Avg EV (MPH),Avg LA (°),Barrel%,Solid Contact %,Hard Hit %,Zone Swing %,Out of Zone Swing %,Pull %,Straight Away %,Oppo %,GB%,FB%,LD %,Sprint Speed
0,1,"Heineman, Tyler",2020.0,29.0,82.4,34.3,2.6,0.0,10.5,65.2,24.2,44.7,23.7,31.6,36.8,23.7,15.8,25.9
1,2,"Schimpf, Ryan",2017.0,29.0,86.1,29.8,16.5,4.1,30.9,59.3,18.5,44.3,29.9,25.8,20.6,42.3,17.5,27.7
2,3,"Higashioka, Kyle",2018.0,28.0,88.4,28.0,10.7,5.4,30.4,68.9,23.5,42.9,42.9,14.3,25.0,44.6,16.1,25.0
3,4,"Zunino, Mike",2020.0,29.0,87.5,26.9,15.8,7.9,36.8,66.3,31.5,44.7,34.2,21.1,21.1,31.6,28.9,25.2
4,5,"Gallo, Joey",2020.0,27.0,91.2,26.8,14.0,4.4,43.9,66.3,19.2,48.2,28.9,22.8,31.6,29.8,21.1,27.1


Driver's work here is done.

In [15]:
driver.close()

Creating a list of names that can feed into the BBREF code:

In [26]:
player_column = savant_df.Player

In [28]:
player_column.drop_duplicates(inplace=True)

In [29]:
player_list = player_column.tolist()

In [30]:
player_list_split = [player.split(', ') for player in player_list]
for player in player_list_split:
    player[0] = player[0].strip()

final_player_list = [(player[1] + ' ' + player[0]) for player in player_list_split]    

In [31]:
final_player_list

['Tyler Heineman',
 'Ryan Schimpf',
 'Kyle Higashioka',
 'Mike Zunino',
 'Joey Gallo',
 'Jett Bandy',
 'Grayson Greiner',
 'Adam Duvall',
 'Kyle Garlick',
 'Chris Young',
 'Travis Shaw',
 'Rhys Hoskins',
 'Jordan Luplow',
 'Anthony Santander',
 'Christin Stewart',
 'Carlos Perez',
 'Austin Meadows',
 'Dilson Herrera',
 'Nick Dini',
 'Will Smith',
 'Matt Chapman',
 'Cody Asche',
 'Byron Buxton',
 'Aaron Hill',
 'Brandon Belt',
 'Brian McCann',
 'Jose Ramirez',
 'Franklin Barreto',
 'Mike Trout',
 'Chris Carter',
 'Edwin Encarnacion',
 'Matt Carpenter',
 'Pete Kozma',
 'Anthony Bemboom',
 'Robel Garcia',
 'Lucas Duda',
 'Stephen Vogt',
 'John Ryan Murphy',
 'Mike Napoli',
 "Tyler O'Neill",
 'Curt Casali',
 'Robinson Chirinos',
 'Max Kepler',
 'Kaleb Cowart',
 'Austin Hedges',
 'Noah Syndergaard',
 'Paul DeJong',
 'Zack Cozart',
 'Sandy Leon',
 'Dustin Garneau',
 'Jay Bruce',
 'Matt Adams',
 'Cam Gallagher',
 'Erik Kratz',
 'Danny Jansen',
 'Renato Nunez',
 'Kyle Seager',
 "Chase d'Arnaud

In [32]:
len(final_player_list)

845

In [34]:
#Checking to make sure length of unique names matches player list.
savant_df.Player.value_counts()

 McCutchen, Andrew    4
 Gardner, Brett       4
 Devers, Rafael       4
 Lindor, Francisco    4
 Gregorius, Didi      4
                     ..
 Alcantara, Sandy     1
 Paredes, Isaac       1
 Cecchini, Gavin      1
 Robert, Luis         1
 Leake, Mike          1
Name: Player, Length: 845, dtype: int64

## Moving in BBREF Data

In [36]:
def Player_URL_Generator(player_list):
    '''
    Takes a list of player names, with each name being an entry of first name and last name.
    Returns a URL on baseball reference to each player's original page.
    '''
    separated_names = [player.split(" ") for player in player_list]
    url_list = []
    for i, player in enumerate(separated_names):
        if len(separated_names[i][1]) <= 5:
            url = 'https://www.baseball-reference.com/players/{}/{}{}01.shtml'.format(separated_names[i][1][0].lower(), separated_names[i][1].lower(), separated_names[i][0][0:2].lower())
        else:
            url = 'https://www.baseball-reference.com/players/{}/{}{}01.shtml'.format(separated_names[i][1][0].lower(), separated_names[i][1][0:5].lower(), separated_names[i][0][0:2].lower())
        url_list.append(url)
    return url_list 

In [35]:
def Player_Soup_Generator(player_list):
    '''
    Takes in a list of players and returns Beautiful Soup objects.
    '''
    url_list = Player_URL_Generator(player_list)
    response_list = [requests.get(url) for url in url_list]
    soup_list = []
    for response in response_list:
        page = response.text
        soup_list.append(BeautifulSoup(page,'lxml'))
    return soup_list

In [37]:
def Pull_Player_Seasons(player_soup):
    '''
    Takes in a player's soup object.
    Returns a list of the season the player played in.
    '''
    seasons = player_soup.find_all('tr', attrs={'id':re.compile('batting_standard.')})
    #seasons = player_soup.find_all('tr', attrs={'data-stat':re.compile('')})
    seasons_played = []
    for season in seasons:
        season_item = [str(item.get_text()) for item in season.find_all('th')]
        seasons_played.append(season_item)
    return seasons_played

In [38]:
def Pull_Player_Stats(player_soup):
    '''
    Takes in a player's soup object.
    Returns a list of stat lines by season.
    '''
    stat_line = player_soup.find_all('tr', attrs={'id':re.compile('batting_standard.')})
    career_stats = []
    for season in stat_line:
        season_stat_line = [item.get_text() for item in season.find_all('td')]
        career_stats.append(season_stat_line)
    for i,year in enumerate(career_stats):
        career_stats[i].insert(0,Pull_Player_Seasons(player_soup)[i][0])
    return career_stats

In [51]:
def Player_Dataframe_Builder(player_soup_list):
    '''
    Takes in a list of BeautifulSoup Objects, returns a data frame of their Baseball-Reference stats.
    '''
    i = 0
    for player in player_soup_list:
        header = player.find_all('th', attrs={'class': 'poptip'})
        columns = [col.get_text() for col in header]
        if i == 0:
            compiled_player_df = pd.DataFrame(columns=columns)
            compiled_player_df['Name'] = ''
            i += 1
        
        current_player_df = pd.DataFrame(Pull_Player_Stats(player), columns=columns)
  
        #Adding Player Name as a column:
        name = player.title.text.split("Stats")[0]
        current_player_df['Name'] = name
            
        compiled_player_df = pd.concat([compiled_player_df, current_player_df], ignore_index=True)
    num_cols = ['Age', 'G', 'PA', 'AB', 'R','H','2B','3B','HR','RBI','SB','CS','BB','SO','BA','OBP','SLG','OPS','OPS+','TB','GDP','HBP','SH','SF','IBB']
    compiled_player_df[num_cols] = compiled_player_df[num_cols].apply(pd.to_numeric, errors='coerce', axis=1)
    return compiled_player_df

In [55]:
my_soup_list = Player_Soup_Generator(final_player_list)

In [49]:
for player in my_soup_list:
        i = 0
        header = player.find_all('th', attrs={'class': 'poptip'})
        columns = [col.get_text() for col in header]
        if i == 0:
            compiled_player_df = pd.DataFrame(columns=columns)
            compiled_player_df['Name'] = ''
            i += 1
        current_player_df = pd.DataFrame(Pull_Player_Stats(player), columns=columns)
  
        #Adding Player Name as a column:
        name = player.title.text.split("Stats")[0]
        current_player_df['Name'] = name
            
        compiled_player_df = pd.concat([compiled_player_df, current_player_df], ignore_index=True)

In [56]:
multiple_player_df = Player_Dataframe_Builder(my_soup_list)

ValueError: 34 columns passed, passed data had 30 columns

In [None]:
multiple_player_df.Name.value_counts()

In [None]:
multiple_player_df.head()