# Pitch Stat Scraping

The purpose of this notebook is to begin the scraping process from baseball savant, in order to collect MLB pitch by pitch data.

First, importing needed packages:

In [54]:
import pandas as pd
import matplotlib.pyplot as plt
from pylab import rcParams
%matplotlib inline
rcParams['figure.figsize'] = 20,10
import numpy as np
import glob
from scipy import stats
from bs4 import BeautifulSoup
import requests
import re
from IPython.core.display import display, HTML    # make sure Jupyter knows to display it as HTML

Importing selenium and setting chrome driver:

In [55]:
import time, os
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
chromedriver = "/Applications/chromedriver" # path to the chromedriver executable
os.environ["webdriver.chrome.driver"] = chromedriver

In [224]:
test_url = 'https://baseballsavant.mlb.com/statcast_search?hfPT=FF%7CFT%7CFC%7CSI%7C&hfAB=&hfGT=R%7C&hfPR=&hfZ=&stadium=&hfBBL=&hfNewZones=&hfPull=&hfC=&hfSea=2020%7C&hfSit=&player_type=pitcher&hfOuts=&opponent=&pitcher_throws=&batter_stands=&hfSA=&game_date_gt=2020-09-01&game_date_lt=2020-09-05&hfInfield=&team=&position=&hfOutfield=&hfRO=&home_road=&hfFlag=&hfBBT=&metric_1=&hfInn=&min_pitches=0&min_results=0&group_by=name-event&sort_col=pitches&player_event_sort=api_p_release_speed&sort_order=desc&min_pas=0&chk_pitch_type=on&chk_pitch_result=on&chk_count=on&chk_metric2_gt=on&chk_event_release_spin_rate=on&chk_event_release_speed=on&chk_event_hit_distance_sc=on&chk_event_release_pos_x=on&chk_event_release_pos_z=on&chk_event_plate_x=on&chk_event_plate_z=on#results'

In [225]:
driver = webdriver.Chrome(chromedriver)
driver.get(test_url)

In [226]:
soup = BeautifulSoup(driver.page_source, 'html.parser')

In [227]:
rank_list = player_table.find_all('span', id=re.compile('val_'))[0:10]

In [228]:
rank_list

[<span id="val_1">1</span>,
 <span id="val_2">2</span>,
 <span id="val_3">3</span>,
 <span id="val_4">4</span>,
 <span id="val_5">5</span>,
 <span id="val_6">6</span>,
 <span id="val_7">7</span>,
 <span id="val_8">8</span>,
 <span id="val_9">9</span>,
 <span id="val_10">10</span>]

In [229]:
player_table = soup.find('div', id='search-results-container')

In [230]:
id_list = player_table.find_all('td', class_='player_name')[0:10]

In [231]:
id_list[1]

<td class="player_name tr-data align-left" id="id_660813">                        
                        Brusdar Graterol
                    </td>

In [232]:
id_list[0].attrs['id'].strip('id_')

'622251'

Clicking to open the hidden data on each of the top 10 (proof of concept):

In [233]:
for i, item in enumerate(rank_list):
    #Getting Ranks
    current_rank = item.attrs['id']
    
    #GEtting ID No.:
    current_id = id_list[i].attrs['id'].strip('id_').strip('""')
    
    #Opening the Pitch Data and creating a new soup object:
    expand_button = driver.find_element_by_xpath('//*[@id="{}"]'.format(current_rank))
    expand_button.click()

Creating a new soup object to scrape the newly revealed page data:

In [234]:
new_soup = BeautifulSoup(driver.page_source, 'html.parser') 

Compiling the stats row by row in a list:

In [250]:
compiled_stats = []
for i, item in enumerate(id_list):
    #GEtting ID No.:
    current_id = item.attrs['id'].strip('id_').strip('""')
  
    #Scraping the internal data:
    use_id = 'ajaxTable_{}'.format(current_id)
    pitch_stats = new_soup.find_all('td', colspan='100%')[i]
    ind_pitch_stats = [item.get_text() for item in pitch_stats.find_all('td')]
    compiled_stats.append(ind_pitch_stats)  

In [251]:
compiled_stats

[['FF',
  '101.7',
  '',
  'Josh Staumont',
  'Tim Anderson',
  '',
  '2446',
  '',
  '14',
  '2020-09-05',
  '0-2',
  'Top 8',
  'ball',
  '',
  ''],
 ['SI',
  '101.4',
  '',
  'Brusdar Graterol',
  'David Peralta',
  '',
  '2005',
  '',
  '11',
  '2020-09-02',
  '1-0',
  'Top 8',
  'ball',
  '',
  ''],
 ['SI',
  '101.2',
  '98.4',
  'Josh Staumont',
  'Yasmani Grandal',
  '298',
  '2440',
  '17',
  '5',
  '2020-09-05',
  '1-2',
  'Top 8',
  'hit_into_play_no_out',
  'Yasmani Grandal singles on a line drive to left fielder Alex Gordon.  ',
  ''],
 ['FF',
  '101.2',
  '',
  'Aroldis Chapman',
  'Hanser Alberto',
  '',
  '2621',
  '',
  '13',
  '2020-09-04',
  '1-2',
  'Bot 7',
  'ball',
  '',
  ''],
 ['SI',
  '101.1',
  '',
  'Brusdar Graterol',
  'Christian Walker',
  '',
  '2125',
  '',
  '13',
  '2020-09-02',
  '1-0',
  'Top 8',
  'ball',
  '',
  ''],
 ['FF',
  '100.7',
  '76.8',
  'Jorge Alcala',
  'Tim Anderson',
  '243',
  '2475',
  '30',
  '1',
  '2020-09-02',
  '0-2',
  'Top 7'

This gives me the exact data I need per pitch.  From here, I can adjust some of the selection tools on the site to pick the number of outs, score, and base runners.

### Finding Row Columns:

In [183]:
headers = player_table.find('tr', class_='tr-component-row').find_all('th', class_='th-component-header')

In [184]:
headers

[<th class="th-component-header align-left" title="Rank">Rk.</th>,
 <th class="th-component-header align-left">Player</th>,
 <th class="th-component-header align-left numeric">Result</th>,
 <th class="th-component-header align-left numeric">Game Date</th>,
 <th class="th-component-header align-left numeric" id="release_spin_rate">Spin Rate (RPM)</th>,
 <th class="th-component-header align-left numeric" id="release_speed">Pitch (MPH)</th>,
 <th class="th-component-header align-left numeric" id="hit_distance_sc">Dist (ft)</th>,
 <th class="th-component-header align-left numeric" id="release_pos_x">RP X (ft)</th>,
 <th class="th-component-header align-left numeric" id="release_pos_z">RP Z (ft)</th>,
 <th class="th-component-header align-left numeric" id="plate_x">PX (ft)</th>,
 <th class="th-component-header align-left numeric" id="plate_z">PZ (ft)</th>,
 <th class="th-component-header align-left">Vs.</th>,
 <th class="charts th-component-header align-left numeric" style="text-align: righ

In [185]:
columns = [col.get_text() for col in headers]

In [186]:
columns

['Rk.',
 'Player',
 'Result',
 'Game Date',
 'Spin Rate (RPM)',
 'Pitch (MPH)',
 'Dist (ft)',
 'RP X (ft)',
 'RP Z (ft)',
 'PX (ft)',
 'PZ (ft)',
 'Vs.',
 '\n\xa0\xa0\n                    \xa0\xa0\n                    \n']