# Code on Scraping Data from Fangraphs:

First, importing needed packages:

In [170]:
import pandas as pd
import matplotlib.pyplot as plt
from pylab import rcParams
%matplotlib inline
rcParams['figure.figsize'] = 20,10
import numpy as np
import glob
from scipy import stats
from bs4 import BeautifulSoup
import requests
import re
from IPython.core.display import display, HTML    # make sure Jupyter knows to display it as HTML

Importing selenium and setting chrome driver:

In [171]:
import time, os
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
chromedriver = "/Applications/chromedriver" # path to the chromedriver executable
os.environ["webdriver.chrome.driver"] = chromedriver

Next step: Try to find a WAR or wRC+ list for each season

In [172]:
#Data from 2015-2020, players with at least 200 PA in that given season:
wrc_url = 'https://www.fangraphs.com/leaders/season-stat-grid?position=B&seasonStart=2015&seasonEnd=2020&stat=wRC%2B&pastMinPt=200&curMinPt=0&mode=normal'

In [173]:
driver = webdriver.Chrome(chromedriver)
driver.get(wrc_url)

In [174]:
driver.page_source[:1000]

'<html xmlns="http://www.w3.org/1999/xhtml"><head id="Head1"><script src="https://js-agent.newrelic.com/nr-1194.min.js"></script><script type="text/javascript">ezstandalone.selectedPlaceholders = {};eval(ez_write_tag([[728,90],\'fangraphs_com-medrectangle-2\',\'ezslot_0\',100,\'0\',\'0\']));eval(ez_write_tag([[970,250],\'fangraphs_com-box-2\',\'ezslot_1\',104,\'0\',\'0\']));eval(ez_write_tag([[970,250],\'fangraphs_com-box-2\',\'ezslot_2\',104,\'0\',\'1\']));eval(ez_write_tag([[970,250],\'fangraphs_com-box-2\',\'ezslot_3\',104,\'0\',\'2\']));eval(ez_write_tag([[160,600],\'fangraphs_com-box-4\',\'ezslot_4\',125,\'0\',\'0\']));eval(ez_write_tag([[160,600],\'fangraphs_com-banner-1\',\'ezslot_5\',126,\'0\',\'0\']));eval(ez_write_tag([[970,250],\'fangraphs_com-medrectangle-1\',\'ezslot_6\',113,\'0\',\'0\']));eval(ez_write_tag([[970,250],\'fangraphs_com-medrectangle-1\',\'ezslot_7\',113,\'0\',\'1\']));eval(ez_write_tag([[970,250],\'fangraphs_com-medrectangle-1\',\'ezslot_8\',113,\'0\',\'2\'])

Using Selenium to set page to Infinite Rows:

In [175]:
players_dropdown = driver.find_element_by_xpath('//*[@id="root-season-grid"]/div/div[3]/div[3]/div[1]/select')
players_dropdown.send_keys("Infinity")

Once page is at inifinte rows, run the Beautiful soup code:

In [176]:
soup = BeautifulSoup(driver.page_source, 'html.parser')

In [177]:
headers = soup.find('div', class_='table-scroll').find_all('th')
print(headers)

[<th class="th-rank fixed">#</th>, <th class="align-left fixed" data-col="0" data-stat="Name">Name</th>, <th class="align-right" data-col="1" data-stat="2015">2015</th>, <th class="align-right" data-col="2" data-stat="2016">2016</th>, <th class="align-right" data-col="3" data-stat="2017">2017</th>, <th class="align-right" data-col="4" data-stat="2018">2018</th>, <th class="align-right" data-col="5" data-stat="2019">2019</th>, <th class="align-right highlight-sort" data-col="6" data-stat="2020">2020</th>]


Setting up and Verifying ultimate Dataframe rows are right:

In [178]:
columns = [col.get_text() for col in headers]

In [179]:
print(columns)

['#', 'Name', '2015', '2016', '2017', '2018', '2019', '2020']


In [180]:
wrc_df = pd.DataFrame(columns=columns)
wrc_df.head()

Unnamed: 0,#,Name,2015,2016,2017,2018,2019,2020


Finding Stats:

In [181]:
stats_test = soup.find('div', class_='table-scroll').find_all('tr')
print(stats_test)

[<tr><th class="th-rank fixed">#</th><th class="align-left fixed" data-col="0" data-stat="Name">Name</th><th class="align-right" data-col="1" data-stat="2015">2015</th><th class="align-right" data-col="2" data-stat="2016">2016</th><th class="align-right" data-col="3" data-stat="2017">2017</th><th class="align-right" data-col="4" data-stat="2018">2018</th><th class="align-right" data-col="5" data-stat="2019">2019</th><th class="align-right highlight-sort" data-col="6" data-stat="2020">2020</th></tr>, <tr class="is-selected__invalid"><td class="fixed">1</td><td class="align-left fixed" data-stat="Name"><a href="//www.fangraphs.com/statss.aspx?playerid=14843">Michael Lorenzen</a></td><td class="align-right" data-stat="2015"></td><td class="align-right" data-stat="2016"></td><td class="align-right" data-stat="2017"></td><td class="align-right" data-stat="2018"></td><td class="align-right" data-stat="2019"></td><td class="align-right highlight-sort" data-stat="2020">366</td></tr>, <tr class

In [182]:
stats = [col.get_text() for col in stats_test]

In [183]:
stat_lines = soup.find('div', class_='table-scroll').find_all('tr')
compiled_stats = []
for line_item in stat_lines:
    ind_stat_line = [item.get_text() for item in line_item.find_all('td')]
    compiled_stats.append(ind_stat_line)
    final_stats = compiled_stats[1:]

Building Dataframe with scraped data:

In [184]:
wrc_df = pd.DataFrame(final_stats, columns=columns)
#num_cols = ['Age','G','PA','Year','Avg EV (MPH)','Avg LA (°)','Barrel%','Solid Contact %','Hard Hit %','Zone Swing %','Out of Zone Swing %','Pull %','Straight Away %','Oppo %','GB%','FB%','LD %','Sprint Speed']
#savant_df[num_cols] = savant_df[num_cols].apply(pd.to_numeric, errors='coerce', axis=1)
wrc_df.set_index(wrc_df['#'], inplace=True)
wrc_df.drop(columns='#',inplace=True)
wrc_df.replace("", '0', inplace=True)
wrc_df.head()

Unnamed: 0_level_0,Name,2015,2016,2017,2018,2019,2020
#,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1,Michael Lorenzen,0,0,0,0,0,366
2,Luis Campusano,0,0,0,0,0,337
3,Billy McKinney,0,0,0,0,79,286
4,Rafael Marchan,0,0,0,0,0,277
5,Travis Blankenhorn,0,0,0,0,0,219


In [185]:
wrc_df.columns

Index(['Name', '2015', '2016', '2017', '2018', '2019', '2020'], dtype='object')

In [186]:
wrc_df

Unnamed: 0_level_0,Name,2015,2016,2017,2018,2019,2020
#,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1,Michael Lorenzen,0,0,0,0,0,366
2,Luis Campusano,0,0,0,0,0,337
3,Billy McKinney,0,0,0,0,79,286
4,Rafael Marchan,0,0,0,0,0,277
5,Travis Blankenhorn,0,0,0,0,0,219
...,...,...,...,...,...,...,...
837,Richie Martin,0,0,0,0,50,0
838,Jung Ho Kang,128,132,0,0,0,0
839,Yasmany Tomás,87,109,0,0,0,0
840,Dustin Fowler,0,0,0,66,0,0


Closing driver:

In [187]:
driver.close()

In [237]:
wrc_df.head()

Unnamed: 0_level_0,Name,2015,2016,2017,2018,2019,2020
#,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1,Michael Lorenzen,0,0,0,0,0,366
2,Luis Campusano,0,0,0,0,0,337
3,Billy McKinney,0,0,0,0,79,286
4,Rafael Marchan,0,0,0,0,0,277
5,Travis Blankenhorn,0,0,0,0,0,219


Next Step: changing dataframe data to include rows of each player's yearly performance, to merge with baseball savant data

In [238]:
wrc_pivot_df = wrc_df.T
wrc_pivot_df.head()

#,1,2,3,4,5,6,7,8,9,10,...,832,833,834,835,836,837,838,839,840,841
Name,Michael Lorenzen,Luis Campusano,Billy McKinney,Rafael Marchan,Travis Blankenhorn,Yolmer Sánchez,Andrew Stevenson,Austin Dean,Juan Soto,Ke'Bryan Hayes,...,Ryan Rua,Taylor Motter,Dae-Ho Lee,Hyun Soo Kim,ByungHo Park,Richie Martin,Jung Ho Kang,Yasmany Tomás,Dustin Fowler,Rusney Castillo
2015,0,0,0,0,0,63,0,0,0,0,...,0,0,0,0,0,0,128,87,0,73
2016,0,0,0,0,0,0,0,0,0,0,...,90,0,103,121,79,0,132,109,0,0
2017,0,0,0,0,0,95,0,0,0,0,...,0,57,0,61,0,0,0,0,0,0
2018,0,0,0,0,0,87,0,0,145,0,...,0,0,0,0,0,0,0,0,66,0


In [None]:
wrc_pivot_df.columns = wrc_pivot_df.iloc[0]
final_wrc_pivot_df = wrc_pivot_df[1:]

In [200]:
final_wrc_pivot_df.head()

Name,Michael Lorenzen,Luis Campusano,Billy McKinney,Rafael Marchan,Travis Blankenhorn,Yolmer Sánchez,Andrew Stevenson,Austin Dean,Juan Soto,Ke'Bryan Hayes,...,Ryan Rua,Taylor Motter,Dae-Ho Lee,Hyun Soo Kim,ByungHo Park,Richie Martin,Jung Ho Kang,Yasmany Tomás,Dustin Fowler,Rusney Castillo
2015,0,0,0,0,0,63,0,0,0,0,...,0,0,0,0,0,0,128,87,0,73
2016,0,0,0,0,0,0,0,0,0,0,...,90,0,103,121,79,0,132,109,0,0
2017,0,0,0,0,0,95,0,0,0,0,...,0,57,0,61,0,0,0,0,0,0
2018,0,0,0,0,0,87,0,0,145,0,...,0,0,0,0,0,0,0,0,66,0
2019,0,0,79,0,0,74,0,0,142,0,...,0,0,0,0,0,50,0,0,0,0


### Building new dataframe with the year/player wRC+ Data:

Creating Rows: (each row should be player-year)

In [214]:
names = final_wrc_pivot_df.columns.to_list()
years = final_wrc_pivot_df.index.to_list()

In [215]:
print(years)    

['2015', '2016', '2017', '2018', '2019', '2020']


In [216]:
new_df_rows = []
for name in names:
    for year in years:
        new_row = name + "-" + year
        new_df_rows.append(new_row)


In [219]:
#Should be 841*6 = 5046
len(new_df_rows)

5046

In [226]:
final_wrc_df = pd.DataFrame(new_df_rows, columns=['Name-Year'])

In [241]:
final_wrc_df['wRC+'] = ''
final_wrc_df.head(10)

Unnamed: 0,Name-Year,wRC+
0,Michael Lorenzen-2015,
1,Michael Lorenzen-2016,
2,Michael Lorenzen-2017,
3,Michael Lorenzen-2018,
4,Michael Lorenzen-2019,
5,Michael Lorenzen-2020,
6,Luis Campusano-2015,
7,Luis Campusano-2016,
8,Luis Campusano-2017,
9,Luis Campusano-2018,


Pulling wRC+ Data:

In [240]:
wrc_2015 = final_wrc_pivot_df.iloc[0].to_list()
wrc_2016 = final_wrc_pivot_df.iloc[1].to_list()
wrc_2017 = final_wrc_pivot_df.iloc[2].to_list()
wrc_2018 = final_wrc_pivot_df.iloc[3].to_list()
wrc_2019 = final_wrc_pivot_df.iloc[4].to_list()
wrc_2020 = final_wrc_pivot_df.iloc[5].to_list()

In [251]:
final_wrc_df['wRC+'].iloc[0::6] = wrc_2015
final_wrc_df['wRC+'].iloc[1::6] = wrc_2016
final_wrc_df['wRC+'].iloc[2::6] = wrc_2017
final_wrc_df['wRC+'].iloc[3::6] = wrc_2018
final_wrc_df['wRC+'].iloc[4::6] = wrc_2019
final_wrc_df['wRC+'].iloc[5::6] = wrc_2020

In [254]:
#Pulling a random sample to audit:
final_wrc_df.sample(25)

Unnamed: 0,Name-Year,wRC+
4365,Brett Eibner-2018,0
522,Rowdy Tellez-2015,0
2708,Matt Adams-2017,111
1870,Matt Carpenter-2019,95
1773,Shin-Soo Choo-2018,117
793,Jonathan Davis-2016,0
4694,Trevor Plouffe-2017,59
563,Michael Brantley-2020,134
4877,Bruce Maxwell-2020,0
2667,Jaylin Davis-2018,0


Good!!!!!!