# Webscraping ESPN.com for Quarterback Rating (QBR) Statistics
## By: Nick Bruno

### Import Libraries

In [1]:
# https://news.codecademy.com/web-scraping-python-beautiful-soup-mlb-stats/
import pandas as pd
import re
import requests
from bs4 import BeautifulSoup

### URL Information

In [2]:
url = 'http://www.espn.com/nfl/qbr'
page = requests.get(url)
soup = BeautifulSoup(page.text, 'html.parser')

### Investigate Website HTML

In [3]:
# Investigate structure of the column header #
soup.find('tr', attrs={'class':'colhead'})

<tr align="right" class="colhead"><td align="left">RK</td><td align="left" width="18%">PLAYER</td><td><a href="//www.espn.com/nfl/qbr/_/sort/qbpaa" title="Number of points contributed by a quarterback over the season, accounting for QBR and how much he plays, above the level of an average quarterback.">PTS ADDED</a></td><td><a href="//www.espn.com/nfl/qbr/_/sort/cwepaPassesCondensed" title="Clutch-weighted expected points added on plays with pass attempts.">PASS</a></td><td><a href="//www.espn.com/nfl/qbr/_/sort/cwepaRuns" title="Clutch-weighted expected points added through rushes.">RUN</a></td><td><a href="//www.espn.com/nfl/qbr/_/sort/cwepaPenalties" title="Clutch-weighted expected points added on penalties.">PENALTY</a></td><td><a href="//www.espn.com/nfl/qbr/_/sort/cwepaTotal" title="Total clutch-weighted expected points added.">TOTAL EPA</a></td><td><a href="//www.espn.com/nfl/qbr/_/sort/actionPlays" title="Plays on which the QB has a non-zero expected points contribution. Includ

In [4]:
# Get column headers #
header = soup.find('tr', attrs={'class':'colhead'})
columns = [col.get_text() for col in header.find_all('td')]
columns

['RK',
 'PLAYER',
 'PTS ADDED',
 'PASS',
 'RUN',
 'PENALTY',
 'TOTAL EPA',
 'QB PLAYS',
 'RAW QBR',
 'TOTAL QBR']

In [5]:
# Set up dataframe #
df = pd.DataFrame(columns=columns)
df

Unnamed: 0,RK,PLAYER,PTS ADDED,PASS,RUN,PENALTY,TOTAL EPA,QB PLAYS,RAW QBR,TOTAL QBR


### Investigate How to Find and Scrape Data on One Player

In [15]:
# Look at Dak Prescott #
soup.find('tr', attrs = {'class':'evenrow player-28-2577417'})

<tr align="right" class="evenrow player-28-2577417"><td align="left">2</td><td align="left"><a href="http://www.espn.com/nfl/player/_/id/2577417/dak-prescott">Dak Prescott</a>, DAL</td><td>40.2</td><td>51.2</td><td>6.5</td><td>3.0</td><td>65.1</td><td>373</td><td>82.1</td><td class="sortcell">78.9</td></tr>

In [16]:
# Scrape data for one player #
row = soup.find('tr', attrs = {'class':'evenrow player-28-2577417'})
for data in row.find_all('td'):
    print(data.get_text())

# Shows statistics of Dak Prescott #

2
Dak Prescott, DAL
40.2
51.2
6.5
3.0
65.1
373
82.1
78.9


### Apply This to All 2019 QBs Listed on Site

In [12]:
players = soup.find_all('tr', attrs = {'class':re.compile('row player-28-')}) # finds all for each player
for player in players: # for every player in the players list
    stats = [stat.get_text() for stat in player.find_all('td')]
    temp_df = pd.DataFrame(stats).transpose()
    temp_df.columns = columns
    df = pd.concat([df, temp_df], ignore_index=True) # concatenates each player into once dataframe

In [13]:
df.head()

Unnamed: 0,RK,PLAYER,PTS ADDED,PASS,RUN,PENALTY,TOTAL EPA,QB PLAYS,RAW QBR,TOTAL QBR
0,1,"Russell Wilson, SEA",39.4,48.0,7.1,1.4,65.8,377,81.4,79.0
1,2,"Dak Prescott, DAL",40.2,51.2,6.5,3.0,65.1,373,82.1,78.9
2,3,"Patrick Mahomes, KC",34.7,49.7,5.9,2.2,61.7,342,80.8,78.2
3,4,"Lamar Jackson, BAL",33.0,31.0,23.4,0.4,60.6,385,77.1,76.0
4,5,"Deshaun Watson, HOU",34.7,43.2,15.6,1.0,71.3,409,76.9,75.2


In [14]:
df.shape # 32 QBs and 10 columns

(32, 10)

In [25]:
# Write results to .csv
df.to_csv('2019_season_total_qbr.csv', header=True, index=False)

### Scrape QBR Stat Leaders for One Year Available on ESPN using a function

In [29]:
# Creates dataframe of Quarterback Total QBR stats over one season
def one_year_qbr(url):
    page = requests.get(url)
    soup = BeautifulSoup(page.text, 'html.parser')
    header = soup.find('tr', attrs={'class':'colhead'})
    columns = [col.get_text() for col in header.find_all('td')]
    df = pd.DataFrame(columns=columns) # empty df with column names
    players = soup.find_all('tr', attrs = {'class':re.compile('row player-28-')}) # finds all for each player
    for player in players: # for every player in the players list
        stats = [stat.get_text() for stat in player.find_all('td')]
        temp_df = pd.DataFrame(stats).transpose()
        temp_df.columns = columns
        temp_df['YEAR'] = url[-4:] # Creates a column 'YEAR' for the season
        df = pd.concat([df, temp_df], ignore_index=True) # concatenates each player to the dataframe
    return df

# https://stackoverflow.com/questions/31062435/how-can-i-loop-scraping-data-for-multiple-pages-in-a-website-using-python-and-be

In [30]:
qbr_2010 = one_year_qbr('http://www.espn.com/nfl/qbr/_/year/2010') # works
qbr_2010.head()

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  


Unnamed: 0,PASS,PENALTY,PLAYER,PTS ADDED,QB PLAYS,RAW QBR,RK,RUN,TOTAL EPA,TOTAL QBR,YEAR
0,66.7,2.8,"Tom Brady, NE",52.0,574,78.3,1,0.8,79.1,79.1,2010
1,96.3,4.9,"Peyton Manning, IND",60.3,749,75.7,2,1.3,108.8,71.6,2010
2,64.1,6.4,"Matt Ryan, ATL",39.6,668,69.8,3,2.2,83.8,71.2,2010
3,80.3,0.8,"Drew Brees, NO",45.0,726,70.6,4,-1.2,91.9,70.8,2010
4,26.6,1.7,"Mike Vick, PHI",32.8,523,70.8,5,20.1,63.1,70.4,2010


In [28]:
qbr_2010.shape[0] # 31 QBs

31

### Function that creates a dataframe of a QB's season QBR for a range of seasons

In [31]:
def all_years_qbr(years_list):
    final_df_try = pd.DataFrame() # empty df
    for i in years_list:
        url = 'http://www.espn.com/nfl/qbr/_/year/' + str(i)
        page = requests.get(url)
        soup = BeautifulSoup(page.text, 'html.parser')
        header = soup.find('tr', attrs={'class':'colhead'})
        columns = [col.get_text() for col in header.find_all('td')]
        df = pd.DataFrame(columns=columns)
        players = soup.find_all('tr', attrs = {'class':re.compile('row player-28-')}) # finds all for each player
        for player in players: # for every player in the players list
            stats = [stat.get_text() for stat in player.find_all('td')]
            temp_df = pd.DataFrame(stats).transpose()
            temp_df.columns = columns
            temp_df['YEAR'] = int(i)
            df = pd.concat([df, temp_df], ignore_index=True)
        
        final_df_try = final_df_try.append(df)
    return final_df_try

In [32]:
final_qbr_df = all_years_qbr(range(2006,2020))

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  app.launch_new_instance()


In [33]:
final_qbr_df.head()

Unnamed: 0,PASS,PENALTY,PLAYER,PTS ADDED,QB PLAYS,RAW QBR,RK,RUN,TOTAL EPA,TOTAL QBR,YEAR
0,96.0,1.1,"Peyton Manning, IND",85.5,624,87.4,1,6.8,108.8,86.4,2006.0
1,38.8,2.8,"Tom Brady, NE",30.9,610,67.2,2,4.3,57.9,68.6,2006.0
2,43.1,0.3,"Philip Rivers, LAC",28.2,542,67.6,3,-0.9,53.0,67.4,2006.0
3,43.2,2.5,"Carson Palmer, CIN",29.9,623,66.4,4,-0.3,58.3,67.1,2006.0
4,61.0,0.6,"Drew Brees, NO",36.7,631,69.5,5,-5.2,64.2,66.7,2006.0


In [36]:
len(final_qbr_df.PLAYER.unique()) # stats on 158 unique quarterbacks

158

In [121]:
# Write to .csv 
final_qbr_df.to_csv('all_years_qbr.csv', header=True, index=False)

### Create a function to scrape stats for each week

I have created functions that takes a Quarterback's yearly QBR over a season. QBR is a statistic that can be calculated over a season, but it is also calculated after every game. I am also interested in obtaining data concerning the QBR rating of a Quarterback for every game played.

In [53]:
def all_weeks_qbr(years_list):
    final_weeks_df = pd.DataFrame()
    for i in years_list:
        url_year = 'http://www.espn.com/nfl/qbr/_/year/' + str(i)
        second_df = pd.DataFrame()
        for week in range(1, 18):
            url = url_year + '/type/player-week/week/' + str(week)
            page = requests.get(url)
            soup = BeautifulSoup(page.text, 'html.parser')
            header = soup.find('tr', attrs={'class':'colhead'})
            columns = [col.get_text() for col in header.find_all('td')]
            df = pd.DataFrame(columns=columns)
            players = soup.find_all('tr', attrs = {'class':re.compile('row player-28-')}) # finds all for each player
            for player in players: # for every player in the players list
                stats = [stat.get_text() for stat in player.find_all('td')]
                temp_df = pd.DataFrame(stats).transpose()
                temp_df.columns = columns
                temp_df['WEEK'] = int(week)
                temp_df['YEAR'] = int(i)
                df = pd.concat([df, temp_df], ignore_index=True)
            second_df = second_df.append(df)
        final_weeks_df = final_weeks_df.append(second_df)
    return final_weeks_df

In [68]:
# Create a list of years QBR has been recorded (2006 - 2018)
lst = range(2006,2019)
year_list = ["{:02d}".format(x) for x in lst]
# https://stackoverflow.com/questions/17577797/convert-ranger-to-list-of-strings-of-length-2-in-python

I chose to exclude QBR statistics from the 2019 season, since the 2019 NFL season is currently happening and there is not a full 17 weeks of 2019 QBR statistics.

In [69]:
all_weeks_reg_season_df = all_weeks_qbr(year_list)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.




In [70]:
all_weeks_reg_season_df.shape

(6537, 13)

In [71]:
all_weeks_reg_season_df.head()

Unnamed: 0,PASS,PENALTY,PLAYER,PTS ADDED,QB PLAYS,RAW QBR,RESULT,RK,RUN,TOTAL EPA,TOTAL QBR,WEEK,YEAR
0,4.3,0.1,"Rex Grossman, CHI",5.5,31,92.4,W 26-0 @ GB in Wk 1,1,0.6,5.4,96.4,1.0,2006.0
1,8.6,-0.1,"Donovan McNabb, PHI",8.1,38,95.4,W 24-10 @ HOU in Wk 1,2,0.1,8.9,92.5,1.0,2006.0
2,6.5,0.2,"Chad Pennington, NYJ",6.1,43,88.1,W 23-16 @ TEN in Wk 1,3,-0.3,7.5,90.1,1.0,2006.0
3,8.4,-0.3,"Peyton Manning, IND",5.8,45,86.0,W 26-21 @ NYG in Wk 1,4,0.0,8.3,86.0,1.0,2006.0
4,1.3,0.4,"Mike Vick, ATL",1.8,30,70.5,W 20-6 @ CAR in Wk 1,5,1.1,3.4,79.4,1.0,2006.0


In [73]:
len(all_weeks_reg_season_df.PLAYER.unique()) # 334 unique Quarterbacks with QBR rankings from 2006 - 2018

334

In [74]:
all_weeks_reg_season_df.tail()

Unnamed: 0,PASS,PENALTY,PLAYER,PTS ADDED,QB PLAYS,RAW QBR,RESULT,RK,RUN,TOTAL EPA,TOTAL QBR,WEEK,YEAR
27,-0.6,-0.1,"Blaine Gabbert, TEN",-4.0,31,13.7,L 33-17 vs IND in Wk 17,28,0.0,-0.8,11.3,17.0,2018.0
28,-4.9,0.3,"Russell Wilson, SEA",-4.4,29,10.3,W 27-24 vs ARI in Wk 17,29,-0.2,-1.6,10.9,17.0,2018.0
29,-5.4,-0.1,"Ryan Tannehill, MIA",-7.9,38,5.1,L 42-17 @ BUF in Wk 17,30,0.3,-3.4,8.2,17.0,2018.0
30,-2.0,-0.3,"Blake Bortles, JAX",-7.1,38,6.7,L 20-3 @ HOU in Wk 17,31,-0.8,-2.4,7.5,17.0,2018.0
31,-3.7,0.1,"Josh Johnson, WSH",-8.3,36,3.7,L 24-0 vs PHI in Wk 17,32,-0.8,-3.0,3.7,17.0,2018.0


In [76]:
# Write to .csv
all_weeks_reg_season_df.to_csv('regular_season_qbr_2006_to_2018.csv', header=True)

### Summary

This Jupyter Notebook has showed how to scrape NFL Quarterback QBR statistics from ESPN.com. I have presented multiple functions that scrape season total QBR stats as well as weekly QBR stats. This scraping was done in an effort to compare QBR stats with the Passer Rating metric, and this analysis will be conducted in a future blog post on btc.bashingbitcoin.com. Gathering the data is the first step in this analysis process. The scraped ESPN data will be combined with the Quarterback Rating statistics scraped from pro-football-reference.com that was accomplished in an earlier blog post.