In [1]:
import pandas as pd
import csv
from urllib.request import urlopen
from bs4 import BeautifulSoup, Comment
import requests

In [2]:
# Firstly we need to collect all links that we intend to scrape from
# the data needed is in the format of:
# https://www.pro-football-reference.com/years/2018/week_21.htm
# We need to set up a dictionary containing all weeks and years: seasonDict

# First setup years and weeks we are interested in. Note that the
# postseason starts in week 18 and the superbowl is week 21
# We will collect dating back to 2004
yearNum = [year for year in range(2004,2019)]
weekNum = [week for week in range(1,22)]
seasonDict = {i : [j for j in weekNum] for i in yearNum}
print(seasonDict)

{2016: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21], 2017: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21], 2018: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21], 2004: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21], 2005: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21], 2006: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21], 2007: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21], 2008: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21], 2009: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21], 2010: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21], 2011: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21], 2012: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21], 201

In [3]:
# Now we can build the links we need, start with empty list: seasonLinks
seasonLinks = []

# Now iterate through seasonDict and put together all needed links
# remeber the format of the links is: https://www.pro-football-reference.com/years/2018/week_21.htm
for year in seasonDict.keys():
    for week in seasonDict[year]:
        seasonLinks.append('https://www.pro-football-reference.com/years/' + str(year) + '/week_' + str(week) + '.htm')

print(seasonLinks)

['https://www.pro-football-reference.com/years/2016/week_1.htm', 'https://www.pro-football-reference.com/years/2016/week_2.htm', 'https://www.pro-football-reference.com/years/2016/week_3.htm', 'https://www.pro-football-reference.com/years/2016/week_4.htm', 'https://www.pro-football-reference.com/years/2016/week_5.htm', 'https://www.pro-football-reference.com/years/2016/week_6.htm', 'https://www.pro-football-reference.com/years/2016/week_7.htm', 'https://www.pro-football-reference.com/years/2016/week_8.htm', 'https://www.pro-football-reference.com/years/2016/week_9.htm', 'https://www.pro-football-reference.com/years/2016/week_10.htm', 'https://www.pro-football-reference.com/years/2016/week_11.htm', 'https://www.pro-football-reference.com/years/2016/week_12.htm', 'https://www.pro-football-reference.com/years/2016/week_13.htm', 'https://www.pro-football-reference.com/years/2016/week_14.htm', 'https://www.pro-football-reference.com/years/2016/week_15.htm', 'https://www.pro-football-referen

In [4]:
# Our next step is to collect the game ID's from each week from each season
# each individual game comes in link form: https://www.pro-football-reference.com/boxscores/201712140clt.htm
gameLinks = []

for week in seasonLinks:
    page = requests.get(week)
    soup = BeautifulSoup(page.text, "html.parser")



    links = soup.select("td.right.gamelink a")
    for a in links:
        gameLinks.append('https://www.pro-football-reference.com'+ str(a["href"]))
    
#in the above code, td.right.gamelink a is a CSS selector which matches
#all a elements inside td elements with classes of right and gamelink.

print(len(gameLinks))


3991


In [5]:
print(gameLinks[:20])

['https://www.pro-football-reference.com/boxscores/201609080den.htm', 'https://www.pro-football-reference.com/boxscores/201609110jax.htm', 'https://www.pro-football-reference.com/boxscores/201609110rav.htm', 'https://www.pro-football-reference.com/boxscores/201609110kan.htm', 'https://www.pro-football-reference.com/boxscores/201609110htx.htm', 'https://www.pro-football-reference.com/boxscores/201609110atl.htm', 'https://www.pro-football-reference.com/boxscores/201609110phi.htm', 'https://www.pro-football-reference.com/boxscores/201609110oti.htm', 'https://www.pro-football-reference.com/boxscores/201609110nyj.htm', 'https://www.pro-football-reference.com/boxscores/201609110nor.htm', 'https://www.pro-football-reference.com/boxscores/201609110sea.htm', 'https://www.pro-football-reference.com/boxscores/201609110dal.htm', 'https://www.pro-football-reference.com/boxscores/201609110clt.htm', 'https://www.pro-football-reference.com/boxscores/201609110crd.htm', 'https://www.pro-football-referen

In [41]:
# We need to setup a csv file with column names to hold our data
csvFile = open("./csvFiles/gameData.csv", 'w+')
writer = csv.writer(csvFile)
writer.writerow(('gameId','date','awayTeam','awayScore','homeTeam','homeScore', 'totalScore','stadium','roof','surface','vegasLine','overUnder','referee','umpire','headLinesman','lineJudge','backJudge','sideJudge','fieldJudge'))


172

In [144]:
# Now that we have all the links needed to scrape game data, we will aim to
# collect the following variables from each page: homeTeam, awayTeam, stadium,
# homeScore, awayScore, roof, surface, vegasLine, overUnder, referee, upmire,
# headLinesman lineJudge, backJudge, sideJudge, fieldJudge

url = 'https://www.pro-football-reference.com/boxscores/201609120was.htm'
page = requests.get(url)
soup = BeautifulSoup(page.text, "html.parser")

#slice the game_id from url
gameId = url[-16:-4]

# First let's grab the stadium and date
date = soup.select("div.scorebox_meta div")[0].text
stadium = soup.select("div.scorebox_meta a")[0].text


# Now grab a list of tags from the scorebox
teamTags = soup.select("table.linescore tbody td")

# Index 1: away team, index 8: home team,
# index 6: away team score, index 13: home team score
awayTeam = teamTags[1].text
awayScore = teamTags[6].text
homeTeam = teamTags[8].text
homeScore = teamTags[13].text
totalScore = int(awayScore) + int(homeScore)

# print(gameId)
# print(date)
# print(stadium)
# print(awayTeam)
# print(awayScore)
# print(homeTeam)
# print(homeScore)
# print(totalScore)


# Our next step is to grab data from the 'Game Info' table. We will collect the
#following variables: roof, surface, weather, vegasLine, overUnder
#may need to consider using regex to clean the string data after collecting it...


In [145]:
# Our next step is to grab data from the 'Game Info' table. We will collect the
#following variables: roof, surface, weather, vegasLine, overUnder

#may need to consider using regex to clean the string data after collecting it...

html = urlopen('https://www.pro-football-reference.com/boxscores/201609120was.htm')
soup = BeautifulSoup(html, 'html.parser')
gameDataComment = soup.find('div',{'id':'all_game_info'}).find(string=lambda text: isinstance(text, Comment))

gameDataComment = BeautifulSoup(gameDataComment, "html5lib")
roof = gameDataComment.select('tr td')[2].text
surface = gameDataComment.select('tr td')[3].text
weather = gameDataComment.select('tr td')[4].text
vegasLine = gameDataComment.select('tr td')[5].text
overUnder = gameDataComment.select('tr td')[6].text

# l1 = [roof,surface,weather,vegasLine,overUnder]
# for i in l1:
#     print(i)

In [146]:
# Now we will use a similar process for extracting referee data

refereeDataComment = soup.find('div',{'id':'all_officials'}).find(string=lambda text: isinstance(text, Comment))

refereeDataComment = BeautifulSoup(refereeDataComment, "html5lib")
referee = refereeDataComment.select('tr td')[1].text
umpire = refereeDataComment.select('tr td')[2].text
headLinesman = refereeDataComment.select('tr td')[3].text
lineJudge = refereeDataComment.select('tr td')[4].text
backJudge = refereeDataComment.select('tr td')[5].text
sideJudge = refereeDataComment.select('tr td')[6].text
fieldJudge = refereeDataComment.select('tr td')[7].text

# l1 = [referee,umpire,headLinesman,lineJudge,backJudge,sideJudge,fieldJudge]
# for i in l1:
#     print(i)

# Now write all collected data from this page into our csv file as one row:
writer.writerow((gameId,date,awayTeam,awayScore,homeTeam,homeScore,totalScore,stadium,roof,surface,weather,vegasLine,overUnder,referee,umpire,headLinesman,lineJudge,backJudge,sideJudge,fieldJudge))



301

In [147]:
# Our next step is to gather referee data for every game

In [10]:
#setup the above as one clean script for one page

url = 'https://www.pro-football-reference.com/boxscores/201609120was.htm'
page = requests.get(url)
soup = BeautifulSoup(page.text, "html.parser")

gameId = url[-16:-4]

date = soup.select("div.scorebox_meta div")[0].text
stadium = soup.select("div.scorebox_meta a")[0].text


teamTags = soup.select("table.linescore tbody td")
awayTeam = teamTags[1].text
awayScore = teamTags[6].text
homeTeam = teamTags[8].text
homeScore = teamTags[13].text
totalScore = int(awayScore) + int(homeScore)

html = urlopen('https://www.pro-football-reference.com/boxscores/201609120was.htm')
soup = BeautifulSoup(html, 'html.parser')
gameDataComment = soup.find('div',{'id':'all_game_info'}).find(string=lambda text: isinstance(text, Comment))
gameDataComment = BeautifulSoup(gameDataComment, "html5lib")
roof = gameDataComment.select('tr td')[2].text
surface = gameDataComment.select('tr td')[3].text
weather = gameDataComment.select('tr td')[4].text
vegasLine = gameDataComment.select('tr td')[5].text
overUnder = gameDataComment.select('tr td')[6].text

refereeDataComment = soup.find('div',{'id':'all_officials'}).find(string=lambda text: isinstance(text, Comment))
refereeDataComment = BeautifulSoup(refereeDataComment, "html5lib")
referee = refereeDataComment.select('tr td')[1].text
umpire = refereeDataComment.select('tr td')[2].text
headLinesman = refereeDataComment.select('tr td')[3].text
lineJudge = refereeDataComment.select('tr td')[4].text
backJudge = refereeDataComment.select('tr td')[5].text
sideJudge = refereeDataComment.select('tr td')[6].text
fieldJudge = refereeDataComment.select('tr td')[7].text

writer.writerow((gameId,date,awayTeam,awayScore,homeTeam,homeScore,totalScore,stadium,roof,surface,weather,vegasLine,overUnder,referee,umpire,headLinesman,lineJudge,backJudge,sideJudge,fieldJudge))


301

In [4]:
#setup the above script as one clean script for entire list of games using a loop

import pandas as pd
import csv
from urllib.request import urlopen
from bs4 import BeautifulSoup, Comment
import requests
yearNum = [year for year in range(2004,2019)]
weekNum = [week for week in range(1,22)]
seasonDict = {i : [j for j in weekNum] for i in yearNum}




In [5]:
seasonLinks = []

for year in seasonDict.keys():
    for week in seasonDict[year]:
        seasonLinks.append('https://www.pro-football-reference.com/years/' + str(year) + '/week_' + str(week) + '.htm')

gameLinks = []

for week in seasonLinks:
    page = requests.get(week)
    soup = BeautifulSoup(page.text, "html.parser")
    links = soup.select("td.right.gamelink a")
    for a in links:
        gameLinks.append('https://www.pro-football-reference.com'+ str(a["href"]))


csvFile = open("./csvFiles/gameData.csv", 'w+')
writer = csv.writer(csvFile)
writer.writerow(('gameId','date','awayTeam','awayScore','homeTeam','homeScore', 'totalScore','stadium','roof','surface','vegasLine','overUnder','referee','umpire','headLinesman','lineJudge','backJudge','sideJudge','fieldJudge'))


172

In [7]:
csvFile2 = open("./csvFiles/gameData2.csv", 'w+')
writer = csv.writer(csvFile2)
writer.writerow(('gameId','date','awayTeam','awayScore','homeTeam','homeScore', 'totalScore','stadium','roof','surface','vegasLine','overUnder','referee','umpire','headLinesman','lineJudge','backJudge','sideJudge','fieldJudge'))


172

In [8]:
for game in gameLinks:

    url = game
    page = requests.get(url)
    soup = BeautifulSoup(page.text, "html.parser")

    gameId = url[-16:-4]

    date = soup.select("div.scorebox_meta div")[0].text
    stadium = soup.select("div.scorebox_meta a")[0].text


    teamTags = soup.select("table.linescore tbody td")
    awayTeam = soup.select("div.scorebox div strong a")[1].text
    awayScore = soup.select("div.score")[1].text
    homeTeam = soup.select("div.scorebox div strong a")[0].text
    homeScore = soup.select("div.score")[0].text
    totalScore = int(awayScore) + int(homeScore)

    html = urlopen(url)
    soup = BeautifulSoup(html, 'html.parser')
    gameDataComment = soup.find('div',{'id':'all_game_info'}).find(string=lambda text: isinstance(text, Comment))
    gameDataComment = BeautifulSoup(gameDataComment, "html5lib")
    roof = gameDataComment.select('tr td')[-5].text
    surface = gameDataComment.select('tr td')[-4].text
    #weather = gameDataComment.select('tr td')[-3].text
    vegasLine = gameDataComment.select('tr td')[-2].text
    overUnder = gameDataComment.select('tr td')[-1].text

    refereeDataComment = soup.find('div',{'id':'all_officials'}).find(string=lambda text: isinstance(text, Comment))
    refereeDataComment = BeautifulSoup(refereeDataComment, "html5lib")
    referee = refereeDataComment.select('tr td')[1].text
    umpire = refereeDataComment.select('tr td')[2].text
    headLinesman = refereeDataComment.select('tr td')[-5].text
    lineJudge = refereeDataComment.select('tr td')[-4].text
    backJudge = refereeDataComment.select('tr td')[-3].text
    sideJudge = refereeDataComment.select('tr td')[-2].text
    fieldJudge = refereeDataComment.select('tr td')[-1].text

    writer.writerow((gameId,date,awayTeam,awayScore,homeTeam,homeScore,totalScore,stadium,roof,surface,vegasLine,overUnder,referee,umpire,headLinesman,lineJudge,backJudge,sideJudge,fieldJudge))
