In [7]:
import csv
from urllib.request import urlopen
from bs4 import BeautifulSoup, Comment
import requests
import re
import pandas as pd
import html5lib

In [2]:
# Firstly we need to collect all links that we intend to scrape from
# the data needed is in the format of:
# https://www.pro-football-reference.com/years/2018/week_21.htm
# We need to set up a dictionary containing all weeks and years: seasonDict

# First setup years and weeks we are interested in. Note that the
# postseason starts in week 18 and the superbowl is week 21
# We will collect dating back to 2010
yearNum = [year for year in range(2010,2019)]
weekNum = [week for week in range(1,22)]
seasonDict = {i : [j for j in weekNum] for i in yearNum}
print(seasonDict)

{2010: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21], 2011: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21], 2012: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21], 2013: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21], 2014: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21], 2015: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21], 2016: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21], 2017: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21], 2018: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21]}


In [4]:
# Now we can build the links we need, start with empty list: seasonLinks
seasonLinks = []

# Now iterate through seasonDict and put together all needed links
# remeber the format of the links is: https://www.pro-football-reference.com/years/2018/week_21.htm
for year in seasonDict.keys():
    for week in seasonDict[year]:
        seasonLinks.append('https://www.pro-football-reference.com/years/' + str(year) + '/week_' + str(week) + '.htm')

print(seasonLinks[:5])

['https://www.pro-football-reference.com/years/2010/week_1.htm', 'https://www.pro-football-reference.com/years/2010/week_2.htm', 'https://www.pro-football-reference.com/years/2010/week_3.htm', 'https://www.pro-football-reference.com/years/2010/week_4.htm', 'https://www.pro-football-reference.com/years/2010/week_5.htm']


In [5]:
# Our next step is to collect the game ID's from each week from each season
# each individual game comes in link form: https://www.pro-football-reference.com/boxscores/201712140clt.htm
gameLinks = []

for week in seasonLinks:
    page = requests.get(week)
    soup = BeautifulSoup(page.text, "html.parser")



    links = soup.select("td.right.gamelink a")
    for a in links:
        gameLinks.append('https://www.pro-football-reference.com'+ str(a["href"]))
    
#in the above code, td.right.gamelink a is a CSS selector which matches
#all a elements inside td elements with classes of right and gamelink.

print(len(gameLinks))

2403


In [119]:
print(gameLinks[409])

https://www.pro-football-reference.com/boxscores/201111130sfo.htm


In [120]:
# Now that we have all the links needed to scrape game data, we will aim to
# collect the 'Expected Points Summary' and 'Team Stats' tables from each page

# First, create a mainDf to append all data w/in the upcoming loop
# Setup desired columns for the DF first
cols = ['gameId','date','team','totExp','oTot',
        'oPass','oRush','oTov','dTot','dPass','dRush','dTov',
        'stTot','stKo','stKr','stP','stPr','fgxp','firstDowns',
        'rushesYdsTds','cmpAttYdsTdsInts','sackedYds','netPyds',
        'totYds','fumLost','to','penYds','3dConv','4dConv','timePoss']

teamGameData = pd.DataFrame(columns=cols)


# Write loop that collects needed data for each page in gameLinks

for link in gameLinks:
    try:
        page = requests.get(link)
        soup = BeautifulSoup(page.text, "html.parser")

        #slice the game_id from url
        gameId = link[-16:-4]

        # grab date
        date = soup.select("div.scorebox_meta div")[0].text

        # Now grab a list of tags from the expected points table. This part is a comment
        expectedPointsComment = soup.find('div',{'id':'all_expected_points'}).find(string=lambda text: isinstance(text, Comment))

        expectedPointsComment = BeautifulSoup(expectedPointsComment, "html5lib")

        # Collect team names
        awayTeam = expectedPointsComment.select('tbody tr th')[0].text
        homeTeam = expectedPointsComment.select('tbody tr th')[1].text

        # Setup two lists to add all data for both teams. We will turn these lists
        # into a DF later
        awayVars = []
        homeVars = []

        # First half of data is for away team, second half is home
        # Add away and home attributes from expected pts table into respective lists
        for attribute in expectedPointsComment.select('tbody tr td')[:15]:
            awayVars.append(attribute.text)

        for attribute in expectedPointsComment.select('tbody tr td')[15:]:
            homeVars.append(attribute.text)

        # insert team names, gameId, and date into front of lists
        awayList = [awayTeam,date,gameId]
        homeList = [homeTeam,date,gameId]

        for item in awayList:
            awayVars.insert(0,item)

        for item in homeList:
            homeVars.insert(0,item)   

        # Now collect team stats for both teams. Table is a comment again :/
        statsTableComment = soup.find('div',{'id':'all_team_stats'}).find(string=lambda text: isinstance(text, Comment))

        statsTableComment = BeautifulSoup(statsTableComment, "html5lib")

        for i in range(len(statsTableComment.select('tbody td'))):
            if i%2 == 0:
                awayVars.append(statsTableComment.select('tbody td')[i].text)
            else:
                homeVars.append(statsTableComment.select('tbody td')[i].text)

        # Now create a DF with awayVars and homeVars lists as rows
        varsDf = pd.DataFrame([awayVars,homeVars],columns=cols)

        #append it to the main DF
        teamGameData = teamGameData.append(varsDf)
    
    except:
        pass

In [121]:
teamGameData

Unnamed: 0,gameId,date,team,totExp,oTot,oPass,oRush,oTov,dTot,dPass,...,cmpAttYdsTdsInts,sackedYds,netPyds,totYds,fumLost,to,penYds,3dConv,4dConv,timePoss
0,201009090nor,"Thursday Sep 9, 2010",Vikings,-5.00,-6.45,-5.54,-0.91,-4.17,-2.86,-4.60,...,15-27-171-1-1,1-9,162,253,1-0,1,6-60,5-13,0-0,26:17
1,201009090nor,"Thursday Sep 9, 2010",Saints,5.00,2.86,4.60,-1.42,0.00,6.45,5.54,...,27-36-237-1-0,1-8,229,308,1-0,0,3-20,3-11,0-0,33:43
0,201009120pit,"Sunday Sep 12, 2010",Falcons,-6.00,-8.90,-4.99,-3.91,-3.92,3.01,0.92,...,27-44-252-0-1,2-15,237,295,0-0,1,3-24,6-16,0-0,30:29
1,201009120pit,"Sunday Sep 12, 2010",Steelers,6.00,-3.01,-0.92,-1.43,-3.79,8.90,4.99,...,18-26-236-0-1,3-25,211,354,0-0,1,4-25,4-14,0-0,32:06
0,201009120oti,"Sunday Sep 12, 2010",Titans,25.00,10.04,6.91,3.79,-4.66,14.46,12.28,...,22-37-180-1-1,4-30,150,286,4-1,2,10-77,3-14,0-2,28:56
1,201009120oti,"Sunday Sep 12, 2010",Raiders,-25.00,-14.46,-12.28,1.81,-11.46,-10.04,-6.91,...,13-17-154-2-0,2-14,140,345,2-1,1,8-81,8-15,0-0,31:04
0,201009120nwe,"Sunday Sep 12, 2010",Bengals,-14.00,10.45,10.95,-0.17,-14.30,-15.94,-13.31,...,34-50-345-2-1,1-4,341,428,2-1,2,2-5,8-15,3-3,31:50
1,201009120nwe,"Sunday Sep 12, 2010",Patriots,14.00,15.94,13.31,3.56,0.00,-10.45,-10.95,...,25-35-258-3-0,0-0,258,376,0-0,0,6-30,9-13,0-1,28:10
0,201009120htx,"Sunday Sep 12, 2010",Colts,-10.00,16.62,18.14,-1.50,-1.57,-17.43,-2.04,...,40-57-433-3-0,2-14,419,463,1-1,1,5-73,4-12,1-1,29:07
1,201009120htx,"Sunday Sep 12, 2010",Texans,10.00,17.43,2.04,16.06,-4.25,-16.62,-18.14,...,9-17-107-1-1,2-9,98,355,0-0,1,7-50,6-11,1-1,30:53


In [122]:
%store teamGameData

Stored 'teamGameData' (DataFrame)


In [123]:
teamGameData.to_csv('C:\\Users\\Michael\\Desktop\\Springboard\\Capstone2\\csvFiles\\teamGameData.csv')