In [1]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import re

## Function that gets all playoff game url for a given year
Note: pre_merger is a boolean that help selects the appropriate table

In [2]:
def getPlayoffURLs(url, pre_merger):
    reqs = requests.get(url) 
    cleaned = reqs.text.replace("<!--","").replace("--!>","") # remove HTML comments to access tables
    soup = BeautifulSoup(cleaned, 'html.parser')
    table = soup.find_all('table')
    
    if(pre_merger):
        tbody = table[1].find('tbody')
    else:
        tbody = table[2].find('tbody') 
    
    tr_body = tbody.find_all('tr')
    url_list = []
    prefix = "https://www.pro-football-reference.com"
    
    """box score is the 6th td element out of 8 for each table row
    Get first box score lined up to be #8 and then skip 8 td elements to get the next box score
    If the 6th td element is #8, then 5th=7, 4th=6, 3rd=5, 2nd=4, 1st=3"""
    counter = 3
    for trb in tr_body:
        for td in trb.find_all('td'):
            if(counter % 8 == 0):
                elem = td.find(href=True)
                url_list.append(prefix + elem['href'])
            counter += 1 
    
    return(url_list)

## Function to go through url and return points for, points against, yards for, yards against, final point spread
Use points for, points against, and final point spread to create a new column called "Covered"

In [3]:
def scrapeURL(URL):
    page = requests.get(URL)
    df_arr = pd.read_html(page.text) # create a list from the webpage
    df = df_arr[0]
    
    # get the team names
    away_team = df["Unnamed: 1"][0]
    home_team = df["Unnamed: 1"][1]
    teams = [away_team, home_team]
    
    # get the final scores
    away_score = df["Final"][0]
    home_score = df["Final"][1]
    scores = [away_score, home_score]
    
    """ Remove HTML comment tags to access hidden tables """
    cleaned = page.text.replace("<!--","").replace("--!>","")
    df_list = pd.read_html(cleaned)
    
    """determine which table number "Game Info" is"""
    game_info = -1
    for i in range(len(df_list)):
        try:
            if(df_list[i][0][0] == "Game Info"):
                game_info = i
        except:
            pass
        
    # get the betting odds
    vegas = df_list[game_info]
    """Not all games have the same table row order - so we need to determine indicies"""
    vl_index = -1
    ou_index = -1
    for i in range(len(vegas)):
        if(vegas[0][i] == "Vegas Line"):
            vl_index = i
        if(vegas[0][i] == "Over/Under"):
            ou_index = i
    
    if(vl_index == -1): # vegas line not available for that game
        vegas_line = -1
    else:
        vegas_line = vegas[1][vl_index]
    
    if(ou_index == -1): # over/under not available for that game
        over_under = -1
    else:
        over_under = vegas[1][ou_index]
    
    odds = [vegas_line, over_under]
    
    
    # get the rushing yards
    """Not all playoff games have the same tables after the Game Info table.
    However, all Team Stats tables have "First Downs" as the first element in the first column
    """
    team_stats = -1
    for i in range(len(df_list)):
        if(df_list[i].loc[0][0] == "First Downs"):
            team_stats = i
    
    box_stats = df_list[team_stats]
    """Catch negative yard performances"""
    try:
        away_rushing = int(box_stats.loc[1][1].split("-")[1])
    except:
        reg_ex_split = re.split(r'\D+', box_stats.loc[1][1])[1]
        away_rushing = int(reg_ex_split)*-1
    
    try:
        home_rushing = int(box_stats.loc[1][2].split("-")[1])
    except:
        reg_ex_split = re.split(r'\D+', box_stats.loc[1][2])[1]
        home_rushing = int(reg_ex_split)*-1
        
    rushing = [away_rushing, home_rushing]
    
    #get the passing yards
    try:
        away_passing = int(box_stats.loc[2][1].split("-")[1])
    except:
        reg_ex_split = re.split(r'\D+', box_stats.loc[2][1])[1]
        away_passing = int(reg_ex_split)*-1
    
    try:
        home_passing = int(box_stats.loc[2][2].split("-")[1])
    except:
        reg_ex_split = re.split(r'\D+', box_stats.loc[2][2])[1]
        home_passing = int(reg_ex_split)*-1
    
    passing = [away_passing, home_passing]
    
    
    return([teams, scores, odds, rushing, passing])

## Function to call scrapeURL for each game url

In [4]:
def iterateGames(data):
    all_teams = []
    all_scores = []
    all_odds = []
    all_rushing = []
    all_passing = []
    for year in range(len(data)):
        for row in range(len(data[year])):
            teams, scores, odds, rushing, passing = scrapeURL(data[year][row])
            all_teams.append(teams)
            all_scores.append(scores)
            all_odds.append(odds)
            all_rushing.append(rushing)
            all_passing.append(passing)
            
    return([all_teams, all_scores, all_odds, all_rushing, all_passing])

# Driver

## Create a list of urls for each year in the Super Bowl era
Note: AFL and NFL for pre-merger years both include the Super Bowl in their data, so there are 4 duplicates in the data. They will be removed in pandas later.

In [5]:
root = "https://www.pro-football-reference.com/years/"
num_sbs = 55 # SB 55 was 2020 nfl season, sb1 was  1966 nfl season
afl = "_AFL"
pre_merger = []
post_merger = []
for i in range(num_sbs):
    if(i < 4):
        pre_merger.append(root + str(i+1966))
        pre_merger.append(root + str(i+1966) + afl)
    else:
        post_merger.append(root + str(i+1966))

In [6]:
url_list = []
for j in range(len(pre_merger)):
    url_list.append(getPlayoffURLs(pre_merger[j], True))
for k in range(len(post_merger)):
    url_list.append(getPlayoffURLs(post_merger[k], False))

## Go through each game in url_list and scrape data, then combine into final DataFrame that has duplicates removed

In [7]:
teams, scores, odds, rushing, passing = iterateGames(url_list)

teams_df = pd.DataFrame(teams, columns=["Away", "Home"])
scores_df = pd.DataFrame(scores, columns=["Away Score", "Home Score"])
odds_df = pd.DataFrame(odds, columns=["Line", "Over/Under"])
rushing_df = pd.DataFrame(rushing, columns=["Away Rushing", "Home Rushing"])
passing_df = pd.DataFrame(passing, columns=["Away Passing", "Home Passing"])

In [8]:
teams_df.head(5)

Unnamed: 0,Away,Home
0,Green Bay Packers,Dallas Cowboys
1,Kansas City Chiefs,Green Bay Packers
2,Kansas City Chiefs,Buffalo Bills
3,Kansas City Chiefs,Green Bay Packers
4,Los Angeles Rams,Green Bay Packers


In [9]:
scores_df.head(5)

Unnamed: 0,Away Score,Home Score
0,34,27
1,10,35
2,31,7
3,10,35
4,7,28


In [10]:
odds_df.head(5)

Unnamed: 0,Line,Over/Under
0,Green Bay Packers -7.0,-1
1,Green Bay Packers -14.0,0.0 (over)
2,Kansas City Chiefs -3.5,-1
3,Green Bay Packers -14.0,0.0 (over)
4,Los Angeles Rams -3.0,-1


In [11]:
rushing_df.head(5)

Unnamed: 0,Away Rushing,Home Rushing
0,102,187
1,72,130
2,113,40
3,72,130
4,75,163


In [12]:
passing_df.head(5)

Unnamed: 0,Away Passing,Home Passing
0,28,31
1,32,24
2,24,27
3,32,24
4,31,23


In [13]:
# Merge dataframes
merge1_df = teams_df.join(scores_df, how="inner")
merge1_df

Unnamed: 0,Away,Home,Away Score,Home Score
0,Green Bay Packers,Dallas Cowboys,34,27
1,Kansas City Chiefs,Green Bay Packers,10,35
2,Kansas City Chiefs,Buffalo Bills,31,7
3,Kansas City Chiefs,Green Bay Packers,10,35
4,Los Angeles Rams,Green Bay Packers,7,28
...,...,...,...,...
533,Cleveland Browns,Kansas City Chiefs,17,22
534,Tampa Bay Buccaneers,New Orleans Saints,30,20
535,Buffalo Bills,Kansas City Chiefs,24,38
536,Tampa Bay Buccaneers,Green Bay Packers,31,26


In [14]:
merge2_df = merge1_df.join(odds_df, how="inner")
merge2_df

Unnamed: 0,Away,Home,Away Score,Home Score,Line,Over/Under
0,Green Bay Packers,Dallas Cowboys,34,27,Green Bay Packers -7.0,-1
1,Kansas City Chiefs,Green Bay Packers,10,35,Green Bay Packers -14.0,0.0 (over)
2,Kansas City Chiefs,Buffalo Bills,31,7,Kansas City Chiefs -3.5,-1
3,Kansas City Chiefs,Green Bay Packers,10,35,Green Bay Packers -14.0,0.0 (over)
4,Los Angeles Rams,Green Bay Packers,7,28,Los Angeles Rams -3.0,-1
...,...,...,...,...,...,...
533,Cleveland Browns,Kansas City Chiefs,17,22,Kansas City Chiefs -7.5,55.5 (under)
534,Tampa Bay Buccaneers,New Orleans Saints,30,20,New Orleans Saints -2.5,53.0 (under)
535,Buffalo Bills,Kansas City Chiefs,24,38,Kansas City Chiefs -3.0,55.0 (over)
536,Tampa Bay Buccaneers,Green Bay Packers,31,26,Green Bay Packers -3.0,53.0 (over)


In [15]:
merge3_df = merge2_df.join(rushing_df, how="inner")
merge3_df

Unnamed: 0,Away,Home,Away Score,Home Score,Line,Over/Under,Away Rushing,Home Rushing
0,Green Bay Packers,Dallas Cowboys,34,27,Green Bay Packers -7.0,-1,102,187
1,Kansas City Chiefs,Green Bay Packers,10,35,Green Bay Packers -14.0,0.0 (over),72,130
2,Kansas City Chiefs,Buffalo Bills,31,7,Kansas City Chiefs -3.5,-1,113,40
3,Kansas City Chiefs,Green Bay Packers,10,35,Green Bay Packers -14.0,0.0 (over),72,130
4,Los Angeles Rams,Green Bay Packers,7,28,Los Angeles Rams -3.0,-1,75,163
...,...,...,...,...,...,...,...,...
533,Cleveland Browns,Kansas City Chiefs,17,22,Kansas City Chiefs -7.5,55.5 (under),112,123
534,Tampa Bay Buccaneers,New Orleans Saints,30,20,New Orleans Saints -2.5,53.0 (under),127,104
535,Buffalo Bills,Kansas City Chiefs,24,38,Kansas City Chiefs -3.0,55.0 (over),129,114
536,Tampa Bay Buccaneers,Green Bay Packers,31,26,Green Bay Packers -3.0,53.0 (over),76,67


In [16]:
merge4_df = merge3_df.join(passing_df, how="inner")
merge4_df

Unnamed: 0,Away,Home,Away Score,Home Score,Line,Over/Under,Away Rushing,Home Rushing,Away Passing,Home Passing
0,Green Bay Packers,Dallas Cowboys,34,27,Green Bay Packers -7.0,-1,102,187,28,31
1,Kansas City Chiefs,Green Bay Packers,10,35,Green Bay Packers -14.0,0.0 (over),72,130,32,24
2,Kansas City Chiefs,Buffalo Bills,31,7,Kansas City Chiefs -3.5,-1,113,40,24,27
3,Kansas City Chiefs,Green Bay Packers,10,35,Green Bay Packers -14.0,0.0 (over),72,130,32,24
4,Los Angeles Rams,Green Bay Packers,7,28,Los Angeles Rams -3.0,-1,75,163,31,23
...,...,...,...,...,...,...,...,...,...,...
533,Cleveland Browns,Kansas City Chiefs,17,22,Kansas City Chiefs -7.5,55.5 (under),112,123,37,38
534,Tampa Bay Buccaneers,New Orleans Saints,30,20,New Orleans Saints -2.5,53.0 (under),127,104,33,35
535,Buffalo Bills,Kansas City Chiefs,24,38,Kansas City Chiefs -3.0,55.0 (over),129,114,48,38
536,Tampa Bay Buccaneers,Green Bay Packers,31,26,Green Bay Packers -3.0,53.0 (over),76,67,36,48


In [17]:
final_df = merge4_df.drop_duplicates()
final_df

Unnamed: 0,Away,Home,Away Score,Home Score,Line,Over/Under,Away Rushing,Home Rushing,Away Passing,Home Passing
0,Green Bay Packers,Dallas Cowboys,34,27,Green Bay Packers -7.0,-1,102,187,28,31
1,Kansas City Chiefs,Green Bay Packers,10,35,Green Bay Packers -14.0,0.0 (over),72,130,32,24
2,Kansas City Chiefs,Buffalo Bills,31,7,Kansas City Chiefs -3.5,-1,113,40,24,27
4,Los Angeles Rams,Green Bay Packers,7,28,Los Angeles Rams -3.0,-1,75,163,31,23
5,Cleveland Browns,Dallas Cowboys,14,52,Dallas Cowboys -5.0,-1,159,178,30,15
...,...,...,...,...,...,...,...,...,...,...
533,Cleveland Browns,Kansas City Chiefs,17,22,Kansas City Chiefs -7.5,55.5 (under),112,123,37,38
534,Tampa Bay Buccaneers,New Orleans Saints,30,20,New Orleans Saints -2.5,53.0 (under),127,104,33,35
535,Buffalo Bills,Kansas City Chiefs,24,38,Kansas City Chiefs -3.0,55.0 (over),129,114,48,38
536,Tampa Bay Buccaneers,Green Bay Packers,31,26,Green Bay Packers -3.0,53.0 (over),76,67,36,48


### Save DF

In [18]:
final_df.to_csv("playoff_data.csv", index=False)