In [None]:
import pandas as pd
import requests
from bs4 import BeautifulSoup

## Web-design
(https://www.pro-football-reference.com/years/) has yearly results under "Year" column.

(https://www.pro-football-reference.com/years/2020/) is the 2020 yearly data, with a table for "Playoff Results" that includes a box score for each game

"Playoff Results" is the 3rd table for each year from 1970 and on. For 1966 through 1969 "Playoff Results" is the 2nd table for each AFL and NFL link

## Function that gets all playoff game url for a given year
Note: pre_merger is a boolean that help selects the appropriate table

In [None]:
def getPlayoffURLs(url, pre_merger):
    reqs = requests.get(url) 
    cleaned = reqs.text.replace("<!--","").replace("--!>","") # remove HTML comments to access tables
    soup = BeautifulSoup(cleaned, 'html.parser')
    table = soup.find_all('table')
    
    if(pre_merger):
        tbody = table[1].find('tbody')
    else:
        tbody = table[2].find('tbody') 
    
    tr_body = tbody.find_all('tr')
    url_list = []
    prefix = "https://www.pro-football-reference.com"
    
    """box score is the 6th td element out of 8 for each table row
    Get first box score lined up to be #8 and then skip 8 td elements to get the next box score
    If the 6th td element is #8, then 5th=7, 4th=6, 3rd=5, 2nd=4, 1st=3"""
    counter = 3
    for trb in tr_body:
        for td in trb.find_all('td'):
            if(counter % 8 == 0):
                elem = td.find(href=True)
                url_list.append(prefix + elem['href'])
            counter += 1 
    
    return(url_list)

## Function to go through url and return [teams, scores]

In [None]:
def scrapeURL(URL):
    page = requests.get(URL)
    df_list = pd.read_html(page.text) # create a list from the webpage
    df = df_list[0]
    
    # get the team names
    away_team = df["Unnamed: 1"][0]
    home_team = df["Unnamed: 1"][1]
    teams = [away_team, home_team]
    
    # get the scores
    away_scores = [int(df["1"][0]), int(df["2"][0]), int(df["3"][0]), int(df["4"][0])]
    home_scores = [int(df["1"][1]), int(df["2"][1]), int(df["3"][1]), int(df["4"][1])]
    scores = [away_scores, home_scores]
    
    return([teams, scores])

## Function to call scrapeURL for each game url

In [None]:
def iterateGames(data):
    all_teams = []
    all_scores = []
    for year in range(len(data)):
        for row in range(len(data[year])):
            teams, scores = scrapeURL(data[year][row])
            all_teams.append(teams)
            all_scores.append(scores)
    
    return([all_teams, all_scores]) 

## Function to create columns for the final digit of the score for each quater for both home and away teams

In [None]:
def quarterScores(df):
    # Create new column for individual quarters and set initial value to 0
    df["Away Q1"] = df.apply(lambda x: 0, axis=1)
    df["Away Q2"] = df.apply(lambda x: 0, axis=1)
    df["Away Q3"] = df.apply(lambda x: 0, axis=1)
    df["Away Q4"] = df.apply(lambda x: 0, axis=1)
    df["Home Q1"] = df.apply(lambda x: 0, axis=1)
    df["Home Q2"] = df.apply(lambda x: 0, axis=1)
    df["Home Q3"] = df.apply(lambda x: 0, axis=1)
    df["Home Q4"] = df.apply(lambda x: 0, axis=1)

    # update score with only last digit for each quarter
    for i in range(len(combined_df["Away"])):
        df["Away Q1"].iloc[i] = df["Away score"].iloc[i][0] % 10
        df["Away Q2"].iloc[i] = (df["Away score"].iloc[i][1] + df["Away Q1"].iloc[i]) % 10
        df["Away Q3"].iloc[i] = (df["Away score"].iloc[i][2] + df["Away Q2"].iloc[i]) % 10
        df["Away Q4"].iloc[i] = (df["Away score"].iloc[i][3] + df["Away Q3"].iloc[i]) % 10
        df["Home Q1"].iloc[i] = df["Home score"].iloc[i][0] % 10
        df["Home Q2"].iloc[i] = (df["Home score"].iloc[i][1] + df["Home Q1"].iloc[i]) % 10
        df["Home Q3"].iloc[i] = (df["Home score"].iloc[i][2] + df["Home Q2"].iloc[i]) % 10
        df["Home Q4"].iloc[i] = (df["Home score"].iloc[i][3] + df["Home Q3"].iloc[i]) % 10
    
    df.drop(['Away score', 'Home score'], axis=1, inplace=True)
    
    return df

# Driver

## Create a list of urls for each year in the Super Bowl era
Note: AFL and NFL for pre-merger years both include the Super Bowl in their data, so there are 4 duplicates in the data. They will be removed in pandas later.

In [None]:
root = "https://www.pro-football-reference.com/years/"
num_sbs = 55 # SB 55 was 2020 nfl season, sb1 was  1966 nfl season
afl = "_AFL"
pre_merger = []
post_merger = []
for i in range(num_sbs):
    if(i < 4):
        pre_merger.append(root + str(i+1966))
        pre_merger.append(root + str(i+1966) + afl)
    else:
        post_merger.append(root + str(i+1966))

## Create a list of lists of playoff games

In [None]:
url_list = []
for j in range(len(pre_merger)):
    url_list.append(getPlayoffURLs(pre_merger[j], True))
for k in range(len(post_merger)):
    url_list.append(getPlayoffURLs(post_merger[k], False))

## Go through each game in url_list and scrape data, then combine into final DataFrame that has duplicates removed

In [None]:
teams, scores = iterateGames(url_list)

teams_df = pd.DataFrame(teams, columns=["Away", "Home"])
scores_df = pd.DataFrame(scores, columns=["Away score", "Home score"])
combined_df = teams_df.join(scores_df, how="inner")

final_df = quarterScores(combined_df)
final_df.drop_duplicates(inplace=True)
final_df.head()

## Export Data

In [None]:
#final_df.to_csv('playoff_data.csv', index=False)