Naturally, we must begin with collecting the data. This will involve scraping PFR.

Let's define some helper functions to keep our code organized.

In [49]:
def get_passing_url(year):
    return f"https://www.pro-football-reference.com/years/{year}/passing.htm"


def get_rushing_url(year):
    return f"https://www.pro-football-reference.com/years/{year}/rushing.htm"


def get_receiving_url(year):
    return f"https://www.pro-football-reference.com/years/{year}/receiving.htm"


def get_soup(url):
    import requests
    from bs4 import BeautifulSoup
    page = requests.get(url)
    soup = BeautifulSoup(page.content, 'html.parser')
    return soup


def get_valid_subdirectories(soup):
    html_subdirectories = soup.find_all("a")
    valid_subdirectories = [link for link in html_subdirectories if '/players/' in str(link)]
    subdirectories = [link.get('href') for link in valid_subdirectories]
    return subdirectories

Great. Now our first step will involve iterating through each season of interest and getting the webpages for that season's passing, rushing, and receiving stats. 

From each of those webpages, we will extract every link that leads to a player's individual stats page. We will store all of these links in a list called `player_links`, which we will then make unique.

In [51]:
player_links = []
for year in range(2012, 2014):
    # Passing
    passing_url = get_passing_url(year)
    passing_soup = get_soup(passing_url)
    passing_table = passing_soup.find(id="passing")
    passing_subdirectories = get_valid_subdirectories(passing_table)
    player_links.extend(passing_subdirectories)
    # Rushing
    rushing_url = get_rushing_url(year)
    rushing_soup = get_soup(rushing_url)
    rushing_table = rushing_soup.find(id="rushing")
    rushing_subdirectories = get_valid_subdirectories(rushing_table)
    player_links.extend(rushing_subdirectories)
    # Receiving
    receiving_url = get_receiving_url(year)
    receiving_soup = get_soup(receiving_url)
    receiving_table = receiving_soup.find(id="receiving")
    receiving_subdirectories = get_valid_subdirectories(receiving_table)
    player_links.extend(receiving_subdirectories)
# Make unique
player_links = list(set(player_links))

In [52]:
player_links

['/players/C/CromAn20.htm',
 '/players/W/WaynRe00.htm',
 '/players/L/LewiTh01.htm',
 '/players/J/JohnMi23.htm',
 '/players/M/McInCo00.htm',
 '/players/C/ColeDe00.htm',
 '/players/T/TurbRo00.htm',
 '/players/T/TennMa20.htm',
 '/players/T/TolbMi00.htm',
 '/players/M/MoorLa00.htm',
 '/players/T/TurnPa00.htm',
 '/players/H/HarkCo00.htm',
 '/players/A/AngeBr00.htm',
 '/players/G/GateCl00.htm',
 '/players/P/PettAu00.htm',
 '/players/H/HillJo02.htm',
 '/players/S/SalaGr00.htm',
 '/players/R/RainCh00.htm',
 '/players/F/FasaAn00.htm',
 '/players/P/PopeLe00.htm',
 '/players/B/BranDe00.htm',
 '/players/S/SmitAl02.htm',
 '/players/H/HillRo00.htm',
 '/players/C/ClemTo00.htm',
 '/players/D/DixoAn00.htm',
 '/players/C/ConeKe00.htm',
 '/players/M/MannMa01.htm',
 '/players/S/StarMa21.htm',
 '/players/J/JackVi00.htm',
 '/players/M/McBrMa20.htm',
 '/players/M/MattRi00.htm',
 '/players/D/DoylJa00.htm',
 '/players/C/CunnBe01.htm',
 '/players/W/WingRo00.htm',
 '/players/L/LyncMa00.htm',
 '/players/S/ShiaVi0