In [1069]:
from urllib.request import urlopen
import time
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
import re
from scipy.spatial.distance import cdist
from datetime import date
from numpy import asarray
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler

pd.options.mode.chained_assignment = None  # default='warn'

## Scraping and Cleaner Functions for Each Statistical Category

In [480]:
def rushing_scraper(url):
    # Open URL and pass to BeautifulSoup
    html = urlopen(url)
    stats_page = BeautifulSoup(html)
    # Collect table headers
    column_headers = stats_page.findAll('tr')[1]
    column_headers = [i.getText() for i in column_headers.findAll('th')]
    # Collect table rows
    rows = stats_page.findAll('tr')[2:]
    # Get stats from each row
    stats = []
    for i in range(len(rows)):
      stats.append([col.getText() for col in rows[i].findAll('td')])
    # Create DataFrame from our scraped data
    df = pd.DataFrame(stats, columns=column_headers[1:])
    return df

In [468]:
def receiving_scraper(url):
    # Open URL and pass to BeautifulSoup
    html = urlopen(url)
    stats_page = BeautifulSoup(html)
    # Collect table headers
    column_headers = stats_page.findAll('tr')[0]
    column_headers = [i.getText() for i in column_headers.findAll('th')]
    # Collect table rows
    rows = stats_page.findAll('tr')[1:]
    # Get stats from each row
    stats = []
    for i in range(len(rows)):
      stats.append([col.getText() for col in rows[i].findAll('td')])
    # Create DataFrame from our scraped data
    df = pd.DataFrame(stats, columns=column_headers[1:])
    return df

In [634]:
def rushing_cleaner(data):
    #Filter for only QB, WR, RB and TE
    rush = data.loc[(data.Pos == 'QB') | (data.Pos == 'WR') | (data.Pos == 'RB') | (data.Pos == 'TE')]
    rush = rush.dropna()
    #Select Relevant stats and convert to numeric datatype
    rush = rush[['Player', 'Age','Pos', 'Tm', 'G', 'Att', 'Yds', 'TD', 'Fmb']]
    num_df = rush[['Age', 'G', 'Att', 'Yds', 'TD', 'Fmb']].astype(str).astype(float)
    #Identify Non-Numerical Columns and Clean the Player Name
    bio = rush[['Player', 'Pos', 'Tm']]
    rush = bio.merge(num_df, left_index=True, right_index=True)
    clean_name = []
    for name in bio.Player:
        clean = re.sub(r"[*+]+$", "", name)
        clean_name.append(clean)
    
    rush['Player'] = clean_name
    rush.rename(columns={'Att': 'Rush_Att', 'Yds': 'Rush_Yds', 'TD': 'Rush_TD'}, inplace=True)
    #Group by Player, Position and Age to aggregate potential players traded mid-season
    rush = rush.groupby(by = ['Player', 'Pos', 'Age'], as_index = False).sum(numeric_only=True)
    rush.sort_values(by = 'Rush_Yds', ascending = False, inplace=True)
    #Drop duplicate players with the same names, and only keep the one with the more points
    rush.drop_duplicates(subset = ['Player'], keep = 'first', inplace = True)
    #Scale Data
    data = rush[['Rush_Att', 'Rush_Yds', 'Rush_TD']]
    scaler = MinMaxScaler()
    scaler.fit(data)
    scaled = scaler.transform(data)
    rush[['Rush_Att_Scaled', 'Rush_Yds_Scaled', 'Rush_TD_Scaled']] = scaled
    return rush

In [647]:
def receiving_cleaner(data):
    #Filter Stats
    catch = data.loc[(data.Pos == 'QB') | (data.Pos == 'WR') | (data.Pos == 'RB') | (data.Pos == 'TE')]
    catch = catch.loc[catch.Rec != '0']
    catch = catch.loc[catch.Player != 'Mike Williams']
    catch = catch.loc[catch.Tgt != '']
    catch = catch.loc[catch.Player != 'None']
    catch = catch.dropna()
    # Select Relevant stats and convert to numeric datatype
    catch = catch[['Player', 'Age', 'Tm', 'Pos', 'G', 'Tgt', 'Rec', 'Yds', 'TD']]
    num_df = catch[['Age', 'G', 'Tgt', 'Rec', 'Yds', 'TD']].astype(str).astype(float)
    bio = catch[['Player', 'Pos', 'Tm']]
    #Identify Non-Numerical Columns and Clean the Player Name
    catch = bio.merge(num_df, left_index=True, right_index=True)
    clean_name = []
    for name in bio.Player:
        clean = re.sub(r"[*+]+$", "", name)
        clean_name.append(clean)
    catch['Player'] = clean_name
    catch.rename(columns={'Yds': 'Rec_Yds', 'TD': 'Rec_TD'}, inplace=True)
    #Group by Player, Position and Age to aggregate potential players traded mid-season
    catch = catch.groupby(by = ['Player', 'Pos', 'Age'], as_index = False).sum(numeric_only=True)
    #Drop duplicate players with the same names, and only keep the one with the more points
    catch.sort_values(by = 'Rec_Yds', ascending = False, inplace=True)
    catch.drop_duplicates(subset = ['Player'], keep = 'first', inplace = True)
    #Scale Data
    data = catch[['Rec', 'Rec_Yds', 'Rec_TD']]
    scaler = MinMaxScaler()
    scaler.fit(data)
    scaled = scaler.transform(data)
    catch[['Rec_Scaled', 'Rec_Yds_Scaled', 'Rec_TD_Scaled']] = scaled
    return catch

In [471]:
def passing_scraper(url):
    # Open URL and pass to BeautifulSoup
    html = urlopen(url)
    stats_page = BeautifulSoup(html)
    # Collect table headers
    column_headers = stats_page.findAll('tr')[0]
    column_headers = [i.getText() for i in column_headers.findAll('th')]
    column_headers[-6] = 'Sk_Yds'
    # Collect table rows
    rows = stats_page.findAll('tr')[1:]
    # Get stats from each row
    qb_stats = []
    for i in range(len(rows)):
      qb_stats.append([col.getText() for col in rows[i].findAll('td')])
    # Create DataFrame from our scraped data
    df = pd.DataFrame(qb_stats, columns=column_headers[1:])
    return df

In [646]:
def passing_cleaner(data):
    #Data Manipulation
    qb = data.loc[data.Pos == 'QB']
    qb = qb.dropna()
    # Select Relevant stats and convert to numeric datatype
    qb = qb[['Player', 'Age', 'Pos', 'G', 'Cmp', 'Att', 'Yds', 'TD', 'Int']]
    num_df = qb[['Age','G', 'Cmp', 'Att', 'Yds', 'TD', 'Int']].astype(str).astype(float)
    bio = qb[['Player', 'Pos']]
    qb = bio.merge(num_df, left_index=True, right_index=True)
    #Clean Player Name
    clean_name = []
    for name in qb.Player.unique():
        clean = re.sub(r"[*+]+$", "", name)
        clean_name.append(clean)
    qb['Player'] = clean_name
    qb.rename(columns={'Att': 'Pass_Att', 'Yds': 'Pass_Yds', 'TD': 'Pass_TD'}, inplace=True)
    data = qb[['Pass_Att', 'Pass_Yds', 'Pass_TD']]
    scaler = MinMaxScaler()
    scaler.fit(data)
    scaled = scaler.transform(data)
    qb[['Pass_Att_Scaled', 'Pass_Yds_Scaled', 'Pass_TD_Scaled']] = scaled
    return qb

## Organize Player's Season

In [709]:
#Indicate Start and End Years for Season
start_dt = date(2013, 1, 1)
end_dt = date(2022, 1, 11)

year_range = [year for year in range(start_dt.year, end_dt.year +1)]

#For Loop to Iterate through Rushing, Passing and Receiving Seasons
categories = ['rushing', 'receiving', 'passing']

for i in year_range:
    for x in categories:
        url = 'https://www.pro-football-reference.com/years/{}/{}.htm'.format(i, x)
        scraper = x + '_scraper'
        cleaner = x + '_cleaner'
        data = vars()[scraper](url=url)
        df = vars()[cleaner](data)
        df['Season'] = i
        last_col = df.columns[-1]
        col_to_move = df.pop(last_col)
        df.insert(2, last_col, col_to_move)
        name = str(x)+'_df_' +str(i)
        vars()[name] = df
        time.sleep(1)
    print('Compiled stats from the {} season'.format(i))
    time.sleep(2)

Compiled stats from the 2013 season
Compiled stats from the 2014 season
Compiled stats from the 2015 season
Compiled stats from the 2016 season
Compiled stats from the 2017 season
Compiled stats from the 2018 season
Compiled stats from the 2019 season
Compiled stats from the 2020 season
Compiled stats from the 2021 season
Compiled stats from the 2022 season


In [710]:
#Concat Data
x = min(year_range)
range_mod = year_range[1:]
for i in categories:
    df_name = str(i)+'_df_'+str(x)
    base_df = vars()[df_name]
    for y in range_mod:
        df_name = str(i)+'_df_' +str(y)
        year_df = vars()[df_name]
        base_df = pd.concat([base_df, year_df], ignore_index=True)
        vars()[str(i)+'_df'] = base_df

In [711]:
#Join the 3 categories 
df = passing_df.merge(rushing_df, how = 'outer', on = ['Player', 'Pos', 'Age', 'Season', 'G'])
df = df.merge(receiving_df, how = 'outer', on = ['Player', 'Pos', 'Age', 'Season', 'G'])
df.fillna(0, inplace=True)
#Calculate Fantasy Points
df['Fantasy_Points'] = round(df['Pass_Yds']*.04 + df['Pass_TD']*4+df['Int']*(-2)+df['Rush_Yds']*.1+df['Rush_TD']*6+df['Rec']*.5+df['Rec_Yds']*.1+df['Rec_TD']*6+df['Fmb']*(-2),0)
last_col = df.columns[-1]
col_to_move = df.pop(last_col)
df.insert(3, last_col, col_to_move)
df = df.sort_values(by = 'Fantasy_Points', ascending = False) #Sort DF by Fantasy Points
df.loc[df.Pos != 'QB'].head()

Unnamed: 0,Player,Pos,Season,Fantasy_Points,Age,G,Cmp,Pass_Att,Pass_Yds,Pass_TD,...,Rush_Att_Scaled,Rush_Yds_Scaled,Rush_TD_Scaled,Tgt,Rec,Rec_Yds,Rec_TD,Rec_Scaled,Rec_Yds_Scaled,Rec_TD_Scaled
2128,Christian McCaffrey,RB,2019,409.0,23.0,16.0,0.0,0.0,0.0,0.0,...,0.94702,0.901418,0.9375,142.0,116.0,1005.0,4.0,0.777027,0.583574,0.363636
2821,Cooper Kupp,WR,2021,365.0,28.0,17.0,0.0,0.0,0.0,0.0,...,0.009063,0.016996,0.0,191.0,145.0,1947.0,16.0,1.0,1.0,1.0
1408,David Johnson,RB,2016,362.0,25.0,16.0,0.0,0.0,0.0,0.0,...,0.909657,0.761412,0.888889,120.0,80.0,879.0,4.0,0.745283,0.611339,0.285714
2648,Jonathan Taylor,RB,2021,349.0,22.0,17.0,0.0,0.0,0.0,0.0,...,1.0,1.0,1.0,51.0,40.0,360.0,2.0,0.270833,0.186571,0.125
1648,Todd Gurley,RB,2017,345.0,23.0,15.0,0.0,0.0,0.0,0.0,...,0.86875,0.983594,1.0,87.0,64.0,788.0,6.0,0.567568,0.515919,0.461538


In [713]:
#season_df = pd.concat([season_00_12, season_13_22], axis=0).sort_values(by = 'Fantasy_Points', ascending = False).reset_index(drop= True)
#season_df.to_csv('/Users/yushunli/Documents/Data Science/jupyter_notebook/Fantasy Football/season2000_to_2022_data.csv', index=False)

In [868]:
season_df.loc[season_df.Pos != 'QB'].head()

Unnamed: 0,Player,Pos,Season,Fantasy_Points,Age,G,Cmp,Pass_Att,Pass_Yds,Pass_TD,...,Rush_Att_Scaled,Rush_Yds_Scaled,Rush_TD_Scaled,Tgt,Rec,Rec_Yds,Rec_TD,Rec_Scaled,Rec_Yds_Scaled,Rec_TD_Scaled
0,LaDainian Tomlinson,RB,2006,442.0,27.0,16.0,0.0,0.0,0.0,0.0,...,0.836145,1.0,1.0,80.0,56.0,508.0,3.0,0.539216,0.373818,0.230769
1,Marshall Faulk,RB,2000,415.0,27.0,14.0,0.0,0.0,0.0,0.0,...,0.626866,0.796037,1.0,113.0,81.0,830.0,8.0,0.792079,0.509445,0.533333
2,Christian McCaffrey,RB,2019,409.0,23.0,16.0,0.0,0.0,0.0,0.0,...,0.94702,0.901418,0.9375,142.0,116.0,1005.0,4.0,0.777027,0.583574,0.363636
3,Priest Holmes,RB,2003,408.0,30.0,16.0,0.0,0.0,0.0,0.0,...,0.815857,0.689124,1.0,90.0,74.0,690.0,0.0,0.62931,0.409278,0.0
4,Priest Holmes,RB,2002,406.0,29.0,14.0,0.0,0.0,0.0,0.0,...,0.816754,0.872591,1.0,81.0,70.0,672.0,3.0,0.485915,0.394813,0.230769


## Draft Classes

In [783]:
def draft_class_scraper(season):
    url = 'https://www.pro-football-reference.com/years/{}/draft.htm'.format(season)
    html = urlopen(url)
    stats_page = BeautifulSoup(html)
    # Collect table headers
    column_headers = stats_page.findAll('tr')[1]
    column_headers = [i.getText() for i in column_headers.findAll('th')]
    # Collect table rows
    rows = stats_page.findAll('tr')[2:]
    # Get stats from each row
    stats = []
    for i in range(len(rows)):
      stats.append([col.getText() for col in rows[i].findAll('td')])
    # Create DataFrame from our scraped data
    df = pd.DataFrame(stats, columns=column_headers[1:])
    #Filter for only relevant rows and columns
    df.dropna(inplace=True)
    df = df.loc[df.Player != None]
    df = df.loc[df.Player != 'None']
    draft = df[['Player', 'Pos', 'Age', 'Pick']]
    #draft['Age','Pick'] = draft[['Age','Pick']].astype(str).astype(float)
    draft = draft.loc[draft.Pick != None]
    draft.reset_index(drop = True, inplace=True)
    draft.loc[:, 'Season'] = season
    draft['Pick'] = draft.index + 1
    draft = draft.loc[(draft.Pos == 'QB') | (draft.Pos == 'WR') | (draft.Pos == 'RB') | (draft.Pos == 'TE')]
    return draft

In [784]:
#Indicate Start and End Years for Season
start_dt = date(1994, 1, 1)
end_dt = date(2022, 1, 11)

year_range = [year for year in range(start_dt.year, end_dt.year +1)]

for i in year_range:
    df = draft_class_scraper(season = i)
    name = 'draftclass_' +str(i)
    vars()[name] = df
    print('Completed the {} Draft Class'.format(i))
    time.sleep(2)

Completed the 1994 Draft Class
Completed the 1995 Draft Class
Completed the 1996 Draft Class
Completed the 1997 Draft Class
Completed the 1998 Draft Class
Completed the 1999 Draft Class
Completed the 2000 Draft Class
Completed the 2001 Draft Class
Completed the 2002 Draft Class
Completed the 2003 Draft Class
Completed the 2004 Draft Class
Completed the 2005 Draft Class
Completed the 2006 Draft Class
Completed the 2007 Draft Class
Completed the 2008 Draft Class
Completed the 2009 Draft Class
Completed the 2010 Draft Class
Completed the 2011 Draft Class
Completed the 2012 Draft Class
Completed the 2013 Draft Class
Completed the 2014 Draft Class
Completed the 2015 Draft Class
Completed the 2016 Draft Class
Completed the 2017 Draft Class
Completed the 2018 Draft Class
Completed the 2019 Draft Class
Completed the 2020 Draft Class
Completed the 2021 Draft Class
Completed the 2022 Draft Class


In [942]:
start_dt = date(1994, 1, 1)
end_dt = date(2022, 1, 11)

draft_range = [year for year in range(start_dt.year+1, end_dt.year +1)]

draft_class = draftclass_1994

for i in draft_range:
    name = 'draftclass_' +str(i)
    draft_class = pd.concat([draft_class,vars()[name]])
    
clean_name = []
for name in draft_class.Player:
    clean = re.sub(r" HOF$", "", name)
    clean_name.append(clean)

draft_class['Player'] = clean_name
draft_class["Position_Pick"] = draft_class.groupby(['Pos', 'Season']).cumcount()+1
draft_class = draft_class.loc[(draft_class.Pos == 'QB') | (draft_class.Pos == 'WR') | (draft_class.Pos == 'RB') | (draft_class.Pos == 'TE')]
draft_class.to_csv('/Users/yushunli/Documents/Data Science/jupyter_notebook/Fantasy Football/1994_to_2022_draftclass.csv',index=False)
draft_class

Unnamed: 0,Player,Pos,Age,Pick,Season,Position_Pick
1,Marshall Faulk,RB,21,2,1994,1
2,Heath Shuler,QB,22,3,1994,1
5,Trent Dilfer,QB,22,6,1994,2
16,Charles Johnson,WR,22,17,1994,1
20,Johnnie Morton,WR,22,21,1994,2
...,...,...,...,...,...,...
249,Brittain Brown,RB,24,250,2022,21
250,Isiah Pacheco,RB,22,251,2022,22
257,Samori Toure,WR,24,258,2022,28
259,Zander Horvath,RB,23,260,2022,23


In [892]:
first_season = season_df.groupby(by = ['Player', 'Pos'], as_index = False).min('Age')
draft_df = first_season.merge(draft_class, on = ['Player', 'Pos'])
draft_df = draft_df[['Player', 'Pos', 'Pick', 'Position Pick Number', 'Season_y']]
draft_df.rename(columns = {'Season_y' : 'Draft_Year', 'Position Pick Number': 'Position_Pick'}, inplace=True)
draft_df.sort_values(by = ['Draft_Year', 'Pick'], ascending = True, inplace=True)
draft_df.reset_index(drop=True, inplace=True)
draft_df

Unnamed: 0,Player,Pos,Pick,Position_Pick,Draft_Year
0,Marshall Faulk,RB,2,1,1994
1,Trent Dilfer,QB,6,2,1994
2,Charles Johnson,WR,17,1,1994
3,Johnnie Morton,WR,21,2,1994
4,Derrick Alexander,WR,29,4,1994
...,...,...,...,...,...
1769,Dareke Young,WR,233,27,2022
1770,Skylar Thompson,QB,247,8,2022
1771,Isiah Pacheco,RB,251,22,2022
1772,Samori Toure,WR,258,28,2022
