# Multi-Season NFL Fantasy Analysis

## Setup

### Import Packages

In [2]:
%pip install pandas==1.4.1

# Import packages
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer

# Set theme for seaborn
sns.set_theme()

# Turn off warning messages
warnings.filterwarnings("ignore")

You should consider upgrading via the '/usr/local/bin/python -m pip install --upgrade pip' command.[0m[33m
[0mNote: you may need to restart the kernel to use updated packages.


### User Parameters

In [3]:
# Specify years to analyze as list (go back until 2012 and up to 2022)
years = [2012, 2013, 2014, 2015, 2016, 2017, 2018, 2019, 2020, 2021, 2022]

# Position/color dictionary for plots (in-line with Sleeper colors)
color_dict = {'QB': 'red', 'RB': 'green', 'WR' : 'blue', 'TE' : 'orange'}

### Define Functions

In [4]:
def remove_fuzzy(df, name_value, column_to_remove):
    """
    Assigns NaN to the specified column for any rows in the DataFrame where the 'name' column matches the given name value.

    Args:
        df: the DataFrame to modify
        name_value: the value to match in the 'name' column
        column_to_remove: the name of the column to assign NaN to
    
    Returns dataframe with value removed
    """
    df.loc[df['player_name'] == name_value, column_to_remove] = np.nan

def points_histogram(df, x='fantasy_points_ppr'):
    """
    Creates histogram for given dataframe and column.

    Args:
        df: the DataFrame to plot
        x: vlue to plot on x-axis
    
    Returns histogram 
    """
    fig = px.histogram(df, x, nbins=10, labels={'x':x}, 
                   title=f'Histogram of {x}', color = 'position', color_discrete_map=color_dict, opacity = .5)
    fig.show()

def points_scatter(df, x, y='fantasy_points_ppr', hoverdata=None):
    """
    Creates scatterplot for given paramters.

    Args:
        df: the DataFrame to plot
        x: vlue to plot on x-axis
        y: value to plot on y-axis (defualt is PPR points)
    
    Returns scatterplot
    """
    fig = px.scatter(df, x=x, y=y, 
    hover_name='player_season', hover_data=hoverdata,
    title=f'{y} vs. {x}',
    marginal_x='histogram', marginal_y='histogram',
    color = 'position', color_discrete_map=color_dict,
    opacity=0.25)

    fig.show()

### Data Preparation

In [5]:
final_df = pd.read_pickle('final_df.pkl')
final_df = final_df[final_df['season'].isin(years)]

# Create positional dataframes
qb_df = final_df.loc[final_df['position'] == 'QB']
rb_df = final_df.loc[final_df['position'] == 'RB']
wr_df = final_df.loc[final_df['position'] == 'WR']
te_df = final_df.loc[final_df['position'] == 'TE']

In [6]:
final_df

Unnamed: 0,season,team,position,depth_chart_position,jersey_number,status,player_name,first_name,last_name,birth_date,...,40_yard_dash,dash_year,player,school,bmi,player_season,draft_number_filled,draft_capital,bmi_over_30,sub_4.6_40
880,2019,CAR,RB,RB,22.0,Active,Christian McCaffrey,Christian,McCaffrey,1996-06-07,...,4.48,2017.0,Christian McCaffrey,Stanford,28.588574,"Christian McCaffrey, 2019",8.0,Round 1.0,0,1
1008,2021,LA,WR,WR,10.0,Active,Cooper Kupp,Cooper,Kupp,1993-06-15,...,4.62,2017.0,Cooper Kupp,Eastern Washington,26.702703,"Cooper Kupp, 2021",69.0,Round 3.0,0,0
1747,2022,KC,QB,QB,15.0,Active,Patrick Mahomes,Patrick,Mahomes,1995-09-17,...,4.80,2017.0,Patrick Mahomes,Texas Tech,29.527027,"Patrick Mahomes, 2022",10.0,Round 1.0,0,0
1743,2018,KC,QB,QB,15.0,Active,Patrick Mahomes,Patrick,Mahomes,1995-09-17,...,4.80,2017.0,Patrick Mahomes,Texas Tech,28.744889,"Patrick Mahomes, 2018",10.0,Round 1.0,0,0
822,2019,BAL,QB,QB,8.0,Active,Lamar Jackson,Lamar,Jackson,1997-01-07,...,,,,,27.216216,"Lamar Jackson, 2019",32.0,Round 1.0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1491,2017,KC,QB,QB,9.0,Active,Tyler Bray,Tyler,Bray,1991-12-27,...,5.05,2013.0,Tyler Bray,Tennessee,24.843031,"Tyler Bray, 2017",225.0,Undrafted,0,0
1085,2018,NYG,QB,QB,17.0,Active,Kyle Lauletta,Kyle,Lauletta,1995-03-17,...,4.81,2018.0,Kyle Lauletta,Richmond,26.870222,"Kyle Lauletta, 2018",108.0,Round 4.0,0,0
576,2017,MIN,QB,QB,5.0,Active,Teddy Bridgewater,Theodore,Bridgewater,1992-11-10,...,,,,,27.601351,"Teddy Bridgewater, 2017",32.0,Round 1.0,0,0
1910,2020,DEN,WR,WR,2,Practice Squad,Kendall Hinton,Kendall,Hinton,1997-02-19,...,,,,,26.443866,"Kendall Hinton, 2020",225.0,Undrafted,0,0


### Examine Fuzzy Matching Output

In [7]:
# View fuzzy matching output
fuzzy_name_comp = final_df[final_df['fuzzy_matches'].notnull()]
fuzzy_name_comp = fuzzy_name_comp[['player_name', 'fuzzy_matches', 'fuzzy_score', 'avg_adp', '40_yard_dash']]
fuzzy_name_comp = fuzzy_name_comp.sort_values('fuzzy_score', ascending=False)
fuzzy_name_comp

Unnamed: 0,player_name,fuzzy_matches,fuzzy_score,avg_adp,40_yard_dash
1045,Curtis Painter,Curtis Painter,100.0,,
113,Devin Hester,Devin Hester,100.0,172.0,
1743,Ray-Ray McCloud,Ray-Ray Mccloud,100.0,,4.53
140,Dwayne Bowe,Dwayne Bowe,100.0,217.0,
1098,Jim Dray,Jim Dray,100.0,,
...,...,...,...,...,...
1184,William Powell,William Bell,85.0,,
265,Brandon Myers,Brandon James,85.0,127.5,
133,Pierre Thomas,Derrek Thomas,85.0,151.7,
659,Jalen Richard,Ricardo Allen,85.0,184.0,


In [8]:
final_df[final_df['fuzzy_matches'] == 'Patrick Mahomes II']

Unnamed: 0,season,team,position,depth_chart_position,jersey_number,status,player_name,first_name,last_name,birth_date,...,40_yard_dash,dash_year,player,school,bmi,player_season,draft_number_filled,draft_capital,bmi_over_30,sub_4.6_40
1747,2022,KC,QB,QB,15.0,Active,Patrick Mahomes,Patrick,Mahomes,1995-09-17,...,4.8,2017.0,Patrick Mahomes,Texas Tech,29.527027,"Patrick Mahomes, 2022",10.0,Round 1.0,0,0
1743,2018,KC,QB,QB,15.0,Active,Patrick Mahomes,Patrick,Mahomes,1995-09-17,...,4.8,2017.0,Patrick Mahomes,Texas Tech,28.744889,"Patrick Mahomes, 2018",10.0,Round 1.0,0,0
1745,2020,KC,QB,QB,15.0,Active,Patrick Mahomes,Patrick,Mahomes,1995-09-17,...,4.8,2017.0,Patrick Mahomes,Texas Tech,28.744889,"Patrick Mahomes, 2020",10.0,Round 1.0,0,0
1746,2021,KC,QB,QB,15.0,Active,Patrick Mahomes,Patrick,Mahomes,1995-09-17,...,4.8,2017.0,Patrick Mahomes,Texas Tech,28.744889,"Patrick Mahomes, 2021",10.0,Round 1.0,0,0
1744,2019,KC,QB,QB,15.0,Active,Patrick Mahomes,Patrick,Mahomes,1995-09-17,...,4.8,2017.0,Patrick Mahomes,Texas Tech,28.744889,"Patrick Mahomes, 2019",10.0,Round 1.0,0,0
1742,2017,KC,QB,QB,15.0,Active,Patrick Mahomes,Patrick,Mahomes,1995-09-17,...,4.8,2017.0,Patrick Mahomes,Texas Tech,29.527027,"Patrick Mahomes, 2017",10.0,Round 1.0,0,0


In [9]:
'''
# Manually remove fuzzy matching mistakes
bad_matches = ['Anthony Brown', 'Deon Jackson', 'Jake Ferguson', 'DeAndre Washington', 'Malik Davis']

for name in bad_matches:
    remove_fuzzy(final_df, name, 'avg_adp')
'''

"\n# Manually remove fuzzy matching mistakes\nbad_matches = ['Anthony Brown', 'Deon Jackson', 'Jake Ferguson', 'DeAndre Washington', 'Malik Davis']\n\nfor name in bad_matches:\n    remove_fuzzy(final_df, name, 'avg_adp')\n"

In [10]:
final_df['fuzzy_matches'].value_counts()

Mike Williams       8
Rory Anderson       7
Marvin Jones Jr.    7
Mark Ingram II      7
Duke Johnson Jr.    7
                   ..
DaRick Rogers       1
Charlie Batch       1
Lavelle Hawkins     1
Nick Toon           1
Jordan Palmer       1
Name: fuzzy_matches, Length: 493, dtype: int64

In [11]:
# View fuzzy matching output
fuzzy_name_comp = final_df[final_df['fuzzy_matches'].notnull()]
fuzzy_name_comp = fuzzy_name_comp[['player_name', 'fuzzy_matches', 'fuzzy_score', 'avg_adp', '40_yard_dash']]
fuzzy_name_comp = fuzzy_name_comp.sort_values('fuzzy_score', ascending=False)
fuzzy_name_comp

Unnamed: 0,player_name,fuzzy_matches,fuzzy_score,avg_adp,40_yard_dash
1045,Curtis Painter,Curtis Painter,100.0,,
113,Devin Hester,Devin Hester,100.0,172.0,
1743,Ray-Ray McCloud,Ray-Ray Mccloud,100.0,,4.53
140,Dwayne Bowe,Dwayne Bowe,100.0,217.0,
1098,Jim Dray,Jim Dray,100.0,,
...,...,...,...,...,...
1184,William Powell,William Bell,85.0,,
265,Brandon Myers,Brandon James,85.0,127.5,
133,Pierre Thomas,Derrek Thomas,85.0,151.7,
659,Jalen Richard,Ricardo Allen,85.0,184.0,


## EDA

### All Positions

In [12]:
final_df

Unnamed: 0,season,team,position,depth_chart_position,jersey_number,status,player_name,first_name,last_name,birth_date,...,40_yard_dash,dash_year,player,school,bmi,player_season,draft_number_filled,draft_capital,bmi_over_30,sub_4.6_40
880,2019,CAR,RB,RB,22.0,Active,Christian McCaffrey,Christian,McCaffrey,1996-06-07,...,4.48,2017.0,Christian McCaffrey,Stanford,28.588574,"Christian McCaffrey, 2019",8.0,Round 1.0,0,1
1008,2021,LA,WR,WR,10.0,Active,Cooper Kupp,Cooper,Kupp,1993-06-15,...,4.62,2017.0,Cooper Kupp,Eastern Washington,26.702703,"Cooper Kupp, 2021",69.0,Round 3.0,0,0
1747,2022,KC,QB,QB,15.0,Active,Patrick Mahomes,Patrick,Mahomes,1995-09-17,...,4.80,2017.0,Patrick Mahomes,Texas Tech,29.527027,"Patrick Mahomes, 2022",10.0,Round 1.0,0,0
1743,2018,KC,QB,QB,15.0,Active,Patrick Mahomes,Patrick,Mahomes,1995-09-17,...,4.80,2017.0,Patrick Mahomes,Texas Tech,28.744889,"Patrick Mahomes, 2018",10.0,Round 1.0,0,0
822,2019,BAL,QB,QB,8.0,Active,Lamar Jackson,Lamar,Jackson,1997-01-07,...,,,,,27.216216,"Lamar Jackson, 2019",32.0,Round 1.0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1491,2017,KC,QB,QB,9.0,Active,Tyler Bray,Tyler,Bray,1991-12-27,...,5.05,2013.0,Tyler Bray,Tennessee,24.843031,"Tyler Bray, 2017",225.0,Undrafted,0,0
1085,2018,NYG,QB,QB,17.0,Active,Kyle Lauletta,Kyle,Lauletta,1995-03-17,...,4.81,2018.0,Kyle Lauletta,Richmond,26.870222,"Kyle Lauletta, 2018",108.0,Round 4.0,0,0
576,2017,MIN,QB,QB,5.0,Active,Teddy Bridgewater,Theodore,Bridgewater,1992-11-10,...,,,,,27.601351,"Teddy Bridgewater, 2017",32.0,Round 1.0,0,0
1910,2020,DEN,WR,WR,2,Practice Squad,Kendall Hinton,Kendall,Hinton,1997-02-19,...,,,,,26.443866,"Kendall Hinton, 2020",225.0,Undrafted,0,0


In [13]:
points_histogram(final_df, x='fantasy_points_ppr')

In [14]:
points_scatter(final_df, x='draft_number')

In [15]:
points_scatter(final_df, x='avg_adp')

### QB Analysis

In [16]:
points_histogram(qb_df, x='fantasy_points_ppr')
points_scatter(qb_df, x='avg_adp')
points_scatter(qb_df, x='draft_number')
points_scatter(qb_df, x='40_yard_dash')

### RB Analysis

In [17]:
points_histogram(rb_df, x='fantasy_points_ppr')
points_scatter(rb_df, x='avg_adp')
points_scatter(rb_df, x='draft_number')
points_scatter(rb_df, x='40_yard_dash')
points_scatter(rb_df, x='bmi', hoverdata=['height', 'weight'])
points_scatter(rb_df, x='receptions')
points_scatter(rb_df, x='carries')

### WR Analysis

In [18]:
points_histogram(wr_df, x='fantasy_points_ppr')
points_scatter(wr_df, x='avg_adp')
points_scatter(wr_df, x='draft_number')
points_scatter(wr_df, x='40_yard_dash')
points_scatter(wr_df, x='targets')
points_scatter(wr_df, x='receptions')

### TE Analysis

In [19]:
points_histogram(te_df, x='fantasy_points_ppr')
points_scatter(te_df, x='avg_adp')
points_scatter(te_df, x='draft_number')
points_scatter(te_df, x='40_yard_dash')
points_scatter(te_df, x='targets')
points_scatter(te_df, x='receptions')

## Modeling

### RB Model

In [20]:
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
import warnings

# Filter for 1st year performance only
rookie_rb_df = rb_df[rb_df['years_exp'] == 0].copy()

# Separate the input variables and target variable
X = rookie_rb_df[['draft_number_filled', 'bmi', '40_yard_dash']]
y = rookie_rb_df['fantasy_points_ppr']

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

# Impute missing values using mean imputation
imputer = SimpleImputer(strategy='mean')
X_train_imputed = pd.DataFrame(imputer.fit_transform(X_train), columns=X_train.columns, index=X_train.index)
X_test_imputed = pd.DataFrame(imputer.transform(X_test), columns=X_test.columns, index=X_test.index)

# Create an instance of the linear regression model
regressor = LinearRegression()

# Fit the model using the training data
regressor.fit(X_train_imputed, y_train)

# Make predictions on the test data
y_pred = regressor.predict(X_test_imputed)

# Add predicted values as a new column in the testing dataframe
X_test_df = pd.concat([X_test_imputed, pd.Series(y_pred, name='predicted_fantasy_points_ppr', index=X_test.index)], axis=1)

# Add predicted values as a new column in the original dataframe
X_imputed = pd.DataFrame(imputer.transform(X), columns=X.columns, index=X.index)
rookie_rb_df = pd.concat([rookie_rb_df, pd.Series(regressor.predict(X_imputed), name='predicted_fantasy_points_ppr', index=rookie_rb_df.index)], axis=1)

# Calculate R-squared for train and test
train_score = regressor.score(X_train_imputed, y_train)
test_score = regressor.score(X_test_imputed, y_test)

print("Train R-squared:", train_score)
print("Test R-squared:", test_score)

Train R-squared: 0.31325456253396544
Test R-squared: 0.3143389637045362


In [21]:
rookie_rb_df

Unnamed: 0,season,team,position,depth_chart_position,jersey_number,status,player_name,first_name,last_name,birth_date,...,dash_year,player,school,bmi,player_season,draft_number_filled,draft_capital,bmi_over_30,sub_4.6_40,predicted_fantasy_points_ppr
1162,2018,NYG,RB,RB,26.0,Active,Saquon Barkley,Saquon,Barkley,1997-02-07,...,2018.0,Saquon Barkley,Penn State,30.240934,"Saquon Barkley, 2018",2.0,Round 1.0,1,1,171.863700
838,2016,DAL,RB,RB,21.0,Active,Ezekiel Elliott,Ezekiel,Elliott,1995-07-22,...,2016.0,Ezekiel Elliott,Ohio State,30.512153,"Ezekiel Elliott, 2016",4.0,Round 1.0,1,1,161.870552
998,2017,NO,RB,RB,41.0,Active,Alvin Kamara,Alvin,Kamara,1995-07-25,...,2017.0,Alvin Kamara,Tennessee,30.845918,"Alvin Kamara, 2017",67.0,Round 3.0,1,1,116.153884
903,2021,PIT,RB,RB,22.0,Active,Najee Harris,Najee,Harris,1998-03-09,...,,,,30.341527,"Najee Harris, 2021",24.0,Round 1.0,1,0,140.739421
1019,2017,KC,RB,RB,27.0,Active,Kareem Hunt,Kareem,Hunt,1995-08-06,...,2017.0,Kareem Hunt,Toledo,29.841633,"Kareem Hunt, 2017",86.0,Round 3.0,0,0,91.874657
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1929,2020,PHI,RB,RB,46,Practice Squad,Adrian Killins,Adrian,Killins,1998-01-02,...,,,,24.933391,"Adrian Killins, 2020",225.0,Undrafted,0,0,3.970624
1689,2017,CLE,RB,FB,42.0,Active,Marquez Williams,Marquez,Williams,1994-07-16,...,,,,39.047808,"Marquez Williams, 2017",240.0,Undrafted,1,0,68.111498
1493,2016,GB,RB,RB,40.0,,Joe Kerridge,Joe,Kerridge,1992-09-17,...,,,,33.748463,"Joe Kerridge, 2016",225.0,Undrafted,1,0,49.113076
1921,2021,BAL,RB,RB,47.0,Prac Sq.; Inj,Nate McCrary,Nate,McCrary,1999-04-09,...,2021.0,Nate McCrary,Saginaw Valley State (MI),30.512153,"Nate McCrary, 2021",225.0,Undrafted,1,1,34.589198


In [22]:
points_scatter(rookie_rb_df, x = 'predicted_fantasy_points_ppr', y = 'fantasy_points_ppr', hoverdata=['draft_number', 'bmi', '40_yard_dash'])

<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=37ec1120-bd1c-442f-b6e8-aab8eb5fc09e' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>