In [1]:
# Import required libraries
import numpy as np
import pandas as pd
import hvplot.pandas
from pathlib import Path
from sklearn.svm import SVR
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
import math

# Training model with all seasons data - Optimized
### Run training/testing against all seasons through 2023 to get preliminary validation of the model

In [2]:
# First, we will import our dataset for all players, all years

all_seasons_data = pd.read_csv('../resources/training_data.csv')
all_seasons_data

Unnamed: 0,name,team,position,season,games,receptions,targets,receiving_yards,rec_ypg,receiving_tds,...,sacks,sack_fumbles,offense_snaps,teams_offense_snaps,round,overall,stadium_name,stadium_weather_type,stadium_surface,fantasy_points_ppr
0,A.J. Brown,TEN,WR,2019,16,52,84,1051.0,65.69,8,...,0.0,0,678.0,997.0,2.0,51.0,Nissan Stadium,moderate,Grass,217.10
1,A.J. Brown,TEN,WR,2020,14,70,106,1075.0,76.79,11,...,0.0,0,760.0,945.0,2.0,51.0,Nissan Stadium,moderate,Grass,247.50
2,A.J. Brown,TEN,WR,2021,13,63,105,869.0,66.85,5,...,0.0,0,604.0,876.0,2.0,51.0,Nissan Stadium,moderate,Grass,180.90
3,A.J. Brown,PHI,WR,2022,17,88,145,1496.0,88.00,11,...,0.0,0,1004.0,1189.0,2.0,51.0,Lincoln Financial Field,cold,Grass,299.60
4,A.J. Brown,PHI,WR,2023,17,106,158,1456.0,85.65,7,...,0.0,0,1019.0,1154.0,2.0,51.0,Lincoln Financial Field,cold,Grass,289.60
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2101,Zay Jones,BUF,WR,2018,16,56,102,652.0,40.75,7,...,0.0,0,941.0,1063.0,2.0,37.0,New Era Field,cold,FieldTurf,165.20
2102,Zay Jones,LV,WR,2020,11,14,20,154.0,14.00,1,...,0.0,0,248.0,771.0,2.0,37.0,Allegiant Stadium,indoor,Grass,36.56
2103,Zay Jones,LV,WR,2021,15,47,70,546.0,36.40,1,...,0.0,0,614.0,1026.0,2.0,37.0,Allegiant Stadium,indoor,Grass,105.90
2104,Zay Jones,JAX,WR,2022,16,82,121,823.0,51.44,5,...,0.0,0,930.0,1082.0,2.0,37.0,TIAA Bank Field,warm,Grass,198.10


In [3]:
# Adding new features to training data (part of optimization)

# We'll start with a copy of the all_season_data and add additional dimensions/features
opt_model_all_seasons_data = all_seasons_data.copy()
opt_model_all_seasons_data['target_per_game'] = round((opt_model_all_seasons_data['targets']/opt_model_all_seasons_data['games']),2)
opt_model_all_seasons_data['carries_per_game'] = round((opt_model_all_seasons_data['carries']/opt_model_all_seasons_data['games']),2)
opt_model_all_seasons_data['team_off_snaps_per_game'] = round((opt_model_all_seasons_data['teams_offense_snaps']/opt_model_all_seasons_data['games']),2)
opt_model_all_seasons_data['off_snaps_per_game'] = round((opt_model_all_seasons_data['offense_snaps']/opt_model_all_seasons_data['games']),2)
opt_model_all_seasons_data['attempts_per_game'] = round((opt_model_all_seasons_data['attempts']/opt_model_all_seasons_data['games']),2)
opt_model_all_seasons_data['attempts_per_game'] = round((opt_model_all_seasons_data['attempts']/opt_model_all_seasons_data['games']),2)



opt_model_all_seasons_data.fillna(0, inplace=True)
new_features_training_data_all_seasons = opt_model_all_seasons_data
new_features_training_data_all_seasons.head()

Unnamed: 0,name,team,position,season,games,receptions,targets,receiving_yards,rec_ypg,receiving_tds,...,overall,stadium_name,stadium_weather_type,stadium_surface,fantasy_points_ppr,target_per_game,carries_per_game,team_off_snaps_per_game,off_snaps_per_game,attempts_per_game
0,A.J. Brown,TEN,WR,2019,16,52,84,1051.0,65.69,8,...,51.0,Nissan Stadium,moderate,Grass,217.1,5.25,0.19,62.31,42.38,0.0
1,A.J. Brown,TEN,WR,2020,14,70,106,1075.0,76.79,11,...,51.0,Nissan Stadium,moderate,Grass,247.5,7.57,0.0,67.5,54.29,0.0
2,A.J. Brown,TEN,WR,2021,13,63,105,869.0,66.85,5,...,51.0,Nissan Stadium,moderate,Grass,180.9,8.08,0.15,67.38,46.46,0.15
3,A.J. Brown,PHI,WR,2022,17,88,145,1496.0,88.0,11,...,51.0,Lincoln Financial Field,cold,Grass,299.6,8.53,0.0,69.94,59.06,0.0
4,A.J. Brown,PHI,WR,2023,17,106,158,1456.0,85.65,7,...,51.0,Lincoln Financial Field,cold,Grass,289.6,9.29,0.0,67.88,59.94,0.0


In [4]:
# Check to see that the appropriate columns were created

new_features_training_data_all_seasons.columns

Index(['name', 'team', 'position', 'season', 'games', 'receptions', 'targets',
       'receiving_yards', 'rec_ypg', 'receiving_tds', 'ypr',
       'receiving_fumbles', 'receiving_2pt_conversions', 'target_share',
       'air_yards_share', 'carries', 'rushing_yards', 'rush_ypg',
       'rushing_tds', 'rush_td_percentage', 'rushing_fumbles',
       'rushing_2pt_conversions', 'completions', 'attempts', 'comp_percentage',
       'passing_yards', 'pass_ypg', 'passing_tds', 'passing_2pt_conversions',
       'td_percentage', 'interceptions', 'sacks', 'sack_fumbles',
       'offense_snaps', 'teams_offense_snaps', 'round', 'overall',
       'stadium_name', 'stadium_weather_type', 'stadium_surface',
       'fantasy_points_ppr', 'target_per_game', 'carries_per_game',
       'team_off_snaps_per_game', 'off_snaps_per_game', 'attempts_per_game'],
      dtype='object')

In [5]:
# Cleaning up all seasons data columns
# We need to drop any columns that are unique identifiers of player records as well
# as any features that are used directly to calculate our target such as td's, receiving yards, etc.

all_seasons_data_cleaned = new_features_training_data_all_seasons.drop(columns = ['name', 'season', 'team', 'receptions', 'receiving_yards', 'receiving_tds', 'receiving_2pt_conversions', 
                            'rushing_yards', 'rushing_tds', 'rushing_2pt_conversions', 'completions', 'passing_yards', 
                            'passing_tds', 'interceptions'])
all_seasons_data_cleaned.head()

Unnamed: 0,position,games,targets,rec_ypg,ypr,receiving_fumbles,target_share,air_yards_share,carries,rush_ypg,...,overall,stadium_name,stadium_weather_type,stadium_surface,fantasy_points_ppr,target_per_game,carries_per_game,team_off_snaps_per_game,off_snaps_per_game,attempts_per_game
0,WR,16,84,65.69,20.21,1.0,0.2,0.29,3,3.75,...,51.0,Nissan Stadium,moderate,Grass,217.1,5.25,0.19,62.31,42.38,0.0
1,WR,14,106,76.79,15.36,2.0,0.27,0.35,0,0.0,...,51.0,Nissan Stadium,moderate,Grass,247.5,7.57,0.0,67.5,54.29,0.0
2,WR,13,105,66.85,13.79,0.0,0.28,0.45,2,0.77,...,51.0,Nissan Stadium,moderate,Grass,180.9,8.08,0.15,67.38,46.46,0.15
3,WR,17,145,88.0,17.0,2.0,0.29,0.41,0,0.0,...,51.0,Lincoln Financial Field,cold,Grass,299.6,8.53,0.0,69.94,59.06,0.0
4,WR,17,158,85.65,13.74,2.0,0.3,0.42,0,0.0,...,51.0,Lincoln Financial Field,cold,Grass,289.6,9.29,0.0,67.88,59.94,0.0


In [6]:
# Check to see that the appropriate columns were dropped

all_seasons_data_cleaned.columns

Index(['position', 'games', 'targets', 'rec_ypg', 'ypr', 'receiving_fumbles',
       'target_share', 'air_yards_share', 'carries', 'rush_ypg',
       'rush_td_percentage', 'rushing_fumbles', 'attempts', 'comp_percentage',
       'pass_ypg', 'passing_2pt_conversions', 'td_percentage', 'sacks',
       'sack_fumbles', 'offense_snaps', 'teams_offense_snaps', 'round',
       'overall', 'stadium_name', 'stadium_weather_type', 'stadium_surface',
       'fantasy_points_ppr', 'target_per_game', 'carries_per_game',
       'team_off_snaps_per_game', 'off_snaps_per_game', 'attempts_per_game'],
      dtype='object')

In [7]:
# Encoding categorical columns for model
encoded_training_data_all_seasons = pd.get_dummies(all_seasons_data_cleaned)
encoded_training_data_all_seasons

Unnamed: 0,games,targets,rec_ypg,ypr,receiving_fumbles,target_share,air_yards_share,carries,rush_ypg,rush_td_percentage,...,stadium_name_Soldier Field,stadium_name_State Farm Stadium,stadium_name_TIAA Bank Field,stadium_name_U.S. Bank Stadium,stadium_weather_type_cold,stadium_weather_type_indoor,stadium_weather_type_moderate,stadium_weather_type_warm,stadium_surface_FieldTurf,stadium_surface_Grass
0,16,84,65.69,20.21,1.0,0.20,0.29,3,3.75,0.33,...,False,False,False,False,False,False,True,False,False,True
1,14,106,76.79,15.36,2.0,0.27,0.35,0,0.00,0.00,...,False,False,False,False,False,False,True,False,False,True
2,13,105,66.85,13.79,0.0,0.28,0.45,2,0.77,0.00,...,False,False,False,False,False,False,True,False,False,True
3,17,145,88.00,17.00,2.0,0.29,0.41,0,0.00,0.00,...,False,False,False,False,True,False,False,False,False,True
4,17,158,85.65,13.74,2.0,0.30,0.42,0,0.00,0.00,...,False,False,False,False,True,False,False,False,False,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2101,16,102,40.75,11.64,1.0,0.21,0.26,1,0.00,0.00,...,False,False,False,False,True,False,False,False,True,False
2102,11,20,14.00,11.00,0.0,0.05,0.04,0,0.00,0.00,...,False,False,False,False,False,True,False,False,False,True
2103,15,70,36.40,11.62,1.0,0.14,0.24,2,0.20,0.00,...,False,False,False,False,False,True,False,False,False,True
2104,16,121,51.44,10.04,0.0,0.22,0.26,4,1.12,0.00,...,False,False,True,False,False,False,False,True,False,True


In [8]:
# Defining y (target variable)
y = encoded_training_data_all_seasons['fantasy_points_ppr']

# Defining X (features)
X = encoded_training_data_all_seasons.drop(columns = 'fantasy_points_ppr')

In [9]:
# Spliting model into training and testing data
X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    y, 
                                                    random_state=42)

In [10]:
# Create the StandardScaler instance
scaler = StandardScaler()
# Fit the Standard Scaler with the training data
X_scaler = scaler.fit(X_train)
# Scale the training data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [26]:
svr_linear = SVR(kernel='linear',gamma='scale', C=10.0, epsilon=0.01)
svr_linear.fit(X_train_scaled, y_train)

In [29]:
# Run the model against the training data
training_predictions_all_seasons = svr_linear.predict(X_test_scaled)

In [30]:
# Check our R2 score to see whether we got a good result
training_all_seasons_data_r2 = r2_score(y_test, training_predictions_all_seasons)
training_all_seasons_data_r2

0.9751228078636596

In [32]:
# Check our root mean square error to see whether it is reasonable
training_all_seasons_data_mse = mean_squared_error(y_test, training_predictions_all_seasons)
training_all_seasons_data_rmse = math.sqrt(training_all_seasons_data_mse)
training_all_seasons_data_rmse

14.877657953245409

In [None]:
# This result is essentially the same as using linear regression, so no reason to choose SVM over
# linear regression. It is less than 0.1% different.

### Support vector regressor with Gaussian (radial basis function) kernel

In [33]:
svr_rbf = SVR(kernel='rbf',gamma='scale', C=10.0, epsilon=0.01)
svr_rbf.fit(X_train_scaled, y_train) 

In [34]:
svr_rbf.score(X_test_scaled,y_test)

0.9023429482291252

In [35]:
training_predictions_all_seasons = svr_rbf.predict(X_test_scaled)

In [36]:
training_all_seasons_data_mse = mean_squared_error(y_test, training_predictions_all_seasons)
training_all_seasons_data_rmse = math.sqrt(training_all_seasons_data_mse)
training_all_seasons_data_rmse

29.477163753417756

In [37]:
# This result is not nearly as good a fit as the two linear models, linear regression and linear SVR

### It appears that a linear model is the best fit, and SVM offers no significant advantage over linear regression, so we will stick with the more efficient linear regression model