In [1]:
# install necessary packages
import nfl_data_py as nfl
import datetime
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
# extract pbp, weekly data
weekly_data_test = nfl.import_weekly_data([2023])
weekly_data_train = nfl.import_weekly_data([2022, 2021, 2020, 2019])

Downcasting floats.
Downcasting floats.


In [3]:
# view head of data
weekly_data_test.head()

Unnamed: 0,player_id,player_name,player_display_name,position,position_group,headshot_url,recent_team,season,week,season_type,...,receiving_first_downs,receiving_epa,receiving_2pt_conversions,racr,target_share,air_yards_share,wopr,special_teams_tds,fantasy_points,fantasy_points_ppr
0,00-0023459,A.Rodgers,Aaron Rodgers,QB,QB,https://static.www.nfl.com/image/private/f_aut...,NYJ,2023,1,REG,...,0.0,,0,,,,,0.0,0.0,0.0
1,00-0024243,M.Lewis,Marcedes Lewis,TE,TE,https://static.www.nfl.com/image/private/f_aut...,CHI,2023,4,REG,...,0.0,0.483465,0,0.0,0.03125,-0.012397,0.038197,0.0,0.8,1.8
2,00-0024243,M.Lewis,Marcedes Lewis,TE,TE,https://static.www.nfl.com/image/private/f_aut...,CHI,2023,7,REG,...,1.0,1.437224,0,3.2,0.034483,0.09434,0.117762,0.0,1.6,2.6
3,00-0024243,M.Lewis,Marcedes Lewis,TE,TE,https://static.www.nfl.com/image/private/f_aut...,CHI,2023,11,REG,...,0.0,-0.547367,0,0.0,0.045455,0.036885,0.094001,0.0,0.0,0.0
4,00-0026158,J.Flacco,Joe Flacco,QB,QB,https://static.www.nfl.com/image/private/f_aut...,CLE,2023,13,REG,...,0.0,,0,,,,,0.0,16.16,16.16


In [4]:
# view various positions
weekly_data_test['position'].unique()

array(['QB', 'TE', 'P', 'WR', 'FB', 'RB', 'T', 'OLB', 'ILB', 'CB', 'FS',
       'DT', 'G', 'SS', 'MLB'], dtype=object)

In [5]:
# preprocess player id to be int
weekly_data_test['player_id'] = [int(x.split('-')[0] + x.split('-')[1]) for x in weekly_data_test['player_id']]
weekly_data_train['player_id'] = [int(x.split('-')[0] + x.split('-')[1]) for x in weekly_data_train['player_id']]

In [6]:
# show cols in weekly data
nfl.see_weekly_cols()

Index(['player_id', 'player_name', 'player_display_name', 'position',
       'position_group', 'headshot_url', 'recent_team', 'season', 'week',
       'season_type', 'completions', 'attempts', 'passing_yards',
       'passing_tds', 'interceptions', 'sacks', 'sack_yards', 'sack_fumbles',
       'sack_fumbles_lost', 'passing_air_yards', 'passing_yards_after_catch',
       'passing_first_downs', 'passing_epa', 'passing_2pt_conversions', 'pacr',
       'dakota', 'carries', 'rushing_yards', 'rushing_tds', 'rushing_fumbles',
       'rushing_fumbles_lost', 'rushing_first_downs', 'rushing_epa',
       'rushing_2pt_conversions', 'receptions', 'targets', 'receiving_yards',
       'receiving_tds', 'receiving_fumbles', 'receiving_fumbles_lost',
       'receiving_air_yards', 'receiving_yards_after_catch',
       'receiving_first_downs', 'receiving_epa', 'receiving_2pt_conversions',
       'racr', 'target_share', 'air_yards_share', 'wopr', 'special_teams_tds',
       'fantasy_points', 'fantasy_point

# Filter Variables from Entire Data

In [7]:
# drop cols that are not needed in any dataframe
cols_to_drop = [
                'player_display_name',
                'position_group',
                'headshot_url',
                'season_type',
                'receiving_2pt_conversions',
                'sacks',
                'sack_yards',
                'sack_fumbles',
                'sack_fumbles_lost',
                'receiving_fumbles_lost',
                'receiving_fumbles',
                'rushing_fumbles',
                'rushing_fumbles_lost',
                'special_teams_tds',
                ]
weekly_data_test.drop(cols_to_drop, axis=1, inplace=True)
weekly_data_train.drop(cols_to_drop, axis=1, inplace=True)

In [8]:
# check that cols were dropped
weekly_data_test.columns

Index(['player_id', 'player_name', 'position', 'recent_team', 'season', 'week',
       'opponent_team', 'completions', 'attempts', 'passing_yards',
       'passing_tds', 'interceptions', 'passing_air_yards',
       'passing_yards_after_catch', 'passing_first_downs', 'passing_epa',
       'passing_2pt_conversions', 'pacr', 'dakota', 'carries', 'rushing_yards',
       'rushing_tds', 'rushing_first_downs', 'rushing_epa',
       'rushing_2pt_conversions', 'receptions', 'targets', 'receiving_yards',
       'receiving_tds', 'receiving_air_yards', 'receiving_yards_after_catch',
       'receiving_first_downs', 'receiving_epa', 'racr', 'target_share',
       'air_yards_share', 'wopr', 'fantasy_points', 'fantasy_points_ppr'],
      dtype='object')

## Filter by Position

In [9]:
#QB
qb_train = weekly_data_train[weekly_data_train["position"] == "QB" ]
qb_test = weekly_data_test[weekly_data_test["position"] == "QB"]

#RB
rb_train = weekly_data_train[weekly_data_train["position"] == "RB"]
rb_test = weekly_data_test[weekly_data_test["position"] == "RB"]

#WR
wr_train = weekly_data_train[weekly_data_train["position"] == "WR"]
wr_test = weekly_data_test[weekly_data_test["position"] == "WR"]

#TE
te_train = weekly_data_train[weekly_data_train["position"] == "TE"]
te_test = weekly_data_test[weekly_data_test["position"] == "TE"]

#FLEX
flex_list = ["RB", "WR", "TE"]
flex_train = weekly_data_train[weekly_data_train["position"].isin(flex_list)]
flex_test = weekly_data_test[weekly_data_test["position"].isin(flex_list)]

In [10]:
# check that the flex position is correct
flex_train["position"].unique()

array(['TE', 'WR', 'RB'], dtype=object)

## Drop Cols by Position

In [11]:
# define qb cols to drop (all receiving-related cols)
qb_cols_to_drop = ['position',
                   'receptions',
                   'targets',
                   'receiving_yards',
                   'receiving_tds',
                   'receiving_air_yards',
                   'receiving_yards_after_catch',
                   'receiving_first_downs',
                   'receiving_epa',
                   'racr',
                   'target_share',
                   'air_yards_share',
                   'wopr'
                   ]
qb_train = qb_train.drop(qb_cols_to_drop, axis=1)
qb_test = qb_test.drop(qb_cols_to_drop + ['opponent_team'], axis=1)

In [12]:
# define rb cols to drop (all passing-related cols)
rb_cols_to_drop = ['position',
                   'completions',
                   'attempts',
                   'passing_yards',
                   'passing_tds',
                   'passing_air_yards',
                   'passing_yards_after_catch',
                   'passing_first_downs',
                   'passing_epa',
                   'passing_2pt_conversions',
                   'pacr',
                   'dakota',
                   'interceptions'
                   ]
rb_train = rb_train.drop(rb_cols_to_drop, axis=1)
rb_test = rb_test.drop(rb_cols_to_drop + ['opponent_team'], axis=1)

In [13]:
# define wr cols to drop (all passing, rushing-related cols)
wr_cols_to_drop = ['position',
                   'completions',
                   'attempts',
                   'passing_yards',
                   'passing_tds',
                   'passing_air_yards',
                   'passing_yards_after_catch',
                   'passing_first_downs',
                   'passing_epa',
                   'passing_2pt_conversions',
                   'pacr',
                   'dakota',
                   'carries',
                   'rushing_yards',
                   'rushing_tds',
                   'rushing_first_downs',
                   'rushing_epa',
                   'rushing_2pt_conversions',
                   'interceptions'
                   ]
wr_train = wr_train.drop(wr_cols_to_drop, axis=1)
wr_test = wr_test.drop(wr_cols_to_drop + ['opponent_team'], axis=1)

In [14]:
# define te cols to drop (all passing, rushing-related cols)
te_cols_to_drop = ['position',
                   'completions',
                   'attempts',
                   'passing_yards',
                   'passing_tds',
                   'passing_air_yards',
                   'passing_yards_after_catch',
                   'passing_first_downs',
                   'passing_epa',
                   'passing_2pt_conversions',
                   'pacr',
                   'dakota',
                   'carries',
                   'rushing_yards',
                   'rushing_tds',
                   'rushing_first_downs',
                   'rushing_epa',
                   'rushing_2pt_conversions',
                   'interceptions'
                   ]
te_train = te_train.drop(te_cols_to_drop, axis=1)
te_test = te_test.drop(te_cols_to_drop + ['opponent_team'], axis=1)

In [15]:
# define flex cols to drop (all passing-related cols)
flex_cols_to_drop = ['position',
                    'completions',
                    'attempts',
                    'passing_yards',
                    'passing_tds',
                    'passing_air_yards',
                    'passing_yards_after_catch',
                    'passing_first_downs',
                    'passing_epa',
                    'passing_2pt_conversions',
                    'pacr',
                    'dakota',
                    'interceptions'
                   ]
flex_train = flex_train.drop(flex_cols_to_drop, axis=1)
flex_test = flex_test.drop(flex_cols_to_drop + ['opponent_team'], axis=1)

# Other Preprocessing

In [16]:
# change categorical QB data cols to one-hot encoding
qb_train = pd.get_dummies(qb_train, columns=['recent_team'])
qb_test = pd.get_dummies(qb_test, columns=['recent_team'])

In [17]:
# change categorical RB data cols to one-hot encoding
rb_train = pd.get_dummies(rb_train, columns=['recent_team'])
rb_test = pd.get_dummies(rb_test, columns=['recent_team'])

In [18]:
# change categorical WR data cols to one-hot encoding
wr_train = pd.get_dummies(wr_train, columns=['recent_team'])
wr_test = pd.get_dummies(wr_test, columns=['recent_team'])

In [19]:
# change categorical TE data cols to one-hot encoding
te_train = pd.get_dummies(te_train, columns=['recent_team'])
te_test = pd.get_dummies(te_test, columns=['recent_team'])

In [20]:
# change categorical FLEX data cols to one-hot encoding
flex_train = pd.get_dummies(flex_train, columns=['recent_team'])
flex_test = pd.get_dummies(flex_test, columns=['recent_team'])

- use RMSE for standardized PPR units

# Preprocess Data For LSTM
- Input will be a (Num. players x Timesteps x Num. Features) shaped array 
- Output will be a (Num. players x 1) shaped array where each obs represents the current week PPR

In [21]:
import torch

In [22]:
# define static and dynamic fts
qb_static_fts = ['player_id']
qb_dynamic_fts = [x for x in qb_train.columns if x not in qb_static_fts]

In [57]:
def create_model_input(input_df):

    # find feature index of fantasy ppr (our output)
    fantasy_ppr_idx = input_df.columns.get_loc("fantasy_points_ppr")

    # group the data by player
    input_df = input_df.groupby("player_name").agg(lambda x: x.tolist())

    # find max number of weeks for all players in the dataset
    max_weeks = max([len(x) for x in input_df["player_id"]])

    # pad each entry in the data with 0s to make them all the same length
    input_df = input_df.apply(lambda x: x.apply(lambda y: [0] * (max_weeks - len(y)) + y))

    # convert data to array and reshape to (num_players, max_weeks, num_features)
    input_df = input_df.apply(lambda x: x.apply(lambda y: np.array(y, dtype="float32").reshape(-1)))

    fantasy_ppr_idx = input_df.columns.get_loc("fantasy_points_ppr")

    X = np.empty((input_df.shape[0], max_weeks, input_df.shape[1]))
    for col in range(input_df.shape[1]):
        for row in range(input_df.shape[0]):
            X[row, :, col] = input_df.iloc[row, col]

    #extract last week of fantasy ppr for each player as targets
    Y = X[:, -1, fantasy_ppr_idx]

    # drop last week of data from X
    X = X[:, :-1, :]

    return X, Y

In [58]:
# create training and testing data for qbs
qb_train_X, qb_train_Y = create_model_input(qb_train)
qb_test_X, qb_test_Y = create_model_input(qb_test)

In [None]:
# create training and testing data for rbs
rb_train_X, rb_train_Y = create_model_input(rb_train)
rb_test_X, rb_test_Y = create_model_input(rb_test)

In [None]:
# create training and testing data for wrs
wr_train_X, wr_train_Y = create_model_input(wr_train)
wr_test_X, wr_test_Y = create_model_input(wr_test)

In [None]:
# create training and testing data for tes
te_train_X, te_train_Y = create_model_input(te_train)
te_test_X, te_test_Y = create_model_input(te_test)

In [None]:
# create training and testing data for flex
flex_train_X, flex_train_Y = create_model_input(flex_train)
flex_test_X, flex_test_Y = create_model_input(flex_test)