<a href="https://colab.research.google.com/github/ojh485/fantasy/blob/main/Fantasy_Football_ML.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

##**License**

**Web scraping copied and modified from: https://github.com/logan-lauton/nfl_webscrape/tree/main
See below license:**

MIT License

Copyright (c) 2023 Logan Lauton

Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:

The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.

##Web Scraping

In [None]:
from urllib.request import urlopen
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
import time

In [None]:
## (See License)

##code to scrape a single seasons data
def single(season):
    url = f'https://www.pro-football-reference.com/years/{season}/fantasy.htm'
    table_html = BeautifulSoup(urlopen(url), 'html.parser').findAll('table')
    df = pd.read_html(str(table_html))[0]
    df.columns = ['Rk', 'Player', 'Tm', 'FantPos','Age','G','GS','Cmp','P_Att','P_Yds', \
                  'P_TD','Int','R_Att','R_Yds','R_Y/A','R_TD','Re_Tgt','Rec','Re_Y' \
                  , 'Y/R', 'Re_TD','Fmb', 'FL','TD','2PM','2PP', 'FantPt', 'PPR', \
                  'DkPt', 'FDPt', 'VBD', 'PosRank','OvRank'] #renaming columns bc of multi-index

    df = df.drop('Rk', axis = 1) # drop Rk columns
    df.Player = df.Player.str.replace('*','') # remove asterisk on player's name
    df.Player = df.Player.str.replace('+','') # remove plus on player's name
    df.insert(0,'Season',season) # insert season column
    df = df.apply(pd.to_numeric, errors='coerce').fillna(df) # convert non string values to numeric
    return df

##function to scrape multiple seasons of data at a time
def multiple(start_year,end_year):
    df = single(start_year)
    while start_year < end_year:
        time.sleep(4)                     ##code sleeps for 4 seconds between calls as 20 requests per minute
        start_year = start_year + 1       ##are allowed meaning only 15 requests per minute will be made here
        df = df.append(single(start_year))
    return df

In [None]:
##calling for all NFL seasons I deemed 'relevant' for my purposes. (2000-2001 – 2022-23)
df = multiple(2000,2022)

In [None]:
##quick check to ensure both 2000 and 2022 were included in the table
df

Unnamed: 0,Season,Player,Tm,FantPos,Age,G,GS,Cmp,P_Att,P_Yds,...,TD,2PM,2PP,FantPt,PPR,DkPt,FDPt,VBD,PosRank,OvRank
0,2000,Marshall Faulk,STL,RB,27.0,14.0,14.0,0.0,0.0,0.0,...,26.0,2.0,,379.0,459.9,465.9,419.4,228.0,1.0,1.0
1,2000,Edgerrin James,IND,RB,22.0,16.0,16.0,0.0,0.0,0.0,...,18.0,1.0,,332.0,395.3,405.3,363.8,181.0,2.0,2.0
2,2000,Jeff Garcia,SFO,QB,30.0,16.0,16.0,355.0,561.0,4278.0,...,4.0,,1.0,341.0,340.5,357.5,350.5,149.0,1.0,3.0
3,2000,Daunte Culpepper,MIN,QB,23.0,16.0,16.0,297.0,474.0,3937.0,...,7.0,,2.0,338.0,338.5,366.5,354.5,147.0,2.0,4.0
4,2000,Eddie George,TEN,RB,27.0,16.0,16.0,0.0,0.0,0.0,...,16.0,,,284.0,334.2,344.2,309.2,133.0,3.0,5.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
666,2022,Ihmir Smith-Marsette,2TM,WR,23.0,8.0,0.0,0.0,0.0,0.0,...,0.0,,,-1.0,0.4,1.4,-0.1,,252.0,
667,2022,KaVontae Turpin,DAL,WR,26.0,17.0,0.0,0.0,0.0,0.0,...,0.0,,,-1.0,-0.4,1.6,-0.9,,254.0,
668,2022,Taiwan Jones,BUF,,34.0,16.0,0.0,0.0,0.0,0.0,...,0.0,,,-2.0,-2.0,-1.0,-2.0,,173.0,
669,2022,Chester Rogers,CLE,,28.0,3.0,0.0,0.0,0.0,0.0,...,0.0,,,-2.0,-2.0,-1.0,-2.0,,255.0,


## Data Wrangling and Download

In [None]:
##removal of any rows containing the column names
df = df[df['Player']!='Player']

In [None]:
#Removing irrelevant columns for fantasy:
drop = ['FL', 'PosRank', 'OvRank']
df = df.drop(drop, 1)

  df = df.drop(drop, 1)


KeyError: ignored

In [None]:
#changing to appropriate datatypes
df = df.infer_objects()
df.info()

In [None]:
#adding relevant column(s): FPP/Game, PPR/Game, HalfPPR/Game
df['FPP/G'] = np.where(df['G'] >= 1, df['FantPt'] / df['G'], np.NaN)
df['PPR/G'] = np.where(df['G'] >= 1, df['PPR'] / df['G'], np.NaN)

halfPPR = (df['FantPt'] + 0.5*df['Rec']) / df['G']
df['HalfPPR/G'] = np.where(df['G'] >= 1, halfPPR, np.NaN)
df['id'] = df.groupby(['Player', (df['Age'] - df['Season'])]).ngroup()
df.sort_values('id')

In [None]:
#Removing all players that only have one season (besides 2022 rookies)
mask = ~(df.duplicated('id', keep = False)) & (df.Season != 2022)
temp1 = df[mask]
temp2 = pd.merge(df,temp1, indicator=True, how='outer') \
         .query('_merge=="left_only"') \
         .drop('_merge', axis=1)
df = temp2

In [None]:
#Saved to drive so no longer need to rerun everything
from google.colab import drive

drive.mount('/content/drive')
path = '/content/drive/MyDrive/Coding Projects/ML Fantasy Predictions/NFLstats.csv'
with open(path, 'w', encoding = 'utf-8-sig') as f:
  df.to_csv(f)


In [None]:
#checking to verify that no two players were the same age, with the same name
#in the same season (if this were the case, my ID assignment wouldn't work)
mask = (df.duplicated(['Player', 'Season', 'Age'], keep = False))
df[mask]

## Access Downloaded File (Start Here After Running Previous Once)

In [None]:
from urllib.request import urlopen
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
import time
import tensorflow as tf

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
path = "/content/drive/MyDrive/Coding Projects/ML Fantasy Predictions/NFLstats.csv"
df = pd.read_csv(path)
df = df.drop('Unnamed: 0', 1)

  df = df.drop('Unnamed: 0', 1)


In [None]:
#Creating dataframes based on position:
QB = df.loc[df['FantPos'] == 'QB']
RB = df.loc[df['FantPos'] == 'RB']
WR = df.loc[df['FantPos'] == 'WR']
TE = df.loc[df['FantPos'] == 'TE']

In [None]:
#creating training data: all years from all players (except their last season)
training = df[df.duplicated('id', keep = 'last')]
training = training.sort_values(['id','Season'])
training = training.reset_index()
training

Unnamed: 0,index,Season,Player,Tm,FantPos,Age,G,GS,Cmp,P_Att,...,2PP,FantPt,PPR,DkPt,FDPt,VBD,FPP/G,PPR/G,HalfPPR/G,id
0,10471,2019.0,A.J. Brown,TEN,WR,22.0,16.0,11.0,0.0,0.0,...,,165.0,217.1,220.1,191.1,36.0,10.312500,13.568750,11.937500,0
1,11059,2020.0,A.J. Brown,TEN,WR,23.0,14.0,12.0,0.0,0.0,...,,178.0,247.5,251.5,212.5,52.0,12.714286,17.678571,15.214286,0
2,11760,2021.0,A.J. Brown,TEN,WR,24.0,13.0,13.0,0.0,2.0,...,,118.0,180.9,183.9,149.4,,9.076923,13.915385,11.500000,0
3,9080,2016.0,A.J. Derby,2TM,TE,25.0,10.0,3.0,0.0,0.0,...,,14.0,30.0,34.0,22.0,,1.400000,3.000000,2.200000,1
4,9551,2017.0,A.J. Derby,2TM,TE,26.0,11.0,1.0,0.0,0.0,...,,34.0,55.4,59.4,44.9,,3.090909,5.036364,4.045455,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10255,358,2000.0,Zeron Flemister,WAS,TE,24.0,5.0,0.0,0.0,0.0,...,,1.0,1.8,1.8,1.3,,0.200000,0.360000,0.300000,3419
10256,656,2001.0,Zeron Flemister,WAS,TE,25.0,16.0,1.0,0.0,0.0,...,,32.0,49.6,52.6,40.6,,2.000000,3.100000,2.562500,3419
10257,1218,2002.0,Zeron Flemister,WAS,TE,26.0,15.0,7.0,0.0,0.0,...,,27.0,36.6,39.6,31.6,,1.800000,2.440000,2.133333,3419
10258,1874,2003.0,Zeron Flemister,WAS,TE,27.0,12.0,9.0,0.0,0.0,...,,7.0,15.9,16.9,11.4,,0.583333,1.325000,0.958333,3419


In [None]:
#Dropping Unnecessary QB data
drop = ['Season','Tm', 'FantPos', 'Re_Tgt', 'Rec', 'Re_Y', 'Y/R', 'Re_TD', 'VBD','2PM', '2PP']
QB.drop(drop, axis = 1, inplace = True)
QB.columns

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  QB.drop(drop, axis = 1, inplace = True)


Index(['Player', 'Age', 'G', 'GS', 'Cmp', 'P_Att', 'P_Yds', 'P_TD', 'Int',
       'R_Att', 'R_Yds', 'R_Y/A', 'R_TD', 'Fmb', 'TD', 'FantPt', 'PPR', 'DkPt',
       'FDPt', 'FPP/G', 'PPR/G', 'HalfPPR/G', 'id'],
      dtype='object')

In [None]:
#Dropping Unnecessary RB data
drop = ['Season','Tm', 'FantPos', 'Cmp', 'P_Att',
       'P_Yds', 'P_TD', 'Int', 'VBD','2PM', '2PP']
RB.drop(drop, axis = 1, inplace = True)
RB.columns

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  RB.drop(drop, axis = 1, inplace = True)


Index(['Player', 'Age', 'G', 'GS', 'R_Att', 'R_Yds', 'R_Y/A', 'R_TD', 'Re_Tgt',
       'Rec', 'Re_Y', 'Y/R', 'Re_TD', 'Fmb', 'TD', 'FantPt', 'PPR', 'DkPt',
       'FDPt', 'FPP/G', 'PPR/G', 'HalfPPR/G', 'id'],
      dtype='object')

In [None]:
#Dropping Unnecessary WR data
drop = ['Season','Tm', 'FantPos', 'Cmp', 'P_Att',
       'P_Yds', 'P_TD', 'Int', 'VBD','2PM', '2PP']
WR.drop(drop, axis = 1, inplace = True)
WR.columns

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  WR.drop(drop, axis = 1, inplace = True)


Index(['Player', 'Age', 'G', 'GS', 'R_Att', 'R_Yds', 'R_Y/A', 'R_TD', 'Re_Tgt',
       'Rec', 'Re_Y', 'Y/R', 'Re_TD', 'Fmb', 'TD', 'FantPt', 'PPR', 'DkPt',
       'FDPt', 'FPP/G', 'PPR/G', 'HalfPPR/G', 'id'],
      dtype='object')

In [None]:
#Dropping Unnecessary TE data
drop = ['Season','Tm', 'FantPos', 'Cmp', 'P_Att',
       'P_Yds', 'P_TD', 'Int', 'VBD','2PM', '2PP']
TE.drop(drop, axis = 1, inplace = True)
TE.columns

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  TE.drop(drop, axis = 1, inplace = True)


Index(['Player', 'Age', 'G', 'GS', 'R_Att', 'R_Yds', 'R_Y/A', 'R_TD', 'Re_Tgt',
       'Rec', 'Re_Y', 'Y/R', 'Re_TD', 'Fmb', 'TD', 'FantPt', 'PPR', 'DkPt',
       'FDPt', 'FPP/G', 'PPR/G', 'HalfPPR/G', 'id'],
      dtype='object')

In [None]:
QB.sort_values(['id','Age'], inplace = True)
QB.reset_index(inplace = True, drop = True)
QB.fillna(0, inplace = True)
QB

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  QB.sort_values(['id','Age'], inplace = True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  QB.fillna(0, inplace = True)


Unnamed: 0,Player,Age,G,GS,Cmp,P_Att,P_Yds,P_TD,Int,R_Att,...,Fmb,TD,FantPt,PPR,DkPt,FDPt,FPP/G,PPR/G,HalfPPR/G,id
0,A.J. Feeley,24.0,1.0,0.0,10.0,14.0,143.0,2.0,1.0,0.0,...,0.0,0.0,12.0,11.7,12.7,12.7,12.000000,11.700000,12.000000,2
1,A.J. Feeley,25.0,6.0,5.0,86.0,154.0,1011.0,6.0,5.0,12.0,...,2.0,0.0,53.0,53.0,62.0,58.0,8.833333,8.833333,8.833333,2
2,A.J. Feeley,27.0,11.0,8.0,191.0,356.0,1893.0,11.0,15.0,14.0,...,10.0,1.0,91.0,91.0,114.0,106.0,8.272727,8.272727,8.272727,2
3,A.J. Feeley,29.0,2.0,0.0,26.0,38.0,342.0,3.0,0.0,1.0,...,1.0,0.0,24.0,24.0,28.0,24.0,12.000000,12.000000,12.000000,2
4,A.J. Feeley,30.0,3.0,2.0,59.0,103.0,681.0,5.0,8.0,7.0,...,1.0,0.0,34.0,33.5,44.5,41.5,11.333333,11.166667,11.333333,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1680,Vinny Testaverde,44.0,7.0,6.0,94.0,172.0,952.0,5.0,6.0,9.0,...,3.0,0.0,46.0,46.3,56.3,52.3,6.571429,6.614286,6.571429,3342
1681,Zach Mettenberger,23.0,7.0,6.0,107.0,179.0,1412.0,8.0,7.0,5.0,...,4.0,0.0,71.0,70.9,82.9,77.9,10.142857,10.128571,10.142857,3404
1682,Zach Mettenberger,24.0,7.0,4.0,101.0,166.0,935.0,4.0,7.0,9.0,...,4.0,1.0,42.0,42.2,54.2,49.2,6.000000,6.028571,6.000000,3404
1683,Zach Wilson,22.0,13.0,13.0,213.0,383.0,2334.0,9.0,11.0,29.0,...,5.0,4.0,152.0,151.9,169.9,162.9,11.692308,11.684615,11.692308,3410


##**QB 1 Year NN**

###Data Work

In [None]:
#creating input data: all years from all QBs except their last season
eligible_QBs = QB[QB.duplicated('id', keep = 'last')]
eligible_QBs

Unnamed: 0,Player,Age,G,GS,Cmp,P_Att,P_Yds,P_TD,Int,R_Att,...,Fmb,TD,FantPt,PPR,DkPt,FDPt,FPP/G,PPR/G,HalfPPR/G,id
0,A.J. Feeley,24.0,1.0,0.0,10.0,14.0,143.0,2.0,1.0,0.0,...,0.0,0.0,12.0,11.7,12.7,12.7,12.000000,11.700000,12.000000,2
1,A.J. Feeley,25.0,6.0,5.0,86.0,154.0,1011.0,6.0,5.0,12.0,...,2.0,0.0,53.0,53.0,62.0,58.0,8.833333,8.833333,8.833333,2
2,A.J. Feeley,27.0,11.0,8.0,191.0,356.0,1893.0,11.0,15.0,14.0,...,10.0,1.0,91.0,91.0,114.0,106.0,8.272727,8.272727,8.272727,2
3,A.J. Feeley,29.0,2.0,0.0,26.0,38.0,342.0,3.0,0.0,1.0,...,1.0,0.0,24.0,24.0,28.0,24.0,12.000000,12.000000,12.000000,2
4,A.J. Feeley,30.0,3.0,2.0,59.0,103.0,681.0,5.0,8.0,7.0,...,1.0,0.0,34.0,33.5,44.5,41.5,11.333333,11.166667,11.333333,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1677,Vinny Testaverde,41.0,16.0,15.0,297.0,495.0,3532.0,17.0,20.0,21.0,...,8.0,1.0,175.0,175.1,201.1,195.1,10.937500,10.943750,10.937500,3342
1678,Vinny Testaverde,42.0,6.0,4.0,60.0,106.0,777.0,1.0,6.0,7.0,...,8.0,2.0,23.0,23.5,38.5,29.5,3.833333,3.916667,3.833333,3342
1679,Vinny Testaverde,43.0,3.0,0.0,2.0,3.0,29.0,1.0,0.0,8.0,...,0.0,0.0,4.0,4.4,4.4,4.4,1.333333,1.466667,1.333333,3342
1681,Zach Mettenberger,23.0,7.0,6.0,107.0,179.0,1412.0,8.0,7.0,5.0,...,4.0,0.0,71.0,70.9,82.9,77.9,10.142857,10.128571,10.142857,3404


In [None]:
eligible_ids = eligible_QBs.id.unique()
eligible_ids

array([   2,    5,    7,   17,   43,   64,   66,   68,  119,  123,  124,
        149,  189,  211,  229,  256,  260,  264,  286,  296,  302,  331,
        346,  347,  349,  353,  357,  360,  374,  384,  387,  389,  392,
        396,  398,  409,  417,  422,  434,  441,  459,  460,  461,  481,
        482,  491,  511,  512,  516,  524,  542,  590,  593,  594,  601,
        608,  619,  633,  635,  646,  651,  659,  688,  703,  730,  744,
        757,  769,  780,  787,  792,  843,  854,  856,  861,  863,  892,
        893,  919,  943,  959,  960,  989, 1053, 1067, 1069, 1072, 1076,
       1077, 1081, 1082, 1084, 1100, 1109, 1131, 1138, 1220, 1224, 1241,
       1292, 1351, 1369, 1371, 1378, 1389, 1394, 1410, 1416, 1421, 1434,
       1459, 1500, 1511, 1512, 1515, 1521, 1530, 1562, 1563, 1578, 1581,
       1583, 1584, 1658, 1664, 1665, 1666, 1668, 1677, 1680, 1693, 1699,
       1707, 1727, 1732, 1737, 1746, 1755, 1766, 1782, 1786, 1805, 1812,
       1818, 1820, 1829, 1833, 1855, 1864, 1936, 19

In [None]:
# number of QBs we have to train/test with
eligible_ids.size

278

In [None]:
#Randomly selecting 80% of QBs for training, leaving 20% for testing
rng = np.random.default_rng(1)
num_training = np.round(eligible_ids.size * 0.8).astype('int')
train_ids = rng.choice(eligible_ids, size = num_training, replace = False)

In [None]:
QB_train_df = QB[QB.id.isin(train_ids)]
#getting rid of last season for each QB
QB_train_df = QB_train_df[QB_train_df.duplicated('id', keep = 'last')]
#converting to numpy array so it can be passed into NN
QB_train_data = QB_train_df.drop(['Player', 'id'], axis =1).to_numpy()
#creating labels
train_indices = QB_train_df.index + 1
QB_train_labels = QB.iloc[train_indices,:].loc[:,['FantPt', 'PPR']].to_numpy()

QB_test_df = QB[~(QB.id.isin(train_ids))]
#getting rid of last season for each QB
QB_test_df = QB_test_df[QB_test_df.duplicated('id', keep = 'last')]
#converting to numpy array so it can be passed into NN
QB_test_data = QB_test_df.drop(['Player', 'id'], axis =1).to_numpy()
#creating labels
test_indices = QB_test_df.index + 1
QB_test_labels = QB.iloc[test_indices,:].loc[:,['FantPt', 'PPR']].to_numpy()

###Model Building

Following NN "rules of thumb" from [this article](https://towardsdatascience.com/17-rules-of-thumb-for-building-a-neural-network-93356f9930af#:~:text=The%20first%20layer%20should%20be,is%20the%20number%20of%20classes.)

In [None]:
def build_model():
  model = tf.keras.Sequential([
      tf.keras.Input(shape = (21,)),
      tf.keras.layers.Dense(units = 256, activation = 'relu'),
      tf.keras.layers.Dropout(rate = 0.3),
      tf.keras.layers.Dense(units = 128, activation = 'relu'),
      tf.keras.layers.Dropout(rate = 0.3),
      tf.keras.layers.Dense(units = 2, activation = 'linear')
  ])
  return model
model = build_model()
model.summary()

Model: "sequential_33"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_100 (Dense)           (None, 256)               5632      
                                                                 
 dropout_35 (Dropout)        (None, 256)               0         
                                                                 
 dense_101 (Dense)           (None, 128)               32896     
                                                                 
 dropout_36 (Dropout)        (None, 128)               0         
                                                                 
 dense_102 (Dense)           (None, 2)                 258       
                                                                 
Total params: 38,786
Trainable params: 38,786
Non-trainable params: 0
_________________________________________________________________


In [None]:
model.compile(optimizer = tf.keras.optimizers.Adam(),
              loss = 'mse',
              metrics = tf.keras.metrics.RootMeanSquaredError()
)

In [None]:
BATCH_SIZE = 128
EPOCHS = 100

model.fit(QB_train_data, QB_train_labels, batch_size=BATCH_SIZE, epochs=EPOCHS)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

<keras.callbacks.History at 0x7faf1c6e7250>

In [None]:
#Evaluating Accuracy on Test Dataset:
test_loss, test_acc = tf.keras.Sequential.evaluate(self = model,x = QB_test_data, y = QB_test_labels)
print('MSE:', test_acc)

MSE: 73.64630126953125


####Hyperparamter Notes
* Seemed to perform better with a low (or no) dropout
* More than 3 layers (2 hidden, 1 output) was not better than 3 layers
* Diminishing returns on number of epochs after 100. 200 was *slightly* better than 100
* Default learning rate of 0.001 was best
* Adam and Nadam has very similar performance
* Larger number of nodes in a layer seemed to be better when there was dropout




###Predictions

In [None]:
QB_2022 = df[(df.FantPos == 'QB') & (df.Season == 2022)]

In [None]:
QB_2022.columns

Index(['Season', 'Player', 'Tm', 'FantPos', 'Age', 'G', 'GS', 'Cmp', 'P_Att',
       'P_Yds', 'P_TD', 'Int', 'R_Att', 'R_Yds', 'R_Y/A', 'R_TD', 'Re_Tgt',
       'Rec', 'Re_Y', 'Y/R', 'Re_TD', 'Fmb', 'TD', '2PM', '2PP', 'FantPt',
       'PPR', 'DkPt', 'FDPt', 'VBD', 'FPP/G', 'PPR/G', 'HalfPPR/G', 'id'],
      dtype='object')

In [None]:
drop = ['Season', 'Player', 'Tm', 'FantPos', 'Re_Tgt',
       'Rec', 'Re_Y', 'Y/R', 'Re_TD', '2PM', '2PP', 'VBD','id']
input = QB_2022.drop(drop, axis = 1).infer_objects().to_numpy()
predictions = model.predict(input)
predictions

In [None]:
prediction_df = pd.DataFrame(predictions, columns = ['FantPt', 'PPR'])
prediction_df['Player'] = QB_2022.reset_index(drop = True).Player
prediction_df.head(20)

Unnamed: 0,FantPt,PPR,Player
0,322.16333,298.315674,Patrick Mahomes
1,322.868286,302.024261,Josh Allen
2,310.816193,293.314789,Jalen Hurts
3,277.989594,257.988464,Joe Burrow
4,250.368973,230.853607,Geno Smith
5,362.359222,359.087036,Justin Fields
6,239.358627,220.698059,Trevor Lawrence
7,240.893326,221.368439,Kirk Cousins
8,250.83461,235.102142,Daniel Jones
9,230.839432,211.783905,Jared Goff


##QB 3 Year NN
A network that learns from

In [None]:
#Only looking at QBs with more than 3 years of data
num_seasons = QB.id.value_counts()
veteran_ids = num_seasons.index[num_seasons.gt(3)]
vetQB = QB[QB.id.isin(veteran_ids)]

## RNN Model (Not Finished)

In [None]:
#hyper parameters
batch_size = 100

In [None]:
input = training.loc[:, 'Age':'id']
input

Unnamed: 0,Age,G,GS,Cmp,P_Att,P_Yds,P_TD,Int,R_Att,R_Yds,...,2PP,FantPt,PPR,DkPt,FDPt,VBD,FPP/G,PPR/G,HalfPPR/G,id
0,22.0,16.0,11.0,0.0,0.0,0.0,0.0,0.0,3.0,60.0,...,,165.0,217.1,220.1,191.1,36.0,10.312500,13.568750,11.937500,0
1,23.0,14.0,12.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,,178.0,247.5,251.5,212.5,52.0,12.714286,17.678571,15.214286,0
2,24.0,13.0,13.0,0.0,2.0,0.0,0.0,0.0,2.0,10.0,...,,118.0,180.9,183.9,149.4,,9.076923,13.915385,11.500000,0
3,25.0,10.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,,14.0,30.0,34.0,22.0,,1.400000,3.000000,2.200000,1
4,26.0,11.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,,34.0,55.4,59.4,44.9,,3.090909,5.036364,4.045455,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10255,24.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,,1.0,1.8,1.8,1.3,,0.200000,0.360000,0.300000,3419
10256,25.0,16.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,,32.0,49.6,52.6,40.6,,2.000000,3.100000,2.562500,3419
10257,26.0,15.0,7.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,,27.0,36.6,39.6,31.6,,1.800000,2.440000,2.133333,3419
10258,27.0,12.0,9.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,,7.0,15.9,16.9,11.4,,0.583333,1.325000,0.958333,3419


In [None]:
def LSTM(rnn_units):
  return tf.keras.layers.LSTM(
    rnn_units,
    return_sequences=True, #only returns last output of output sequence
    recurrent_initializer='glorot_uniform', #initializer for recurrent_kernel weights matrix
    recurrent_activation='sigmoid',
    stateful=True, #last state at index i in a batch is used as first state for index i in following batch
  )

In [None]:
### Defining the RNN Model ###

model = tf.keras.Sequential()
# Layer 1: LSTM with `rnn_units` number of units.
model.add(LSTM(64))
#model.add(tf.keras.layers.BatchNormalization())
# Layer 3: Dense (fully-connected) layer that transforms the LSTM output
# into a points prediction. 3 output layers for three different predictions:
# Regular, PPR, Half PPR
model.add(tf.keras.layers.Dense(3))

In [None]:
#input shape = [batch_size, timesteps, input_dim]
model.build([batch_size, None, input.iloc[0].shape[0]])
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 lstm (LSTM)                 (100, None, 64)           24064     
                                                                 
 dense (Dense)               (100, None, 3)            195       
                                                                 
Total params: 24,259
Trainable params: 24,259
Non-trainable params: 0
_________________________________________________________________


In [None]:
### Batch definition to create training examples ###
# player_id is the first player in the new batch
def get_player_batch(player_id):
  x = input[input.id == player_id]
  x.drop('id', axis = 1, inplace = True)
  y = x.iloc[-1,:].loc[:, 'FantPt':'PPR']
  x.drop(x.iloc[-1:], axis = 0, inplace = True)


  '''TODO: construct a list of input sequences for the training batch'''
  input_batch = [vectorized_songs[i:i + seq_length] for i in idx]
  '''TODO: construct a list of output sequences for the training batch'''
  output_batch = [vectorized_songs[i + 1:i + seq_length + 1] for i in idx]

  # x_batch, y_batch provide the true inputs and targets for network training
  x_batch = np.reshape(input_batch, [batch_size, seq_length])
  y_batch = np.reshape(output_batch, [batch_size, seq_length])
  return x_batch, y_batch



In [None]:
df.values

array([[2000.0, 'Marshall Faulk', 'STL', ..., 32.85, 29.964285714285715,
        2290],
       [2000.0, 'Edgerrin James', 'IND', ..., 24.70625, 22.71875, 1128],
       [2000.0, 'Jeff Garcia', 'SFO', ..., 21.28125, 21.3125, 1583],
       ...,
       [2022.0, 'Taiwan Jones', 'BUF', ..., -0.125, -0.125, 3049],
       [2022.0, 'Chester Rogers', 'CLE', ..., -0.6666666666666666,
        -0.6666666666666666, 530],
       [2022.0, 'Tim Boyle', 'CHI', ..., -2.9, -3.0, 3128]], dtype=object)