<a href="https://colab.research.google.com/github/ojh485/fantasy/blob/main/Fantasy_Football_ML.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

##License

**Web scraping copied and modified from: https://github.com/logan-lauton/nfl_webscrape/tree/main
See below license:**

MIT License

Copyright (c) 2023 Logan Lauton

Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:

The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.

##Web Scraping

In [None]:
from urllib.request import urlopen
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
import time

In [None]:
## (See License)

##code to scrape a single seasons data
def single(season):
    url = f'https://www.pro-football-reference.com/years/{season}/fantasy.htm'
    table_html = BeautifulSoup(urlopen(url), 'html.parser').findAll('table')
    df = pd.read_html(str(table_html))[0]
    df.columns = ['Rk', 'Player', 'Tm', 'FantPos','Age','G','GS','Cmp','P_Att','P_Yds', \
                  'P_TD','Int','R_Att','R_Yds','R_Y/A','R_TD','Re_Tgt','Rec','Re_Y' \
                  , 'Y/R', 'Re_TD','Fmb', 'FL','TD','2PM','2PP', 'FantPt', 'PPR', \
                  'DkPt', 'FDPt', 'VBD', 'PosRank','OvRank'] #renaming columns bc of multi-index

    df = df.drop('Rk', axis = 1) # drop Rk columns
    df.Player = df.Player.str.replace('*','') # remove asterisk on player's name
    df.Player = df.Player.str.replace('+','') # remove plus on player's name
    df.insert(0,'Season',season) # insert season column
    df = df.apply(pd.to_numeric, errors='coerce').fillna(df) # convert non string values to numeric
    return df

##function to scrape multiple seasons of data at a time
def multiple(start_year,end_year):
    df = single(start_year)
    while start_year < end_year:
        time.sleep(4)                     ##code sleeps for 4 seconds between calls as 20 requests per minute
        start_year = start_year + 1       ##are allowed meaning only 15 requests per minute will be made here
        df = df.append(single(start_year))
    return df

In [None]:
##calling for all NFL seasons I deemed 'relevant' for my purposes. (2000-2001 – 2022-23)
df = multiple(2000,2022)

In [None]:
##quick check to ensure both 2000 and 2022 were included in the table
df

## Data Wrangling and Download

In [None]:
##removal of any rows containing the column names
df = df[df['Player']!='Player']

In [None]:
#Removing irrelevant columns for fantasy:
drop = ['FL', 'PosRank', 'OvRank']
df = df.drop(drop, 1)

In [None]:
#changing to appropriate datatypes
df = df.infer_objects()
df.info()

In [None]:
#adding relevant column(s): FPP/Game, PPR/Game, HalfPPR/Game
df['FPP/G'] = np.where(df['G'] >= 1, df['FantPt'] / df['G'], np.NaN)
df['PPR/G'] = np.where(df['G'] >= 1, df['PPR'] / df['G'], np.NaN)

halfPPR = (df['FantPt'] + 0.5*df['Rec']) / df['G']
df['HalfPPR/G'] = np.where(df['G'] >= 1, halfPPR, np.NaN)
df['id'] = df.groupby(['Player', (df['Age'] - df['Season'])]).ngroup()
df.sort_values('id')

In [None]:
#Removing all players that only have one season (besides 2022 rookies)
mask = ~(df.duplicated('id', keep = False)) & (df.Season != 2022)
temp1 = df[mask]
temp2 = pd.merge(df,temp1, indicator=True, how='outer') \
         .query('_merge=="left_only"') \
         .drop('_merge', axis=1)
df = temp2

In [None]:
#Saved to drive so no longer need to rerun everything
from google.colab import drive

drive.mount('/content/drive')
path = '/content/drive/MyDrive/Coding Projects/ML Fantasy Predictions/NFLstats.csv'
with open(path, 'w', encoding = 'utf-8-sig') as f:
  df.to_csv(f)


In [None]:
#checking to verify that no two players were the same age, with the same name
#in the same season (if this were the case, my ID assignment wouldn't work)
mask = (df.duplicated(['Player', 'Season', 'Age'], keep = False))
df[mask]

## Access Downloaded File (Start Here After Running Previous Once)

In [None]:
from urllib.request import urlopen
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
import time
import tensorflow as tf

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
path = "/content/drive/MyDrive/Personal Coding ML/Coding Projects/ML Fantasy Predictions/NFLstats.csv"
df = pd.read_csv(path)
df = df.drop('Unnamed: 0', 1)

In [None]:
#Creating dataframes based on position:
QB = df.loc[df['FantPos'] == 'QB']
RB = df.loc[df['FantPos'] == 'RB']
WR = df.loc[df['FantPos'] == 'WR']
TE = df.loc[df['FantPos'] == 'TE']

In [None]:
#creating training data: all years from all players (except their last season)
training = df[df.duplicated('id', keep = 'last')]
training = training.sort_values(['id','Season'])
training = training.reset_index()
training

In [None]:
#Dropping Unnecessary QB data
drop = ['Season','Tm', 'FantPos', 'Re_Tgt', 'Rec', 'Re_Y', 'Y/R', 'Re_TD', 'VBD','2PM', '2PP']
QB.drop(drop, axis = 1, inplace = True)
QB.columns

In [None]:
#Dropping Unnecessary RB data
drop = ['Season','Tm', 'FantPos', 'Cmp', 'P_Att',
       'P_Yds', 'P_TD', 'Int', 'VBD','2PM', '2PP']
RB.drop(drop, axis = 1, inplace = True)
RB.columns

In [None]:
#Dropping Unnecessary WR data
drop = ['Season','Tm', 'FantPos', 'Cmp', 'P_Att',
       'P_Yds', 'P_TD', 'Int', 'VBD','2PM', '2PP']
WR.drop(drop, axis = 1, inplace = True)
WR.columns

In [None]:
#Dropping Unnecessary TE data
drop = ['Season','Tm', 'FantPos', 'Cmp', 'P_Att',
       'P_Yds', 'P_TD', 'Int', 'VBD','2PM', '2PP']
TE.drop(drop, axis = 1, inplace = True)
TE.columns

In [None]:
QB.sort_values(['id','Age'], inplace = True)
QB.reset_index(inplace = True, drop = True)
QB.fillna(0, inplace = True)
QB

##**QB 1 Year NN**

###Data Work

In [None]:
#creating input data: all years from all QBs except their last season
eligible_QBs = QB[QB.duplicated('id', keep = 'last')]
eligible_QBs

In [None]:
eligible_ids = eligible_QBs.id.unique()
eligible_ids

In [None]:
# number of QBs we have to train/test with
eligible_ids.size

In [None]:
#Randomly selecting 80% of QBs for training, leaving 20% for testing
rng = np.random.default_rng(1)
num_training = np.round(eligible_ids.size * 0.8).astype('int')
train_ids = rng.choice(eligible_ids, size = num_training, replace = False)

In [None]:
QB_train_df = QB[QB.id.isin(train_ids)]
#getting rid of last season for each QB
QB_train_df = QB_train_df[QB_train_df.duplicated('id', keep = 'last')]
#converting to numpy array so it can be passed into NN
QB_train_data = QB_train_df.drop(['Player', 'id'], axis =1).to_numpy()
#creating labels
train_indices = QB_train_df.index + 1
QB_train_labels = QB.iloc[train_indices,:].loc[:,['FantPt', 'PPR']].to_numpy()

QB_test_df = QB[~(QB.id.isin(train_ids))]
#getting rid of last season for each QB
QB_test_df = QB_test_df[QB_test_df.duplicated('id', keep = 'last')]
#converting to numpy array so it can be passed into NN
QB_test_data = QB_test_df.drop(['Player', 'id'], axis =1).to_numpy()
#creating labels
test_indices = QB_test_df.index + 1
QB_test_labels = QB.iloc[test_indices,:].loc[:,['FantPt', 'PPR']].to_numpy()

###Model Building

Following NN "rules of thumb" from [this article](https://towardsdatascience.com/17-rules-of-thumb-for-building-a-neural-network-93356f9930af#:~:text=The%20first%20layer%20should%20be,is%20the%20number%20of%20classes.)

In [None]:
def build_model():
  model = tf.keras.Sequential([
      tf.keras.Input(shape = (21,)),
      tf.keras.layers.Dense(units = 256, activation = 'relu'),
      tf.keras.layers.Dropout(rate = 0.3),
      tf.keras.layers.Dense(units = 128, activation = 'relu'),
      tf.keras.layers.Dropout(rate = 0.3),
      tf.keras.layers.Dense(units = 2, activation = 'linear')
  ])
  return model
model = build_model()
model.summary()

In [None]:
model.compile(optimizer = tf.keras.optimizers.Adam(),
              loss = 'mse',
              metrics = tf.keras.metrics.RootMeanSquaredError()
)

In [None]:
BATCH_SIZE = 128
EPOCHS = 100

model.fit(QB_train_data, QB_train_labels, batch_size=BATCH_SIZE, epochs=EPOCHS)

In [None]:
#Evaluating Accuracy on Test Dataset:
test_loss, test_acc = tf.keras.Sequential.evaluate(self = model,x = QB_test_data, y = QB_test_labels)
print('MSE:', test_acc)

####Hyperparamter Notes
* Seemed to perform better with a low (or no) dropout
* More than 3 layers (2 hidden, 1 output) was not better than 3 layers
* Diminishing returns on number of epochs after 100. 200 was *slightly* better than 100
* Default learning rate of 0.001 was best
* Adam and Nadam has very similar performance
* Larger number of nodes in a layer seemed to be better when there was dropout




###Predictions

In [None]:
QB_2022 = df[(df.FantPos == 'QB') & (df.Season == 2022)]

In [None]:
QB_2022.columns

In [None]:
drop = ['Season', 'Player', 'Tm', 'FantPos', 'Re_Tgt',
       'Rec', 'Re_Y', 'Y/R', 'Re_TD', '2PM', '2PP', 'VBD','id']
input = QB_2022.drop(drop, axis = 1).infer_objects().to_numpy()
predictions = model.predict(input)
predictions

In [None]:
prediction_df = pd.DataFrame(predictions, columns = ['FantPt', 'PPR'])
prediction_df['Player'] = QB_2022.reset_index(drop = True).Player
prediction_df.head(20)

##QB 3 Year NN
A network that learns from

In [None]:
#Only looking at QBs with more than 3 years of data
num_seasons = QB.id.value_counts()
veteran_ids = num_seasons.index[num_seasons.gt(3)]
vetQB = QB[QB.id.isin(veteran_ids)]

## RNN Model (Not Finished)

In [None]:
#hyper parameters
batch_size = 100

In [None]:
input = training.loc[:, 'Age':'id']
input

In [None]:
def LSTM(rnn_units):
  return tf.keras.layers.LSTM(
    rnn_units,
    return_sequences=True, #only returns last output of output sequence
    recurrent_initializer='glorot_uniform', #initializer for recurrent_kernel weights matrix
    recurrent_activation='sigmoid',
    stateful=True, #last state at index i in a batch is used as first state for index i in following batch
  )

In [None]:
### Defining the RNN Model ###

model = tf.keras.Sequential()
# Layer 1: LSTM with `rnn_units` number of units.
model.add(LSTM(64))
#model.add(tf.keras.layers.BatchNormalization())
# Layer 3: Dense (fully-connected) layer that transforms the LSTM output
# into a points prediction. 3 output layers for three different predictions:
# Regular, PPR, Half PPR
model.add(tf.keras.layers.Dense(3))

In [None]:
#input shape = [batch_size, timesteps, input_dim]
model.build([batch_size, None, input.iloc[0].shape[0]])
model.summary()

In [None]:
### Batch definition to create training examples ###
# player_id is the first player in the new batch
def get_player_batch(player_id):
  x = input[input.id == player_id]
  x.drop('id', axis = 1, inplace = True)
  y = x.iloc[-1,:].loc[:, 'FantPt':'PPR']
  x.drop(x.iloc[-1:], axis = 0, inplace = True)


  '''TODO: construct a list of input sequences for the training batch'''
  input_batch = [vectorized_songs[i:i + seq_length] for i in idx]
  '''TODO: construct a list of output sequences for the training batch'''
  output_batch = [vectorized_songs[i + 1:i + seq_length + 1] for i in idx]

  # x_batch, y_batch provide the true inputs and targets for network training
  x_batch = np.reshape(input_batch, [batch_size, seq_length])
  y_batch = np.reshape(output_batch, [batch_size, seq_length])
  return x_batch, y_batch



In [None]:
df.values

## RNN Model Attempt #2

### Data Work

In [None]:
#creating input data: all years from all QBs except their last season
eligible_QBs = QB[QB.duplicated('id', keep = 'last')]
eligible_QBs

In [None]:
eligible_ids = eligible_QBs.id.unique()
eligible_ids

In [None]:
# number of QBs we have to train/test with
eligible_ids.size

In [None]:
#Randomly selecting 80% of QBs for training, leaving 20% for testing
rng = np.random.default_rng(1)
num_training = np.round(eligible_ids.size * 0.8).astype('int')
train_ids = rng.choice(eligible_ids, size = num_training, replace = False)

In [None]:
QB_train_df = QB[QB.id.isin(train_ids)]
#getting rid of last season for each QB
QB_train_df = QB_train_df[QB_train_df.duplicated('id', keep = 'last')]
#converting to numpy array so it can be passed into NN
QB_train_data = QB_train_df.drop(['Player', 'id'], axis =1).to_numpy()
#creating labels
train_indices = QB_train_df.index + 1
QB_train_labels = QB.iloc[train_indices,:].loc[:,['FantPt', 'PPR']].to_numpy()

QB_test_df = QB[~(QB.id.isin(train_ids))]
#getting rid of last season for each QB
QB_test_df = QB_test_df[QB_test_df.duplicated('id', keep = 'last')]
#converting to numpy array so it can be passed into NN
QB_test_data = QB_test_df.drop(['Player', 'id'], axis =1).to_numpy()
#creating labels
test_indices = QB_test_df.index + 1
QB_test_labels = QB.iloc[test_indices,:].loc[:,['FantPt', 'PPR']].to_numpy()

#### Padding
Goal: make one row = one player's career. Add padding to end of the player.

In [None]:
QB_np.shape

In [None]:
QB_np = QB.to_numpy()
QB_career = []*400
print(QB_career)
QB_counter = 0
num_rows, num_cols = QB_np.shape
for idx,qb in enumerate(QB_np):
  np.append(QB_career[QB_counter], qb)
  if idx < num_rows - 1:
    #if next QB is different from current, then we update the career idx by 1
    if qb[-1] != QB_np[idx + 1][-1]:
      QB_counter += 1




### RNN Model

In [None]:
def build_model():
  model = tf.keras.Sequential([
      tf.keras.Input(shape = (21,)),
      tf.keras.layers.Dense(units = 256, activation = 'relu'),
      tf.keras.layers.Dropout(rate = 0.3),
      tf.keras.layers.Dense(units = 128, activation = 'relu'),
      tf.keras.layers.Dropout(rate = 0.3),
      tf.keras.layers.Dense(units = 2, activation = 'linear')
  ])
  return model
model = build_model()
model.summary()

In [None]:
model.compile(optimizer = tf.keras.optimizers.Adam(),
              loss = 'mse',
              metrics = tf.keras.metrics.RootMeanSquaredError()
)

In [None]:
BATCH_SIZE = 128
EPOCHS = 100

model.fit(QB_train_data, QB_train_labels, batch_size=BATCH_SIZE, epochs=EPOCHS)

In [None]:
#Evaluating Accuracy on Test Dataset:
test_loss, test_acc = tf.keras.Sequential.evaluate(self = model,x = QB_test_data, y = QB_test_labels)
print('MSE:', test_acc)