# QB vs RB Model

## Import Packages

In [46]:
import pandas as pd
import tensorflow as tf
from tensorflow.keras import layers
import numpy as np

## Import and Preprocess Data

In [22]:
input_df = pd.read_csv('Heisman_Winner_QB_vs_RB_Import.csv')
#print(input_data_cv.head())

#Standardize stats by # of games played
input_df['Pass_Cmp_per_Game'] = input_df['Pass_Cmp']/input_df['QB_Games']
input_df['Pass_Att_per_Game'] = input_df['Pass_Att']/input_df['QB_Games']
input_df['Pass_TD_per_Game'] = input_df['Pass_TD']/input_df['QB_Games']
input_df['Int_per_Game'] = input_df['Int']/input_df['QB_Games']

input_df['Rush_Att_per_Game'] = input_df['Rush_Att']/input_df['RB_Games']
input_df['Rush_Yds_per_Game'] = input_df['Rush_Yds']/input_df['RB_Games']
input_df['Rush_TD_per_Game'] = input_df['Rush_TD']/input_df['RB_Games']
input_df['Rec_per_Game'] = input_df['Rec']/input_df['RB_Games']
input_df['Rcv_Yds_per_Game'] = input_df['Rcv_Yds']/input_df['RB_Games']
input_df['Rcv_TD_per_Game'] = input_df['Rcv_TD']/input_df['RB_Games']
input_df['ScrmgPlays_per_Game'] = input_df['ScrmgPlays']/input_df['RB_Games']
input_df['Scrmg_Yds_per_Game'] = input_df['Scrmg_Yds']/input_df['RB_Games']
input_df['Scrmg_TD_per_Game'] = input_df['Scrmg_TD']/input_df['RB_Games']
print(input_df.head())

   Year          Best_QB  QB_Rank  QB_Games  Pass_Cmp  Pass_Att  Cmp%  \
0  2023  Jayden Daniels*        5        12       236       327  72.2   
1  2022  Caleb Williams*        3        14       333       500  66.6   
2  2021      Bryce Young        2        15       366       547  66.9   
3  2020  Trevor Lawrence        8        10       231       334  69.2   
4  2019      Joe Burrow*        1        15       402       527  76.3   

   Pass_Yds  Pass_TD  Pass_TD%  ...  Int_per_Game  Rush_Att_per_Game  \
0      3812       40      12.2  ...      0.333333          20.357143   
1      4537       42       8.4  ...      0.357143          20.583333   
2      4872       47       8.6  ...      0.466667          21.916667   
3      3153       24       7.2  ...      0.500000          19.307692   
4      5671       60      11.4  ...      0.400000          22.857143   

   Rush_Yds_per_Game  Rush_TD_per_Game  Rec_per_Game  Rcv_Yds_per_Game  \
0         123.714286               1.5      2.785714  

Unlike the quarterback and running back datasets, which have 1 entry per player per year, the Heisman winner dataset is much smaller, with only 30 entries of data. Due to the limited dataset size, instead of creating a traditional train and test set, I will be evaluating the performance of my model using leave-one-out cross-validation. This method is most commonly used on very small datasets.

In this process, I will create 30 models, each one leaving exactly one year of data out for testing. After training each model, I will record the error (cross-entropy) of the one test record, then average all 30 errors to estimate the test error of a model trained on all 30 records.

In [23]:
#Drop non-standardized variables
vars_final = input_df.drop(columns = ['Pass_Cmp','Pass_Att','Pass_Yds', 'Pass_TD', 'Int', 'Rush_Att', 'Rush_Yds', 'Rush_TD',
                                      'Rec','Rcv_Yds','Rcv_TD','ScrmgPlays','Scrmg_Yds','Scrmg_TD'])

#Drop variables we won't be using to predict
vars_final.drop(columns = ['Year','Best_QB','QB_Games', 'Best_RB', 'RB_Games'], inplace=True)

In [107]:
#Create X and y tensors
X = vars_final.drop(columns = ['Winner'])
y = pd.Series([1 if winner == 'QB' else 0 for winner in vars_final['Winner']])

In [108]:
#x = np.array([[1,2,3,4,5], [1, 7, 2, 8, 5]])
#normal_layer = layers.Normalization(axis=-1)
#normal_layer.adapt(x)
#y = normal_layer(x)

#Create model
def build_network(X):
    normal_layer = layers.Normalization(axis=-1)
    normal_layer.adapt(np.array(X.values))
    model = tf.keras.Sequential()
    model.add(normal_layer)
    model.add(layers.Dense(10, activation = 'relu'))
    model.add(layers.Dense(1, activation = 'sigmoid'))
    
    model.compile(loss = 'binary_crossentropy',
                           optimizer = 'rmsprop',
                           metrics = ['accuracy'])

    return model


In [109]:
my_first_model = build_network(X)

In [110]:
def leave_one_out_train(X, y):
    callbacks =  [
    tf.keras.callbacks.EarlyStopping(
        monitor = 'val_loss', 
        patience=3
    ),
    tf.keras.callbacks.TensorBoard()
    ]

    errors = []
    for i in range(len(y)):
        y_train = y.copy()
        y_test = y_train.pop(i)
        X_train = X.copy()
        X_test = X[i]
        X_train.drop(i)

        my_first_model.fit(X_train, y_train, epochs = 10, callbacks = callbacks)
        error = my_first_model.evaluate(X_test, y_test)
        errors.append(error)
        print(f'Finished round {i}')
    return errors


In [111]:
error_list = leave_one_out_train(X, y)

KeyError: 0