In [26]:
import os
import pickle
from utils.b2 import B2
from utils.modeling import *
import streamlit as st
from dotenv import load_dotenv
import numpy as np
from implicit.als import AlternatingLeastSquares
from sklearn.metrics import mean_squared_error
from scipy.sparse import coo_matrix

In [27]:
# ------------------------------------------------------
#                      APP CONSTANTS
# ------------------------------------------------------
REMOTE_DATA = 'pbp-2023.csv'

In [28]:
# ------------------------------------------------------
#                        CONFIG
# ------------------------------------------------------
load_dotenv()

True

In [29]:
# load Backblaze connection
b2 = B2(endpoint=os.environ['B2_ENDPOINT'],
        key_id=os.environ['B2_KEYID'],
        secret_key=os.environ['B2_APPKEY'])

In [30]:
# ------------------------------------------------------
#                        CACHING
# ------------------------------------------------------
@st.cache_data
def get_data():
    # collect data frame of reviews and their sentiment
    b2.set_bucket(os.environ['B2_BUCKETNAME'])
    df_pbp = b2.get_df(REMOTE_DATA)

    return df_pbp

2024-03-27 00:09:58.366 No runtime found, using MemoryCacheStorageManager


In [31]:
data = get_data()
data

Unnamed: 0,GameId,Quarter,Minute,Second,OffenseTeam,DefenseTeam,Down,ToGo,YardLine,SeriesFirstDown,...,IsChallenge,IsInterception,IsFumble,IsPenalty,IsTwoPointConversion,IsTwoPointConversionSuccessful,RushDirection,SitID,TeamID,PlayID
0,2023121101,3,1,35,NYG,GB,3,7,92,1,...,0,0,0,0,0,0,,130792,1,201
1,2023121101,3,2,19,NYG,GB,2,11,88,0,...,0,0,0,0,0,0,RIGHT GUARD,121188,1,102
2,2023121101,3,2,56,NYG,GB,1,10,89,0,...,0,0,0,0,0,0,CENTER,111089,1,103
3,2023121101,3,3,43,NYG,GB,1,10,64,1,...,0,0,0,0,0,0,,111064,1,203
4,2023121101,3,4,29,NYG,GB,2,3,55,1,...,0,0,0,0,0,0,RIGHT GUARD,120355,1,102
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19711,2023091011,4,3,43,NE,PHI,1,10,89,1,...,0,0,0,0,0,0,,2811089,28,201
19712,2023091012,4,3,16,LA,SEA,1,10,42,0,...,0,0,0,0,0,0,RIGHT TACKLE,2611042,26,107
19713,2023091012,4,9,9,LA,SEA,1,10,25,0,...,0,0,0,0,0,0,RIGHT TACKLE,2611025,26,107
19714,2023091004,3,15,0,TB,MIN,1,10,25,0,...,0,0,0,0,0,0,LEFT TACKLE,2311025,23,108


In [32]:
rush_plays = data[data['IsRush'] == 1]
pass_plays = data[data['IsPass'] == 1]

## Model for rush plays

In [33]:
# Train ALS model for rush plays
rush_model = train_als_model(rush_plays)

100%|██████████| 5/5 [00:02<00:00,  2.19it/s]


In [34]:
# Calculate RMSE for rush plays
rush_rmse = calculate_rmse(rush_model, rush_plays)
rush_rmse

3.4636428171726896

## Model for pass plays

In [35]:
pass_model = train_als_model(pass_plays)

100%|██████████| 5/5 [00:02<00:00,  1.85it/s]


In [36]:
pass_rmse = calculate_rmse(pass_model, pass_plays)
pass_rmse

7.380806343220903

## Initial Recommender

In [37]:
import pandas as pd

In [38]:
pass_prediction = predict_ratings(pass_model)
rush_prediction = predict_ratings(rush_model)

In [39]:
# Define row and column names
row_names = pass_plays['SitID'].unique()
col_names = pass_plays['PlayID'].unique()

# Create DataFrame
pass_pred_df = pd.DataFrame(pass_prediction, index=row_names, columns=col_names)

# Display DataFrame
print(pass_pred_df.head())

                 201           203           204           205           206  \
130792 -1.376475e-02 -5.292662e-03 -2.636018e-03  7.493966e-01 -1.439670e-02   
111064 -6.114335e-11  1.617473e-10  1.647742e-11  1.612517e-10 -2.983586e-11   
220756  8.433750e-01 -2.822801e-03 -3.706921e-03 -1.275725e-02 -1.242319e-02   
111069  1.002178e+00 -3.357262e-03 -4.396595e-03 -1.517411e-02 -1.476048e-02   
511052 -2.312991e-02  1.014072e+00 -1.827671e-03 -3.402809e-02 -2.224168e-02   

                 207  
130792  2.903337e-04  
111064 -6.152442e-11  
220756 -4.686197e-04  
111069 -5.704103e-04  
511052  8.460073e-03  


In [40]:
# Define row and column names
row_names = rush_plays['SitID'].unique()
col_names = rush_plays['PlayID'].unique()

# Create DataFrame
rush_pred_df = pd.DataFrame(rush_prediction, index=row_names, columns=col_names)

# Display DataFrame
print(rush_pred_df.head())

                 102           103           104           105           106  \
121188 -8.032175e-03 -1.314418e-02 -5.370577e-03 -4.674552e-03 -5.897369e-03   
111089  2.638367e-11  7.492065e-11 -8.800072e-11 -1.686921e-11 -2.393626e-11   
120355 -1.510533e-02 -1.341257e-02  9.454886e-01 -8.480371e-03 -1.088741e-02   
231565 -8.240388e-03 -7.501468e-03 -1.819033e-02 -6.685348e-03 -1.452834e-02   
211053 -8.294441e-03 -1.878190e-02 -7.721137e-03 -6.668874e-03 -8.368893e-03   

                 107           108  
121188  7.151322e-01 -8.203869e-03  
111089  5.290677e-11  2.034956e-11  
120355 -5.954822e-03 -2.925484e-02  
231565 -8.497369e-03  8.420964e-01  
211053  1.018074e+00 -1.172264e-02  


In [46]:
best_pass = pass_pred_df.loc[111064].nlargest(2)
best_pass

203    1.617473e-10
205    1.612517e-10
Name: 111064, dtype: float32

In [41]:
def get_play(team, down, distance, yardline, pass_pred_df, rush_pred_df, plays_df):
    teamId = data.loc[data['OffenseTeam'] == team, 'TeamID'].values[0]
    SitId = int(yardline + 100 * distance + 10000 * down + 100000 * teamId)
    
    if SitId in pass_pred_df.index:
        best_pass = pass_pred_df.loc[SitId].nlargest(2)
        print("Top Pass Choice:", plays_df.loc[plays_df['PlayID'] == int(best_pass.index[0]), 'PassType'].values[0])
        print("Predicted Gain:", round(best_pass.iloc[0]), "Yards")
        
        print("Second Pass Choice:", plays_df.loc[plays_df['PlayID'] == int(best_pass.index[1]), 'PassType'].values[0])
        print("Predicted Gain:", round(best_pass.iloc[1]), "Yards")
    else:
        best_pass = pass_pred_df.mean().nlargest(2)
        print("Top Pass Choice:", plays_df.loc[plays_df['PlayID'] == int(best_pass.index[0]), 'PassType'].values[0])
        print("Predicted Gain:", round(best_pass.iloc[0]), "Yards")
        
        print("Second Pass Choice:", plays_df.loc[plays_df['PlayID'] == int(best_pass.index[1]), 'PassType'].values[0])
        print("Predicted Gain:", round(best_pass.iloc[1]), "Yards")

    if SitId in rush_pred_df.index:
        best_rush = rush_pred_df.loc[SitId].nlargest(2)
        print("Top Run Choice:", plays_df.loc[plays_df['PlayID'] == int(best_rush.index[0]), 'RushDirection'].values[0])
        print("Predicted Gain:", round(best_rush.iloc[0]), "Yards")
        
        print("Second Run Choice:", plays_df.loc[plays_df['PlayID'] == int(best_rush.index[1]), 'RushDirection'].values[0])
        print("Predicted Gain:", round(best_rush.iloc[1]), "Yards")
    else:
        best_rush = rush_pred_df.mean().nlargest(2)
        print("Top Run Choice:", plays_df.loc[plays_df['PlayID'] == int(best_rush.index[0]), 'RushDirection'].values[0])
        print("Predicted Gain:", round(best_rush.iloc[0]), "Yards")
        
        print("Second Run Choice:", plays_df.loc[plays_df['PlayID'] == int(best_rush.index[1]), 'RushDirection'].values[0])
        print("Predicted Gain:", round(best_rush.iloc[1]), "Yards")

In [42]:
get_play('NE', 1, 10, 20, pass_pred_df, rush_pred_df, data)

Top Pass Choice: DEEP MIDDLE
Predicted Gain: 0 Yards
Second Pass Choice: SHORT RIGHT
Predicted Gain: 0 Yards
Top Run Choice: CENTER
Predicted Gain: 1 Yards
Second Run Choice: LEFT TACKLE
Predicted Gain: 0 Yards
