In [22]:
import os
import pickle

In [23]:
import streamlit as st
from dotenv import load_dotenv

In [24]:
from utils.b2 import B2
from utils.modeling import *

In [25]:
# ------------------------------------------------------
#                      APP CONSTANTS
# ------------------------------------------------------
REMOTE_DATA = 'pbp-2023.csv'

In [26]:
# ------------------------------------------------------
#                        CONFIG
# ------------------------------------------------------
load_dotenv()

True

In [27]:
# load Backblaze connection
b2 = B2(endpoint=os.environ['B2_ENDPOINT'],
        key_id=os.environ['B2_KEYID'],
        secret_key=os.environ['B2_APPKEY'])

In [28]:
# ------------------------------------------------------
#                        CACHING
# ------------------------------------------------------
@st.cache_data
def get_data():
    # collect data frame of reviews and their sentiment
    b2.set_bucket(os.environ['B2_BUCKETNAME'])
    df_pbp = b2.get_df(REMOTE_DATA)

    return df_pbp

2024-03-21 01:07:07.124 No runtime found, using MemoryCacheStorageManager


In [29]:
data = get_data()
data

Unnamed: 0,GameId,Quarter,Minute,Second,OffenseTeam,DefenseTeam,Down,ToGo,YardLine,SeriesFirstDown,...,IsSack,IsChallenge,IsInterception,IsFumble,IsPenalty,IsTwoPointConversion,IsTwoPointConversionSuccessful,RushDirection,SitID,TeamID
0,2023121101,3,1,35,NYG,GB,3,7,92,1,...,0,0,0,0,0,0,0,,130792,1
1,2023121101,3,2,19,NYG,GB,2,11,88,0,...,0,0,0,0,0,0,0,RIGHT GUARD,121188,1
2,2023121101,3,2,56,NYG,GB,1,10,89,0,...,0,0,0,0,0,0,0,CENTER,111089,1
3,2023121101,3,3,43,NYG,GB,1,10,64,1,...,0,0,0,0,0,0,0,,111064,1
4,2023121101,3,4,29,NYG,GB,2,3,55,1,...,0,0,0,0,0,0,0,RIGHT GUARD,120355,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19711,2023091011,4,3,43,NE,PHI,1,10,89,1,...,0,0,0,0,0,0,0,,2811089,28
19712,2023091012,4,3,16,LA,SEA,1,10,42,0,...,0,0,0,0,0,0,0,RIGHT TACKLE,2611042,26
19713,2023091012,4,9,9,LA,SEA,1,10,25,0,...,0,0,0,0,0,0,0,RIGHT TACKLE,2611025,26
19714,2023091004,3,15,0,TB,MIN,1,10,25,0,...,0,0,0,0,0,0,0,LEFT TACKLE,2311025,23


In [31]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19716 entries, 0 to 19715
Data columns (total 29 columns):
 #   Column                          Non-Null Count  Dtype 
---  ------                          --------------  ----- 
 0   GameId                          19716 non-null  int64 
 1   Quarter                         19716 non-null  int64 
 2   Minute                          19716 non-null  int64 
 3   Second                          19716 non-null  int64 
 4   OffenseTeam                     19716 non-null  object
 5   DefenseTeam                     19716 non-null  object
 6   Down                            19716 non-null  int64 
 7   ToGo                            19716 non-null  int64 
 8   YardLine                        19716 non-null  int64 
 9   SeriesFirstDown                 19716 non-null  int64 
 10  Description                     19716 non-null  object
 11  Yards                           19716 non-null  int64 
 12  Formation                       19716 non-null

In [32]:
# Remove incompletions to make the data looks a bit more normal
filtered_data = data[data['IsIncomplete'] != 1]
# Keep only rows where 'PlayType' is either 'PASS' or 'RUSH'
filtered_data = filtered_data[(filtered_data['PlayType'] == 'PASS') | (filtered_data['PlayType'] == 'RUSH')]

In [33]:
# Create a dictionary to map unique offense team names to unique identifiers
team_to_teamid = {team: i for i, team in enumerate(filtered_data['OffenseTeam'].unique(), start=1)}
# Create the 'TeamID' column by mapping the 'OffenseTeam' column to the unique identifiers
filtered_data['TeamID'] = filtered_data['OffenseTeam'].map(team_to_teamid)
# Creating a unique situation identifier
filtered_data['SitID'] = filtered_data['YardLine'] + 100 * filtered_data['ToGo'] + 10000 * filtered_data['Down'] + 100000 * filtered_data['TeamID']

In [34]:
#csv_file_path = 'filtered_data.csv'
#filtered_data.to_csv(csv_file_path, index=False)

In [51]:
pass_type_mapping = {pass_type: i for i, pass_type in enumerate(filtered_data['PassType'].unique(), start=1)}
rush_direction_mapping = {rush_direction: i for i, rush_direction in enumerate(filtered_data['RushDirection'].unique(), start=1)}

In [53]:
# Create the 'PlayID' column
filtered_data['PlayID'] = filtered_data['IsRush'] * 100 + filtered_data['RushDirection'].map(rush_direction_mapping)
filtered_data.loc[filtered_data['IsPass'] == 1, 'PlayID'] = 200 + filtered_data['PassType'].map(pass_type_mapping)

In [54]:
filtered_data.head()

Unnamed: 0,GameId,Quarter,Minute,Second,OffenseTeam,DefenseTeam,Down,ToGo,YardLine,SeriesFirstDown,...,IsChallenge,IsInterception,IsFumble,IsPenalty,IsTwoPointConversion,IsTwoPointConversionSuccessful,RushDirection,SitID,TeamID,PlayID
0,2023121101,3,1,35,NYG,GB,3,7,92,1,...,0,0,0,0,0,0,,130792,1,201
1,2023121101,3,2,19,NYG,GB,2,11,88,0,...,0,0,0,0,0,0,RIGHT GUARD,121188,1,102
2,2023121101,3,2,56,NYG,GB,1,10,89,0,...,0,0,0,0,0,0,CENTER,111089,1,103
3,2023121101,3,3,43,NYG,GB,1,10,64,1,...,0,0,0,0,0,0,,111064,1,203
4,2023121101,3,4,29,NYG,GB,2,3,55,1,...,0,0,0,0,0,0,RIGHT GUARD,120355,1,102


In [57]:
# Specify the file path for the CSV file
csv_file_path = 'pbp-2023.csv'
# Export the DataFrame to a CSV file, overriding the existing file if it exists
filtered_data.to_csv(csv_file_path, index=False, mode='w')