In [58]:
import numpy as np
import pandas as pd
import scipy.io as sio
import matplotlib
import matplotlib.pyplot as plt
import random

from sklearn.linear_model import LinearRegression
from sklearn.metrics import accuracy_score

Load in the Data

In [102]:
games_df = pd.read_csv("C:/Users/jesse/Desktop/CSE141L/COGS118BProject/Data/nfl-big-data-bowl-2024/games.csv")
games_df

Unnamed: 0,gameId,season,week,gameDate,gameTimeEastern,homeTeamAbbr,visitorTeamAbbr,homeFinalScore,visitorFinalScore
0,2022090800,2022,1,09/08/2022,20:20:00,LA,BUF,10,31
1,2022091100,2022,1,09/11/2022,13:00:00,ATL,NO,26,27
2,2022091101,2022,1,09/11/2022,13:00:00,CAR,CLE,24,26
3,2022091102,2022,1,09/11/2022,13:00:00,CHI,SF,19,10
4,2022091103,2022,1,09/11/2022,13:00:00,CIN,PIT,20,23
...,...,...,...,...,...,...,...,...,...
131,2022110607,2022,9,11/06/2022,13:00:00,WAS,MIN,17,20
132,2022110608,2022,9,11/06/2022,16:05:00,ARI,SEA,21,31
133,2022110609,2022,9,11/06/2022,16:25:00,TB,LA,16,13
134,2022110610,2022,9,11/06/2022,20:20:00,KC,TEN,20,17


In [21]:
plays_df = pd.read_csv("C:/Users/jesse/Desktop/CSE141L/COGS118BProject/Data/nfl-big-data-bowl-2024/plays.csv")
plays_df = plays_df[["quarter", "down", "yardsToGo", "possessionTeam","defensiveTeam","absoluteYardlineNumber", "gameClock", "preSnapHomeScore","preSnapVisitorScore","passResult", "offenseFormation", "defendersInTheBox"]]
#makes pass result binary(0 run, 1 pass)
plays_df['passResult'] = plays_df['passResult'].fillna(0).where(plays_df['passResult'].isna(),1)
#assigns each offensive formation a unique number classifier
offense_formations = plays_df['offenseFormation'].unique()
offense_formations_num = {value: i for i, value in enumerate(offense_formations)}
plays_df['offenseFormation'] = plays_df['offenseFormation'].map(offense_formations_num)
#assigns each possession team a unique number classifier
teams = plays_df.possessionTeam.unique()
teams_num = {value: i for i, value in enumerate(teams)}
plays_df['possessionTeam'] = plays_df['possessionTeam'].map(teams_num)
#assigns each defensive team a unique number classifier
plays_df['defensiveTeam'] = plays_df['defensiveTeam'].map(teams_num)
#make all game times purly numbers (probably not necessary - most models can handle datetime or can convert to int differently)
plays_df['gameClock'] = plays_df['gameClock'].replace(':', "", regex =True).astype(int)

teams_num

{'ATL': 0,
 'PIT': 1,
 'LV': 2,
 'DEN': 3,
 'BUF': 4,
 'TEN': 5,
 'SF': 6,
 'KC': 7,
 'SEA': 8,
 'GB': 9,
 'NYJ': 10,
 'LA': 11,
 'JAX': 12,
 'DAL': 13,
 'DET': 14,
 'CAR': 15,
 'IND': 16,
 'MIN': 17,
 'CLE': 18,
 'WAS': 19,
 'CIN': 20,
 'BAL': 21,
 'PHI': 22,
 'TB': 23,
 'NO': 24,
 'ARI': 25,
 'NE': 26,
 'NYG': 27,
 'MIA': 28,
 'CHI': 29,
 'HOU': 30,
 'LAC': 31}

Seperating plays by team and creating a test and train dataset for each team(later we can do analysis on which teams we can predict the best)

In [41]:
teams = plays_df.possessionTeam.unique()
#empty dictionary of team names
plays_df_by_team_test = {x : pd.DataFrame() for x in teams}
plays_df_by_team_train = {x : pd.DataFrame() for x in teams}
plays_df_test = pd.DataFrame()
plays_df_train = pd.DataFrame()
#fills dictionary with all plays according to possesion team
for team in plays_df_by_team_test.keys():
    plays_df_by_team_test[team] = plays_df[:][plays_df.possessionTeam == team]
    #seperate into train and test sets seperated by team 
    rows = len(plays_df_by_team_test[team])
    rand_idx = np.random.randint(0, rows,size = int(rows/10))
    plays_df_by_team_train[team] = plays_df_by_team_test[team].drop(plays_df_by_team_test[team].index[rand_idx])
    plays_df_by_team_test[team] = plays_df_by_team_test[team].iloc[rand_idx]
    #create a joined train and test set
    plays_df_test = pd.concat([plays_df_test, plays_df_by_team_test[team]], ignore_index = True)
    plays_df_train = pd.concat([plays_df_train, plays_df_by_team_train[team]], ignore_index = True)

#Randomize final sets to mix teams
plays_df_test = plays_df_test.sample(frac=1)
plays_df_train = plays_df_test.sample(frac=1)
#split datasets into parameters and result
plays_df_test_res = plays_df_test["passResult"]
plays_df_test.drop("passResult",axis =1, inplace=True)
plays_df_train_res = plays_df_train["passResult"]
plays_df_train.drop("passResult",axis =1, inplace=True)

#print to show
print("Pass Results")
print(plays_df_test_res)
print("Test Data")
plays_df_test

Pass Results
34      1
211     0
996     1
953     0
344     1
       ..
878     1
1028    1
1056    1
866     1
287     1
Name: passResult, Length: 1233, dtype: object
Test Data


Unnamed: 0,quarter,down,yardsToGo,possessionTeam,defensiveTeam,absoluteYardlineNumber,gameClock,preSnapHomeScore,preSnapVisitorScore,offenseFormation,defendersInTheBox
34,2,1,10,0,24,92,347,10,7,4,8.0
211,1,2,10,5,4,99,632,7,0,1,8.0
996,3,2,5,25,15,54,242,10,10,0,5.0
953,2,3,1,24,21,44,136,0,14,2,7.0
344,2,2,8,9,23,74,1021,3,7,2,7.0
...,...,...,...,...,...,...,...,...,...,...,...
878,4,3,10,23,11,73,919,6,13,0,5.0
1028,2,1,10,26,10,50,42,10,3,4,6.0
1056,2,1,15,27,8,67,820,7,0,2,6.0
866,4,1,10,22,17,50,1430,24,7,3,6.0


In [56]:
plays_test_res = np.ravel(np.array(plays_df_test_res))
plays_test_param = np.array(plays_df_test)
plays_train_res = np.ravel(np.array(plays_df_train_res))
plays_train_param = np.array(plays_df_train)

Testing Out Sklearn Logistic Regression to see how good the Data is

In [68]:
model = LinearRegression()
model.fit(plays_train_param, plays_train_res)
pass_pred = model.predict(plays_test_param)
pass_pred = np.where(pass_pred>.5,1,0)
matching = np.sum(pass_pred == plays_test_res)
#percentage accuarcy
matching/len(pass_pred) * 100

71.37064071370641

In [None]:
tackles_df = pd.read_csv("C:/Users/jesse/Desktop/CSE141L/COGS118BProject/Data/nfl-big-data-bowl-2024/tackles.csv")
tackles_df

Unnamed: 0,gameId,playId,nflId,tackle,assist,forcedFumble,pff_missedTackle
0,2022090800,101,42816,1,0,0,0
1,2022090800,393,46232,1,0,0,0
2,2022090800,486,40166,1,0,0,0
3,2022090800,646,47939,1,0,0,0
4,2022090800,818,40107,1,0,0,0
...,...,...,...,...,...,...,...
17421,2022091113,2494,43533,0,0,0,1
17422,2022092502,3510,42406,0,0,0,1
17423,2022091113,3642,43478,0,0,0,1
17424,2022091901,3578,42431,0,0,0,1
