In [16]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.neighbors import KNeighborsClassifier, NearestNeighbors, LocalOutlierFactor
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, ConfusionMatrixDisplay
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn.feature_selection import SelectKBest, f_classif
import nfl_data_py as nfl
import datetime as dt
import warnings
warnings.filterwarnings('ignore')
today = dt.date.today()
year = today.year
pd.set_option("display.max_columns", None)
pd.set_option('display.max_colwidth', None)

In [17]:
df = nfl.import_schedules(years=range(2000,year+1))
currSeason = df[df.season == year]
predWeek = currSeason[['week', 'total_line']].dropna()
predWeek = predWeek.week.max()

In [18]:
df.drop(columns=['game_id', 'season', 'home_score', 'away_score', 'result', 'overtime', 'old_game_id', 'gsis', 'nfl_detail_id', 'pfr', 'pff', 'espn', 'ftn', 'away_rest', 'home_rest', 'away_qb_name', 'home_qb_name', 'stadium'], inplace=True)
df['Under'] = np.where(df['total'] < df['total_line'], 1, 0)
df['Push'] = np.where(df['total'] == df['total_line'], 1, 0)
df = df[df.Push != 1]
df.drop(columns=['total', 'Push'] , inplace=True)

def date_to_month(time_str):
    year, month, day = map(int, time_str.split('-'))
    return month
df['month'] = df['gameday'].apply(date_to_month)
# Function to convert time to seconds
def time_to_seconds(time_str):
    hours, minutes = map(int, time_str.split(':'))
    return hours * 3600 + minutes * 60
# Apply the function to the 'time' column
df['gametime'] = df['gametime'].apply(time_to_seconds)

In [19]:
dict_day = {"weekday": {"Sunday": 0, "Monday": 1, "Tuesday": 2, "Wednesday": 3, "Thursday": 4, "Friday": 5, "Saturday": 6}}
df.replace(dict_day, inplace=True)
dict_roof = {"roof": {"outdoors": 0, "dome": 1, "closed": 2, "open": 3}}
df.replace(dict_roof, inplace=True)
dict_surface = {"surface": {"grass": 0, "grass ": 0, "fieldturf": 1, "astroturf": 2, "sportturf": 3, "matrixturf": 4, "astroplay": 5, "a_turf": 6, "dessograss": 7}}
df.replace(dict_surface, inplace=True)

In [20]:
df = pd.get_dummies(df, drop_first=True, columns=['game_type', 'away_team', 'home_team', 'location', 'temp', 'away_qb_id', 'home_qb_id', 'away_coach', 'home_coach', 'referee', 'stadium_id'])
df.reset_index(drop=True, inplace=True)
df = df.dropna()

In [21]:
X_variables = df.drop(['Under', 'gameday'], axis=1).copy()
y_variable = df['Under'].copy()

selected_X = SelectKBest(f_classif, k=12)
selected_X.fit(X_variables, y_variable)

indices = selected_X.get_support(indices=True)
selected_features = X_variables.columns[indices]
print(selected_features)

Index(['wind', 'away_team_STL', 'temp_36.0', 'temp_39.0', 'temp_46.0',
       'temp_59.0', 'away_qb_id_00-0032950', 'home_qb_id_00-0019596',
       'home_qb_id_00-0032950', 'away_coach_Norv Turner',
       'home_coach_Joe Judge', 'referee_John Parry'],
      dtype='object')


In [22]:
nfl.import_contracts()

URLError: <urlopen error [SSL: CERTIFICATE_VERIFY_FAILED] certificate verify failed: unable to get local issuer certificate (_ssl.c:1000)>

In [None]:
# Model building
feats = df.drop(columns=['home_team', 'away_team', 'season', 'gameday', 'Over', 'Under', 'Push'])
features = feats.columns
target = 'Under'

train_df = df[df.season < 2019]
test_df = df[df.season == 2019]
X_train = train_df[features]
y_train = train_df[target]
X_test = test_df[features]
y_test = test_df[target]

# model = KNeighborsClassifier(n_neighbors=13)

pipe = Pipeline([
    ('scaler', StandardScaler()),
    ('model', KNeighborsClassifier(n_neighbors=7))
])

classif = pipe.fit(X_train, y_train)

pipe2 = Pipeline([
    ('scaler', StandardScaler()),
    ('lof', LocalOutlierFactor(novelty=True))
])

pipe2.fit(X_train)
y_test_nov = pipe2.predict(X_test)

mask = [y == 1 for y in y_test_nov]

X_test = X_test[mask]
y_test = y_test[mask]
y_pred = classif.predict(X_test)
y_true = y_test

print(f'Total accuracy score={accuracy_score(y_true, y_pred):.2%}')
print(f'\nClassification Report:')
print(classification_report(y_true, y_pred, target_names=['Over', 'Under']))

cm = confusion_matrix(y_true, y_pred)
display = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=['Over', 'Under'])
display.plot()
plt.grid(False)
plt.show()

KeyError: "['home_team', 'away_team', 'season', 'Over', 'Push'] not found in axis"

In [None]:
train_df = df[(df.season < year) & (df.week < predWeek) | (df.season < year)]
test_df = df[(df.season == year) & (df.week == predWeek)]
X_train = train_df[features]
y_train = train_df[target]
X_test = test_df[features]
y_test = test_df[target]

pipe = Pipeline([
    ('scaler', StandardScaler()),
    ('model', KNeighborsClassifier(n_neighbors=7))
])

classif = pipe.fit(X_train, y_train)

pipe2 = Pipeline([
    ('scaler', StandardScaler()),
    ('lof', LocalOutlierFactor(novelty=True))
])

pipe2.fit(X_train)
y_test_nov = pipe2.predict(X_test)

mask = [y == 1 for y in y_test_nov]
X_test = X_test[mask]
y_test = y_test[mask]
y_pred = classif.predict(X_test)
y_true = y_test

# Predicted Plays log
nextPlays = currSeason[currSeason.week == predWeek]
nextPlays['Predicted Outcome'] = y_pred
nextPlays = nextPlays[nextPlays['Predicted Outcome'] == 1]
nextPlays = nextPlays[['game_id', 'season', 'week', 'home_team', 'away_team', 'gametime', 'weekday', 'total_line', 'under_odds']]
nextPlays.columns = ['Game ID', 'Season', 'Week', 'Home', 'Away', 'Start Time', 'Day', 'Total Line', 'Under Odds']

ValueError: Length of values (14) does not match length of index (16)

In [None]:
nextPlays

Unnamed: 0,Game ID,Season,Week,Home,Away,Start Time,Day,Total Line,Under Odds
6708,2024_01_PIT_ATL,2024,1,ATL,PIT,13:00,Sunday,42.0,-112.0
6709,2024_01_ARI_BUF,2024,1,BUF,ARI,13:00,Sunday,48.0,-108.0
6710,2024_01_TEN_CHI,2024,1,CHI,TEN,13:00,Sunday,43.0,-108.0
6713,2024_01_JAX_MIA,2024,1,MIA,JAX,13:00,Sunday,49.0,-110.0
6715,2024_01_MIN_NYG,2024,1,NYG,MIN,13:00,Sunday,41.5,-112.0
6718,2024_01_DAL_CLE,2024,1,CLE,DAL,16:25,Sunday,43.0,-110.0
6719,2024_01_WAS_TB,2024,1,TB,WAS,16:25,Sunday,42.0,-112.0
6720,2024_01_LA_DET,2024,1,DET,LA,20:20,Sunday,51.0,-110.0
