In [599]:
import numpy as np
import pandas as pd
import xgboost as xgb
from sklearn.feature_selection import SelectFromModel
import nfl_data_py as nfl
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score
import datetime as dt
import warnings
warnings.filterwarnings('ignore')
today = dt.date.today()
year = today.year

In [600]:
df = nfl.import_schedules(years=range(year-5,year+1))
currSeason = df[df.season == year]
predWeek = currSeason[['week', 'total']].dropna()
if np.isnan(predWeek.week.max()):
    predWeek = 1
else:
    predWeek = predWeek.week.max() + 1
predWeek=3

In [601]:
# Prepare dataframe by dropping irrelevant predictors and formatting columns for KNN
df = df[df.result != 0]
df['Home'] = np.where(df['result'] > 0, 1, 0)

def date_to_month(time_str):
    year, month, day = map(int, time_str.split('-'))
    return month
df['month'] = df['gameday'].apply(date_to_month)
# Function to convert time to seconds
def time_to_seconds(time_str):
    hours, minutes = map(int, time_str.split(':'))
    return hours * 3600 + minutes * 60
# Apply the function to the 'time' column
df['gametime'] = df['gametime'].apply(time_to_seconds)

dict_day = {"weekday": {"Sunday": 0, "Monday": 1, "Tuesday": 2, "Wednesday": 3, "Thursday": 4, "Friday": 5, "Saturday": 6}}
df.replace(dict_day, inplace=True)
dict_roof = {"roof": {"outdoors": 0, "dome": 1, "closed": 2, "open": 3}}
df.replace(dict_roof, inplace=True)
dict_surface = {"surface": {"grass": 0, "grass ": 0, "fieldturf": 1, "astroturf": 2, "sportturf": 3, "matrixturf": 4, "astroplay": 5, "a_turf": 6, "dessograss": 7}}
df.replace(dict_surface, inplace=True)

df_dummy = pd.get_dummies(df, drop_first=True, columns=['game_type', 'location', 'stadium_id', 'home_team', 'away_team', 'home_qb_id', 'away_qb_id', 'home_coach', 'away_coach'])

In [602]:
features = df_dummy.drop(['Home', 'home_moneyline', 'away_moneyline', 'gameday', 'surface', 'game_id', 'home_score', 'away_score', 'result', 'total', 'overtime', 'old_game_id', 'gsis', 'nfl_detail_id', 'pfr', 'pff', 'espn', 'ftn', 'away_qb_name', 'home_qb_name', 'referee', 'stadium', 'wind', 'temp'], axis=1).columns

# Accuracy Testing

In [603]:
# Model building
df_acc = df_dummy.dropna()
y = df_acc.Home
X = df_acc[features]

precis_array = []
acc_array = []
for i in range(1, 26):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=i)

    model = xgb.XGBClassifier(
        use_label_encoder=False,
        eval_metric='logloss',
        max_depth=6,  # Limit the depth of the trees
        learning_rate=0.1,
        n_estimators=1000,  # Use a large number of trees
        reg_alpha=0.1,  # L1 regularization term on weights
        reg_lambda=0.1,  # L2 regularization term on weights
        early_stopping_rounds=10  # Stop early if validation score doesn't improve
    )
    # Evaluation set
    X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=i)

    model.fit(
        X_train, y_train,
        eval_set=[(X_val, y_val)],
        verbose=False
    )

    # Predict probabilities and classes on selected features
    y_pred_proba = model.predict_proba(X_test)[:, 1]
    y_pred = model.predict(X_test)
    acc = accuracy_score(y_test, y_pred)
    precis = precision_score(y_test, y_pred)
    precis_array.append(precis)
    acc_array.append(acc)

In [604]:
def find_mean(array):
    if len(array) == 0:
        return 0  # To handle empty arrays
    return sum(array) / len(array)

In [605]:
print("This model predicts the home team correctly", round(find_mean(precis_array)*100, 2), "% of the time.")

This model predicts the home team correctly 67.47 % of the time.


In [606]:
print("This model predicts the winner correctly", round(find_mean(acc_array)*100, 2), "% of the time.")

This model predicts the winner correctly 60.96 % of the time.


# Weekly Plays

In [607]:
train_df = df_dummy[(df_dummy.season < year) | ((df_dummy.season == year) & (df_dummy.week < predWeek))]
test_df = df_dummy[(df_dummy.season == year) & (df_dummy.week == predWeek)]
train_df.dropna(inplace=True)
X_train = train_df[features]
y_train = train_df.Home
X_test = test_df[features]
y_test = test_df.Home

model = xgb.XGBClassifier(
    use_label_encoder=False,
    eval_metric='logloss',
    max_depth=6,  # Limit the depth of the trees
    learning_rate=0.1,
    n_estimators=1000,  # Use a large number of trees
    reg_alpha=0.1,  # L1 regularization term on weights
    reg_lambda=0.1,  # L2 regularization term on weights
    early_stopping_rounds=10  # Stop early if validation score doesn't improve
)

# Evaluation set
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

model.fit(
    X_train, y_train,
    eval_set=[(X_val, y_val)],
    verbose=False
)

# Predict probabilities and classes on selected features
y_pred_proba = model.predict_proba(X_test)[:, 1]
y_pred = model.predict(X_test)

# Append predictions and probabilities to X_test
X_test['Prediction'] = y_pred
X_test['Home Probability'] = y_pred_proba

# Predicted Plays log
nextPlays = pd.merge(right=X_test, left=currSeason, right_index=True, left_index=True, how='left')
nextPlays = nextPlays[['game_id', 'season_x', 'week_x', 'home_team', 'away_team', 'gametime_x', 'weekday_x', 'spread_line_x', 'home_moneyline', 'Prediction', 'Home Probability']]
nextPlays.columns = ['Game ID', 'Season', 'Week', 'Home', 'Away', 'Start Time', 'Day', 'Spread Line', 'Home Moneyline', 'Prediction', 'Home Probability']
nextPlays = nextPlays[nextPlays.Week == predWeek]
nextPlays['Home Implied Odds'] = np.where(nextPlays['Home Moneyline'] < 0, (abs(nextPlays['Home Moneyline'])/(abs(nextPlays['Home Moneyline'])+100)), (100/(nextPlays['Home Moneyline']+100)))
nextPlays = nextPlays[nextPlays.Prediction == 1]
# Value cleanup
dict_day = {"Day": {0: "Sunday", 1: "Monday", 2: "Tuesday", 3: "Wednesday", 4: "Thursday", 5: "Friday", 6: "Saturday"}}
nextPlays.replace(dict_day, inplace=True)
dict_pred = {"Prediction": {1: "Home"}}
nextPlays.replace(dict_pred, inplace=True)
nextPlays

Unnamed: 0,Game ID,Season,Week,Home,Away,Start Time,Day,Spread Line,Home Moneyline,Away Moneyline,Prediction,Home Probability,Home Implied Odds
6738,2024_03_NE_NYJ,2024,3,NYJ,NE,20:15,Thursday,6.5,-285.0,230.0,Home,0.744163,0.74026
6740,2024_03_CHI_IND,2024,3,IND,CHI,13:00,Sunday,1.5,-125.0,105.0,Home,0.570002,0.555556
6741,2024_03_HOU_MIN,2024,3,MIN,HOU,13:00,Sunday,-2.0,114.0,-135.0,Home,0.551984,0.46729
6747,2024_03_MIA_SEA,2024,3,SEA,MIA,16:05,Sunday,4.0,-205.0,170.0,Home,0.516471,0.672131
6748,2024_03_DET_ARI,2024,3,ARI,DET,16:25,Sunday,-3.0,124.0,-148.0,Home,0.508686,0.446429
6749,2024_03_BAL_DAL,2024,3,DAL,BAL,16:25,Sunday,-1.5,100.0,-120.0,Home,0.542394,0.5
6750,2024_03_SF_LA,2024,3,LA,SF,16:25,Sunday,-6.0,220.0,-270.0,Home,0.601034,0.3125
6751,2024_03_KC_ATL,2024,3,ATL,KC,20:20,Sunday,-3.0,140.0,-166.0,Home,0.744163,0.416667
6752,2024_03_JAX_BUF,2024,3,BUF,JAX,19:30,Monday,5.5,-238.0,195.0,Home,0.545563,0.704142
6753,2024_03_WAS_CIN,2024,3,CIN,WAS,20:15,Monday,7.0,-340.0,270.0,Home,0.718593,0.772727


In [608]:
pos_EV_home = nextPlays.query('`Home Probability` > `Home Implied Odds`')
pos_EV_home

Unnamed: 0,Game ID,Season,Week,Home,Away,Start Time,Day,Spread Line,Home Moneyline,Away Moneyline,Prediction,Home Probability,Home Implied Odds
6738,2024_03_NE_NYJ,2024,3,NYJ,NE,20:15,Thursday,6.5,-285.0,230.0,Home,0.744163,0.74026
6740,2024_03_CHI_IND,2024,3,IND,CHI,13:00,Sunday,1.5,-125.0,105.0,Home,0.570002,0.555556
6741,2024_03_HOU_MIN,2024,3,MIN,HOU,13:00,Sunday,-2.0,114.0,-135.0,Home,0.551984,0.46729
6748,2024_03_DET_ARI,2024,3,ARI,DET,16:25,Sunday,-3.0,124.0,-148.0,Home,0.508686,0.446429
6749,2024_03_BAL_DAL,2024,3,DAL,BAL,16:25,Sunday,-1.5,100.0,-120.0,Home,0.542394,0.5
6750,2024_03_SF_LA,2024,3,LA,SF,16:25,Sunday,-6.0,220.0,-270.0,Home,0.601034,0.3125
6751,2024_03_KC_ATL,2024,3,ATL,KC,20:20,Sunday,-3.0,140.0,-166.0,Home,0.744163,0.416667


In [609]:
parlay = nextPlays.sort_values(by='Home Probability', ascending=False).query('`Home Probability` > .7')
parlay

Unnamed: 0,Game ID,Season,Week,Home,Away,Start Time,Day,Spread Line,Home Moneyline,Away Moneyline,Prediction,Home Probability,Home Implied Odds
6738,2024_03_NE_NYJ,2024,3,NYJ,NE,20:15,Thursday,6.5,-285.0,230.0,Home,0.744163,0.74026
6751,2024_03_KC_ATL,2024,3,ATL,KC,20:20,Sunday,-3.0,140.0,-166.0,Home,0.744163,0.416667
6753,2024_03_WAS_CIN,2024,3,CIN,WAS,20:15,Monday,7.0,-340.0,270.0,Home,0.718593,0.772727
