In [77]:
import numpy as np
import pandas as pd
import xgboost as xgb
from sklearn.feature_selection import SelectFromModel
import nfl_data_py as nfl
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score
from sklearn.model_selection import GridSearchCV
import datetime as dt
import warnings
warnings.filterwarnings('ignore')
today = dt.date.today()
year = today.year

In [78]:
df = nfl.import_schedules(years=range(year-5,year+1))
currSeason = df[df.season == year]
predWeek = currSeason[['week', 'total']].dropna()
if np.isnan(predWeek.week.max()):
    predWeek = 1
else:
    predWeek = predWeek.week.max() + 1
predWeek=4

In [79]:
# Prepare dataframe by dropping irrelevant predictors and formatting columns for KNN
df = df[df.result != 0]
df['Home'] = np.where(df['result'] > 0, 1, 0)

def date_to_month(time_str):
    year, month, day = map(int, time_str.split('-'))
    return month
df['month'] = df['gameday'].apply(date_to_month)
# Function to convert time to seconds
def time_to_seconds(time_str):
    hours, minutes = map(int, time_str.split(':'))
    return hours * 3600 + minutes * 60
# Apply the function to the 'time' column
df['gametime'] = df['gametime'].apply(time_to_seconds)

dict_day = {"weekday": {"Sunday": 0, "Monday": 1, "Tuesday": 2, "Wednesday": 3, "Thursday": 4, "Friday": 5, "Saturday": 6}}
df.replace(dict_day, inplace=True)
dict_roof = {"roof": {"outdoors": 0, "dome": 1, "closed": 2, "open": 3}}
df.replace(dict_roof, inplace=True)
dict_surface = {"surface": {"grass": 0, "grass ": 0, "fieldturf": 1, "astroturf": 2, "sportturf": 3, "matrixturf": 4, "astroplay": 5, "a_turf": 6, "dessograss": 7}}
df.replace(dict_surface, inplace=True)

df_dummy = pd.get_dummies(df, drop_first=True, columns=['home_team', 'away_team', 'home_qb_id', 'away_qb_id', 'home_coach', 'away_coach'])

In [80]:
features = df_dummy.drop(['Home', 'season', 'away_spread_odds', 'home_spread_odds', 'under_odds', 'over_odds', 'game_type', 'location', 'stadium_id', 'home_moneyline', 'away_moneyline', 'gameday', 'surface', 'game_id', 'home_score', 'away_score', 'result', 'total', 'overtime', 'old_game_id', 'gsis', 'nfl_detail_id', 'pfr', 'pff', 'espn', 'ftn', 'away_qb_name', 'home_qb_name', 'referee', 'stadium', 'wind', 'temp'], axis=1).columns

In [81]:
print(*features)

week weekday gametime away_rest home_rest spread_line total_line div_game roof month home_team_ATL home_team_BAL home_team_BUF home_team_CAR home_team_CHI home_team_CIN home_team_CLE home_team_DAL home_team_DEN home_team_DET home_team_GB home_team_HOU home_team_IND home_team_JAX home_team_KC home_team_LA home_team_LAC home_team_LV home_team_MIA home_team_MIN home_team_NE home_team_NO home_team_NYG home_team_NYJ home_team_OAK home_team_PHI home_team_PIT home_team_SEA home_team_SF home_team_TB home_team_TEN home_team_WAS away_team_ATL away_team_BAL away_team_BUF away_team_CAR away_team_CHI away_team_CIN away_team_CLE away_team_DAL away_team_DEN away_team_DET away_team_GB away_team_HOU away_team_IND away_team_JAX away_team_KC away_team_LA away_team_LAC away_team_LV away_team_MIA away_team_MIN away_team_NE away_team_NO away_team_NYG away_team_NYJ away_team_OAK away_team_PHI away_team_PIT away_team_SEA away_team_SF away_team_TB away_team_TEN away_team_WAS home_qb_id_00-0020531 home_qb_id_00

# Model Tuning

In [82]:
df_acc = df_dummy.dropna()
y = df_acc.Home
X = df_acc[features]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=i)

param_grid = {
    'max_depth': [3, 6, 9],
    'learning_rate': [0.01, 0.1, 0.2],
    'n_estimators': [100, 500, 1000],
    'reg_alpha': [0, 0.1, 0.5],
    'reg_lambda': [0.1, 0.5, 1.0]
}

xgb_clf = xgb.XGBClassifier(use_label_encoder=False, eval_metric='logloss')
grid_search = GridSearchCV(estimator=xgb_clf, param_grid=param_grid, cv=3, scoring='roc_auc', verbose=1)
grid_search.fit(X_train, y_train)

print("Best parameters found: ", grid_search.best_params_)
print("Best cross-validation score: ", grid_search.best_score_)

Fitting 3 folds for each of 243 candidates, totalling 729 fits
Best parameters found:  {'learning_rate': 0.01, 'max_depth': 3, 'n_estimators': 100, 'reg_alpha': 0.5, 'reg_lambda': 0.1}
Best cross-validation score:  0.774703557312253


# Accuracy Testing

In [88]:
# Model building
precis_array = []
acc_array = []
for i in range(1, 26):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=i)

    model = xgb.XGBClassifier(
        use_label_encoder=False,
        eval_metric='logloss',
        max_depth=3,  # Limit the depth of the trees
        learning_rate=0.01,
        n_estimators=100,  # Use a large number of trees
        reg_alpha=0.5,  # L1 regularization term on weights
        reg_lambda=.1,  # L2 regularization term on weights
        early_stopping_rounds=10  # Stop early if validation score doesn't improve
    )
    # Evaluation set
    X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=i)

    model.fit(
        X_train, y_train,
        eval_set=[(X_val, y_val)],
        verbose=False
    )

    # Predict probabilities and classes on selected features
    y_pred_proba = model.predict_proba(X_test)[:, 1]
    y_pred = model.predict(X_test)
    acc = accuracy_score(y_test, y_pred)
    precis = precision_score(y_test, y_pred)
    precis_array.append(precis)
    acc_array.append(acc)

In [89]:
def find_mean(array):
    if len(array) == 0:
        return 0  # To handle empty arrays
    return sum(array) / len(array)

In [90]:
print("This model predicts the home team correctly", round(find_mean(precis_array)*100, 2), "% of the time.")

This model predicts the home team correctly 70.66 % of the time.


In [91]:
print("This model predicts the winner correctly", round(find_mean(acc_array)*100, 2), "% of the time.")

This model predicts the winner correctly 60.96 % of the time.


# Weekly Plays

In [92]:
train_df = df_dummy[(df_dummy.season < year) | ((df_dummy.season == year) & (df_dummy.week < predWeek))]
test_df = df_dummy[(df_dummy.season == year) & (df_dummy.week == predWeek)]
train_df.dropna(inplace=True)
X_train = train_df[features]
y_train = train_df.Home
X_test = test_df[features]
y_test = test_df.Home

model = xgb.XGBClassifier(
    use_label_encoder=False,
    eval_metric='logloss',
    max_depth=3,  # Limit the depth of the trees
    learning_rate=0.01,
    n_estimators=100,  # Use a large number of trees
    reg_alpha=0.5,  # L1 regularization term on weights
    reg_lambda=0.1,  # L2 regularization term on weights
    early_stopping_rounds=10  # Stop early if validation score doesn't improve
)

# Evaluation set
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

model.fit(
    X_train, y_train,
    eval_set=[(X_val, y_val)],
    verbose=False
)

# Predict probabilities and classes on selected features
y_pred_proba = model.predict_proba(X_test)[:, 1]
y_pred = model.predict(X_test)

# Append predictions and probabilities to X_test
X_test['Prediction'] = y_pred
X_test['Home Probability'] = y_pred_proba

# Predicted Plays log
nextPlays = pd.merge(right=X_test, left=currSeason, right_index=True, left_index=True, how='left')
nextPlays = nextPlays[['game_id', 'season', 'week_x', 'home_team', 'away_team', 'gametime_x', 'weekday_x', 'spread_line_x', 'home_moneyline', 'Prediction', 'Home Probability']]
nextPlays.columns = ['Game ID', 'Season', 'Week', 'Home', 'Away', 'Start Time', 'Day', 'Spread Line', 'Home Moneyline', 'Prediction', 'Home Probability']
nextPlays = nextPlays[nextPlays.Week == predWeek]
nextPlays['Home Implied Odds'] = np.where(nextPlays['Home Moneyline'] < 0, (abs(nextPlays['Home Moneyline'])/(abs(nextPlays['Home Moneyline'])+100)), (100/(nextPlays['Home Moneyline']+100)))
nextPlays = nextPlays[nextPlays.Prediction == 1]
# Value cleanup
dict_day = {"Day": {0: "Sunday", 1: "Monday", 2: "Tuesday", 3: "Wednesday", 4: "Thursday", 5: "Friday", 6: "Saturday"}}
nextPlays.replace(dict_day, inplace=True)
dict_pred = {"Prediction": {1: "Home"}}
nextPlays.replace(dict_pred, inplace=True)
nextPlays

Unnamed: 0,Game ID,Season,Week,Home,Away,Start Time,Day,Spread Line,Home Moneyline,Prediction,Home Probability,Home Implied Odds
6755,2024_04_NO_ATL,2024,4,ATL,NO,13:00,Sunday,2.5,-135.0,Home,0.52096,0.574468
6757,2024_04_LA_CHI,2024,4,CHI,LA,13:00,Sunday,3.0,-155.0,Home,0.523896,0.607843
6758,2024_04_MIN_GB,2024,4,GB,MIN,13:00,Sunday,3.0,-148.0,Home,0.523896,0.596774
6759,2024_04_JAX_HOU,2024,4,HOU,JAX,13:00,Sunday,6.5,-278.0,Home,0.523896,0.73545
6760,2024_04_PIT_IND,2024,4,IND,PIT,13:00,Sunday,-1.5,105.0,Home,0.52096,0.487805
6761,2024_04_DEN_NYJ,2024,4,NYJ,DEN,13:00,Sunday,7.5,-425.0,Home,0.809644,0.809524
6762,2024_04_PHI_TB,2024,4,TB,PHI,13:00,Sunday,-1.5,110.0,Home,0.52096,0.47619
6764,2024_04_NE_SF,2024,4,SF,NE,16:05,Sunday,10.5,-575.0,Home,0.800065,0.851852
6766,2024_04_CLE_LV,2024,4,LV,CLE,16:25,Sunday,2.0,-125.0,Home,0.522375,0.555556
6767,2024_04_BUF_BAL,2024,4,BAL,BUF,20:20,Sunday,2.5,-130.0,Home,0.522375,0.565217


In [93]:
pos_EV_home = nextPlays.query('`Home Probability` > `Home Implied Odds`')
pos_EV_home

Unnamed: 0,Game ID,Season,Week,Home,Away,Start Time,Day,Spread Line,Home Moneyline,Prediction,Home Probability,Home Implied Odds
6760,2024_04_PIT_IND,2024,4,IND,PIT,13:00,Sunday,-1.5,105.0,Home,0.52096,0.487805
6761,2024_04_DEN_NYJ,2024,4,NYJ,DEN,13:00,Sunday,7.5,-425.0,Home,0.809644,0.809524
6762,2024_04_PHI_TB,2024,4,TB,PHI,13:00,Sunday,-1.5,110.0,Home,0.52096,0.47619


In [94]:
parlay = nextPlays.sort_values(by='Home Probability', ascending=False).query('`Home Probability` > .7')
parlay

Unnamed: 0,Game ID,Season,Week,Home,Away,Start Time,Day,Spread Line,Home Moneyline,Prediction,Home Probability,Home Implied Odds
6761,2024_04_DEN_NYJ,2024,4,NYJ,DEN,13:00,Sunday,7.5,-425.0,Home,0.809644,0.809524
6764,2024_04_NE_SF,2024,4,SF,NE,16:05,Sunday,10.5,-575.0,Home,0.800065,0.851852
