In [16]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, roc_auc_score
import warnings

warnings.filterwarnings('ignore')

In [2]:
df = pd.read_csv('getting_there.csv', parse_dates=['date'], date_format='%Y-%m-%d')

In [3]:
df.shape

(6175, 70)

In [4]:
df.head(3)

Unnamed: 0,game_id,home_team,away_team,date,season,month,week,day_of_week,overtime,home_pts,...,away_4datt,home_top,away_top,pts_diff_home,pass_yds_diff_home,rush_yds_diff_home,top_diff_home,home_total_yds,away_total_yds,home_results
0,2000_1_New York Giants_Arizona Cardinals,New York Giants,Arizona Cardinals,2000-09-03,2000,9,1,6,0,21,...,2,1890,1710,5,-140,180,180,395,355,1
1,2000_1_New England Patriots_Tampa Bay Buccaneers,New England Patriots,Tampa Bay Buccaneers,2000-09-03,2000,9,1,6,0,16,...,0,1705,1895,-5,34,-52,-190,278,296,0
2,2000_1_Pittsburgh Steelers_Baltimore Ravens,Pittsburgh Steelers,Baltimore Ravens,2000-09-03,2000,9,1,6,0,0,...,1,1493,2107,-16,-3,-110,-614,223,336,0


## Perform One-Hot Encoding

In [17]:
encoder = OneHotEncoder(sparse_output=False, drop='first')

# Combine home and away teams for encoding
teams = pd.concat([df['home_team'], df['away_team']]).unique()
encoder.fit(teams.reshape(-1, 1))

# Encode home teams
home_encoded = encoder.transform(df[['home_team']])
home_feature_names = encoder.get_feature_names_out(['home_team'])
home_encoded_df = pd.DataFrame(home_encoded, columns=home_feature_names)

# Encode away teams
away_encoded = encoder.transform(df[['away_team']])
away_feature_names = encoder.get_feature_names_out(['away_team'])
away_encoded_df = pd.DataFrame(away_encoded, columns=away_feature_names)

# Concatenate with original DataFrame
df_encoded = pd.concat([df, home_encoded_df, away_encoded_df], axis=1)

# Drop original 'home_team' and 'away_team' columns
df_encoded = df_encoded.drop(['home_team', 'away_team'], axis=1)

In [19]:
df_encoded.shape, df.shape

((6175, 130), (6175, 70))

## Prepare Your Features And Target

In [24]:
# Assuming 'home_results' is your target variable
X = df_encoded.drop(['game_id', 'date','home_results'], axis=1)
y = df_encoded['home_results']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


## Create And Train Logistic Regression Model

In [25]:
# Scale numerical features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Create and train the model
model = LogisticRegression(random_state=42)
model.fit(X_train_scaled, y_train)

## Make Predictions

In [26]:
y_pred = model.predict(X_test_scaled)
y_pred_proba = model.predict_proba(X_test_scaled)[:, 1]  # Probability of home team winning

## Evaluate The Model

In [32]:
print("Accuracy:", f'{accuracy_score(y_test, y_pred):.4%}')
print("\nClassification Report:\n", classification_report(y_test, y_pred))
print("ROC AUC Score:", f'{roc_auc_score(y_test, y_pred_proba):%}')

Accuracy: 97.8947%

Classification Report:
               precision    recall  f1-score   support

           0       0.98      0.98      0.98       542
           1       0.98      0.98      0.98       693

    accuracy                           0.98      1235
   macro avg       0.98      0.98      0.98      1235
weighted avg       0.98      0.98      0.98      1235

ROC AUC Score: 98.469939%


## Future Predictions

In [None]:
# make dummies or get the average of the last season to predict in the first week of games then be able to 
# make predictions, then take each week's stats and plug in to the week to best predict

In [36]:
future_games = pd.read_csv('NFL_2024_schedule.csv')
future_games.head(2)

Unnamed: 0,home_team,away_team,season,month,week,day_of_week
0,Kansas City Chiefs,Baltimore Ravens,2024,9,1,3
1,Philadelphia Eagles,Green Bay Packers,2024,9,1,4


In [49]:
y_predictions = np.round(y_pred_proba * 100, 2)
y_predictions = pd.DataFrame(y_predictions, columns= ['home_win_prob'])
y_predictions # predictions are on X_test dataset subset

Unnamed: 0,home_win_prob
0,0.00
1,9.28
2,100.00
3,100.00
4,100.00
...,...
1230,0.00
1231,100.00
1232,100.00
1233,100.00


In [62]:
X_test

Unnamed: 0,season,month,week,day_of_week,overtime,home_pts,away_pts,home_pass_cmp,away_pass_cmp,home_pass_att,...,away_team_New Orleans Saints,away_team_New York Giants,away_team_New York Jets,away_team_Philadelphia Eagles,away_team_Pittsburgh Steelers,away_team_San Francisco 49ers,away_team_Seattle Seahawks,away_team_Tampa Bay Buccaneers,away_team_Tennessee Titans,away_team_Washington Commanders
2402,2009,11,8,6,0,10,31,15,25,23,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2662,2010,11,9,6,1,20,23,20,22,37,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6102,2023,12,14,6,0,33,13,24,19,39,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
346,2001,10,7,0,0,34,7,14,14,24,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
4681,2018,10,6,6,0,27,3,17,23,23,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5908,2023,9,1,6,0,20,38,24,15,37,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3846,2015,9,2,6,0,43,18,21,33,27,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
3008,2011,12,15,3,0,41,14,24,12,32,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2223,2008,12,13,0,0,30,17,14,25,25,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
