In [1]:
import pandas as pd

df = pd.read_csv('nfl-cleaned.csv')

In [2]:
df.sort_values('Date')
# df.drop(columns=['Unnamed: 0'])
df = df.reset_index()

**Training and Testing Our Model: Expanding Mean Statistics**  
To effectively train and test our model, we must compute expanding averages over the statistics from previous games. This approach ensures that the predictions for an upcoming game are based solely on data available prior to that game. By using expanding averages rather than rolling averages, we capture the cumulative mean of all prior games, rather than limiting the calculation to a fixed sliding window. This method provides a more comprehensive view of a team’s performance trajectory throughout the season, offering a robust foundation for predicting the outcome of the next game.

In [3]:
df[df['Team1'] == 'DAL'].head()

In [4]:
# Compute the expanding averages over previous games. 

numeric_columns = ['Home',
				'Team1Pts',    
				'Team2Pts',    
				'Team1PtDiff',
				'Team2PtDiff', 
				'Team1TM',    
				'Team2TM',     
				'Team1Rating', 
				'Team2Rating', 
				'Team1Sks',    
				'Team2Sks',    
				'Team1SkYds',  
				'Team2SkYds',  
				'Team1RushAtt',
				'Team2RushAtt',
				'Team1RushYds',
				'Team2RushYds',
				'Team1RYM',    
				'Team2RYM',    
				'Team1PYM',    
				'Team2PYM',    
				'Team1YM',     
				'Team2YM']

for column in numeric_columns:
	avg_col_name = column + '_avg'
	df[avg_col_name] = (
		df.groupby('Team1', group_keys=False)[column]
		.apply(lambda group: group.expanding().mean().shift(1))
		.reset_index(drop=True)
	)

In [None]:
# Verify that expanding mean statistics have been computed.

df[df['Team1'] == 'DAL'].head()

In [6]:
# TODO: Should we drop the first column or impute with its original values?

for column in numeric_columns:
	avg_col_name = column + '_avg'
	df[avg_col_name] = df[avg_col_name].fillna(df[column])

df[df['Team1'] == 'DAL'].head()

To ensure the integrity of our model evaluation, we will hold out a test set and set it aside for later use. Since the dataframe has already been sorted in chronological order, we can simply split it without worrying about data leakage. This chronological sorting ensures that the time series logic is preserved, allowing us to test the model on future data points that were not part of the training process.

In [7]:
import numpy as np

train_set, test_set = np.split(df, [int(0.8 * len(df))])

Let's do cross validation and training!

This code defines a function, `prep_data_for_fold`, to prepare training and testing datasets for each fold in a time series cross-validation split. It starts by identifying relevant feature columns for the training and testing sets. The `pre_game_cols` represent pre-game data, while `train_post_game_cols` capture various post-game statistics (e.g., points, rushing yards, sacks, ratings). For testing, the equivalent columns are suffixed with `_avg` to indicate they contain average statistics up to the current game. The function takes a specific fold (a tuple of training and testing indices) and a dataframe as inputs. It splits the data into training and testing subsets based on these indices, extracts the specified features (`X_train` and `X_test`), and retrieves the corresponding outcome labels (`y_train` and `y_test`). These prepared datasets are then returned, ready for use in training and evaluating the model during each fold of the time series split.

In [8]:
# Prepare train and test sets for each fold of TimeSeriesSplit

pre_game_cols = ['Team1', 'Team2', 'Home']
train_post_game_cols = ['Team1Pts', 'Team2Pts', 'Team1RushYds', 'Team2RushYds', 'Team1SkYds', 'Team2SkYds',
                  'Team1Sks', 'Team2Sks', 'Team1RushAtt', 'Team2RushAtt', 'Team1RYM', 'Team2RYM', 
                  'Team1PYM', 'Team2PYM', 'Team1YM', 'Team2YM', 'Team1Rating', 'Team2Rating']

test_post_game_cols = [col + '_avg' for col in train_post_game_cols]

outcome_col = 'Team1Won'

def prep_data_for_fold(fold, df):
  
  # Split data into training and testing based on the fold
  train_indices, test_indices = fold
  train_data = df.iloc[train_indices]
  test_data = df.iloc[test_indices]

  # Extract features that will be trained and tested on
  X_train = train_data[train_post_game_cols]
  X_test = test_data[test_post_game_cols]
  
  # Class labels from fold split
  y_train = train_data[outcome_col]
  y_test = test_data[outcome_col]
  
  return X_train, X_test, y_train, y_test

In [9]:
from sklearn.model_selection import TimeSeriesSplit, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score

# TODO: PCA???
# TODO: Run SequentialFeatureSelector to only use best features?

# Nested cross-validation loop that computes how well a model does.
# Return an accuracy score averaged over all folds of the TimeSeriesSplit.

def get_model_accuracy(model, params, df):

  tscv = TimeSeriesSplit(n_splits=5)

  accuracies = []

  # Outer loop: find average accuracy over all time series splits

  for train_indices, test_indices in tscv.split(df):

    # Prepare the data for this fold
    X_train, X_test, y_train, y_test = prep_data_for_fold((train_indices, test_indices), df)

    # Rename the <col>_avg columns to just <col> so GridSearchCV doesn't complain
    X_test = X_test.rename(columns=lambda x: x[:-4] if x.endswith('_avg') else x)

    # Scale the data (fit scaler on training data and transform both train and test)
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)

    # Inner loop: find the best hyperparameters for this split
    grid_search = GridSearchCV(estimator=model, param_grid=params, cv=tscv)
    grid_search.fit(X_train, y_train)

    # Get the best model from grid search
    best_model = grid_search.best_estimator_

    # Analyze how well model does by comparing its predictions to actual class labels
    y_pred = best_model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    accuracies.append(accuracy)
  
  print(best_model)

  # Return the average accuracy across all outer folds
  return sum(accuracies) / len(accuracies)

In [10]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
import warnings

warnings.filterwarnings("ignore")

models_dict = {}

dtc = DecisionTreeClassifier()
dtc_params = {
  'max_depth': [5, 10, 15, 20],
  'max_features': [5, 10, 15],
  'min_samples_leaf': [5, 10, 15, 20]
}

lr = LogisticRegression()
lr_params = {
	'penalty': ['l1', 'l2', 'elasticnet', None],
	'C': [0.1, 1, 10],
	'solver': ['liblinear', 'saga'],
	'max_iter': [100, 200]
}

rfc = RandomForestClassifier()
rfc_params = {
  'n_estimators': [50, 100, 200],
  'max_depth': [None, 10, 20],
  'min_samples_split': [2, 5, 10],
  'min_samples_leaf': [1, 2, 4]
}

gbc = GradientBoostingClassifier()
gbc_params = {
  'n_estimators': [50, 100, 200],
  'learning_rate': [0.01, 0.1, 0.2],
  'max_depth': [3, 5, 7],
  'subsample': [0.8, 1.0]
}

knn = KNeighborsClassifier()
knn_params = {
  'n_neighbors': [3, 5, 10],
  'weights': ['uniform', 'distance'],
  'p': [1, 2]  # 1 = Manhattan distance, 2 = Euclidean distance
}

svc = SVC()
svc_params = {
  'C': [0.1, 1, 10],
  'kernel': ['linear', 'rbf', 'poly'],
  'gamma': ['scale', 'auto'],
  'degree': [2, 3, 4]  # Only for 'poly' kernel
}

models_dict[dtc] = dtc_params
models_dict[lr] = lr_params
# models_dict[rfc] = rfc_params
# models_dict[gbc] = gbc_params
models_dict[knn] = knn_params
models_dict[svc] = svc_params

for model in models_dict.keys():
  score = get_model_accuracy(model, models_dict[model], df)
  print(score)

# TODO: Handle <col>_avg NaN for each team's first game of the season
# LogisticRegression() does not accept missing values encoded as NaN natively.

DecisionTreeClassifier(max_depth=20, max_features=15, min_samples_leaf=15)
0.5853658536585366
LogisticRegression(C=0.1, penalty='l1', solver='saga')
0.6097560975609756
KNeighborsClassifier(n_neighbors=10, weights='distance')
0.5772357723577235
SVC(C=1, degree=2, kernel='linear')
0.6032520325203252


Let's do a final evaluation of the model on the held out test set.

In [11]:
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score

# Let's say that LogisticRegression(C=0.1, penalty='l1', solver='saga') is the best model.

lr = LogisticRegression(C=0.1, solver='saga', penalty='l1')

X_train = train_set[train_post_game_cols]
X_test = test_set[test_post_game_cols].rename(columns=lambda x: x[:-4] if x.endswith('_avg') else x)
y_train = train_set[outcome_col]
y_test = test_set[outcome_col]

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

lr.fit(X_train, y_train)
y_pred = lr.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)

print(accuracy)

0.581081081081081


In [12]:
from sklearn.metrics import confusion_matrix

conf_matrix = confusion_matrix(y_test, y_pred)
TP = conf_matrix[0, 0]
FN = conf_matrix[0, 1]
FP = conf_matrix[1, 0]
TN = conf_matrix[1, 1]

In [13]:
def print_confusion_matrix(TP, FN, FP, TN):
    table_data = [[TP,FN],[FP,TN]]
    df = pd.DataFrame(table_data, columns =['Predicted 1','Predicted 0'])
    df = df.rename(index={0: 'Actual 1', 1: 'Actual 0'})
    display(df)



In [14]:
print_confusion_matrix(TP, FN, FP, TN)

Unnamed: 0,Predicted 1,Predicted 0
Actual 1,43,24
Actual 0,38,43
