In [1]:
# basic python packages
import pandas as pd
import networkx as nx
import matplotlib.pyplot as plt
import numpy as np
from collections import defaultdict
from sklearn.model_selection import train_test_split, KFold, cross_val_predict
from sklearn.feature_selection import RFE
from sklearn.metrics import mean_squared_error, accuracy_score,  r2_score
from sklearn.preprocessing import StandardScaler
from scipy.stats.mstats import winsorize

In [2]:
# regression model packages
import xgboost as xgb
import statsmodels.api as sm
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.dummy import DummyRegressor

In [3]:
#import data
df_results = pd.read_csv('../data/nfl_games_pfr_features.csv')
df_rankings = pd.read_csv('../data/nfl_rankings_combined.csv')

print(df_results.columns)
print(df_rankings.columns)

Index(['Unnamed: 0', 'Day', 'Date', 'Time', 'Winner', 'LoserIsHome', 'Loser',
       'PtsW', 'PtsL', 'YdsW', 'TOW', 'YdsL', 'TOL', 'season', 'week',
       'Winner Abbr', 'Loser Abbr', 'Home Team', 'Away Team', 'Winner Yds',
       'Loser Yds', 'Margin Yds', 'Winner TO', 'Loser TO', 'Margin TO',
       'Margin Pts', 'Total Pts', 'Margin Pct', 'home_team_winner',
       'away_win_bonus', 'Home_DefenseRank', 'Home_OffenseRank',
       'Home_ScoreRank', 'Week', 'Away_DefenseRank', 'Away_OffenseRank',
       'Away_ScoreRank', 'Spread', 'Delta_OffenseRank', 'Ratio_OffenseRank',
       'Delta_DefenseRank', 'Ratio_DefenseRank', 'Delta_ScoreRank',
       'Ratio_ScoreRank', 'intTerm1', 'intTerm2', 'intTerm3', 'intTerm4',
       'quadTerm1', 'quadTerm2', 'quadTerm3', 'quadTerm4', 'HomeStrength',
       'AwayStrength', 'GameType'],
      dtype='object')
Index(['Unnamed: 0', 'Team', 'Ranking', 'SeasonWeek', 'Season', 'Week',
       'Type'],
      dtype='object')


In [4]:
#Separate played and upcoming games
merged_played = df_results[~df_results['Margin Yds'].isna()]
merged_upcoming = df_results[df_results['Margin Yds'].isna()]
print(len(merged_played),len(merged_upcoming))

2111 13


In [7]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, BaggingClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.dummy import DummyClassifier
from sklearn.metrics import accuracy_score, roc_auc_score
import xgboost as xgb
from sklearn.neural_network import MLPClassifier

# Function to train the model
def train_model(model, X_train, y_train):
    model.fit(X_train, y_train)
    return model

scaler = StandardScaler()

# Define features and target

filtered_played = merged_played[merged_played['Delta_ScoreRank'].abs() > 0.02]

feature_sets = [
    filtered_played[['Home_ScoreRank','Away_ScoreRank']], 
    filtered_played[['Delta_ScoreRank']],
    filtered_played[['Ratio_ScoreRank']],
    filtered_played[['Home_OffenseRank', 'Home_DefenseRank','Home_ScoreRank', 'Away_OffenseRank', 'Away_DefenseRank','Away_ScoreRank']],
    filtered_played[['Delta_ScoreRank','Delta_OffenseRank','Delta_DefenseRank']],
    filtered_played[['Ratio_ScoreRank','Ratio_OffenseRank','Ratio_DefenseRank']],
    filtered_played[['Home_OffenseRank', 'Home_DefenseRank','Away_OffenseRank', 'Away_DefenseRank']],
    filtered_played[['Delta_OffenseRank','Delta_DefenseRank']],
    filtered_played[['Ratio_OffenseRank','Ratio_DefenseRank']],
    filtered_played[['Home_OffenseRank', 'Home_DefenseRank','Home_ScoreRank', 'Away_OffenseRank',\
               'Away_DefenseRank','Away_ScoreRank', 'Delta_ScoreRank','Delta_OffenseRank',\
                'Delta_DefenseRank', 'Ratio_ScoreRank', 'Ratio_OffenseRank', 'Ratio_DefenseRank',\
                'intTerm1', 'intTerm2','intTerm3', 'intTerm4', \
                'quadTerm1','quadTerm2', 'quadTerm3', 'quadTerm4', \
                'HomeStrength', 'AwayStrength']],
    pd.DataFrame(scaler.fit_transform(filtered_played[['Home_OffenseRank', 'Home_DefenseRank','Home_ScoreRank', 'Away_OffenseRank',\
               'Away_DefenseRank','Away_ScoreRank', 'Delta_ScoreRank','Delta_OffenseRank',\
                'Delta_DefenseRank', 'Ratio_ScoreRank', 'Ratio_OffenseRank', 'Ratio_DefenseRank',\
                'intTerm1', 'intTerm2','intTerm3', 'intTerm4', \
                'quadTerm1','quadTerm2', 'quadTerm3', 'quadTerm4', \
                'HomeStrength', 'AwayStrength']]))

]

y = (filtered_played['Spread'] > 0).astype(int)  # 1 if home team wins, 0 otherwise

# Define models
models = [
    ('LogisticReg', LogisticRegression(solver='liblinear', random_state=42)),
    ('RF50', RandomForestClassifier(n_estimators=50, max_depth=3, random_state=42)),
    ('RF10', RandomForestClassifier(n_estimators=10, max_depth=3, random_state=42)),
    ('XGBoostBase', xgb.XGBClassifier(n_estimators=50, learning_rate=0.1, max_depth=3, random_state=42)),
    ('XGBoost150', xgb.XGBClassifier(n_estimators=150, learning_rate=0.1, max_depth=3, random_state=42)),
    ('XGBoost5', xgb.XGBClassifier(n_estimators=50, learning_rate=0.1, max_depth=5, random_state=42)),
    ('SVC', SVC(kernel='rbf', C=1.0, probability=True)),
    ('SVClinear', SVC(kernel='linear', C=1.0, probability=True)),
    ('kNN5', KNeighborsClassifier(n_neighbors=5)),
    ('kNN17', KNeighborsClassifier(n_neighbors=17)),
    ('DecisionTree', DecisionTreeClassifier(max_depth=3, random_state=42)),
    ('GradientBoost', GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, max_depth=3, random_state=42)),
    ('Bagging', BaggingClassifier(n_estimators=100, random_state=42)),
    ('MLP', MLPClassifier(hidden_layer_sizes=(100,), max_iter=300, random_state=42)),
    ('NaiveBayes', GaussianNB()),
    ('Baseline', DummyClassifier(strategy='most_frequent'))
]

# DataFrame to store results
results = pd.DataFrame(columns=['Model', 'Feature_Set', 'Fold', 'Accuracy', 'AUC'])

# Define cross-validation strategy (5 folds in this case)
kf = KFold(n_splits=5, shuffle=True, random_state=42)

# Iterate over feature sets and models
for i, X in enumerate(feature_sets):
    for model_name, model in models:
        print(f"Evaluating model: {model_name} with feature set {i+1}")
        
        fold_number = 1
        # Cross-validation loop
        for train_index, test_index in kf.split(X):
            # Use iloc to select rows by integer index
            X_train, X_test = X.iloc[train_index], X.iloc[test_index]
            y_train, y_test = y.iloc[train_index], y.iloc[test_index]
            
            # Train the model
            model = train_model(model, X_train, y_train)

            # Make predictions on the test set
            y_pred = model.predict(X_test)
            y_pred_prob = model.predict_proba(X_test)[:, 1] if hasattr(model, 'predict_proba') else None

            # Evaluate the model
            accuracy = accuracy_score(y_test, y_pred)
            auc = roc_auc_score(y_test, y_pred_prob) if y_pred_prob is not None else None

            # Store the results for each fold
            new_result = pd.DataFrame({
                'Model': [model_name],
                'Feature_Set': [f'Set {i+1}'],
                'Fold': [fold_number],
                'Accuracy': [accuracy * 100],
                'AUC': [auc]
            })
            results = pd.concat([results, new_result], ignore_index=True)
            
            fold_number += 1

# Print the results DataFrame
#print(results)


Evaluating model: LogisticReg with feature set 1
Evaluating model: RF50 with feature set 1


  results = pd.concat([results, new_result], ignore_index=True)


Evaluating model: RF10 with feature set 1
Evaluating model: XGBoostBase with feature set 1
Evaluating model: XGBoost150 with feature set 1
Evaluating model: XGBoost5 with feature set 1
Evaluating model: SVC with feature set 1
Evaluating model: SVClinear with feature set 1
Evaluating model: kNN5 with feature set 1
Evaluating model: kNN17 with feature set 1
Evaluating model: DecisionTree with feature set 1
Evaluating model: GradientBoost with feature set 1
Evaluating model: Bagging with feature set 1
Evaluating model: CatBoost with feature set 1
Evaluating model: MLP with feature set 1
Evaluating model: AdaBoost with feature set 1




Evaluating model: NaiveBayes with feature set 1
Evaluating model: Baseline with feature set 1
Evaluating model: LogisticReg with feature set 2
Evaluating model: RF50 with feature set 2
Evaluating model: RF10 with feature set 2
Evaluating model: XGBoostBase with feature set 2
Evaluating model: XGBoost150 with feature set 2
Evaluating model: XGBoost5 with feature set 2
Evaluating model: SVC with feature set 2
Evaluating model: SVClinear with feature set 2
Evaluating model: kNN5 with feature set 2
Evaluating model: kNN17 with feature set 2
Evaluating model: DecisionTree with feature set 2
Evaluating model: GradientBoost with feature set 2
Evaluating model: Bagging with feature set 2
Evaluating model: CatBoost with feature set 2
Evaluating model: MLP with feature set 2
Evaluating model: AdaBoost with feature set 2




Evaluating model: NaiveBayes with feature set 2
Evaluating model: Baseline with feature set 2
Evaluating model: LogisticReg with feature set 3
Evaluating model: RF50 with feature set 3
Evaluating model: RF10 with feature set 3
Evaluating model: XGBoostBase with feature set 3
Evaluating model: XGBoost150 with feature set 3
Evaluating model: XGBoost5 with feature set 3
Evaluating model: SVC with feature set 3
Evaluating model: SVClinear with feature set 3
Evaluating model: kNN5 with feature set 3
Evaluating model: kNN17 with feature set 3
Evaluating model: DecisionTree with feature set 3
Evaluating model: GradientBoost with feature set 3
Evaluating model: Bagging with feature set 3
Evaluating model: CatBoost with feature set 3
Evaluating model: MLP with feature set 3
Evaluating model: AdaBoost with feature set 3




Evaluating model: NaiveBayes with feature set 3
Evaluating model: Baseline with feature set 3
Evaluating model: LogisticReg with feature set 4
Evaluating model: RF50 with feature set 4
Evaluating model: RF10 with feature set 4
Evaluating model: XGBoostBase with feature set 4
Evaluating model: XGBoost150 with feature set 4
Evaluating model: XGBoost5 with feature set 4
Evaluating model: SVC with feature set 4
Evaluating model: SVClinear with feature set 4
Evaluating model: kNN5 with feature set 4
Evaluating model: kNN17 with feature set 4
Evaluating model: DecisionTree with feature set 4
Evaluating model: GradientBoost with feature set 4
Evaluating model: Bagging with feature set 4
Evaluating model: CatBoost with feature set 4
Evaluating model: MLP with feature set 4
Evaluating model: AdaBoost with feature set 4




Evaluating model: NaiveBayes with feature set 4
Evaluating model: Baseline with feature set 4
Evaluating model: LogisticReg with feature set 5
Evaluating model: RF50 with feature set 5
Evaluating model: RF10 with feature set 5
Evaluating model: XGBoostBase with feature set 5
Evaluating model: XGBoost150 with feature set 5
Evaluating model: XGBoost5 with feature set 5
Evaluating model: SVC with feature set 5
Evaluating model: SVClinear with feature set 5
Evaluating model: kNN5 with feature set 5
Evaluating model: kNN17 with feature set 5
Evaluating model: DecisionTree with feature set 5
Evaluating model: GradientBoost with feature set 5
Evaluating model: Bagging with feature set 5
Evaluating model: CatBoost with feature set 5
Evaluating model: MLP with feature set 5
Evaluating model: AdaBoost with feature set 5




Evaluating model: NaiveBayes with feature set 5
Evaluating model: Baseline with feature set 5
Evaluating model: LogisticReg with feature set 6
Evaluating model: RF50 with feature set 6
Evaluating model: RF10 with feature set 6
Evaluating model: XGBoostBase with feature set 6
Evaluating model: XGBoost150 with feature set 6
Evaluating model: XGBoost5 with feature set 6
Evaluating model: SVC with feature set 6
Evaluating model: SVClinear with feature set 6
Evaluating model: kNN5 with feature set 6
Evaluating model: kNN17 with feature set 6
Evaluating model: DecisionTree with feature set 6
Evaluating model: GradientBoost with feature set 6
Evaluating model: Bagging with feature set 6
Evaluating model: CatBoost with feature set 6
Evaluating model: MLP with feature set 6
Evaluating model: AdaBoost with feature set 6




Evaluating model: NaiveBayes with feature set 6
Evaluating model: Baseline with feature set 6
Evaluating model: LogisticReg with feature set 7
Evaluating model: RF50 with feature set 7
Evaluating model: RF10 with feature set 7
Evaluating model: XGBoostBase with feature set 7
Evaluating model: XGBoost150 with feature set 7
Evaluating model: XGBoost5 with feature set 7
Evaluating model: SVC with feature set 7
Evaluating model: SVClinear with feature set 7
Evaluating model: kNN5 with feature set 7
Evaluating model: kNN17 with feature set 7
Evaluating model: DecisionTree with feature set 7
Evaluating model: GradientBoost with feature set 7
Evaluating model: Bagging with feature set 7
Evaluating model: CatBoost with feature set 7
Evaluating model: MLP with feature set 7
Evaluating model: AdaBoost with feature set 7




Evaluating model: NaiveBayes with feature set 7
Evaluating model: Baseline with feature set 7
Evaluating model: LogisticReg with feature set 8
Evaluating model: RF50 with feature set 8
Evaluating model: RF10 with feature set 8
Evaluating model: XGBoostBase with feature set 8
Evaluating model: XGBoost150 with feature set 8
Evaluating model: XGBoost5 with feature set 8
Evaluating model: SVC with feature set 8
Evaluating model: SVClinear with feature set 8
Evaluating model: kNN5 with feature set 8
Evaluating model: kNN17 with feature set 8
Evaluating model: DecisionTree with feature set 8
Evaluating model: GradientBoost with feature set 8
Evaluating model: Bagging with feature set 8
Evaluating model: CatBoost with feature set 8
Evaluating model: MLP with feature set 8
Evaluating model: AdaBoost with feature set 8




Evaluating model: NaiveBayes with feature set 8
Evaluating model: Baseline with feature set 8
Evaluating model: LogisticReg with feature set 9
Evaluating model: RF50 with feature set 9
Evaluating model: RF10 with feature set 9
Evaluating model: XGBoostBase with feature set 9
Evaluating model: XGBoost150 with feature set 9
Evaluating model: XGBoost5 with feature set 9
Evaluating model: SVC with feature set 9
Evaluating model: SVClinear with feature set 9
Evaluating model: kNN5 with feature set 9
Evaluating model: kNN17 with feature set 9
Evaluating model: DecisionTree with feature set 9
Evaluating model: GradientBoost with feature set 9
Evaluating model: Bagging with feature set 9
Evaluating model: CatBoost with feature set 9
Evaluating model: MLP with feature set 9
Evaluating model: AdaBoost with feature set 9




Evaluating model: NaiveBayes with feature set 9
Evaluating model: Baseline with feature set 9
Evaluating model: LogisticReg with feature set 10
Evaluating model: RF50 with feature set 10
Evaluating model: RF10 with feature set 10
Evaluating model: XGBoostBase with feature set 10
Evaluating model: XGBoost150 with feature set 10
Evaluating model: XGBoost5 with feature set 10
Evaluating model: SVC with feature set 10
Evaluating model: SVClinear with feature set 10
Evaluating model: kNN5 with feature set 10
Evaluating model: kNN17 with feature set 10
Evaluating model: DecisionTree with feature set 10
Evaluating model: GradientBoost with feature set 10
Evaluating model: Bagging with feature set 10
Evaluating model: CatBoost with feature set 10
Evaluating model: MLP with feature set 10




Evaluating model: AdaBoost with feature set 10




Evaluating model: NaiveBayes with feature set 10
Evaluating model: Baseline with feature set 10
Evaluating model: LogisticReg with feature set 11
Evaluating model: RF50 with feature set 11
Evaluating model: RF10 with feature set 11
Evaluating model: XGBoostBase with feature set 11
Evaluating model: XGBoost150 with feature set 11
Evaluating model: XGBoost5 with feature set 11
Evaluating model: SVC with feature set 11
Evaluating model: SVClinear with feature set 11
Evaluating model: kNN5 with feature set 11
Evaluating model: kNN17 with feature set 11
Evaluating model: DecisionTree with feature set 11
Evaluating model: GradientBoost with feature set 11
Evaluating model: Bagging with feature set 11
Evaluating model: CatBoost with feature set 11
Evaluating model: MLP with feature set 11




Evaluating model: AdaBoost with feature set 11




Evaluating model: NaiveBayes with feature set 11
Evaluating model: Baseline with feature set 11


In [10]:
# Print the results DataFrame
print(results.to_csv(index=False))

Model,Feature_Set,Fold,Accuracy,AUC
LogisticReg,Set 1,1,57.14285714285714,0.639429175475687
LogisticReg,Set 1,2,51.53061224489795,0.5868520066889631
LogisticReg,Set 1,3,56.63265306122449,0.6480978260869565
LogisticReg,Set 1,4,59.183673469387756,0.6605639730639731
LogisticReg,Set 1,5,57.6530612244898,0.5974842767295597
RF50,Set 1,1,63.775510204081634,0.6451902748414376
RF50,Set 1,2,64.79591836734694,0.6256270903010035
RF50,Set 1,3,63.26530612244898,0.6554138795986623
RF50,Set 1,4,62.755102040816325,0.6406776094276094
RF50,Set 1,5,62.755102040816325,0.614832285115304
RF10,Set 1,1,63.775510204081634,0.6449788583509514
RF10,Set 1,2,59.693877551020414,0.6174749163879599
RF10,Set 1,3,61.73469387755102,0.6383779264214047
RF10,Set 1,4,62.244897959183675,0.653672138047138
RF10,Set 1,5,61.73469387755102,0.6342767295597483
XGBoostBase,Set 1,1,62.755102040816325,0.6301797040169134
XGBoostBase,Set 1,2,57.6530612244898,0.6063963210702341
XGBoostBase,Set 1,3,61.224489795918366,0.6293896321070235
XGBo

In [11]:
X = filtered_played[['Home_OffenseRank', 'Home_DefenseRank','Home_ScoreRank', 'Away_OffenseRank', 'Away_DefenseRank','Away_ScoreRank']]
y = (filtered_played['Spread'] > 0).astype(int)
model = RandomForestRegressor(n_estimators=50, max_depth=3, min_samples_split=2,min_samples_leaf=1,max_features='sqrt', random_state=42)

model.fit(X, y)

# Make predictions on the test set
X_new = merged_upcoming[['Home_OffenseRank', 'Home_DefenseRank','Home_ScoreRank', 'Away_OffenseRank', 'Away_DefenseRank','Away_ScoreRank']]

y_pred_new = model.predict(X_new)
merged_upcoming.loc[:, 'Predicted Winner'] = y_pred_new
print(merged_upcoming[['Home Team','Away Team','Delta_ScoreRank','Predicted Winner']])

   Home Team Away Team  Delta_ScoreRank  Predicted Winner
15       CHI       JAC         -0.00386          0.472599
16       PHI       CLE          0.04646          0.692462
17       TEN       IND         -0.03403          0.441097
18        GB       ARI          0.02059          0.580230
19        NE       HOU         -0.01445          0.509864
20        NO        TB         -0.05418          0.475988
21       BAL       WAS          0.00806          0.544053
22        LV       PIT         -0.07049          0.410363
23       DEN       LAC          0.07975          0.633347
24       CAR       ATL         -0.05177          0.410849
25       DAL       DET          0.01047          0.467094
26       NYG       CIN          0.01597          0.550455
27       NYJ       BUF         -0.00428          0.540574


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  merged_upcoming.loc[:, 'Predicted Winner'] = y_pred_new
