<a href="https://colab.research.google.com/github/nickbohall/NFL_Betting_Model/blob/main/NFL_Totals_Model_Linear_Regression.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Imports & Housekeeping

In [None]:
# !pip install --upgrade pip setuptools==57.5.0 
# !pip install regressors

In [None]:
import pandas as pd
import numpy as np

import seaborn as sns
import plotly.express as px
import matplotlib.pyplot as plt
from functools import reduce
from datetime import datetime as dt
import statsmodels.api as sm
from regressors import stats

from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score

from google.colab import drive
drive.mount('/content/drive')

In [None]:
pd.options.display.float_format = '{:,.2f}'.format
plt.style.use('seaborn')
plt.rcParams['figure.figsize'] = (12,8)
pd.set_option('display.max_columns', 15)
pd.set_option('display.max_rows', 50)

# Let's get some damn data. 

This is from a python API. All of the API work and data manipulation was done in PyCharm and can be found on Github

In [None]:
df = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/NFL Model/Data/Model Data/final_data.csv", index_col=0)
schedule = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/NFL Model/Data/API Data/schedule_2002_to_2022.csv", index_col=0)

Ok lets define the target and features and create a train test split

In [None]:
df.dropna(inplace=True)
target = df.total_score
features = df.drop(['score_diff', 'spread_line', 'total_line', 'total_score'], axis=1)

# Creating train test split. Test will be 20% of the data
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=10)
print(X_train.shape, y_train.shape)

Lets define a linear regression object and fit our X and Y

In [None]:
# Create Regression object and fit
clf = LinearRegression()
clf.fit(X_train, y_train)

# Make a prediction based on the fit. Use X_test (20% of the data)
y_pred = clf.predict(X_test)

In [None]:
# Get some info about the coefficients and how they're impacting the model
mod = sm.OLS(y_train,X_train)
fii = mod.fit()

# Getting some info for graphing later
r2 = fii.rsquared
mse = mean_squared_error(y_pred, y_test)

fii.summary()

Not bad. Lets see what it looks like and we can tweak
Current R^2: 0.142

In [None]:
plt.style.use('seaborn')

# set x and y
x = y_pred
y = y_test

# calculate equation for trendline
z = np.polyfit(x, y, 1)
p = np.poly1d(z)

# Create the subplot function
fig, ax = plt.subplots()

# Plot the scatter
ax.scatter(x,y)

# Plot the trendline
ax.plot(x, p(x), color="red")

# Titles and axes
plt.title("NFL Linear Regression totals predictions - Model")
plt.xlabel("Predicted Point Total")
plt.ylabel("Actual Point Total")

# Plotting some text
plt.text(32, 95, 'R-squared = %0.4f' % r2)
plt.text(32, 90, "y = %.2fx + %.2f"%(z[0],z[1]))
plt.text(32, 85, f"MSE = {mse: .3f}")


Okay lets do the same thing, but instead of our predictions, lets use the vegas closing line. This will give us an idea of how close we are getting to the vegas models

In [None]:
df.dropna(inplace=True)
target = df.total_score
features = df.total_line

# Creating train test split. Test will be 20% of the data
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=10)

# Have to do a reshape because this is a 1d array now
X_train = X_train.values.reshape(-1, 1)
X_test = X_test.values.reshape(-1, 1)
print(X_train.shape, y_train.shape)

In [None]:
# Create Regression object and fit
clf = LinearRegression()
clf.fit(X_train, y_train)

# Make a prediction based on the fit. Use X_test (20% of the data)
y_pred = clf.predict(X_test)


In [None]:
# Get some info about the coefficients and how they're impacting the model
mod = sm.OLS(y_train,X_train)
fii = mod.fit()

# Getting some info for graphing later
r2 = fii.rsquared
mse = mean_squared_error(y_pred, y_test)

fii.summary()

In [None]:
plt.style.use('seaborn')

# set x and y
x = y_pred
y = y_test

# calculate equation for trendline
z = np.polyfit(x, y, 1)
p = np.poly1d(z)

# Create the subplot function
fig, ax = plt.subplots()

# Plot the scatter
ax.scatter(x,y)

# Plot the trendline
ax.plot(x, p(x), color="red")

# Titles and axes
plt.title("NFL Linear Regression totals predictions - Vegas")
plt.xlabel("Predicted Point Total")
plt.ylabel("Actual Point Total")

# Plotting some text
plt.text(32, 95, 'R-squared = %0.4f' % r2)
plt.text(32, 90, "y = %.2fx + %.2f"%(z[0],z[1]))
plt.text(32, 85, f"MSE = {mse: .3f}")

In [None]:
pred_y_df = pd.DataFrame({"Actual Value": y_test, "Predicted Value": y_pred, "model_difference": y_test - y_pred})
pred_y_df

Ok Let's try Random Forest

In [None]:
#Random forest
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error

In [None]:
df = df.dropna()
target = df['score_diff']
feature_cols = [column for column in df.columns if 'ewma' in column and 'dynamic' in column]
features = df[feature_cols]

In [None]:
#Data preprocessing
X = features.values
y = df['score_diff'].values.reshape(-1, 1)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1)

RF_model = RandomForestRegressor(n_estimators = 2000)

RF_model.fit(X_train, y_train)

y_pred = RF_model.predict(X_test)

print(mean_absolute_error(y_test, y_pred))

  RF_model.fit(X_train, y_train)


11.066862344582594


In [None]:
importance = RF_model.feature_importances_

feature_names = feature_cols

d = {'Var_Name': feature_names, 'Imp': importance}
dfRF = pd.DataFrame(data=d)
dfRF = dfRF.sort_values(by = ['Imp'], ascending = False).reset_index(drop = True)
dfRF.head(10)

Unnamed: 0,Var_Name,Imp
0,ewma_dynamic_window_passing_offense_home,0.16
1,ewma_dynamic_window_passing_offense_away,0.15
2,ewma_dynamic_window_passing_defense_home,0.13
3,ewma_dynamic_window_passing_defense_away,0.12
4,ewma_dynamic_window_rushing_offense_away,0.11
5,ewma_dynamic_window_rushing_offense_home,0.11
6,ewma_dynamic_window_rushing_defense_home,0.11
7,ewma_dynamic_window_rushing_defense_away,0.11


In [None]:
print(y_train.shape)
print(y_test.shape)

print()

(5065, 1)
(563, 1)



In [None]:
#LOGISTIC
import statsmodels.api as sm
logit_model=sm.Logit(y_train,X_train)
result=logit_model.fit()
print(result.summary())

ValueError: ignored

## Fuck it Fast AI Time

In [None]:
from fastai.tabular.all import *

In [None]:
print(feature_cols)

In [None]:
fai_df = pd.merge(target, features, right_index=True, left_index=True)
fai_df.head()

In [None]:
splits = RandomSplitter(valid_pct=0.2)(range_of(df))