In [11]:
from sklearn.linear_model import Ridge
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
import joblib

In [12]:
base_df = pd.read_csv(r'C:\Users\NickBowen\Projects\NFL Data Project\Game Data\NFL_TeamGame_Stats2024 - Sheet1.csv')

In [13]:
df = base_df.copy()

In [14]:
# Split the data into features and target
X = df.drop(['Points','Week','Win/Loss','Opponent Points','Opponent','Team'], axis=1)
y = df['Points']

In [15]:
#defining categorical columns
categorical_cols = X.select_dtypes(include=['object', 'category']).columns.tolist()

In [16]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [17]:
# Create a column transformer for one-hot encoding and imputation
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', Pipeline(steps=[
            ('imputer', SimpleImputer(strategy='most_frequent')),  # Impute missing values with the most frequent value
            ('onehot', OneHotEncoder(drop='first', handle_unknown='ignore'))  # Ignore unknown categories
        ]), categorical_cols),
        ('num', SimpleImputer(strategy='mean'), X.select_dtypes(include=['float64', 'int64']).columns)  # Impute missing values with the mean for numerical columns
    ],
    remainder='passthrough'  # Keep the rest of the columns unchanged
)

In [18]:
# Create a pipeline with the preprocessor and the model
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', Ridge(alpha=10.0, random_state=42))
])

In [19]:
# Fit the pipeline to the training data
pipeline.fit(X_train, y_train)

Pipeline(steps=[('preprocessor',
                 ColumnTransformer(remainder='passthrough',
                                   transformers=[('cat',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer(strategy='most_frequent')),
                                                                  ('onehot',
                                                                   OneHotEncoder(drop='first',
                                                                                 handle_unknown='ignore'))]),
                                                  ['Location', 'Stadium Type',
                                                   'Time of Possesion']),
                                                 ('num', SimpleImputer(),
                                                  Index(['First Downs', 'Rushing Attempts', 'Rushing Yards',...
       'Passing Yards', 'Passing Touchdown

In [20]:
# Cell 6: Save model
model_filename = 'NFL_Week5_RidgeModel.pkl'
joblib.dump(pipeline, model_filename)
print(f"Model saved to {model_filename}")

Model saved to NFL_Week5_RidgeModel.pkl
