# Setup

In [1]:
# Import needed libraries
import pandas as pd
import polars as pl
import numpy as np
import matplotlib.pyplot as plt
import nflreadpy as nfl
import seaborn as sns

from sklearn.preprocessing import StandardScaler

In [2]:
# Set display options
pd.set_option('display.max_rows', 1000)

# Import data

In [4]:
df = pd.read_csv('../data/clean_data.csv')
df

Unnamed: 0,season,week,team,season_type,opponent_team,previous_game_completions,previous_game_attempts,previous_game_passing_yards,previous_game_passing_tds,previous_game_passing_interceptions,...,opponent_previous_season_average_gwfg_att,opponent_previous_season_average_gwfg_missed,opponent_previous_season_average_gwfg_blocked,opponent_previous_season_average_gwfg_distance,opponent_previous_season_average_opponent_points_scored,opponent_previous_season_average_outcome,opponent_previous_season_average_points_scored,had_bye_week_last,opponent_had_bye_week_last,outcome
0,2008,1,ARI,REG,SF,23.0,39.0,300.0,3.0,2.0,...,0.062500,0.0625,0.000000,3.250000,22.375000,0.281250,13.187500,1,1,1.0
1,2008,1,ATL,REG,DET,17.0,27.0,251.0,4.0,0.0,...,0.062500,0.0000,0.000000,2.312500,27.250000,0.437500,21.125000,1,1,1.0
2,2008,1,BAL,REG,CIN,16.0,27.0,171.0,1.0,0.0,...,0.000000,0.0000,0.000000,0.000000,24.062500,0.437500,22.875000,1,1,1.0
3,2008,1,BUF,REG,SEA,16.0,30.0,133.0,0.0,0.0,...,0.000000,0.0000,0.000000,0.000000,18.722222,0.611111,24.444444,1,1,1.0
4,2008,1,CAR,REG,LAC,15.0,24.0,174.0,2.0,1.0,...,0.000000,0.0000,0.000000,0.000000,17.000000,0.684211,23.736842,1,1,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9607,2025,13,SEA,REG,MIN,16.0,26.0,244.0,2.0,0.0,...,0.055556,0.0000,0.000000,1.611111,19.166667,0.777778,24.166667,0,0,1.0
9608,2025,13,SF,REG,CLE,23.0,32.0,193.0,1.0,3.0,...,0.000000,0.0000,0.000000,0.000000,25.588235,0.176471,14.705882,0,0,1.0
9609,2025,13,TB,REG,ARI,17.0,34.0,103.0,1.0,2.0,...,0.117647,0.0000,0.000000,3.882353,22.294118,0.411765,22.588235,0,0,1.0
9610,2025,13,TEN,REG,JAX,28.0,42.0,256.0,1.0,0.0,...,0.058824,0.0000,0.000000,2.882353,25.470588,0.235294,18.470588,0,0,0.0


# Process data

In [5]:
def process_df(df):
    '''
    - Removes unnecessary features
    - Splits the preprocessed team_stats df
    - Scales the features

    Parameters:
    df - the preprocessed team data returned from preprocess_team_data

    Returns:
    X_train, y_train, X_val, y_val, X_test, y_test - feature matrix and responders
    '''

    # # # # # # # # #
    # Drop Columns  #
    # # # # # # # # #

    # Discard draws - draws happen ~0.005% of the time
    # Since the application of this project is to outperform sportsbooks, 
    # and they discard the bet on a draw, we can safely ignore the draw 
    # from our data to only predict wins/losses
    df = df.query('outcome != 0.5')

    # Discard team names - we don't want to learn patterns from team names since 
    # they can change drastically from season to season.
    # e.g. Patriots dynasty occurs exclusively in train data (ended when Tom Brady left)
    df = df.drop(columns=['team', 'opponent_team'])

    # Turn season_type column into indicator for post-season 
    # (only takes on values regular and post season)
    df.rename(columns={'season_type': 'is_postseason'}, inplace=True)
    df['is_postseason'] = (df['is_postseason'] == 'POST').astype(int)

    # No field goals have ever been missed <20 yards out, so all these variables only have value 0
    df = df.drop(columns=['previous_game_fg_missed_0_19',
                          'opponent_previous_game_fg_missed_0_19',
                          'season_average_fg_missed_0_19',
                          'opponent_season_average_fg_missed_0_19',
                          'previous_season_average_fg_missed_0_19',
                          'opponent_previous_season_average_fg_missed_0_19'])
    
    df = df.reset_index(drop=True)

    
    # # # # # # # #
    # Split data  #
    # # # # # # # #

    # Since this is time series data, we want to split by seasons such that train < val < test in time
    # For this we will use train data < 2020, 2020 <= val data < 2023, 2023 <= test data
    train = df.query('season < 2020').reset_index(drop=True)
    X_train = train.drop(columns=['outcome'])
    y_train = train['outcome']
    
    val = df.query('season >= 2020 and season < 2023').reset_index(drop=True)
    X_val = val.drop(columns=['outcome'])
    y_val = val['outcome']
    
    test = df.query('season >= 2023').reset_index(drop=True)
    X_test = test.drop(columns=['outcome'])
    y_test = test['outcome']


    # # # # # # # # # #
    # Scale Features  #
    # # # # # # # # # #
    scaler = StandardScaler()
    X_train = pd.DataFrame(scaler.fit_transform(X_train), columns=X_train.columns, index=X_train.index)
    X_val = pd.DataFrame(scaler.transform(X_val), columns=X_val.columns, index=X_val.index)
    X_test = pd.DataFrame(scaler.transform(X_test), columns=X_test.columns, index=X_test.index)

    return X_train, y_train, X_val, y_val, X_test, y_test

In [6]:
X_train, y_train, X_val, y_val, X_test, y_test = process_df(df)

# EDA