In [None]:
import nfl_data_py as nfl
import pandas as pd
import numpy as np
from sklearn.model_selection import RandomizedSearchCV
import joblib
import seaborn as sns

In [None]:
# silences pandas warnings
pd.options.mode.chained_assignment = None
pd.set_option('display.max_columns', 300)

### Functions

In [None]:
# Function to update Elo rating after a match
def update_elo(player1_elo, player2_elo, result, k_factor=32):
    expected_score_player1 = 1 / (1 + 10 ** ((player2_elo - player1_elo) / 400))
    expected_score_player2 = 1 - expected_score_player1

    player1_new_elo = player1_elo + k_factor * (result - expected_score_player1)
    player2_new_elo = player2_elo + k_factor * ((1 - result) - expected_score_player2)

    return player1_new_elo, player2_new_elo

# Function to regress Elo ratings towards the mean
def regress_to_mean(elo_ratings, mean_elo, regression_weight=1/3):
    elo_ratings['Elo'] = elo_ratings['Elo'] + regression_weight * (mean_elo - elo_ratings['Elo'])
    return elo_ratings 

# Function to calculate mean Elo
def calculate_mean_elo(elo_ratings):
    return elo_ratings.Elo.sum() / len(elo_ratings)

def get_elo(team,season,week,df):
    try:
        elo = [df.loc[(df['Team']==team)&(df['Season']==season)&(df['Week']==week-1)], 'Elo'].values[0]
    except:
        team_week = df.loc[(df['Team']==team)&(df['Season']==season)&(df['Week']<week)]['Week'].max()
        elo = df.loc[(df['Team']==team)&(df['Season']==season)&(df['Week']==team_week), 'Elo'].values[0]   
    return elo

def get_qb_elo(qb_id,season,week,historical_elo_qb):
    '''
    This function will grab the specific elo rating of a QB from a specific season and week
    '''
    if week == 1:
        try:
            qb_week = historical_elo_qb.loc[(historical_elo_qb['passer_id']==qb_id)&(historical_elo_qb['Season']==season-1)]['Week'].max()
            elo = historical_elo_qb.loc[(historical_elo_qb['passer_id']==qb_id)&(historical_elo_qb['Season']==season-1)&(historical_elo_qb['Week']==qb_week), 'Elo'].values[0]
        except:
            try:
                qb_season = historical_elo_qb.loc[(historical_elo_qb['passer_id']==qb_id)&(historical_elo_qb['Season']<season)]['Season'].max()
                qb_week = historical_elo_qb.loc[(historical_elo_qb['passer_id']==qb_id)&(historical_elo_qb['Season']==qb_season)]['Week'].max()
                elo = historical_elo_qb.loc[(historical_elo_qb['passer_id']==qb_id)&(historical_elo_qb['Season']==qb_season)&(historical_elo_qb['Week']==qb_week), 'Elo'].values[0]
            except:
                elo = 1500
    else:
        try:
            elo = historical_elo_qb.loc[(historical_elo_qb['passer_id']==qb_id)&(historical_elo_qb['Season']==season)&(historical_elo_qb['Week']==week-1), 'Elo'].values[0]
        except:
            try:
                qb_week = historical_elo_qb.loc[(historical_elo_qb['passer_id']==qb_id)&(historical_elo_qb['Season']==season)&(historical_elo_qb['Week']<week)]['Week'].max()
                elo = historical_elo_qb.loc[(historical_elo_qb['passer_id']==row['home_qb_id'])&(historical_elo_qb['Season']==season)&(historical_elo_qb['Week']==qb_week), 'Elo'].values[0]
            except:
                try:
                    qb_season = historical_elo_qb.loc[(historical_elo_qb['passer_id']==qb_id)&(historical_elo_qb['Season']<season)]['Season'].max()
                    qb_week = historical_elo_qb.loc[(historical_elo_qb['passer_id']==qb_id)&(historical_elo_qb['Season']==qb_season)]['Week'].max()
                    elo = historical_elo_qb.loc[(historical_elo_qb['passer_id']==row['home_qb_id'])&(historical_elo_qb['Season']==qb_season)&(historical_elo_qb['Week']==qb_week), 'Elo'].values[0]
                except:
                    elo = 1500
    
    return elo

def get_value(df, season, week, team, team_col_name, num, denom):
    '''Get weighted average of stats over the last two seasons'''
    df_temp = df.loc[(df['season']==season)&(df['week']<week)&(df[team_col_name]==team)]
    df_temp2 = df.loc[(df['season']==season-1)&(df[team_col_name]==team)]
    value2 = df_temp2[num].sum()/df_temp2[denom].sum()
    if df_temp[denom].sum() == 0:
        return value2
    value = df_temp[num].sum()/df_temp[denom].sum()
    if week == 1:
        final_value = value2
    elif week > 12:
        final_value = value
    else:
        final_value = week*value/12 + (12-week)*value2/12

    if final_value is None:
        return value2
    else:
        return final_value

def get_qbr(df, season, week, qb_id):
    '''Get weighted average of QBR over the last two seasons'''
    df_temp = df.loc[(df['season']==season)&(df['game_week']<week)&(df['gsis_id']==qb_id)]
    if len(df_temp)>0:
        df_temp['weighted'] = df_temp['qbr_total']*df_temp['qb_plays']/df_temp['qb_plays'].sum()
        value = df_temp['weighted'].sum()
    else:
        value = 50.0
    
    df_temp2 = df.loc[(df['season']==season-1)&(df['gsis_id']==qb_id)]
    if len(df_temp2) > 0:
        df_temp2['weighted'] = df_temp2['qbr_total']*df_temp2['qb_plays']/df_temp2['qb_plays'].sum()
        value2 = df_temp2['weighted'].sum()
    else:
        value2 = 50.0
    if week == 1:
        final_value = value2
    elif week > 12:
        final_value = value
    else:
        final_value = week*value/12 + (12-week)*value2/12
    
    return final_value

### Data Extraction

In [None]:
years = [year for year in range(1999, 2023)]

In [None]:
weekly_data = nfl.import_weekly_data(years)

In [None]:
matchups = nfl.import_schedules(years)

In [None]:
matchups.tail()

In [None]:
upcoming = nfl.import_schedules([2023])

In [None]:
qbr = nfl.import_qbr([year for year in range(2007, 2023)], level='nfl', frequency='weekly')

In [None]:
data = nfl.import_pbp_data([year for year in range(1999, 2023)])

### Feature Engineering

In [None]:
# Dictionary of all relocated teams
replace_dict = {'OAK':'LV','STL':'LA','SD':'LAC'}

In [None]:
# Replace any relocated teams
data['home_team'] = data['home_team'].replace(replace_dict)
data['away_team'] = data['away_team'].replace(replace_dict)

In [None]:
# pass_att = data.loc[data['play_type']=='pass']
pass_att = data.loc[data['play_type_nfl']=='PASS']
rush_att = data.loc[data['play_type']=='rush']
plays = data.loc[data['play']==1.0]

In [None]:
pass_stats = pass_att.pivot_table(index=['passer_id','passer','season','season_type','week','game_id','game_date','posteam','defteam'], values=['play_id','incomplete_pass','yards_gained','interception', \
    'pass_touchdown'], aggfunc={'play_id':'count', 'incomplete_pass': 'sum','yards_gained': 'sum','interception': 'sum','pass_touchdown': 'sum',}).reset_index().sort_values(by='play_id', \
        ascending=False)

In [None]:
# Create passer rating
pass_stats.loc[:,'completion_pct'] = 1-(pass_stats.loc[:,'incomplete_pass']+pass_stats.loc[:,'interception'])/pass_stats.loc[:,'play_id']
pass_stats.loc[:,'yards_per_att'] = pass_stats.loc[:,'yards_gained']/pass_stats.loc[:,'play_id']
pass_stats.loc[:,'td_per_att'] = pass_stats.loc[:,'pass_touchdown']/pass_stats.loc[:,'play_id']
pass_stats.loc[:,'int_per_att'] = pass_stats.loc[:,'interception']/pass_stats.loc[:,'play_id']
pass_stats.loc[:,'a'] = np.where((5*(pass_stats.loc[:,'completion_pct']-0.3)<=2.375)&(5*(pass_stats.loc[:,'completion_pct']-0.3)>=0), 5*(pass_stats.loc[:,'completion_pct']-0.3), \
    np.where((5*(pass_stats.loc[:,'completion_pct']-0.3)>2.375), 2.375, 0))
pass_stats.loc[:,'b'] = np.where(((0.25*(pass_stats.loc[:,'yards_per_att']-3))<=2.375)&((0.25*(pass_stats.loc[:,'yards_per_att']-3))>=0), (0.25*(pass_stats.loc[:,'yards_per_att']-3)), \
    np.where(((0.25*(pass_stats.loc[:,'yards_per_att']-3))>2.375), 2.375, 0))
pass_stats.loc[:,'c'] = np.where(((20*pass_stats.loc[:,'td_per_att'])<=2.375)&((20*pass_stats.loc[:,'td_per_att'])>=0), (20*pass_stats.loc[:,'td_per_att']), \
    np.where(((20*pass_stats.loc[:,'td_per_att'])>2.375), 2.375, 0))
pass_stats.loc[:,'d'] = np.where(((2.375-(25*pass_stats.loc[:,'int_per_att']))<=2.375)&((2.375-(25*pass_stats.loc[:,'int_per_att']))>=0), (2.375-(25*pass_stats.loc[:,'int_per_att'])), \
    np.where(((2.375-(25*pass_stats.loc[:,'int_per_att']))>2.375), 2.375, 0))
pass_stats.loc[:,'passer_rating'] = 100*((pass_stats.loc[:,'a']+pass_stats.loc[:,'b']+pass_stats.loc[:,'c']+pass_stats.loc[:,'d'])/6)

In [None]:
# At least 10 pass attempts
pass_stats = pass_stats.loc[pass_stats['play_id']>=10]

In [None]:
pass_stats = pass_stats.sort_values(by='game_date').reset_index()

In [None]:
# To check for season change
pass_stats['season_shift'] = pass_stats['season'].shift().fillna(0.0)

In [None]:
# Initial elo rating
initial_elo = 1500

# Create a dictionary to hold current Elo ratings for each player
qb_ratings = {player: initial_elo for player in pass_stats['passer_id'].unique()}
def_ratings = {team: initial_elo for team in pass_stats['defteam'].unique()}

# Create a separate DataFrame to store the updated Elo ratings
elo_qb = pd.DataFrame(qb_ratings.items(), columns=['passer_id', 'Elo'])
elo_qb = elo_qb.merge(pass_stats[['passer_id', 'passer']].drop_duplicates(), how='inner', on='passer_id')
elo_def = pd.DataFrame(def_ratings.items(), columns=['Team', 'Elo'])

In [None]:
# Create a new DataFrame to hold historical weekly Elo ratings for each team and QB
historical_elo_def = pd.DataFrame(columns=['Team', 'Season', 'Week', 'Elo'])
historical_elo_qb = pd.DataFrame(columns=['passer_id', 'Passer', 'Season', 'Week', 'Elo'])

# Iterate through matchups DataFrame and update Elo ratings
for index, row in pass_stats.iterrows():

    # Check if the season has ended (you need to define the condition for season end)
    if (row['season_shift']!=0.0) & (row['season'] != row['season_shift']):
        
        # Calculate mean Elo at the end of the season
        mean_elo = calculate_mean_elo(elo_def)

        # Regress each team's Elo ratings towards the mean
        elo_def = regress_to_mean(elo_def, mean_elo, regression_weight=1/3)
        
        elo_def_temp = elo_def.copy()
        
        elo_def_temp['Week'] = 0
        elo_def_temp['Season'] = row['season']
        
        historical_elo_def = pd.concat([historical_elo_def,elo_def_temp])
    
    player1 = row['passer_id']
    player2 = row['defteam']
    result = 1 if row['passer_rating'] >= pass_stats.loc[pass_stats.season==row['season']].passer_rating.median() else 0 ## A passer rating above season median is considered a win for the QB

    player1_elo = elo_qb.loc[elo_qb['passer_id']==row['passer_id']].reset_index().loc[0,'Elo']
    player2_elo = elo_def.loc[elo_def['Team']==row['defteam']].reset_index().loc[0,'Elo']

    player1_new_elo, player2_new_elo = update_elo(player1_elo, player2_elo, result)
    
    elo_qb.loc[elo_qb['passer_id'] == row['passer_id'], 'Elo'] = player1_new_elo
    elo_def.loc[elo_def['Team'] == row['defteam'], 'Elo'] = player2_new_elo
    
    # Append the updated Elo ratings to the historical Elo DataFrame
    historical_elo_qb = pd.concat([
        historical_elo_qb,
        pd.DataFrame([{'passer_id': row['passer_id'], 'Passer': row['passer'], 'Season': row['season'], 'Week': row['week'], 'Elo': player1_new_elo}])
    ])
    historical_elo_def = pd.concat([
        historical_elo_def,
        pd.DataFrame([{'Team': row['defteam'], 'Season': row['season'], 'Week': row['week'], 'Elo': player2_new_elo}])
    ])

In [None]:
# Create a dictionary to hold current Elo ratings for each team
elo_ratings = {team: initial_elo for team in pd.concat([matchups['away_team'], matchups['home_team']]).unique()}

# Step 3: Create a separate DataFrame to store the updated Elo ratings
elo_df = pd.DataFrame(elo_ratings.items(), columns=['Team', 'Elo'])

In [None]:
# Replace team names
matchups['away_team'] = matchups['away_team'].replace(replace_dict)
matchups['home_team'] = matchups['home_team'].replace(replace_dict)
matchups.head()

In [None]:
# To check for season change
matchups['season_shift'] = matchups['season'].shift().fillna(0.0)

# Create a new DataFrame to hold historical weekly Elo ratings for each team
historical_elo_df = pd.DataFrame(columns=['Team', 'Season', 'Week', 'Elo'])

# Iterate through matchups DataFrame and update Elo ratings
for index, row in matchups.iterrows():
    # Check if the season has ended (you need to define the condition for season end)
    if (row['season_shift']!=0.0) & (row['season'] != row['season_shift']):
        
        # Calculate mean Elo at the end of the season
        mean_elo = calculate_mean_elo(elo_df)

        # Regress each team's Elo ratings towards the mean
        elo_df = regress_to_mean(elo_df, mean_elo, regression_weight=1/3)
        
        elo_df_temp = elo_df.copy()
        
        elo_df_temp['Week'] = 0
        elo_df_temp['Season'] = row['season']
        
        historical_elo_df = pd.concat([historical_elo_df,elo_df_temp])
    
    if row['result'] < 0:
        result = 1
    elif row['result'] == 0:
        result = 0.5
    else:
        result = 0

    player1_elo = elo_df.loc[elo_df['Team']==row['away_team']].reset_index().loc[0,'Elo']
    player2_elo = elo_df.loc[elo_df['Team']==row['home_team']].reset_index().loc[0,'Elo']

    player1_new_elo, player2_new_elo = update_elo(player1_elo, player2_elo, result)
    
    # Update the main Elo DataFrame with the updated Elo ratings
    elo_df.loc[elo_df['Team'] == row['away_team'], 'Elo'] = player1_new_elo
    elo_df.loc[elo_df['Team'] == row['home_team'], 'Elo'] = player2_new_elo
    
    # Append the updated Elo ratings to the historical Elo DataFrame
    historical_elo_df = pd.concat([
        historical_elo_df,
        pd.DataFrame([{'Team': row['away_team'], 'Season': row['season'], 'Week': row['week'], 'Elo': player1_new_elo}]),
        pd.DataFrame([{'Team': row['home_team'], 'Season': row['season'], 'Week': row['week'], 'Elo': player2_new_elo},])
    ])

In [None]:
# Regress each team's Elo ratings towards the mean
elo_def = regress_to_mean(elo_def, calculate_mean_elo(elo_def), regression_weight=1/3)
elo_df = regress_to_mean(elo_df, calculate_mean_elo(elo_df), regression_weight=1/3)

In [None]:
# Replace team names
weekly_data['recent_team'] = weekly_data['recent_team'].replace(replace_dict)
weekly_data.head()

In [None]:
# Pivot table of weekly team metrics
weekly_sum = weekly_data.pivot_table(index=['season','week','recent_team'], values=['carries','rushing_yards','rushing_epa',\
    'passing_epa','attempts','sacks'], aggfunc='sum').reset_index().sort_values(by=['season','week'], ascending=False)

In [None]:
# Identify if there was a turnover in a drive/ how many points scored
data['drive_turnover'] = np.where(data['fixed_drive_result'].isin(['Turnover','Opp touchdown']), 1.0, 0.0)
data['drive_points'] = np.where(data['fixed_drive_result']=='Touchdown', 6.0, np.where(data['fixed_drive_result']=='Field goal', 3.0, np.where(data['fixed_drive_result']=='Safety', -2.0, 0.0)))

In [None]:
# Drop any NA plays
drive_df = data.loc[~data['drive_play_count'].isna()].drop_duplicates(subset=['game_id','fixed_drive'], keep='first')

In [None]:
# Grab yardage, 3rd and 4th down conversions, drive success rates, epa, drive turnover rates, redzone conversion, QBR
off_yardage = data.pivot_table(index=['season','week','posteam','defteam'], values=['yards_gained','play','third_down_converted','third_down_failed','fourth_down_converted','fourth_down_failed',\
    'epa'], aggfunc='sum').reset_index().sort_values(by=['season','week'], ascending=False)
drive_data = drive_df.pivot_table(index=['season','week','posteam','defteam'], values=['fixed_drive','drive_points','drive_turnover','drive_play_count','drive_first_downs','drive_inside20',\
    'drive_yards_penalized'], aggfunc={'fixed_drive':'count','drive_points': 'sum', 'drive_turnover': 'sum', 'drive_play_count': 'sum', 'drive_first_downs':'sum', 'drive_inside20':'sum',\
        'drive_yards_penalized':'sum'}).reset_index().sort_values(by=['season','week'], ascending=False)
rz_data = drive_df.loc[drive_df['drive_inside20']==1].pivot_table(index=['season','week','posteam','defteam'], values=['drive_inside20','drive_points'], aggfunc='sum').reset_index().sort_values(by=[\
    'season','week'], ascending=False)

In [None]:
# Total third and fourth downs
off_yardage['third_down_total'] = off_yardage['third_down_converted'] + off_yardage['third_down_failed']
off_yardage['fourth_down_total'] = off_yardage['fourth_down_converted'] + off_yardage['fourth_down_failed']

In [None]:
## RUN WHEN YOU HAVE WIFI
ids = nfl.import_ids()

In [None]:
# Merge in passer id
qbr['player_id'] = qbr['player_id'].astype(float)
ids['espn_id'] = ids['espn_id'].astype(float)
qbr = qbr.merge(ids[['espn_id','gsis_id']], how='left', left_on ='player_id', right_on='espn_id')

In [None]:
# Replace team names
qbr['team_abb'] = qbr['team_abb'].replace(replace_dict)
qbr.head()

In [None]:
# grab a main df of all important info
master = matchups.loc[matchups['season']>=2008][['game_id','season','week','away_team','away_score','home_team','home_score','result','location','total','away_rest','home_rest','away_moneyline',\
    'home_moneyline','spread_line','total_line','div_game','roof','surface','away_qb_id','home_qb_id','away_qb_name','home_qb_name']]

In [None]:
master.head()

In [None]:
# Loop through matchups in master and populate cols (features individually)
master.loc[:,'home_elo'] = np.nan
master.loc[:,'away_elo'] = np.nan
master.loc[:,'home_pass_elo_off'] = np.nan # QB elo Def elo difference
master.loc[:,'away_pass_elo_off'] = np.nan # QB elo Def elo difference
master.loc[:,'home_pass_elo_def'] = np.nan # QB elo Def elo difference
master.loc[:,'away_pass_elo_def'] = np.nan # QB elo Def elo difference
master.loc[:,'home_rush_ypc'] = np.nan
master.loc[:,'away_rush_ypc'] = np.nan
master.loc[:,'home_rush_epa_play'] = np.nan
master.loc[:,'away_rush_epa_play'] = np.nan
master.loc[:,'home_qbr'] = np.nan
master.loc[:,'away_qbr'] = np.nan
master.loc[:,'home_epa_play'] = np.nan 
master.loc[:,'away_epa_play'] = np.nan 
master.loc[:,'home_epa_play_def'] = np.nan
master.loc[:,'away_epa_play_def'] = np.nan
master.loc[:,'home_yds_play'] = np.nan
master.loc[:,'away_yds_play'] = np.nan
master.loc[:,'home_yds_play_def'] = np.nan
master.loc[:,'away_yds_play_def'] = np.nan
master.loc[:,'home_3d_conv'] = np.nan
master.loc[:,'away_3d_conv'] = np.nan
master.loc[:,'home_3d_conv_def'] = np.nan
master.loc[:,'away_3d_conv_def'] = np.nan
master.loc[:,'home_4d_conv'] = np.nan
master.loc[:,'away_4d_conv'] = np.nan
master.loc[:,'home_4d_conv_def'] = np.nan
master.loc[:,'away_4d_conv_def'] = np.nan
master.loc[:,'home_1D_drive'] = np.nan
master.loc[:,'away_1D_drive'] = np.nan
master.loc[:,'home_1D_drive_def'] = np.nan
master.loc[:,'away_1D_drive_def'] = np.nan
master.loc[:,'home_RZ_drive'] = np.nan
master.loc[:,'away_RZ_drive'] = np.nan
master.loc[:,'home_RZ_drive_def'] = np.nan
master.loc[:,'away_RZ_drive_def'] = np.nan
master.loc[:,'home_play_drive'] = np.nan
master.loc[:,'away_play_drive'] = np.nan
master.loc[:,'home_play_drive_def'] = np.nan
master.loc[:,'away_play_drive_def'] = np.nan
master.loc[:,'home_points_drive'] = np.nan
master.loc[:,'away_points_drive'] = np.nan
master.loc[:,'home_points_drive_def'] = np.nan
master.loc[:,'away_points_drive_def'] = np.nan
master.loc[:,'home_to_drive'] = np.nan
master.loc[:,'away_to_drive'] = np.nan
master.loc[:,'home_to_drive_def'] = np.nan
master.loc[:,'away_to_drive_def'] = np.nan
master.loc[:,'home_pen_yds_drive'] = np.nan
master.loc[:,'away_pen_yds_drive'] = np.nan
master.loc[:,'home_pen_yds_drive_def'] = np.nan
master.loc[:,'away_pen_yds_drive_def'] = np.nan
master.loc[:,'home_points_RZ'] = np.nan
master.loc[:,'away_points_RZ'] = np.nan
master.loc[:,'home_points_RZ_def'] = np.nan
master.loc[:,'away_points_RZ_def'] = np.nan

# Change dtypes
master['season'] = master['season'].astype(int)
master['week'] = master['week'].astype(int)
historical_elo_df['Season'] = historical_elo_df['Season'].astype(int)
historical_elo_df['Week'] = historical_elo_df['Week'].astype(int)

master = master.reset_index(drop=True)

for i,row in master.iterrows():
    # Populate elo differences
    master.loc[i,'home_elo'], master.loc[i,'away_elo'] = get_elo(row['home_team'],row['season'],row['week'],historical_elo_df), get_elo(row['away_team'],row['season'],row['week'],historical_elo_df)
        
    master.loc[i,'home_pass_elo_off'], master.loc[i,'away_pass_elo_off'] = get_qb_elo(row['home_qb_id'],row['season'],row['week'],historical_elo_qb), get_qb_elo(row['away_qb_id'],row['season'],row['week'],historical_elo_qb)
    
    master.loc[i,'home_pass_elo_def'], master.loc[i,'away_pass_elo_def'] = get_elo(row['home_team'],row['season'],row['week'],historical_elo_def), get_elo(row['away_team'],row['season'],row['week'],historical_elo_def)
    
    # Populate other stats
    master.loc[i,'home_rush_ypc'] = get_value(weekly_sum, row['season'], row['week'], row['home_team'], 'recent_team', 'rushing_yards', \
        'carries')
    master.loc[i,'away_rush_ypc'] = get_value(weekly_sum, row['season'], \
        row['week'], row['away_team'], 'recent_team', 'rushing_yards', 'carries')
    
    master.loc[i,'home_rush_epa_play'] = get_value(weekly_sum, row['season'], row['week'], row['home_team'], 'recent_team', 'rushing_epa', \
        'carries') 
    master.loc[i,'away_rush_epa_play'] = get_value(weekly_sum, row['season'], \
        row['week'], row['away_team'], 'recent_team', 'rushing_epa', 'carries')
    
    master.loc[i,'home_qbr'] = get_qbr(qbr, row['season'], row['week'], row['home_qb_id'])
    master.loc[i,'away_qbr'] = get_qbr(qbr, row['season'], row['week'], row['away_qb_id'])
    
    master.loc[i,'home_epa_play'] = get_value(off_yardage, row['season'], row['week'], row['home_team'], 'posteam', 'epa', 'play')
    master.loc[i,'away_epa_play'] = get_value(off_yardage, row['season'], row['week'], row['away_team'], 'posteam', 'epa', 'play')
    
    master.loc[i,'home_epa_play_def'] = get_value(off_yardage, row['season'], row['week'], row['home_team'], 'defteam', 'epa', 'play')
    master.loc[i,'away_epa_play_def'] = get_value(off_yardage, row['season'], row['week'], row['away_team'], 'defteam', 'epa', 'play')
    
    master.loc[i,'home_yds_play'] = get_value(off_yardage, row['season'], row['week'], row['home_team'], 'posteam', 'yards_gained', 'play')
    master.loc[i,'away_yds_play'] = get_value(off_yardage, row['season'], row['week'], row['away_team'], 'posteam', 'yards_gained', 'play')
    
    master.loc[i,'home_yds_play_def'] = get_value(off_yardage, row['season'], row['week'], row['home_team'], 'defteam', 'yards_gained', 'play')
    master.loc[i,'away_yds_play_def']= get_value(off_yardage, row['season'], row['week'], row['away_team'], 'defteam', 'yards_gained', 'play')
    
    master.loc[i,'home_3d_conv'] = get_value(off_yardage, row['season'], row['week'], row['home_team'], 'posteam', 'third_down_converted', \
                                             'third_down_total')
    master.loc[i,'away_3d_conv'] = get_value(off_yardage, row['season'], \
        row['week'], row['away_team'], 'posteam', 'third_down_converted', 'third_down_total')
    
    master.loc[i,'home_3d_conv_def'] = get_value(off_yardage, row['season'], row['week'], row['home_team'], 'defteam', 'third_down_converted',\
                                                 'third_down_total') 
    master.loc[i,'away_3d_conv_def'] = get_value(off_yardage, row['season'], \
        row['week'], row['away_team'], 'defteam', 'third_down_converted', 'third_down_total')
    
    master.loc[i,'home_4d_conv'] = get_value(off_yardage, row['season'], row['week'], row['home_team'], 'posteam', 'fourth_down_converted', \
                                             'fourth_down_total')
    master.loc[i,'away_4d_conv'] = get_value(off_yardage, row['season'], \
        row['week'], row['away_team'], 'posteam', 'fourth_down_converted', 'fourth_down_total')
    
    master.loc[i,'home_4d_conv_def'] = get_value(off_yardage, row['season'], row['week'], row['home_team'], 'defteam', \
                                                 'fourth_down_converted', 'fourth_down_total') 
    master.loc[i,'away_4d_conv_def'] = get_value(off_yardage, row['season'], \
        row['week'], row['away_team'], 'defteam', 'fourth_down_converted', 'fourth_down_total')
    
    master.loc[i,'home_1D_drive'] = get_value(drive_data, row['season'], row['week'], row['home_team'], 'posteam', 'drive_first_downs', \
                                              'fixed_drive')
    master.loc[i,'away_1D_drive'] = get_value(drive_data, row['season'], \
        row['week'], row['away_team'], 'posteam', 'drive_first_downs', 'fixed_drive')
    
    master.loc[i,'home_1D_drive_def'] = get_value(drive_data, row['season'], row['week'], row['home_team'], 'defteam', 'drive_first_downs', \
                                                  'fixed_drive')
    master.loc[i,'away_1D_drive_def'] = get_value(drive_data, row['season'], \
        row['week'], row['away_team'], 'defteam', 'drive_first_downs', 'fixed_drive')
    
    master.loc[i,'home_RZ_drive'] = get_value(drive_data, row['season'], row['week'], row['home_team'], 'posteam', 'drive_inside20', \
                                              'fixed_drive')
    master.loc[i,'away_RZ_drive'] = get_value(drive_data, row['season'], \
        row['week'], row['away_team'], 'posteam', 'drive_inside20', 'fixed_drive')
    
    master.loc[i,'home_RZ_drive_def'] = get_value(drive_data, row['season'], row['week'], row['home_team'], 'defteam', 'drive_inside20', \
                                                  'fixed_drive')
    master.loc[i,'away_RZ_drive_def'] = get_value(drive_data, row['season'], \
        row['week'], row['away_team'], 'defteam', 'drive_inside20', 'fixed_drive')
    
    master.loc[i,'home_play_drive'] = get_value(drive_data, row['season'], row['week'], row['home_team'], 'posteam', 'drive_play_count', \
                                                'fixed_drive')
    master.loc[i,'away_play_drive'] = get_value(drive_data, row['season'], \
        row['week'], row['away_team'], 'posteam', 'drive_play_count', 'fixed_drive')
    
    master.loc[i,'home_play_drive_def'] = get_value(drive_data, row['season'], row['week'], row['home_team'], 'defteam', 'drive_play_count', \
                                                    'fixed_drive')
    master.loc[i,'away_play_drive_def'] = get_value(drive_data, row['season'], \
        row['week'], row['away_team'], 'defteam', 'drive_play_count', 'fixed_drive')
    
    master.loc[i,'home_points_drive'] = get_value(drive_data, row['season'], row['week'], row['home_team'], 'posteam', 'drive_points', \
                                                  'fixed_drive')
    master.loc[i,'away_points_drive'] = get_value(drive_data, row['season'], \
        row['week'], row['away_team'], 'posteam', 'drive_points', 'fixed_drive')
    
    master.loc[i,'home_points_drive_def'] = get_value(drive_data, row['season'], row['week'], row['home_team'], 'defteam', 'drive_points', \
                                                      'fixed_drive')
    master.loc[i,'away_points_drive_def'] = get_value(drive_data, row['season'], \
        row['week'], row['away_team'], 'defteam', 'drive_points', 'fixed_drive')
    
    master.loc[i,'home_to_drive'] = get_value(drive_data, row['season'], row['week'], row['home_team'], 'posteam', 'drive_turnover', \
                                                   'fixed_drive')
    master.loc[i,'away_to_drive'] = get_value(drive_data, row['season'], \
        row['week'], row['away_team'], 'posteam', 'drive_turnover', 'fixed_drive')
    
    master.loc[i,'home_to_drive_def'] = get_value(drive_data, row['season'], row['week'], row['home_team'], 'defteam', 'drive_turnover',\
                                                   'fixed_drive')
    master.loc[i,'away_to_drive_def'] = get_value(drive_data, row['season'], \
        row['week'], row['away_team'], 'defteam', 'drive_turnover', 'fixed_drive')
    
    master.loc[i,'home_pen_yds_drive'] = get_value(drive_data, row['season'], row['week'], row['home_team'], 'posteam', \
                                                   'drive_yards_penalized', 'fixed_drive')
    master.loc[i,'away_pen_yds_drive'] = get_value(drive_data, row['season'], \
        row['week'], row['away_team'], 'posteam', 'drive_yards_penalized', 'fixed_drive')
    
    master.loc[i,'home_pen_yds_drive_def'] = get_value(drive_data, row['season'], row['week'], row['home_team'], 'defteam', \
                                                       'drive_yards_penalized', 'fixed_drive')
    master.loc[i,'away_pen_yds_drive_def'] = get_value(drive_data, row['season'], \
        row['week'], row['away_team'], 'defteam', 'drive_yards_penalized', 'fixed_drive')
    
    master.loc[i,'home_points_RZ'] = get_value(rz_data, row['season'], row['week'], row['home_team'], 'posteam', 'drive_points', \
                                               'drive_inside20')
    master.loc[i,'away_points_RZ'] = get_value(rz_data, row['season'], \
        row['week'], row['away_team'], 'posteam', 'drive_points', 'drive_inside20')
    
    master.loc[i,'home_points_RZ_def'] = get_value(rz_data, row['season'], row['week'], row['home_team'], 'defteam', 'drive_points', \
                                                   'drive_inside20')
    master.loc[i,'away_points_RZ_def'] = get_value(rz_data, row['season'], \
        row['week'], row['away_team'], 'defteam', 'drive_points', 'drive_inside20')
    

In [None]:
master

In [None]:
# Create a binary col to indicate win
master.loc[:,'is_home_win'] = np.where(master.result>0, 1.0, 0.0)

In [None]:
# Create a col if game is in dome
master.loc[:,'is_dome'] = np.where(master['roof']=='dome', 1.0, 0.0)

# Create a col if game is played on natural grass
master.loc[:,'is_grass'] = np.where(master['surface']=='grass',1.0,0.0)

# Create a col if game is played at neutral site
master.loc[:,'is_neutral'] = np.where(master['location']=='Neutral',1.0,0.0)

In [None]:
master

In [None]:
master.to_csv('matchups_df.csv', index=False)

### EDA

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
data_loss = master.loc[master['is_home_win']==0.0]
data_win = master.loc[master['is_home_win']==1.0]
sns.distplot(data_loss.yds_play_diff.dropna(),kde=False,label='Loss')
sns.distplot(data_win.yds_play_diff.dropna(),kde=False,label='Win')
plt.legend()
plt.title('yds_play_diff histogram')
plt.ylabel('frequency')
plt.show()

### Modelling

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, log_loss, mean_absolute_error, mean_squared_error
from xgboost import XGBClassifier
import shap

In [None]:
def fit_and_score_model(mdl, X_train, X_val, y_train, y_val, type='regressor'):
    
    # Write some code to fit the model, and calculate evaluation metrics on
    # the validation set.
    # -------------------------------------------------------------------------
    # Fit the model
    mdl.fit(X_train, y_train)
    y_pred = mdl.predict(X_val)
    if type == 'classifier':
        y_proba = mdl.predict_proba(X_val) # For log_loss and roc_auc_score
        
        # Calculate various classification metrics
        accuracy = accuracy_score(y_val, y_pred)
        precision = precision_score(y_val, y_pred)
        recall = recall_score(y_val, y_pred)
        f1 = f1_score(y_val, y_pred)
        roc_auc = roc_auc_score(y_val, y_proba[:, 1])  # Assuming binary classification
        logloss = log_loss(y_val, y_proba)
        
        # -------------------------------------------------------------------------\
        print('The precision score is {} and the recall score is {}'.format(precision, recall))
        return accuracy, precision, recall, f1, roc_auc, logloss
        
    else:
        train_score = mdl.score(X_train, y_train)
        test_score = mdl.score(X_val, y_val)
        mae = mean_absolute_error(y_val, y_pred)
        mse = mean_squared_error(y_val, y_pred)
        rmse = np.sqrt(mse)

        # -------------------------------------------------------------------------\
        print('The train score is {} and the test score is {}'.format(train_score, test_score))
        return train_score, test_score, mae, mse, rmse
    

def get_feature_importances(mdl, X_train):
    # Create an explainer object for the XGBoost model
    explainer = shap.Explainer(mdl)

    # Calculate SHAP values for all features in the training data
    shap_values = explainer.shap_values(X_train)

    # Summarize the SHAP values to obtain feature importances
    feature_importances = np.abs(shap_values).mean(axis=0)

    # Get the corresponding feature names
    feature_names = X_train.columns

    # Sort the feature importances in descending order
    sorted_indices = feature_importances.argsort()[::-1]
    sorted_importances = feature_importances[sorted_indices]
    sorted_feature_names = feature_names[sorted_indices]

    # Print the feature importances
    for feature_name, importance in zip(sorted_feature_names, sorted_importances):
        print(f"{feature_name}: {importance}")

    # Plot the feature importances
    shap.summary_plot(shap_values, X_train, feature_names=feature_names)

def print_heavily_correlated_features(df, threshold=0.7):
    corr = df.corr().abs()
    corr = corr[corr > threshold]
    print(corr.count().sort_values(ascending=False) - 1)

## Function to discover interacting features to engineer
def discover_interactions(interactiondf, target, function, threshold):
    # Looking for interaction effects within features by dividing
    base_corrs = interactiondf.corr()[target].drop(index=target)

    for feature1 in base_corrs.index:
        for feature2 in base_corrs.index:
            if feature2 != feature1:
                # divide the two features to create a new feature
                if function == 'divide':
                    new_feature = interactiondf[feature1] / interactiondf[feature2]
                else:
                    new_feature = interactiondf[feature1] * interactiondf[feature2]
                new_corr = np.abs(np.round(np.corrcoef(new_feature, interactiondf[target])[0,1], 3))
                corr1 = np.abs(np.round(base_corrs[feature1], 3))
                corr2 = np.abs(np.round(base_corrs[feature2], 3))
                # add a threshold of 0.02 to make sure that the improvement is meaningful
                if new_corr > max(corr1, corr2)+0.02:
                    # Only show significant correlations
                    if new_corr >= threshold:
                        print('{} {} combine to get correlation {} compared to {} {}'.format(
                            feature1, feature2, new_corr, corr1, corr2))

def run_randomized_search(X_train, y_train, n_iter=10, feature_constraints=None):
    # Retrained with grid search
    if feature_constraints == None:
        mdl = XGBClassifier()
    else:
        mdl = XGBClassifier(monotone_constraints=feature_constraints)

    params_to_search = {
        'learning_rate': [0.05, 0.0625, 0.1],
        'gamma': [0,0.1,0.2,0.3],
        'max_depth': [5,6,7,8],
        'n_estimators': [250,375,500],
        'reg_lambda': [0, 0.1, 1, 10]
    }

    optimized_dt = RandomizedSearchCV(mdl, params_to_search, scoring = 'neg_log_loss', refit=True, cv=5, n_iter=n_iter, random_state=42)

    optimized_dt.fit(X_train, y_train)
    
    return optimized_dt

In [None]:
master = pd.read_csv('matchups_df.csv')

In [None]:
master.head()

In [None]:
features = ['away_rest','home_rest','spread_line','total_line','div_game','home_elo', 'away_elo','home_pass_elo_off','away_pass_elo_off',
            'home_pass_elo_def','away_pass_elo_def','home_rush_ypc','away_rush_ypc','home_rush_epa_play','away_rush_epa_play','home_qbr','away_qbr',
            'home_epa_play','away_epa_play','home_epa_play_def','away_epa_play_def','home_yds_play','away_yds_play','home_yds_play_def',
            'away_yds_play_def','home_3d_conv','away_3d_conv','home_3d_conv_def','away_3d_conv_def','home_4d_conv',
            'away_4d_conv','home_4d_conv_def','away_4d_conv_def','home_1D_drive','away_1D_drive','home_1D_drive_def','away_1D_drive_def',
            'home_points_drive','away_points_drive','home_points_drive_def','away_points_drive_def','home_to_drive',
            'away_to_drive','home_to_drive_def','away_to_drive_def','home_pen_yds_drive','away_pen_yds_drive','home_pen_yds_drive_def',
            'away_pen_yds_drive_def','home_points_RZ','away_points_RZ','home_points_RZ_def','away_points_RZ_def','is_dome','is_grass']

# ['home_RZ_drive','away_RZ_drive','home_RZ_drive_def','away_RZ_drive_def','home_play_drive','away_play_drive','home_play_drive_def',
#  'away_play_drive_def']

In [None]:
df_train = master[master.season<2020]
interactiondf = df_train[sorted(features) + ['is_home_win']]

In [None]:
discover_interactions(interactiondf, 'is_home_win', 'divide', 0.2)

In [None]:
interactiondf

In [None]:
# Split the data into training and testing sets
df_train = master[master.season<2020]
df_test = master[master.season>=2020]

# Write your code here.
# Partition the training data into features and target
X_train = df_train[sorted(features)]
y_train = df_train.is_home_win

# Partition the testing data into features and target
X_test = df_test[sorted(features)]
y_test = df_test.is_home_win

In [None]:
print_heavily_correlated_features(X_train, threshold=0.7)

In [None]:
X_train.corr().loc['away_RZ_drive',:].sort_values()

In [None]:
# Train basic xgboost classifier
mdl = XGBClassifier()
fit_and_score_model(mdl, X_train, X_test, y_train, y_test, type='classifier')

In [None]:
# Define your monotonic constraints as a dictionary
feature_constraints = {}
feature_constraints['home_3d_conv_def'] = -1  # Negative constraint for 'home_3d_conv_def'
feature_constraints['away_3d_conv_def'] = -1  # Negative constraint for 'away_3d_conv_def'

# Create and train your XGBoost model while specifying the monotonic_constraints parameter
mdl = XGBClassifier(
    monotone_constraints=feature_constraints,  # Set the monotonic constraints
    objective='binary:logistic',  # Use 'binary:logistic' for binary classification
)
fit_and_score_model(mdl, X_train, X_test, y_train, y_test, type='classifier')

In [None]:
# Define your monotonic constraints as a dictionary
feature_constraints = {}
feature_constraints['home_3d_conv_def'] = -1  # Negative constraint for home team's chances
feature_constraints['away_3d_conv_def'] = 1  # Positive constraint for home team's chances 
feature_constraints['home_yds_play'] = 1 
feature_constraints['away_yds_play'] = -1  
feature_constraints['home_to_drive_def'] = 1 
feature_constraints['away_to_drive_def'] = -1  
feature_constraints['home_points_RZ_def'] = -1 
feature_constraints['away_points_RZ_def'] = 1  

In [None]:
optimized_mdl = run_randomized_search(X_train, y_train, n_iter=10, feature_constraints=feature_constraints)
mdl = optimized_mdl.best_estimator_
fit_and_score_model(mdl, X_train, X_test, y_train, y_test, type='classifier')

In [None]:
'''(0.6372315035799523,
 0.6525612472160356,
 0.6643990929705216,
 0.6584269662921348,
 0.6979386212923456,
 0.6832563014459843)'''

In [None]:
get_feature_importances(mdl, X_train)

In [None]:
from sklearn.metrics import confusion_matrix

y_pred = mdl.predict(X_test)
cfm = confusion_matrix(y_true=y_test, y_pred=y_pred)
ax = sns.heatmap(cfm, annot=True)
ax.set(xlabel='Predicted', ylabel='Actual')

In [None]:
joblib.dump(mdl,'NFL_2023_game_prediction.jlb')

### Making Predictions

In [None]:
mdl = joblib.load('NFL_2023_game_prediction.jlb')

In [None]:
# Manually imported sheet of starters
starters = pd.read_csv('starting_qbs.csv')

In [None]:
starters.head()

In [None]:
upcoming = nfl.import_schedules([2023])

In [None]:
upcoming.head()

In [None]:
# merge in starting QB
upcoming = upcoming.drop(['away_qb_id','home_qb_id','away_qb_name','home_qb_name'],axis=1)

upcoming = upcoming.merge(starters[['Team','passer','passer_id']], how='left', left_on='home_team', right_on='Team').rename({'passer':\
    'home_qb_name', 'passer_id':'home_qb_id'},axis=1)

upcoming = upcoming.drop(['Team'],axis=1)

upcoming = upcoming.merge(starters[['Team','passer','passer_id']], how='left', left_on='away_team', right_on='Team').rename({'passer':\
    'away_qb_name', 'passer_id':'away_qb_id'},axis=1)

upcoming = upcoming.drop(['Team'],axis=1)

In [None]:
upcoming.head()

In [None]:
# grab a main df of all important info
newseason = upcoming[['game_id','season','week','away_team','away_score','home_team','home_score','result','location','total','away_rest','home_rest','away_moneyline',\
    'home_moneyline','spread_line','total_line','div_game','roof','surface','away_qb_id','home_qb_id','away_qb_name','home_qb_name']]

In [None]:
newseason.head()

In [None]:
historical_elo_df

In [None]:
# Calculate mean Elo at the end of the season
mean_elo = calculate_mean_elo(elo_df)

# Regress each team's Elo ratings towards the mean
elo_df = regress_to_mean(elo_df, mean_elo, regression_weight=1/3)

elo_df_temp = elo_df.copy()

elo_df_temp['Week'] = 0
elo_df_temp['Season'] = newseason.loc[0,'season']

historical_elo_df = pd.concat([historical_elo_df,elo_df_temp])

In [None]:
# Calculate mean Elo at the end of the season
mean_elo = calculate_mean_elo(elo_def)

# Regress each team's Elo ratings towards the mean
elo_def = regress_to_mean(elo_def, mean_elo, regression_weight=1/3)

elo_def_temp = elo_def.copy()

elo_def_temp['Week'] = 0
elo_def_temp['Season'] = newseason.loc[0,'season']

historical_elo_def = pd.concat([historical_elo_def,elo_def_temp])

In [None]:
# Loop through matchups in master and populate cols (features individually)
newseason.loc[:,'home_elo'] = np.nan
newseason.loc[:,'away_elo'] = np.nan
newseason.loc[:,'home_pass_elo_off'] = np.nan # QB elo Def elo difference
newseason.loc[:,'away_pass_elo_off'] = np.nan # QB elo Def elo difference
newseason.loc[:,'home_pass_elo_def'] = np.nan # QB elo Def elo difference
newseason.loc[:,'away_pass_elo_def'] = np.nan # QB elo Def elo difference
newseason.loc[:,'home_rush_ypc'] = np.nan
newseason.loc[:,'away_rush_ypc'] = np.nan
newseason.loc[:,'home_rush_epa_play'] = np.nan
newseason.loc[:,'away_rush_epa_play'] = np.nan
newseason.loc[:,'home_qbr'] = np.nan
newseason.loc[:,'away_qbr'] = np.nan
newseason.loc[:,'home_epa_play'] = np.nan 
newseason.loc[:,'away_epa_play'] = np.nan 
newseason.loc[:,'home_epa_play_def'] = np.nan
newseason.loc[:,'away_epa_play_def'] = np.nan
newseason.loc[:,'home_yds_play'] = np.nan
newseason.loc[:,'away_yds_play'] = np.nan
newseason.loc[:,'home_yds_play_def'] = np.nan
newseason.loc[:,'away_yds_play_def'] = np.nan
newseason.loc[:,'home_3d_conv'] = np.nan
newseason.loc[:,'away_3d_conv'] = np.nan
newseason.loc[:,'home_3d_conv_def'] = np.nan
newseason.loc[:,'away_3d_conv_def'] = np.nan
newseason.loc[:,'home_4d_conv'] = np.nan
newseason.loc[:,'away_4d_conv'] = np.nan
newseason.loc[:,'home_4d_conv_def'] = np.nan
newseason.loc[:,'away_4d_conv_def'] = np.nan
newseason.loc[:,'home_1D_drive'] = np.nan
newseason.loc[:,'away_1D_drive'] = np.nan
newseason.loc[:,'home_1D_drive_def'] = np.nan
newseason.loc[:,'away_1D_drive_def'] = np.nan
newseason.loc[:,'home_RZ_drive'] = np.nan
newseason.loc[:,'away_RZ_drive'] = np.nan
newseason.loc[:,'home_RZ_drive_def'] = np.nan
newseason.loc[:,'away_RZ_drive_def'] = np.nan
newseason.loc[:,'home_play_drive'] = np.nan
newseason.loc[:,'away_play_drive'] = np.nan
newseason.loc[:,'home_play_drive_def'] = np.nan
newseason.loc[:,'away_play_drive_def'] = np.nan
newseason.loc[:,'home_points_drive'] = np.nan
newseason.loc[:,'away_points_drive'] = np.nan
newseason.loc[:,'home_points_drive_def'] = np.nan
newseason.loc[:,'away_points_drive_def'] = np.nan
newseason.loc[:,'home_to_drive'] = np.nan
newseason.loc[:,'away_to_drive'] = np.nan
newseason.loc[:,'home_to_drive_def'] = np.nan
newseason.loc[:,'away_to_drive_def'] = np.nan
newseason.loc[:,'home_pen_yds_drive'] = np.nan
newseason.loc[:,'away_pen_yds_drive'] = np.nan
newseason.loc[:,'home_pen_yds_drive_def'] = np.nan
newseason.loc[:,'away_pen_yds_drive_def'] = np.nan
newseason.loc[:,'home_points_RZ'] = np.nan
newseason.loc[:,'away_points_RZ'] = np.nan
newseason.loc[:,'home_points_RZ_def'] = np.nan
newseason.loc[:,'away_points_RZ_def'] = np.nan

# Change dtypes
newseason['season'] = newseason['season'].astype(int)
newseason['week'] = newseason['week'].astype(int)
historical_elo_df['Season'] = historical_elo_df['Season'].astype(int)
historical_elo_df['Week'] = historical_elo_df['Week'].astype(int)

newseason = newseason.reset_index(drop=True)

current_week = newseason.loc[0,'week']

for i,row in newseason.iterrows():
    # Populate elo differences
    newseason.loc[i,'home_elo'], newseason.loc[i,'away_elo'] = get_elo(row['home_team'],row['season'],current_week,historical_elo_df), get_elo(row['away_team'],row['season'],current_week,historical_elo_df)
        
    newseason.loc[i,'home_pass_elo_off'], newseason.loc[i,'away_pass_elo_off'] = get_qb_elo(row['home_qb_id'],row['season'],current_week,historical_elo_qb), get_qb_elo(row['away_qb_id'],row['season'],current_week,historical_elo_qb)
    
    newseason.loc[i,'home_pass_elo_def'], newseason.loc[i,'away_pass_elo_def'] = get_elo(row['home_team'],row['season'],current_week,historical_elo_def), get_elo(row['away_team'],row['season'],current_week,historical_elo_def)
    
    # Populate other stats
    newseason.loc[i,'home_rush_ypc'] = get_value(weekly_sum, row['season'], current_week, row['home_team'], 'recent_team', 'rushing_yards', \
        'carries')
    newseason.loc[i,'away_rush_ypc'] = get_value(weekly_sum, row['season'], \
        current_week, row['away_team'], 'recent_team', 'rushing_yards', 'carries')
    
    newseason.loc[i,'home_rush_epa_play'] = get_value(weekly_sum, row['season'], current_week, row['home_team'], 'recent_team', 'rushing_epa', \
        'carries') 
    newseason.loc[i,'away_rush_epa_play'] = get_value(weekly_sum, row['season'], \
        current_week, row['away_team'], 'recent_team', 'rushing_epa', 'carries')
    
    newseason.loc[i,'home_qbr'] = get_qbr(qbr, row['season'], current_week, row['home_qb_id'])
    newseason.loc[i,'away_qbr'] = get_qbr(qbr, row['season'], current_week, row['away_qb_id'])
    
    newseason.loc[i,'home_epa_play'] = get_value(off_yardage, row['season'], current_week, row['home_team'], 'posteam', 'epa', 'play')
    newseason.loc[i,'away_epa_play'] = get_value(off_yardage, row['season'], current_week, row['away_team'], 'posteam', 'epa', 'play')
    
    newseason.loc[i,'home_epa_play_def'] = get_value(off_yardage, row['season'], current_week, row['home_team'], 'defteam', 'epa', 'play')
    newseason.loc[i,'away_epa_play_def'] = get_value(off_yardage, row['season'], current_week, row['away_team'], 'defteam', 'epa', 'play')
    
    newseason.loc[i,'home_yds_play'] = get_value(off_yardage, row['season'], current_week, row['home_team'], 'posteam', 'yards_gained', 'play')
    newseason.loc[i,'away_yds_play'] = get_value(off_yardage, row['season'], current_week, row['away_team'], 'posteam', 'yards_gained', 'play')
    
    newseason.loc[i,'home_yds_play_def'] = get_value(off_yardage, row['season'], current_week, row['home_team'], 'defteam', 'yards_gained', 'play')
    newseason.loc[i,'away_yds_play_def']= get_value(off_yardage, row['season'], current_week, row['away_team'], 'defteam', 'yards_gained', 'play')
    
    newseason.loc[i,'home_3d_conv'] = get_value(off_yardage, row['season'], current_week, row['home_team'], 'posteam', 'third_down_converted', \
                                             'third_down_total')
    newseason.loc[i,'away_3d_conv'] = get_value(off_yardage, row['season'], \
        current_week, row['away_team'], 'posteam', 'third_down_converted', 'third_down_total')
    
    newseason.loc[i,'home_3d_conv_def'] = get_value(off_yardage, row['season'], current_week, row['home_team'], 'defteam', 'third_down_converted',\
                                                 'third_down_total') 
    newseason.loc[i,'away_3d_conv_def'] = get_value(off_yardage, row['season'], \
        current_week, row['away_team'], 'defteam', 'third_down_converted', 'third_down_total')
    
    newseason.loc[i,'home_4d_conv'] = get_value(off_yardage, row['season'], current_week, row['home_team'], 'posteam', 'fourth_down_converted', \
                                             'fourth_down_total')
    newseason.loc[i,'away_4d_conv'] = get_value(off_yardage, row['season'], \
        current_week, row['away_team'], 'posteam', 'fourth_down_converted', 'fourth_down_total')
    
    newseason.loc[i,'home_4d_conv_def'] = get_value(off_yardage, row['season'],current_week, row['home_team'], 'defteam', \
                                                 'fourth_down_converted', 'fourth_down_total') 
    newseason.loc[i,'away_4d_conv_def'] = get_value(off_yardage, row['season'], \
        current_week, row['away_team'], 'defteam', 'fourth_down_converted', 'fourth_down_total')
    
    newseason.loc[i,'home_1D_drive'] = get_value(drive_data, row['season'], current_week, row['home_team'], 'posteam', 'drive_first_downs', \
                                              'fixed_drive')
    newseason.loc[i,'away_1D_drive'] = get_value(drive_data, row['season'], \
        current_week, row['away_team'], 'posteam', 'drive_first_downs', 'fixed_drive')
    
    newseason.loc[i,'home_1D_drive_def'] = get_value(drive_data, row['season'], current_week, row['home_team'], 'defteam', 'drive_first_downs', \
                                                  'fixed_drive')
    newseason.loc[i,'away_1D_drive_def'] = get_value(drive_data, row['season'], \
        current_week, row['away_team'], 'defteam', 'drive_first_downs', 'fixed_drive')
    
    newseason.loc[i,'home_RZ_drive'] = get_value(drive_data, row['season'], current_week, row['home_team'], 'posteam', 'drive_inside20', \
                                              'fixed_drive')
    newseason.loc[i,'away_RZ_drive'] = get_value(drive_data, row['season'], \
        current_week, row['away_team'], 'posteam', 'drive_inside20', 'fixed_drive')
    
    newseason.loc[i,'home_RZ_drive_def'] = get_value(drive_data, row['season'], current_week, row['home_team'], 'defteam', 'drive_inside20', \
                                                  'fixed_drive')
    newseason.loc[i,'away_RZ_drive_def'] = get_value(drive_data, row['season'], \
        current_week, row['away_team'], 'defteam', 'drive_inside20', 'fixed_drive')
    
    newseason.loc[i,'home_play_drive'] = get_value(drive_data, row['season'], current_week, row['home_team'], 'posteam', 'drive_play_count', \
                                                'fixed_drive')
    newseason.loc[i,'away_play_drive'] = get_value(drive_data, row['season'], \
        current_week, row['away_team'], 'posteam', 'drive_play_count', 'fixed_drive')
    
    newseason.loc[i,'home_play_drive_def'] = get_value(drive_data, row['season'], current_week, row['home_team'], 'defteam', 'drive_play_count', \
                                                    'fixed_drive')
    newseason.loc[i,'away_play_drive_def'] = get_value(drive_data, row['season'], \
        current_week, row['away_team'], 'defteam', 'drive_play_count', 'fixed_drive')
    
    newseason.loc[i,'home_points_drive'] = get_value(drive_data, row['season'], current_week, row['home_team'], 'posteam', 'drive_points', \
                                                  'fixed_drive')
    newseason.loc[i,'away_points_drive'] = get_value(drive_data, row['season'], \
        current_week, row['away_team'], 'posteam', 'drive_points', 'fixed_drive')
    
    newseason.loc[i,'home_points_drive_def'] = get_value(drive_data, row['season'], current_week, row['home_team'], 'defteam', 'drive_points', \
                                                      'fixed_drive')
    newseason.loc[i,'away_points_drive_def'] = get_value(drive_data, row['season'], \
        current_week, row['away_team'], 'defteam', 'drive_points', 'fixed_drive')
    
    newseason.loc[i,'home_to_drive'] = get_value(drive_data, row['season'], current_week, row['home_team'], 'posteam', 'drive_turnover', \
                                                   'fixed_drive')
    newseason.loc[i,'away_to_drive'] = get_value(drive_data, row['season'], \
        current_week, row['away_team'], 'posteam', 'drive_turnover', 'fixed_drive')
    
    newseason.loc[i,'home_to_drive_def'] = get_value(drive_data, row['season'], current_week, row['home_team'], 'defteam', 'drive_turnover',\
                                                   'fixed_drive')
    newseason.loc[i,'away_to_drive_def'] = get_value(drive_data, row['season'], \
        current_week, row['away_team'], 'defteam', 'drive_turnover', 'fixed_drive')
    
    newseason.loc[i,'home_pen_yds_drive'] = get_value(drive_data, row['season'], current_week, row['home_team'], 'posteam', \
                                                   'drive_yards_penalized', 'fixed_drive')
    newseason.loc[i,'away_pen_yds_drive'] = get_value(drive_data, row['season'], \
        current_week, row['away_team'], 'posteam', 'drive_yards_penalized', 'fixed_drive')
    
    newseason.loc[i,'home_pen_yds_drive_def'] = get_value(drive_data, row['season'], current_week, row['home_team'], 'defteam', \
                                                       'drive_yards_penalized', 'fixed_drive')
    newseason.loc[i,'away_pen_yds_drive_def'] = get_value(drive_data, row['season'], \
        current_week, row['away_team'], 'defteam', 'drive_yards_penalized', 'fixed_drive')
    
    newseason.loc[i,'home_points_RZ'] = get_value(rz_data, row['season'], current_week, row['home_team'], 'posteam', 'drive_points', \
                                               'drive_inside20')
    newseason.loc[i,'away_points_RZ'] = get_value(rz_data, row['season'], \
        current_week, row['away_team'], 'posteam', 'drive_points', 'drive_inside20')
    
    newseason.loc[i,'home_points_RZ_def'] = get_value(rz_data, row['season'], current_week, row['home_team'], 'defteam', 'drive_points', \
                                                   'drive_inside20')
    newseason.loc[i,'away_points_RZ_def'] = get_value(rz_data, row['season'], \
        current_week, row['away_team'], 'defteam', 'drive_points', 'drive_inside20')
    

In [None]:
# Create a col if game is in dome
newseason.loc[:,'is_dome'] = np.where(newseason['roof']=='dome', 1.0, 0.0)

# Create a col if game is played on natural grass
newseason.loc[:,'is_grass'] = np.where(newseason['surface']=='grass',1.0,0.0)

# Create a col if game is played at neutral site
newseason.loc[:,'is_neutral'] = np.where(newseason['location']=='Neutral',1.0,0.0)

In [None]:
newseason.loc[0:31,:]

In [None]:
pred_data = newseason[sorted(features)]

In [None]:
y_proba = mdl.predict_proba(pred_data)

In [None]:
newseason['home_team_win_prob'] = y_proba[:,1]

In [None]:
newseason.groupby(['home_team'])['home_team_win_prob'].sum().reset_index().sort_values(by='home_team_win_prob')

In [None]:
newseason.loc[0:31,:]

In [None]:
newseason.to_csv('NFL_2023_week1_predictions.csv')