# NFL Game Score Prediction
We will be using historical NFL game data to predict team performance with consideration to other independent variables such as stadium, weather, and media odds. 

A model that predicts the score of each team based on variables such as home-team status, weather, and historical performance is leveraged to generate predicted scores. To boost the currency of the data used, only the previous seasons since 2000 will be used. 



In [None]:
# Importing libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

stadiums = pd.read_csv('/nfl_stadiums.csv', encoding='latin1')
teams = pd.read_csv('/nfl_teams.csv', encoding='latin1')
games = pd.read_csv('/spreadspoke_scores.csv',encoding='latin1')

In [None]:
stadiums.head(2)

Unnamed: 0,stadium_name,stadium_location,stadium_open,stadium_close,stadium_type,stadium_address,stadium_weather_station_code,stadium_weather_type,stadium_capacity,stadium_surface,STATION,NAME,LATITUDE,LONGITUDE,ELEVATION
0,Alamo Dome,"San Antonio, TX",,,indoor,"100 Montana St, San Antonio, TX 78203",78203.0,dome,72000,FieldTurf,,,,,
1,Allegiant Stadium,"Paradise, NV",2020.0,,indoor,,,dome,65000,Grass,,,,,


In [None]:
# Dropping unnecessary features
stadiums.drop(columns=['stadium_address', 'stadium_weather_station_code' ,'STATION',	'NAME',	'LATITUDE',	'LONGITUDE', 'ELEVATION'], inplace=True)

In [None]:
# Converting dtype to datetime
games['schedule_date'] = pd.to_datetime(games['schedule_date'])

In [None]:
# Removing entries that fall prior to 2000.
mask = (games['schedule_date'] > '2000-1-1')
games = games.loc[mask]
games = games[games['schedule_season'] != 1999]
games['schedule_season'].value_counts()

2021    285
2020    269
2012    267
2019    267
2018    267
2017    267
2016    267
2015    267
2014    267
2013    267
2011    267
2010    267
2009    267
2008    267
2007    267
2006    267
2005    267
2004    267
2003    267
2002    267
2001    259
2000    259
Name: schedule_season, dtype: int64

In [None]:
# Dropping extra unnecessary column
games.drop(columns=['over_under_line'], inplace=True)

In [None]:
# Converting datatypes to 0,1 binary codes
games["schedule_playoff"] = games["schedule_playoff"].astype(int)
games['stadium_neutral'] = games['stadium_neutral'].astype(int)
games[['schedule_playoff', 'stadium_neutral']].head(2)

Unnamed: 0,schedule_playoff,stadium_neutral
7354,0,0
7355,0,0


In [None]:
# Confirming 36 teams in NFL
games['team_home'].nunique(), games['team_away'].nunique()

(36, 36)

In [None]:
# Connecting Team ID to Team Name and correcting Dict values for some teams
team_id = dict(zip(teams.team_id, teams.team_name))
team_id['ARI'] = 'Arizona Cardinals'
team_id['LAC'] = 'Los Angeles Chargers'
team_id['LAR'] = 'St. Louis Rams'
team_id['LVR'] = 'Las Vegas Raiders'
team_id['NE'] = 'New England Patriots'
team_id

{'ARI': 'Arizona Cardinals',
 'ATL': 'Atlanta Falcons',
 'BAL': 'Baltimore Ravens',
 'BUF': 'Buffalo Bills',
 'CAR': 'Carolina Panthers',
 'CHI': 'Chicago Bears',
 'CIN': 'Cincinnati Bengals',
 'CLE': 'Cleveland Browns',
 'DAL': 'Dallas Cowboys',
 'DEN': 'Denver Broncos',
 'DET': 'Detroit Lions',
 'GB': 'Green Bay Packers',
 'HOU': 'Houston Texans',
 'IND': 'Indianapolis Colts',
 'JAX': 'Jacksonville Jaguars',
 'KC': 'Kansas City Chiefs',
 'LAC': 'Los Angeles Chargers',
 'LAR': 'St. Louis Rams',
 'LVR': 'Las Vegas Raiders',
 'MIA': 'Miami Dolphins',
 'MIN': 'Minnesota Vikings',
 'NE': 'New England Patriots',
 'NO': 'New Orleans Saints',
 'NYG': 'New York Giants',
 'NYJ': 'New York Jets',
 'PHI': 'Philadelphia Eagles',
 'PIT': 'Pittsburgh Steelers',
 'SEA': 'Seattle Seahawks',
 'SF': 'San Francisco 49ers',
 'TB': 'Tampa Bay Buccaneers',
 'TEN': 'Tennessee Titans',
 'WAS': 'Washington Football Team'}

In [None]:
# Changing Team ID's to Team Names
games.replace({"team_favorite_id": team_id}, inplace=True)
games['team_favorite_id'].value_counts(), games['team_favorite_id'].nunique()
games = games[games['team_favorite_id'] != 'PICK']
games.tail()

Unnamed: 0,schedule_date,schedule_season,schedule_week,schedule_playoff,team_home,score_home,score_away,team_away,team_favorite_id,spread_favorite,stadium,stadium_neutral,weather_temperature,weather_wind_mph,weather_humidity,weather_detail
13227,2022-01-23,2021,Division,1,Kansas City Chiefs,42.0,36.0,Buffalo Bills,Kansas City Chiefs,-2.5,Arrowhead Stadium,0,35.0,6.0,54.0,
13228,2022-01-23,2021,Division,1,Tampa Bay Buccaneers,27.0,30.0,Los Angeles Rams,Tampa Bay Buccaneers,-3.0,Raymond James Stadium,0,50.0,11.0,69.0,
13229,2022-01-30,2021,Conference,1,Kansas City Chiefs,24.0,27.0,Cincinnati Bengals,Kansas City Chiefs,-7.0,Arrowhead Stadium,0,41.0,4.0,41.0,
13230,2022-01-30,2021,Conference,1,Los Angeles Rams,20.0,17.0,San Francisco 49ers,St. Louis Rams,-3.5,SoFi Stadium,0,72.0,0.0,,DOME
13231,2022-02-13,2021,Superbowl,1,Los Angeles Rams,,,Cincinnati Bengals,St. Louis Rams,-4.0,SoFi Stadium,1,72.0,0.0,,DOME


In [None]:
# Creating home win column and representing favorite team as binary classifier depending if favorite team is home team or not
games['home_win'] = games['score_home'] > games['score_away']
games['home_win'] = games['home_win'].astype(int)
games['team_favorite_id'] = games['team_favorite_id'] == games['team_home']
games['team_favorite_id'] = games['team_favorite_id'].replace({True: 1, False: 0, np.nan:'nan'})
games['team_favorite_id'].replace({'nan':np.nan}, inplace=True)
games['team_favorite_id'].value_counts()

1    3568
0    2268
Name: team_favorite_id, dtype: int64

In [None]:
# Resetting index
games.reset_index(drop=True, inplace=True)

In [None]:
# Merging the data for stadiums to the games df
games = pd.merge(games, stadiums, left_on = 'stadium', right_on = 'stadium_name')

# Resorting to be by date
games.sort_values('schedule_date', inplace = True) 

In [None]:
# For MVP, decided weather features for model are to be dropped.
games.drop(columns=['stadium_weather_type', 'weather_wind_mph', 'weather_humidity', 'weather_detail', 'weather_temperature', 'stadium_open', 'stadium_location', 'stadium_close'], inplace=True)

In [None]:
games = games.dropna(subset=['score_home', 'score_away']) 

In [None]:
def calculate_winner(home_team, away_team):
  teams = [home_team, away_team]
  sample = games.loc[games['team_home'].isin(teams) & games['team_away'].isin(teams)]
  wins, total, wins1, total1, wins2, total2, wins3, total3 = 0, 0, 0, 0, 0, 0, 0, 0
  val1, val2 = 0,0
  sample1 = sample.query('team_home == @teams[0]')

  for index, row in sample.iterrows():
      if row['score_home'] > row['score_away']:
        wins += 1
      total += 1
  h2h = round(wins/total, 2)
  print(f'The head to head win probability for the home team is {h2h}')

  for index, row in sample1.iterrows():
      if row['score_home'] > row['score_away']:
        wins1 += 1
      total1 += 1
  home = round(wins1/total1, 2)
  print(f'The head to head win probability for when the home team plays at home is {home}')

  sample2 = games.query('team_home == @teams[0] or team_away == @teams[0]')
  for index, row in sample2.iterrows():
      if row['score_home'] == teams[0]:
        if row['score_home'] > row['score_away']:
                wins2 += 1
      else:
        if row['score_home'] < row['score_away']:
                wins2 += 1
      total2 += 1
  val1 = round(wins2/total2, 2)

  sample2 = games.query('team_home == @teams[1] or team_away == @teams[1]')
  for index, row in sample2.iterrows():
      if row['score_home'] == teams[1]:
        if row['score_home'] > row['score_away']:
                wins2 += 1
      else:
        if row['score_home'] < row['score_away']:
                wins2 += 1
      total2 += 1
  val2 = round(wins2/total2, 2)
  
  sample3 = games.query('team_home == @teams[0]')
  for index, row in sample3.iterrows():
      if row['score_home'] > row['score_away']:
        wins3 += 1
      total3 += 1
  home_r = round(wins3/total3, 2)

  fav1 = games.query('team_home == @teams[0]')['team_favorite_id'].mean()
  fav2 = games.query('team_home == @teams[1]')['team_favorite_id'].mean()

  ave = (h2h + home + home_r)/3
  if val1 >= val2:
    ave += 0.02
    print('The home team win rate is higher than the away team for all games, therefore an edditional 2% probability is added.')
  elif val1 < val2:
    ave-= 0.02
    print('The away team win rate is higher than the away team for all games, therefore an edditional 2% probability is subtracted.')

  if fav1 > fav2:
    ave += 0.02
    print('The home team favoritism count is higher than the away team for all games, therefore an edditional 2% probability is added.')
  elif fav1 < fav2:
    ave-= 0.02
    print('The away team favoritism count is higher than the home team for all games, therefore an edditional 2% probability is removed.')

  if ave > 0.5:
    print(f"The favorite is: {teams[0]}")
    print(f'They have a probability of {round(ave, 2)} to win.')
  else:
    print(f"The favorite is: {teams[1]}")
    print(f'They have a probability of {round(1-ave, 2)} to win.')
  

In [None]:
def calculate_score(team, date):
  import datetime
  from sklearn.linear_model import LinearRegression
  from sklearn.model_selection import train_test_split

  df = games.query('team_home == @team')
  df['schedule_date'] = df['schedule_date'].map(datetime.datetime.toordinal)

  y = np.asarray(df['score_home'])
  X = df[['schedule_date']]
  X_train, X_test, y_train, y_test = train_test_split(X,y,train_size=0.7,random_state=42)

  model = LinearRegression() 
  model.fit(X_train, y_train) 
  # results = model.predict(X_test)
  # X_test = X_test['schedule_date'].astype(int).map(dt.datetime.fromordinal)

  result = model.predict([[datetime.datetime.strptime(date, "%Y-%m-%d").toordinal()]])
  print(f"The {team} are predicted to score {round(result[0],0)} on {date}")

In [None]:
calculate_winner('Buffalo Bills', 'Kansas City Chiefs')

The head to head win probability for the home team is 0.47
The head to head win probability for when the home team plays at home is 0.4
The home team win rate is higher than the away team for all games, therefore an edditional 2% probability is added.
The away team favoritism count is higher than the home team for all games, therefore an edditional 2% probability is removed.
The favorite is: Kansas City Chiefs
They have a probability of 0.53 to win.


In [None]:
calculate_score('Buffalo Bills', '2022-03-27')

The Buffalo Bills is predicted to score 24.0 on 2022-03-27


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  import sys
  "X does not have valid feature names, but"
