In [1]:
import pandas as pd
import numpy as np
import time
from datetime import datetime, timedelta
import requests as r
from http.client import IncompleteRead

In [2]:
#We want to create a model that predicts which team are most likely to beat their betting spread
#So first, let's scrape historical betting spreads from https://www.sportsoddshistory.com/nfl-game-odds/

In [3]:
#I wrote down the "codes" for the url that we're scraping
#I will add both the team and year that the spread is coming from

In [4]:
team_codes_bet = {'BUF':'Buffalo Bills',
                 'MIA':'Miami Dolphins',
                 'NEP':'New England Patriots',
                 'NYJ': 'New York Jets',
                 'DAL': 'Dallas Cowboys',
                 'NYG': 'New York Giants',
                 'PHI': 'Philadelphia Eagles',
                 'WAS': 'Washington Football Team',
                 'BAL': 'Baltimore Ravens',
                 'CIN': 'Cincinnati Bengals',
                 'CLE': 'Cleveland Browns',
                 'PIT': 'Pittsburgh Steelers',
                 'CHI': 'Chicago Bears',
                 'DET': 'Detroit Lions',
                 'GBP': 'Green Bay Packers',
                 'MIN': 'Minnesota Vikings',
                 'HOU': 'Houston Texans',
                 'IND': 'Indianapolis Colts',
                 'JAC': 'Jacksonville Jaguars',
                 'TEN': 'Tennessee Titans',
                 'ATL': 'Atlanta Falcons',
                 'CAR': 'Carolina Panthers',
                 'NOS': 'New Orleans Saints',
                 'TBB': 'Tampa Bay Buccaneers',
                 'DEN': 'Denver Broncos',
                 'KCC': 'Kansas City Chiefs',
                 'LVR': 'Las Vegas Raiders',
                 'LAC': 'Los Angeles Chargers',
                 'ARZ': 'Arizona Cardinals',
                 'LAR': 'Los Angeles Rams',
                 'SFF':  'San Francisco 49ers',
                 'SEA': 'Seattle Seahawks'
                 }

In [5]:
data_raw = pd.DataFrame()

In [6]:
for i in team_codes_bet:  
    try:
        url = f'https://www.sportsoddshistory.com/nfl-game-team/?tm={i}'
        test_blank = pd.DataFrame()
        for num in range(len(pd.read_html(url))):
            if len(pd.read_html(url)[num]) == 18 \
            or len(pd.read_html(url)[num]) == 17:
                raw = pd.read_html(url)[num]
                raw['Team'] = team_codes_bet[i]
                raw['Year'] = raw.iloc[0,2][-4:]
                test_blank = pd.concat([test_blank,raw])
            else:
                pass
        data_raw = pd.concat([data_raw,test_blank])
        #We don't want too many requests too fast, so make the program stop before continuing.
        #Use a randint so that the scraping behavior is less predictable
        time.sleep(np.random.randint(1,4))
    except:
        continue

In [7]:
data = data_raw.copy().reset_index(drop=True)

In [8]:
#Lets check that we have data for 2020-2022

In [9]:
data['Year'].unique()

array(['2022', '2021', '2020'], dtype=object)

In [10]:
#Also, let's confirm that the legnth of the dataset is what we expect
#We should have about 1.6k rows of data

In [11]:
len(data)

1694

In [12]:
#Now we reset the index

In [13]:
data = data.reset_index(drop=True)

In [14]:
#Then we get rid of rows that don't contain actual data
#Because when we scrape, the table will collect the title row as a regular row of data

In [15]:
data = data.drop(data[data[5] == 'Opponent'].index)

In [16]:
#Let's split the spread column and take the 2nd portion since that's the spread we want

In [17]:
data['Betting Spread'] = data[7].apply(lambda x: x.split(' ')[1])

In [18]:
#Eliminate other columns, and rename the remaining coluumns

In [19]:
data.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,Team,Year,Betting Spread
0,1,Thu,"Sep 8, 2022",8:20,@,Los Angeles Rams,W 31-10,W -2.5,U 52,,Buffalo Bills,2022,-2.5
1,2,Mon,"Sep 19, 2022",7:15,,Tennessee Titans,W 41-7,W -10,O 47.5,,Buffalo Bills,2022,-10.0
2,3,Sun,"Sep 25, 2022",1:00,@,Miami Dolphins,L 19-21,L -4.5,U 53,,Buffalo Bills,2022,-4.5
3,4,Sun,"Oct 2, 2022",1:00,@,Baltimore Ravens,W 23-20,P -3,U 51,,Buffalo Bills,2022,-3.0
4,5,Sun,"Oct 9, 2022",1:00,,Pittsburgh Steelers,W 38-3,W -14,U 45.5,,Buffalo Bills,2022,-14.0


In [20]:
bet = data[[0, 5, 'Team','Betting Spread', 'Year']]

In [21]:
bet = bet.rename(columns={0:'Week',
                         5:'Opp'})

In [22]:
bet.columns

Index(['Week', 'Opp', 'Team', 'Betting Spread', 'Year'], dtype='object')

In [23]:
#Change the columns to a specific datatype

In [24]:
bet['Week'] = bet['Week'].astype('str')
bet['Opp'] = bet['Opp'].astype('str')
bet['Team'] = bet['Team'].astype('str')
bet['Betting Spread'] = bet['Betting Spread'].astype('str')
bet['Year'] = bet['Year'].astype('str')

In [25]:
bet.dtypes

Week              object
Opp               object
Team              object
Betting Spread    object
Year              object
dtype: object

In [26]:
bet.head()

Unnamed: 0,Week,Opp,Team,Betting Spread,Year
0,1,Los Angeles Rams,Buffalo Bills,-2.5,2022
1,2,Tennessee Titans,Buffalo Bills,-10.0,2022
2,3,Miami Dolphins,Buffalo Bills,-4.5,2022
3,4,Baltimore Ravens,Buffalo Bills,-3.0,2022
4,5,Pittsburgh Steelers,Buffalo Bills,-14.0,2022


In [27]:
#Now we have our betting spread data for each NFL team since 2020

In [28]:
#Now we need to get actual performance data
#We will collect this from Pro Football Focus: 
#Example source: https://www.pro-football-reference.com/teams/crd/2021.htm#games

In [29]:
team_codes = {'crd': 'Arizona Cardinals', 
              'atl': 'Atlanta Falcons', 
              'rav': 'Baltimore Ravens', 
              'buf': 'Buffalo Bills', 
              'car': 'Carolina Panthers', 
              'chi': 'Chicago Bears', 
              'cin': 'Cincinnati Bengals', 
              'cle': 'Cleveland Browns', 
              'dal': 'Dallas Cowboys', 
              'den': 'Denver Broncos', 
              'det': 'Detroit Lions', 
              'gnb': 'Green Bay Packers', 
              'htx': 'Houston Texans', 
              'clt': 'Indianapolis Colts', 
              'jax': 'Jacksonville Jaguars', 
              'kan': 'Kansas City Chiefs', 
              'rai': 'Las Vegas Raiders', 
              'sdg': 'Los Angeles Chargers', 
              'ram': 'Los Angeles Rams', 
              'mia': 'Miami Dolphins', 
              'min': 'Minnesota Vikings', 
              'nwe': 'New England Patriots', 
              'nor': 'New Orleans Saints', 
              'nyg': 'New York Giants',
              'nyj': 'New York Jets', 
              'phi': 'Philadelphia Eagles', 
              'pit': 'Pittsburgh Steelers', 
              'sfo': 'San Francisco 49ers', 
              'sea': 'Seattle Seahawks', 
              'tam': 'Tampa Bay Buccaneers',
              'oti': 'Tennessee Titans', 
              'was': 'Washington Football Team'}

In [30]:
data_team_raw = pd.DataFrame()

In [31]:
for i in team_codes:
    team_blank = pd.DataFrame()
#     for year in ['2022', '2021', '2020']:
    for year in data['Year'].unique():
        url = f'https://www.pro-football-reference.com/teams/{i}/{year}.htm#games'
        test = pd.read_html(url)[1]
        test = test.droplevel(0,axis=1)
        test = test[:18]
        test['Team'] = team_codes[i]
        test['Year'] = year
        team_blank = pd.concat([team_blank,test])
        #We don't want too many requests too fast, so make the program stop before continuing.
        #Use a randint so that the scraping behavior is less predictable
        time.sleep(np.random.randint(1,4))
    data_team_raw = pd.concat([data_team_raw, team_blank])
    time.sleep(np.random.randint(1,4))

In [32]:
data_team = data_team_raw.copy().reset_index(drop=True)

In [33]:
data_team.head()

Unnamed: 0,Week,Day,Date,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,OT,Rec,Unnamed: 8_level_1,Opp,...,1stD,TotYd,PassY,RushY,TO,Offense,Defense,Sp. Tms,Team,Year
0,1,Sun,September 11,4:25PM ET,boxscore,L,,0-1,,Kansas City Chiefs,...,33.0,488.0,360.0,128.0,1.0,2.29,-33.41,6.88,Arizona Cardinals,2022
1,2,Sun,September 18,4:25PM ET,boxscore,W,OT,1-1,@,Las Vegas Raiders,...,20.0,324.0,244.0,80.0,1.0,12.0,-4.86,1.75,Arizona Cardinals,2022
2,3,Sun,September 25,4:25PM ET,boxscore,L,,1-2,,Los Angeles Rams,...,15.0,339.0,239.0,100.0,1.0,0.8,-5.31,-5.72,Arizona Cardinals,2022
3,4,Sun,October 2,4:05PM ET,boxscore,W,,2-2,@,Carolina Panthers,...,11.0,220.0,180.0,40.0,3.0,-6.14,19.54,-1.9,Arizona Cardinals,2022
4,5,Sun,October 9,4:25PM ET,boxscore,L,,2-3,,Philadelphia Eagles,...,24.0,357.0,218.0,139.0,,12.33,-10.63,-5.16,Arizona Cardinals,2022


In [34]:
#We'll need to make sure each index is unique in the dataset

In [35]:
data_team.index.is_unique

True

In [36]:
#Since we dropped the multi-level index, some columns have the same name b/c it's referring to offense or defense
#Manually renaming them because identifying by name will change all columns with the same name

In [37]:
data_team.columns = ['Week', 'Day', 'Date', 'Time', 'boxscore',
       'Result', 'OT', 'Rec', 'Home/Away', 'Opp Name', 'Team Score',
       'Opp Score', 'Off1stD', 'OffTotYd', 'OffPassY', 'OffRushY', 'OffTO', 'Def1stD', 'DefTotYd',
       'DefPassY', 'DefRushY', 'DefTO', 'Offense', 'Defense', 'Sp. Tms', 'Team', 'Year']

In [38]:
data_team.columns

Index(['Week', 'Day', 'Date', 'Time', 'boxscore', 'Result', 'OT', 'Rec',
       'Home/Away', 'Opp Name', 'Team Score', 'Opp Score', 'Off1stD',
       'OffTotYd', 'OffPassY', 'OffRushY', 'OffTO', 'Def1stD', 'DefTotYd',
       'DefPassY', 'DefRushY', 'DefTO', 'Offense', 'Defense', 'Sp. Tms',
       'Team', 'Year'],
      dtype='object')

In [45]:
#Lets change the data types for the columns will likely use
#First, change all N/A value to 0 so any int/float columns can be changed as such

In [46]:
data_team = data_team.fillna(0)

In [49]:
#We'll need to drop the any canceled games

In [50]:
data_team = data_team.drop(data_team[data_team['Team Score'] == 'Canceled'].index)

In [51]:
#Also dropping all bye weeks

In [52]:
data_team = data_team.drop(data_team[data_team['Opp Name'] == 'Bye Week'].index)

In [53]:
#Dropping Playoffs

In [54]:
data_team = data_team.drop(data_team[data_team['Date'] == 'Playoffs'].index)

In [55]:
data_team['Week'] = data_team['Week'].astype('int')
data_team['Date'] = data_team['Date'].astype('str')
data_team['Time'] = data_team['Time'].astype('str')
data_team['Result'] = data_team['Result'].astype('str')
data_team['OT'] = data_team['OT'].astype('str')
data_team['Home/Away'] = data_team['Home/Away'].astype('str')
data_team['Team Score'] = data_team['Team Score'].astype('int')
data_team['Opp Score'] = data_team['Opp Score'].astype('int')
data_team['Off1stD'] = data_team['Off1stD'].astype('int')
data_team['OffTotYd'] = data_team['OffTotYd'].astype('int')
data_team['OffPassY'] = data_team['OffPassY'].astype('int')
data_team['OffRushY'] = data_team['OffRushY'].astype('int')
data_team['OffRushY'] = data_team['OffRushY'].astype('int')
data_team['OffTO'] = data_team['OffTO'].astype('int')
data_team['Def1stD'] = data_team['Def1stD'].astype('int')
data_team['DefTotYd'] = data_team['DefTotYd'].astype('int')
data_team['DefPassY'] = data_team['DefPassY'].astype('int')
data_team['DefRushY'] = data_team['DefRushY'].astype('int')
data_team['DefTO'] = data_team['DefTO'].astype('int')
data_team['Team'] = data_team['Team'].astype('str')

In [56]:
#Sort the table by year, team, and week

In [57]:
data_team = data_team.sort_values(by=['Year', 'Team','Week']).reset_index(drop=True)

In [58]:
#Confirm that these numeric columns appear correctly

In [59]:
data_team.iloc[:,10:20].head()

Unnamed: 0,Team Score,Opp Score,Off1stD,OffTotYd,OffPassY,OffRushY,OffTO,Def1stD,DefTotYd,DefPassY
0,24,20,29,404,224,180,1,18,366,243
1,30,15,22,438,278,160,1,19,316,199
2,23,26,28,377,268,109,3,22,322,232
3,21,31,21,262,133,129,1,30,444,276
4,30,10,28,496,369,127,1,21,285,162


In [60]:
# We want the point spread, which we'll calculate as points scored - points allowed

In [61]:
data_team['Scoring Spread'] = data_team['Team Score'] - data_team['Opp Score']

In [62]:
#Home games are counted as 0, away games are counted as 1

In [63]:
data_team['Home/Away'] = data_team['Home/Away'].apply(lambda x: 1 if x == '@' else 0)

In [64]:
#Let's also introduce a team strength score
#We want to take into account the stregnth of the opponent
#We can make a simple one for now that equals the point spread (across all games played)/# of games played

In [65]:
#We'll need to capture the original weeks that these games took place in
#We'll add this back to the strength table before joining to data_team

In [66]:
original_week = data_team['Week'].copy()

In [67]:
#We should only see up to 18 weeks
original_week.unique()

array([ 1,  2,  3,  4,  5,  6,  7,  9, 10, 11, 12, 13, 14, 15, 16, 17,  8,
       18])

In [68]:
strength = data_team.groupby(['Year','Team'])['Scoring Spread'].expanding().sum().reset_index()

In [69]:
strength = strength.drop('level_2', axis=1)

In [70]:
#Let's make sure when a new team appears, the spread resets
strength[14:25]

Unnamed: 0,Year,Team,Scoring Spread
14,2020,Arizona Cardinals,54.0
15,2020,Arizona Cardinals,43.0
16,2020,Atlanta Falcons,-13.0
17,2020,Atlanta Falcons,-14.0
18,2020,Atlanta Falcons,-18.0
19,2020,Atlanta Falcons,-32.0
20,2020,Atlanta Falcons,-39.0
21,2020,Atlanta Falcons,-22.0
22,2020,Atlanta Falcons,-23.0
23,2020,Atlanta Falcons,-15.0


In [72]:
#Now we add back the "week" column
strength['Week'] = original_week

In [73]:
#Now we have week, we make a strength column

In [74]:
strength['Team_Strength'] = round(strength['Scoring Spread'] / strength['Week'],2)

In [75]:
strength.head(20)

Unnamed: 0,Year,Team,Scoring Spread,Week,Team_Strength
0,2020,Arizona Cardinals,4.0,1,4.0
1,2020,Arizona Cardinals,19.0,2,9.5
2,2020,Arizona Cardinals,16.0,3,5.33
3,2020,Arizona Cardinals,6.0,4,1.5
4,2020,Arizona Cardinals,26.0,5,5.2
5,2020,Arizona Cardinals,54.0,6,9.0
6,2020,Arizona Cardinals,57.0,7,8.14
7,2020,Arizona Cardinals,54.0,9,6.0
8,2020,Arizona Cardinals,56.0,10,5.6
9,2020,Arizona Cardinals,49.0,11,4.45


In [76]:
#We now join strength on Year, Team, and Week

data_team = data_team.merge(strength, on=['Year', 'Team', 'Week'], how='left')

In [77]:
data_team = data_team.rename(columns={'Scoring Spread_y':'Rolling_Scoring_Spread',
                                     'Scoring Spread_x':'Team_Scoring_Spread'})

In [78]:
data_team.head()

Unnamed: 0,Week,Day,Date,Time,boxscore,Result,OT,Rec,Home/Away,Opp Name,...,DefRushY,DefTO,Offense,Defense,Sp. Tms,Team,Year,Team_Scoring_Spread,Rolling_Scoring_Spread,Team_Strength
0,1,Sun,September 13,4:25PM ET,boxscore,W,0,1-0,1,San Francisco 49ers,...,123,0,6.46,-3.88,-0.46,Arizona Cardinals,2020,4,4.0,4.0
1,2,Sun,September 20,4:05PM ET,boxscore,W,0,2-0,0,Washington Football Team,...,117,2,8.92,0.18,6.91,Arizona Cardinals,2020,15,19.0,9.5
2,3,Sun,September 27,4:25PM ET,boxscore,L,0,2-1,0,Detroit Lions,...,90,0,8.07,-8.15,-2.46,Arizona Cardinals,2020,-3,16.0,5.33
3,4,Sun,October 4,1:00PM ET,boxscore,L,0,2-2,1,Carolina Panthers,...,168,1,8.53,-23.36,3.47,Arizona Cardinals,2020,-10,6.0,1.5
4,5,Sun,October 11,1:00PM ET,boxscore,W,0,3-2,1,New York Jets,...,123,0,17.2,4.27,-3.61,Arizona Cardinals,2020,20,26.0,5.2


In [79]:
#We will want an opponent strength score as well
#So we will join to opponent name rather than team name

In [80]:
strength.head()

Unnamed: 0,Year,Team,Scoring Spread,Week,Team_Strength
0,2020,Arizona Cardinals,4.0,1,4.0
1,2020,Arizona Cardinals,19.0,2,9.5
2,2020,Arizona Cardinals,16.0,3,5.33
3,2020,Arizona Cardinals,6.0,4,1.5
4,2020,Arizona Cardinals,26.0,5,5.2


In [81]:
data_team = data_team.merge(strength, 
                            left_on=['Year', 'Opp Name', 'Week'], 
                            right_on=['Year', 'Team', 'Week'], 
                            how='left')

In [82]:
data_team.head()

Unnamed: 0,Week,Day,Date,Time,boxscore,Result,OT,Rec,Home/Away,Opp Name,...,Defense,Sp. Tms,Team_x,Year,Team_Scoring_Spread,Rolling_Scoring_Spread,Team_Strength_x,Team_y,Scoring Spread,Team_Strength_y
0,1,Sun,September 13,4:25PM ET,boxscore,W,0,1-0,1,San Francisco 49ers,...,-3.88,-0.46,Arizona Cardinals,2020,4,4.0,4.0,San Francisco 49ers,-4.0,-4.0
1,2,Sun,September 20,4:05PM ET,boxscore,W,0,2-0,0,Washington Football Team,...,0.18,6.91,Arizona Cardinals,2020,15,19.0,9.5,Washington Football Team,-5.0,-2.5
2,3,Sun,September 27,4:25PM ET,boxscore,L,0,2-1,0,Detroit Lions,...,-8.15,-2.46,Arizona Cardinals,2020,-3,16.0,5.33,Detroit Lions,-22.0,-7.33
3,4,Sun,October 4,1:00PM ET,boxscore,L,0,2-2,1,Carolina Panthers,...,-23.36,3.47,Arizona Cardinals,2020,-10,6.0,1.5,Carolina Panthers,-3.0,-0.75
4,5,Sun,October 11,1:00PM ET,boxscore,W,0,3-2,1,New York Jets,...,4.27,-3.61,Arizona Cardinals,2020,20,26.0,5.2,New York Jets,-86.0,-17.2


In [83]:
data_team.columns

Index(['Week', 'Day', 'Date', 'Time', 'boxscore', 'Result', 'OT', 'Rec',
       'Home/Away', 'Opp Name', 'Team Score', 'Opp Score', 'Off1stD',
       'OffTotYd', 'OffPassY', 'OffRushY', 'OffTO', 'Def1stD', 'DefTotYd',
       'DefPassY', 'DefRushY', 'DefTO', 'Offense', 'Defense', 'Sp. Tms',
       'Team_x', 'Year', 'Team_Scoring_Spread', 'Rolling_Scoring_Spread',
       'Team_Strength_x', 'Team_y', 'Scoring Spread', 'Team_Strength_y'],
      dtype='object')

In [84]:
data_team = data_team.rename(columns={'Team_x':'Team',
                                 'Team_Strength_x':'Rolling_Strength',
                                 'Scoring Spread':'Opp_Scoring_Spread',
                                 'Team_Strength_y':'Opp_Rolling_Strength',
                                 })

In [85]:
data_team = data_team.drop('Team_y', axis=1)

In [86]:
data_team.head()

Unnamed: 0,Week,Day,Date,Time,boxscore,Result,OT,Rec,Home/Away,Opp Name,...,Offense,Defense,Sp. Tms,Team,Year,Team_Scoring_Spread,Rolling_Scoring_Spread,Rolling_Strength,Opp_Scoring_Spread,Opp_Rolling_Strength
0,1,Sun,September 13,4:25PM ET,boxscore,W,0,1-0,1,San Francisco 49ers,...,6.46,-3.88,-0.46,Arizona Cardinals,2020,4,4.0,4.0,-4.0,-4.0
1,2,Sun,September 20,4:05PM ET,boxscore,W,0,2-0,0,Washington Football Team,...,8.92,0.18,6.91,Arizona Cardinals,2020,15,19.0,9.5,-5.0,-2.5
2,3,Sun,September 27,4:25PM ET,boxscore,L,0,2-1,0,Detroit Lions,...,8.07,-8.15,-2.46,Arizona Cardinals,2020,-3,16.0,5.33,-22.0,-7.33
3,4,Sun,October 4,1:00PM ET,boxscore,L,0,2-2,1,Carolina Panthers,...,8.53,-23.36,3.47,Arizona Cardinals,2020,-10,6.0,1.5,-3.0,-0.75
4,5,Sun,October 11,1:00PM ET,boxscore,W,0,3-2,1,New York Jets,...,17.2,4.27,-3.61,Arizona Cardinals,2020,20,26.0,5.2,-86.0,-17.2


In [87]:
#We want the model to take into account recent performance
#Select the data we really want for our model
#Isolate the features from the result
#Take into account last 3 games

In [88]:
data_team.columns

Index(['Week', 'Day', 'Date', 'Time', 'boxscore', 'Result', 'OT', 'Rec',
       'Home/Away', 'Opp Name', 'Team Score', 'Opp Score', 'Off1stD',
       'OffTotYd', 'OffPassY', 'OffRushY', 'OffTO', 'Def1stD', 'DefTotYd',
       'DefPassY', 'DefRushY', 'DefTO', 'Offense', 'Defense', 'Sp. Tms',
       'Team', 'Year', 'Team_Scoring_Spread', 'Rolling_Scoring_Spread',
       'Rolling_Strength', 'Opp_Scoring_Spread', 'Opp_Rolling_Strength'],
      dtype='object')

In [89]:
rolling = data_team.groupby(['Year', 'Team'])[['Off1stD', 'OffTotYd', 'OffPassY',
       'OffRushY', 'OffTO', 'Def1stD', 'DefTotYd', 'DefPassY', 'DefRushY',
       'DefTO']].rolling(3).mean().reset_index()

In [90]:
#When we aggregate, we lose the "week"

In [91]:
rolling = rolling.drop('level_2', axis=1)

In [92]:
len(rolling)

1598

In [93]:
#Now we add week back in

In [94]:
rolling['Week'] = original_week

In [95]:
#Double check that when the next team appears in the table, the data resets the "roll"
#We don't want the "rolling avg" to include values from another team 

In [96]:
rolling[14:20]

Unnamed: 0,Year,Team,Off1stD,OffTotYd,OffPassY,OffRushY,OffTO,Def1stD,DefTotYd,DefPassY,DefRushY,DefTO,Week
14,2020,Arizona Cardinals,22.666667,422.0,294.666667,127.333333,1.666667,19.333333,326.333333,185.666667,140.666667,1.333333,16
15,2020,Arizona Cardinals,20.333333,363.333333,273.0,90.333333,2.0,23.0,384.333333,233.0,151.333333,1.0,17
16,2020,Atlanta Falcons,,,,,,,,,,,1
17,2020,Atlanta Falcons,,,,,,,,,,,2
18,2020,Atlanta Falcons,24.0,419.0,309.333333,109.666667,1.0,26.333333,463.333333,350.333333,113.0,1.666667,3
19,2020,Atlanta Falcons,22.666667,359.333333,247.666667,111.666667,0.333333,26.333333,470.0,355.666667,114.333333,1.666667,4


In [97]:
#Now we join the rolling data to our dataframe

data_team = data_team.merge(rolling, on=['Year', 'Team','Week'], how='left')

In [98]:
data_team[(data_team['Team'] == 'Arizona Cardinals') & (data_team['Year'] == '2021')]

Unnamed: 0,Week,Day,Date,Time,boxscore,Result,OT,Rec,Home/Away,Opp Name,...,Off1stD_y,OffTotYd_y,OffPassY_y,OffRushY_y,OffTO_y,Def1stD_y,DefTotYd_y,DefPassY_y,DefRushY_y,DefTO_y
512,1,Sun,September 12,1:00PM ET,boxscore,W,0,1-0,1,Tennessee Titans,...,,,,,,,,,,
513,2,Sun,September 19,4:05PM ET,boxscore,W,0,2-0,0,Minnesota Vikings,...,,,,,,,,,,
514,3,Sun,September 26,1:00PM ET,boxscore,W,0,3-0,1,Jacksonville Jaguars,...,22.0,432.333333,322.333333,110.0,1.333333,19.666667,342.666667,202.0,140.666667,2.333333
515,4,Sun,October 3,4:05PM ET,boxscore,W,0,4-0,1,Los Angeles Rams,...,23.666667,448.666667,312.0,136.666667,1.0,22.0,393.666667,241.333333,152.333333,2.0
516,5,Sun,October 10,4:25PM ET,boxscore,W,0,5-0,0,San Francisco 49ers,...,23.333333,392.0,258.333333,133.666667,0.666667,21.0,366.666667,222.666667,144.0,2.333333
517,6,Sun,October 17,4:05PM ET,boxscore,W,0,6-0,1,Cleveland Browns,...,23.666667,373.666667,222.333333,151.333333,0.333333,19.666667,343.0,227.666667,115.333333,2.0
518,7,Sun,October 24,4:25PM ET,boxscore,W,0,7-0,0,Houston Texans,...,22.666667,351.0,214.333333,136.666667,0.666667,14.333333,262.666667,173.666667,89.0,1.666667
519,8,Thu,October 28,8:20PM ET,boxscore,L,0,7-1,0,Green Bay Packers,...,23.333333,361.0,231.0,130.0,1.333333,16.0,261.666667,173.0,88.666667,1.333333
520,9,Sun,November 7,4:25PM ET,boxscore,W,0,8-1,1,San Francisco 49ers,...,22.666667,389.333333,253.0,136.333333,1.333333,18.0,277.333333,200.0,77.333333,1.333333
521,10,Sun,November 14,4:05PM ET,boxscore,L,0,8-2,0,Carolina Panthers,...,18.333333,313.333333,212.666667,100.666667,1.666667,23.333333,337.666667,219.0,118.666667,1.666667


In [99]:
#Let's change the column names from the rolling table to identify it's a rolling metric

In [100]:
data_team.columns

Index(['Week', 'Day', 'Date', 'Time', 'boxscore', 'Result', 'OT', 'Rec',
       'Home/Away', 'Opp Name', 'Team Score', 'Opp Score', 'Off1stD_x',
       'OffTotYd_x', 'OffPassY_x', 'OffRushY_x', 'OffTO_x', 'Def1stD_x',
       'DefTotYd_x', 'DefPassY_x', 'DefRushY_x', 'DefTO_x', 'Offense',
       'Defense', 'Sp. Tms', 'Team', 'Year', 'Team_Scoring_Spread',
       'Rolling_Scoring_Spread', 'Rolling_Strength', 'Opp_Scoring_Spread',
       'Opp_Rolling_Strength', 'Off1stD_y', 'OffTotYd_y', 'OffPassY_y',
       'OffRushY_y', 'OffTO_y', 'Def1stD_y', 'DefTotYd_y', 'DefPassY_y',
       'DefRushY_y', 'DefTO_y'],
      dtype='object')

In [101]:
data_team = data_team.rename(columns={'Off1stD_y':'Rolling_Off1stD',
                            'OffTotYd_y':'Rolling_OffTotYd', 
                            'OffPassY_y':'Rolling_OffPassY', 
                            'OffRushY_y':'Rolling_OffRushY', 
                            'OffTO_y':'Rolling_OffTO', 
                            'Def1stD_y':'Rolling_Def1stD',
                            'DefTotYd_y':'Rolling_DefTotYd', 
                            'DefPassY_y':'Rolling_DefPassY', 
                            'DefRushY_y':'Rolling_DefRushY', 
                            'DefTO_y':'Rolling_DefTO'})

In [102]:
data_team.head()

Unnamed: 0,Week,Day,Date,Time,boxscore,Result,OT,Rec,Home/Away,Opp Name,...,Rolling_Off1stD,Rolling_OffTotYd,Rolling_OffPassY,Rolling_OffRushY,Rolling_OffTO,Rolling_Def1stD,Rolling_DefTotYd,Rolling_DefPassY,Rolling_DefRushY,Rolling_DefTO
0,1,Sun,September 13,4:25PM ET,boxscore,W,0,1-0,1,San Francisco 49ers,...,,,,,,,,,,
1,2,Sun,September 20,4:05PM ET,boxscore,W,0,2-0,0,Washington Football Team,...,,,,,,,,,,
2,3,Sun,September 27,4:25PM ET,boxscore,L,0,2-1,0,Detroit Lions,...,26.333333,406.333333,256.666667,149.666667,1.666667,19.666667,334.666667,224.666667,110.0,0.666667
3,4,Sun,October 4,1:00PM ET,boxscore,L,0,2-2,1,Carolina Panthers,...,23.666667,359.0,226.333333,132.666667,1.666667,23.666667,360.666667,235.666667,125.0,1.0
4,5,Sun,October 11,1:00PM ET,boxscore,W,0,3-2,1,New York Jets,...,25.666667,378.333333,256.666667,121.666667,1.666667,24.333333,350.333333,223.333333,127.0,0.333333


In [103]:
#We should also include data about the opponent's rolling performance as well

In [104]:
data_team = data_team.merge(rolling, left_on=['Year', 'Opp Name','Week'],
                            right_on=['Year', 'Team','Week'],how='left')

In [105]:
data_team.columns

Index(['Week', 'Day', 'Date', 'Time', 'boxscore', 'Result', 'OT', 'Rec',
       'Home/Away', 'Opp Name', 'Team Score', 'Opp Score', 'Off1stD_x',
       'OffTotYd_x', 'OffPassY_x', 'OffRushY_x', 'OffTO_x', 'Def1stD_x',
       'DefTotYd_x', 'DefPassY_x', 'DefRushY_x', 'DefTO_x', 'Offense',
       'Defense', 'Sp. Tms', 'Team_x', 'Year', 'Team_Scoring_Spread',
       'Rolling_Scoring_Spread', 'Rolling_Strength', 'Opp_Scoring_Spread',
       'Opp_Rolling_Strength', 'Rolling_Off1stD', 'Rolling_OffTotYd',
       'Rolling_OffPassY', 'Rolling_OffRushY', 'Rolling_OffTO',
       'Rolling_Def1stD', 'Rolling_DefTotYd', 'Rolling_DefPassY',
       'Rolling_DefRushY', 'Rolling_DefTO', 'Team_y', 'Off1stD', 'OffTotYd',
       'OffPassY', 'OffRushY', 'OffTO', 'Def1stD', 'DefTotYd', 'DefPassY',
       'DefRushY', 'DefTO'],
      dtype='object')

In [106]:
#Rename columns to identify opponent rolling metrics

data_team = data_team.rename(columns={'Off1stD':'Opp_Rolling_Off1stD', 
                                      'OffTotYd':'Opp_Rolling_OffTotYd', 
                                      'OffPassY':'Opp_Rolling_OffPassY',
                                      'OffRushY':'Opp_Rolling_OffRushY', 
                                      'OffTO':'Opp_Rolling_OffTO',  
                                      'Def1stD':'Opp_Rolling_Deff1stD', 
                                      'DefTotYd':'Opp_Rolling_DefTotYd',
                                      'DefPassY':'Opp_Rolling_DefPassY',
                                      'DefRushY':'Opp_Rolling_DefRushY',
                                      'DefTO':'Opp_Rolling_DefTO',
                                      'Team_x':'Team'})

data_team = data_team.drop('Team_y', axis=1)

In [107]:
#Now it's time to join the betting data with the performance data

In [108]:
#First we need to convert week back to string
#We had it as an int so we could get the strength based on week

In [109]:
data_team['Week'] = data_team['Week'].astype('str')

In [110]:
data_team = data_team.merge(bet, on=['Year', 'Team', 'Week'], how='left')

In [111]:
data_team = data_team.drop('Opp', axis=1)

In [112]:
data_team.head()

Unnamed: 0,Week,Day,Date,Time,boxscore,Result,OT,Rec,Home/Away,Opp Name,...,Opp_Rolling_OffTotYd,Opp_Rolling_OffPassY,Opp_Rolling_OffRushY,Opp_Rolling_OffTO,Opp_Rolling_Off1stD,Opp_Rolling_DefTotYd,Opp_Rolling_DefPassY,Opp_Rolling_DefRushY,Opp_Rolling_DefTO,Betting Spread
0,1,Sun,September 13,4:25PM ET,boxscore,W,0,1-0,1,San Francisco 49ers,...,,,,,,,,,,7.0
1,2,Sun,September 20,4:05PM ET,boxscore,W,0,2-0,0,Washington Football Team,...,,,,,,,,,,-7.5
2,3,Sun,September 27,4:25PM ET,boxscore,L,0,2-1,0,Detroit Lions,...,351.666667,246.0,105.666667,0.666667,26.666667,409.333333,237.0,172.333333,1.0,-4.5
3,4,Sun,October 4,1:00PM ET,boxscore,L,0,2-2,1,Carolina Panthers,...,391.0,279.0,112.0,1.666667,22.333333,345.666667,223.0,122.666667,2.333333,-3.0
4,5,Sun,October 11,1:00PM ET,boxscore,W,0,3-2,1,New York Jets,...,288.666667,168.333333,120.333333,1.0,23.333333,402.666667,281.666667,121.0,1.333333,-7.0


In [113]:
#Now let's get the weather information
#I want this weather to be based on the date, time (nearest hour), and location of the stadium

In [114]:
data_team['Datetime'] = pd.to_datetime(data_team['Year']+' '+data_team['Date']+' '+data_team['Time'].str[:-5],format='%Y %B %d %H:%M')


In [115]:
data_team['Datetime'] = np.where((data_team['Time'].str[-5:-3] == 'PM') \
                                 & (data_team['Time'].str[:2] != '12'), 
                                 data_team['Datetime'] + timedelta(hours=12),
                                 data_team['Datetime'])



In [116]:
data_team['Date'] = pd.to_datetime(data_team['Year']+' '+data_team['Date'],format='%Y %B %d').dt.date

In [117]:
data_team['Time'] = data_team['Datetime'].dt.time

In [118]:
data_team['Time_Rounded'] = data_team['Datetime'].round('H')

In [120]:
# We'll need a string/modified time column that the weather API can recognize

In [119]:
data_team['Time_Rounded'] = data_team['Time_Rounded'].apply(lambda x: str(x).replace(' ', 'T')[:-3])

In [122]:
#NFL has international games, which need to be accounted for
#Both for determining home/away, as well as getting accurate weather information

In [123]:
#We should also denote which team is the home team, so that we can join stadium data
#For international games, I consider this an away for both teams

In [124]:
data_team.head()

Unnamed: 0,Week,Day,Date,Time,boxscore,Result,OT,Rec,Home/Away,Opp Name,...,Opp_Rolling_OffRushY,Opp_Rolling_OffTO,Opp_Rolling_Off1stD,Opp_Rolling_DefTotYd,Opp_Rolling_DefPassY,Opp_Rolling_DefRushY,Opp_Rolling_DefTO,Betting Spread,Datetime,Time_Rounded
0,1,Sun,2020-09-13,16:25:00,boxscore,W,0,1-0,1,San Francisco 49ers,...,,,,,,,,7.0,2020-09-13 16:25:00,2020-09-13T16:00
1,2,Sun,2020-09-20,16:05:00,boxscore,W,0,2-0,0,Washington Football Team,...,,,,,,,,-7.5,2020-09-20 16:05:00,2020-09-20T16:00
2,3,Sun,2020-09-27,16:25:00,boxscore,L,0,2-1,0,Detroit Lions,...,105.666667,0.666667,26.666667,409.333333,237.0,172.333333,1.0,-4.5,2020-09-27 16:25:00,2020-09-27T16:00
3,4,Sun,2020-10-04,13:00:00,boxscore,L,0,2-2,1,Carolina Panthers,...,112.0,1.666667,22.333333,345.666667,223.0,122.666667,2.333333,-3.0,2020-10-04 13:00:00,2020-10-04T13:00
4,5,Sun,2020-10-11,13:00:00,boxscore,W,0,3-2,1,New York Jets,...,120.333333,1.0,23.333333,402.666667,281.666667,121.0,1.333333,-7.0,2020-10-11 13:00:00,2020-10-11T13:00


In [125]:
url = 'https://en.wikipedia.org/wiki/NFL_International_Series'

In [126]:
#Lets get the London Games
london = pd.read_html(url)[1]
london['City'] = 'London'
london['State'] = 'England'
london['Designated home team'] = london['Designated home team'].astype('str')
london['Designated visitor'] = london['Designated visitor'].astype('str')
london['Year'] = london['Year'].str.replace(r'\[\d+\]', '', regex=True)
london['Designated home team'] = london['Designated home team'].str.replace(r'\[\d+\]', '', regex=True)
london['Designated visitor'] = london['Designated visitor'].str.replace(r'\[\d+\]', '', regex=True)
london['Datetime'] = pd.to_datetime(london['Year']+' '+london['Date'])
london['Datetime'] = london['Datetime'].astype('str')

In [127]:
#Mexico Games
mx = pd.read_html(url)[2]
#City is defined in this table
mx['State'] = 'Mexico'
mx['Year'] = mx['Year'].astype('str')
mx['Designated home team'] = mx['Designated home team'].astype('str')
mx['Designated visitor'] = mx['Designated visitor'].astype('str')
mx['Year'] = mx['Year'].str.replace(r'\[\d+\]', '', regex=True)
mx['Designated home team'] = mx['Designated home team'].str.replace(r'\[\d+\]', '', regex=True)
mx['Designated visitor'] = mx['Designated visitor'].str.replace(r'\[\d+\]', '', regex=True)
mx['Datetime'] = pd.to_datetime(mx['Year']+' '+mx['Date'])
mx['Datetime'] = mx['Datetime'].astype('str')

In [128]:
#Germany Games
ger = pd.read_html(url)[3]
#City is defined in this table
ger['State'] = 'Germany'
ger['Year'] = ger['Year'].astype('str')
ger['Designated home team'] = ger['Designated home team'].astype('str')
ger['Designated visitor'] = ger['Designated visitor'].astype('str')
ger['Year'] = ger['Year'].str.replace(r'\[\d+\]', '', regex=True)
ger['Designated home team'] = ger['Designated home team'].str.replace(r'\[\d+\]', '', regex=True)
ger['Designated visitor'] = ger['Designated visitor'].str.replace(r'\[\d+\]', '', regex=True)
ger['Datetime'] = pd.to_datetime(ger['Year']+' '+ger['Date'])
ger['Datetime'] = ger['Datetime'].astype('str')

In [129]:
intl = pd.concat([london, mx, ger])

In [130]:
intl['is_intl'] = 1

In [131]:
intl.head()

Unnamed: 0,Year,Date,Designated visitor,Score,Designated home team,Score.1,Stadium,Attendance,City,State,Datetime,is_intl
0,2007,October 28,New York Giants,13,Miami Dolphins,10,Wembley Stadium,81176,London,England,2007-10-28,1
1,2008,October 26,San Diego Chargers,32,New Orleans Saints,37,Wembley Stadium,83226,London,England,2008-10-26,1
2,2009,October 25,New England Patriots,35,Tampa Bay Buccaneers,7,Wembley Stadium,84254,London,England,2009-10-25,1
3,2010,October 31,Denver Broncos,16,San Francisco 49ers,24,Wembley Stadium,83941,London,England,2010-10-31,1
4,2011,October 23,Chicago Bears,24,Tampa Bay Buccaneers,18,Wembley Stadium,76981,London,England,2011-10-23,1


In [132]:
data_team['Home_Team'] = np.where(data_team['Home/Away'] == 0, data_team['Team'], data_team['Opp Name'])
data_team['Away_Team'] = np.where(data_team['Home/Away'] == 1, data_team['Team'], data_team['Opp Name'])

In [133]:
#We will join on year and home Team and away team
#International games still have a "home" and "away"
#However, I'm international games as away for both teams

In [134]:
data_team = data_team.merge(intl[['Year','Designated home team','Designated visitor',
                                  'City', 'State', 'Datetime', 'is_intl']], 
                            left_on=['Year', 'Home_Team', 'Away_Team'],
                            right_on=['Year', 'Designated home team', 'Designated visitor'],
                            how='left')

In [135]:
#Any row that doesn't have is_intl is in the US, so we mark it as 0

data_team['is_intl'] = np.where(data_team['is_intl'].isna(), 0, 1)

In [136]:
data_team = data_team.drop(['Designated home team', 'Designated visitor',
                           'Datetime_y'], axis=1)

In [137]:
data_team.head()

Unnamed: 0,Week,Day,Date,Time,boxscore,Result,OT,Rec,Home/Away,Opp Name,...,Opp_Rolling_DefRushY,Opp_Rolling_DefTO,Betting Spread,Datetime_x,Time_Rounded,Home_Team,Away_Team,City,State,is_intl
0,1,Sun,2020-09-13,16:25:00,boxscore,W,0,1-0,1,San Francisco 49ers,...,,,7.0,2020-09-13 16:25:00,2020-09-13T16:00,San Francisco 49ers,Arizona Cardinals,,,0
1,2,Sun,2020-09-20,16:05:00,boxscore,W,0,2-0,0,Washington Football Team,...,,,-7.5,2020-09-20 16:05:00,2020-09-20T16:00,Arizona Cardinals,Washington Football Team,,,0
2,3,Sun,2020-09-27,16:25:00,boxscore,L,0,2-1,0,Detroit Lions,...,172.333333,1.0,-4.5,2020-09-27 16:25:00,2020-09-27T16:00,Arizona Cardinals,Detroit Lions,,,0
3,4,Sun,2020-10-04,13:00:00,boxscore,L,0,2-2,1,Carolina Panthers,...,122.666667,2.333333,-3.0,2020-10-04 13:00:00,2020-10-04T13:00,Carolina Panthers,Arizona Cardinals,,,0
4,5,Sun,2020-10-11,13:00:00,boxscore,W,0,3-2,1,New York Jets,...,121.0,1.333333,-7.0,2020-10-11 13:00:00,2020-10-11T13:00,New York Jets,Arizona Cardinals,,,0


In [138]:
#Now we have the date/times, so we can scrape for weather data

In [139]:
#First, let's get the stadium info for each team

In [140]:
stadiums = pd.DataFrame(pd.read_html('https://geojango.com/pages/list-of-nfl-teams')[0])

In [141]:
#Washington changed their name, so we need to add their information manually

In [142]:
stadiums.loc[len(stadiums)] = ['Washington Football Team', 'FedExField', 'Landover, Maryland', 82000, 1997]

In [143]:
stadiums['City'] = stadiums['Arena Location'].apply(lambda x: x.split(', ')[0])
stadiums['State'] = stadiums['Arena Location'].apply(lambda x: x.split(', ')[1])

In [144]:
# Checking on Washington

In [145]:
stadiums[stadiums['Team Name'] == 'Washington Football Team']

Unnamed: 0,Team Name,Arena Name,Arena Location,Seating Capacity,Opening Year,City,State
32,Washington Football Team,FedExField,"Landover, Maryland",82000,1997,Landover,Maryland


In [146]:
data_team = data_team.merge(stadiums, left_on='Home_Team', right_on='Team Name')

In [147]:
data_team.columns

Index(['Week', 'Day', 'Date', 'Time', 'boxscore', 'Result', 'OT', 'Rec',
       'Home/Away', 'Opp Name', 'Team Score', 'Opp Score', 'Off1stD_x',
       'OffTotYd_x', 'OffPassY_x', 'OffRushY_x', 'OffTO_x', 'Def1stD_x',
       'DefTotYd_x', 'DefPassY_x', 'DefRushY_x', 'DefTO_x', 'Offense',
       'Defense', 'Sp. Tms', 'Team', 'Year', 'Team_Scoring_Spread',
       'Rolling_Scoring_Spread', 'Rolling_Strength', 'Opp_Scoring_Spread',
       'Opp_Rolling_Strength', 'Rolling_Off1stD', 'Rolling_OffTotYd',
       'Rolling_OffPassY', 'Rolling_OffRushY', 'Rolling_OffTO',
       'Rolling_Def1stD', 'Rolling_DefTotYd', 'Rolling_DefPassY',
       'Rolling_DefRushY', 'Rolling_DefTO', 'Opp_Rolling_Off1stD',
       'Opp_Rolling_OffTotYd', 'Opp_Rolling_OffPassY', 'Opp_Rolling_OffRushY',
       'Opp_Rolling_OffTO', 'Opp_Rolling_Off1stD', 'Opp_Rolling_DefTotYd',
       'Opp_Rolling_DefPassY', 'Opp_Rolling_DefRushY', 'Opp_Rolling_DefTO',
       'Betting Spread', 'Datetime_x', 'Time_Rounded', 'Home_Team',
   

In [148]:
#We'll condense the information from our international table and stadiums table into one

In [149]:
data_team['City'] = np.where(data_team['City_x'].isna(), data_team['City_y'], data_team['City_x'])
data_team['State'] = np.where(data_team['State_x'].isna(), data_team['State_y'], data_team['State_x'])

In [150]:
data_team = data_team.drop(['City_x', 'City_y', 'State_x', 'State_y', 'Team Name', 'Arena Name', 'Arena Location',
               'Seating Capacity', 'Opening Year'], axis=1)

In [151]:
data_team = data_team.rename(columns={'Datetime_x':'Datetime'})

In [152]:
#Lastly, make any international game a away game
#If it's international, or already marked as away, keep it away, otherwise it's a home game

In [153]:
data_team['Home/Away'] = np.where((data_team['is_intl'] == 1) | (data_team['Home/Away'] == 1), 1, 0)

In [154]:
data_team.head()

Unnamed: 0,Week,Day,Date,Time,boxscore,Result,OT,Rec,Home/Away,Opp Name,...,Opp_Rolling_DefRushY,Opp_Rolling_DefTO,Betting Spread,Datetime,Time_Rounded,Home_Team,Away_Team,is_intl,City,State
0,1,Sun,2020-09-13,16:25:00,boxscore,W,0,1-0,1,San Francisco 49ers,...,,,7.0,2020-09-13 16:25:00,2020-09-13T16:00,San Francisco 49ers,Arizona Cardinals,0,Santa Clara,California
1,13,Mon,2020-12-07,20:15:00,boxscore,W,0,9-3,1,San Francisco 49ers,...,107.0,2.333333,1.5,2020-12-07 20:15:00,2020-12-07T20:00,San Francisco 49ers,Buffalo Bills,0,Santa Clara,California
2,9,Thu,2020-11-05,20:20:00,boxscore,W,0,6-2,1,San Francisco 49ers,...,102.0,1.333333,-6.0,2020-11-05 20:20:00,2020-11-05T20:00,San Francisco 49ers,Green Bay Packers,0,Santa Clara,California
3,6,Sun,2020-10-18,20:20:00,boxscore,L,0,4-2,1,San Francisco 49ers,...,100.0,0.666667,-2.5,2020-10-18 20:20:00,2020-10-18T20:00,San Francisco 49ers,Los Angeles Rams,0,Santa Clara,California
4,5,Sun,2020-10-11,16:05:00,boxscore,W,0,2-3,1,San Francisco 49ers,...,84.333333,1.333333,8.5,2020-10-11 16:05:00,2020-10-11T16:00,San Francisco 49ers,Miami Dolphins,0,Santa Clara,California


In [155]:
#To get weather, I'll pass the cities through an API via openweathermap.org
#Free 1000 API calls/day

In [156]:
%store -r api_key

In [157]:
#Once we have the lat/lon for each location, we get the weather via openweathermap api (using an API key from above)
#In the US, some city names are the same across states, so we make the function below to choose the right one

def state_finder(x):
    for i in range(len(x)):
        if x[i]['state'] == state:
            return i

In [158]:
#We'll then need a function that gets the weather at the "rounded" time the game starts

def time_finder(x):
    for i in range(len(x['hourly']['time'])):
        if x['hourly']['time'][i] == time:
            return i

In [159]:
data_weather = []
for city, state, date, time, team, year, week, is_intl in zip(data_team['City'], 
                                   data_team['State'],
                                   data_team['Date'],
                                   data_team['Time_Rounded'],
                                   data_team['Team'],
                                   data_team['Year'],
                                   data_team['Week'],
                                   data_team['is_intl']):
    try:
        coordinates = r.get(f'http://api.openweathermap.org/geo/1.0/direct?q={city}, {state}&limit=10&appid={api_key}').json()
        if is_intl == 0:
            index = state_finder(coordinates)
            lat = coordinates[index]['lat']
            lon = coordinates[index]['lon']
            weather = r.get(f'https://archive-api.open-meteo.com/v1/archive?latitude={lat}&longitude={lon}&start_date={date}&end_date={date}&hourly=temperature_2m,weathercode,windspeed_10m&temperature_unit=fahrenheit&windspeed_unit=mph&timezone=America%2FNew_York').json()
            index = time_finder(weather)
            temp = weather['hourly']['temperature_2m'][index]
            weather_code = weather['hourly']['weathercode'][index]
            windspeed = weather['hourly']['windspeed_10m'][index]
            data_weather.append([team, year, week, city, state, temp, weather_code, windspeed])
        else:
            index = 0
            lat = coordinates[index]['lat']
            lon = coordinates[index]['lon']
            weather = r.get(f'https://archive-api.open-meteo.com/v1/archive?latitude={lat}&longitude={lon}&start_date={date}&end_date={date}&hourly=temperature_2m,weathercode,windspeed_10m&temperature_unit=fahrenheit&windspeed_unit=mph&timezone=America%2FNew_York').json()
            index = time_finder(weather)
            temp = weather['hourly']['temperature_2m'][index]
            weather_code = weather['hourly']['weathercode'][index]
            windspeed = weather['hourly']['windspeed_10m'][index]
            data_weather.append([team, year, week, city, state, temp, weather_code, windspeed])
    except Exception as e:
#         print(e)
#         break
        continue

In [160]:
len(data_weather)

1598

In [161]:
weather = pd.DataFrame(data_weather, columns = ['Team', 'Year', 'Week', 'City', 'State', 'Temp', 'Weather Code', 'Windspeed'])





In [162]:
#We pulled in weather code, which we will breakout
#https://www.nodc.noaa.gov/archive/arc0021/0002199/1.1/data/0-data/HTML/WMO-CODE/WMO4677.HTM

In [163]:
#These codes indicate the type of weather being experienced
#So we need a binary 0/1 column for each weather type

In [164]:
weather_type = pd.get_dummies(weather['Weather Code'])

In [165]:
weather = pd.concat([weather, weather_type], axis=1)

In [166]:
#Let's join the weather data to team performance data

In [167]:
data_team = data_team.merge(weather, on=['Year', 'Team', 'Week'], how='left')

In [168]:
#Make sure we have no games w/o weather data
data_team[data_team['Temp'].isna()].head()

Unnamed: 0,Week,Day,Date,Time,boxscore,Result,OT,Rec,Home/Away,Opp Name,...,2,3,51,53,55,61,63,71,73,75


In [169]:
data_team.columns

Index([                  'Week',                    'Day',
                         'Date',                   'Time',
                     'boxscore',                 'Result',
                           'OT',                    'Rec',
                    'Home/Away',               'Opp Name',
                   'Team Score',              'Opp Score',
                    'Off1stD_x',             'OffTotYd_x',
                   'OffPassY_x',             'OffRushY_x',
                      'OffTO_x',              'Def1stD_x',
                   'DefTotYd_x',             'DefPassY_x',
                   'DefRushY_x',                'DefTO_x',
                      'Offense',                'Defense',
                      'Sp. Tms',                   'Team',
                         'Year',    'Team_Scoring_Spread',
       'Rolling_Scoring_Spread',       'Rolling_Strength',
           'Opp_Scoring_Spread',   'Opp_Rolling_Strength',
              'Rolling_Off1stD',       'Rolling_OffTotYd

In [170]:
data_team = data_team.drop(['City_y', 'State_y'], axis=1)
data_team = data_team.rename(columns={'City_x':'City',
                                     'State_x':'State'})

In [171]:
#Get week back to an int
#This way when we sort by week, it goes 1-2-3, not 1-10-11

data_team['Week'] = data_team['Week'].astype('int')

In [172]:
#Making sure our rolling data is still skipping the first two weeks

data_team.sort_values(by=['Year', 'Team', 'Week']).iloc[14:20,30:45]

Unnamed: 0,Opp_Scoring_Spread,Opp_Rolling_Strength,Rolling_Off1stD,Rolling_OffTotYd,Rolling_OffPassY,Rolling_OffRushY,Rolling_OffTO,Rolling_Def1stD,Rolling_DefTotYd,Rolling_DefPassY,Rolling_DefRushY,Rolling_DefTO,Opp_Rolling_Off1stD,Opp_Rolling_OffTotYd,Opp_Rolling_OffPassY
57,-11.0,-0.69,22.666667,422.0,294.666667,127.333333,1.666667,19.333333,326.333333,185.666667,140.666667,1.333333,23.666667,400.0,238.333333
400,76.0,4.47,20.333333,363.333333,273.0,90.333333,2.0,23.0,384.333333,233.0,151.333333,1.0,19.0,323.333333,209.333333
450,13.0,13.0,,,,,,,,,,,,,
201,-2.0,-1.0,,,,,,,,,,,,,
451,12.0,4.0,24.0,419.0,309.333333,109.666667,1.0,26.333333,463.333333,350.333333,113.0,1.666667,22.666667,368.0,230.0
500,51.0,12.75,22.666667,359.333333,247.666667,111.666667,0.333333,26.333333,470.0,355.666667,114.333333,1.666667,24.0,420.0,271.666667


In [173]:
#Some games are "pick-em" meaning the sportsbooks didnt pick a favorite
#I consider this as if the spread is 0 because either side could win

In [174]:
data_team['Betting Spread'] = np.where(data_team['Betting Spread'] == 'PK', 0, data_team['Betting Spread'])

In [175]:
data_team['Betting Spread'] = data_team['Betting Spread'].astype('float')

In [177]:
#The regression will solve for how well the team covers the spread, which is the betting spread + scoring spread
#If a team is -7 (expected to win by 7) and wins by 6, the "cover" is -7+6 or -1, meaning they didn't cover the spread
#Initially I want to see how well a team covers the spread
#If a team is expected to cover the spread by a lot of points, I'm more willing to bet that they'll cover the spread
#I will also use this data to view a binary 0/1 as to whether or not a team will cover

In [178]:
data_team['Cover'] = data_team['Betting Spread'] + data_team['Team_Scoring_Spread']

In [179]:
data_team.columns

Index([                  'Week',                    'Day',
                         'Date',                   'Time',
                     'boxscore',                 'Result',
                           'OT',                    'Rec',
                    'Home/Away',               'Opp Name',
                   'Team Score',              'Opp Score',
                    'Off1stD_x',             'OffTotYd_x',
                   'OffPassY_x',             'OffRushY_x',
                      'OffTO_x',              'Def1stD_x',
                   'DefTotYd_x',             'DefPassY_x',
                   'DefRushY_x',                'DefTO_x',
                      'Offense',                'Defense',
                      'Sp. Tms',                   'Team',
                         'Year',    'Team_Scoring_Spread',
       'Rolling_Scoring_Spread',       'Rolling_Strength',
           'Opp_Scoring_Spread',   'Opp_Rolling_Strength',
              'Rolling_Off1stD',       'Rolling_OffTotYd

In [180]:
#We will save this as a csv, so in the future when we run our models, we don't have to re-scrape everything

In [181]:
data_team.to_csv('/Users/SwagMawi/Documents/GitHub/projects/PredictingNFLGames/training_data_2023.csv')

In [None]:
#Ideas for 