# Dealing with Data — Team E

##Football and Weather Analysis from the 2017 NFL Season

# Setup and Preliminaries

Install extra dependencies, import requirements and setup plotting

In [0]:
# install dependencies for matplotlib Basemap

!apt-get install libgeos-dev
!pip install https://github.com/matplotlib/basemap/archive/master.zip

In [0]:
# Render our plots inline
%matplotlib inline
%config InlineBackend.figure_format = 'retina'

# Import all requirements
import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
import requests
from google.colab import files

# Make the graphs a bit bigger
matplotlib.style.use(['seaborn-talk', 'seaborn-ticks', 'seaborn-whitegrid'])
plt.style.use('seaborn-notebook')

# Download and load our data

Download our 2017 NFL Season datasets from github repositories, load it into a dataframes, and clean up where necessary

In [0]:
# nfl play by play dataset
nfl_pbp_2017 = pd.read_csv('https://raw.githubusercontent.com/mstefaniak10/dwd_nfl/master/pbp_2017.csv', low_memory=False)
nfl_pbp_2017['Date'] = pd.to_datetime(nfl_pbp_2017['Date'], format='%Y-%m-%d')

# nfl game outcomes dataset with weather data and attendance
nfl_games_2017 = pd.read_csv('https://raw.githubusercontent.com/mstefaniak10/dwd_nfl/master/nfl_games_2017.csv', low_memory=False)
nfl_games_2017['Date'] = pd.to_datetime(nfl_games_2017['Date'], format='%m/%d/%y')
nfl_games_2017['temperature'] = nfl_games_2017['temperature'].astype(int)

# nfl stadiums dataset
nfl_stadiums = pd.read_csv('https://raw.githubusercontent.com/mstefaniak10/dwd_nfl/master/nfl_stadiums.csv', low_memory=False)

# for a lot of this analysis it makes sense to remove the home games of teams who play inside domes
# Four fixed roof domes (Lions, Falcons, Saints & Vikings), plus four more with retractable roofs (Cowboys, Texans, Colts & Cardinals)
nfl_outside_games_2017 = nfl_games_2017[~nfl_games_2017['home'].isin(['ARI', 'IND', 'DAL', 'ATL', 'DET', 'NO', 'HOU', 'MIN'])].copy()

Create bins of data for certain weather attributes

In [0]:
# temperature bucket
temp_bucket_labels = ['Freezing (-5 to 32)', 'Cold (33 to 55)', 'Mild (56 to 80)', 'Hot (81 to 100)']
nfl_outside_games_2017['temp_bucket'] = pd.cut(nfl_outside_games_2017['temperature'], bins=[-5, 32, 55, 80, 100], labels=temp_bucket_labels)

# wind gust bucket
wind_gust_bucket_labels = ['Low Wind', 'Moderate Wind', 'High Wind', 'Very High Wind']
nfl_outside_games_2017['wind_gust_bucket'] = pd.cut(nfl_outside_games_2017['wind_gust'], bins=[0, 3, 6, 10, 20], labels=wind_gust_bucket_labels)

# pressure bucket
pressure_bucket_labels = ['Low Pressure', 'Moderate Pressure', 'High Pressure']
nfl_outside_games_2017['pressure_bucket'] = pd.cut(nfl_outside_games_2017['pressure'], bins=[1005, 1013, 1021, 1030], labels=pressure_bucket_labels)

# precipitation bucket
precip_bucket_labels = ['Low Precipitation (0 to .01)', 'High Precipitation (.01 to .02)']
nfl_outside_games_2017['precip_bucket'] = pd.cut(nfl_outside_games_2017['precip_intensity'], bins=[0, 0.01, 0.02], labels=precip_bucket_labels)

Merge games and play by play to add weather data

In [0]:
outdoor_data_nfl = pd.merge(nfl_pbp_2017, nfl_outside_games_2017, on='GameID', how='inner')

# print('Play by play \n\n%s \n\n' % nfl_pbp_2017.dtypes)
# print('Game outcomes \n\n%s \n\n' % nfl_games_2017.dtypes)
# print('Stadiums \n\n%s' % nfl_stadiums.dtypes)

# What weather attribute has the greatest effect on total points scored per game?

Run a linear regression using different weather attributes as features (realizing this is not perfect as most weather attributes are not truly independent variables)

In [0]:
# fool around with some sklearn stuff, X = features, y = classifier

from sklearn import linear_model, model_selection
from sklearn.metrics import mean_squared_error, r2_score

# add column for total score
nfl_outside_games_2017['total_score'] = nfl_outside_games_2017['homescore'] + nfl_outside_games_2017['awayscore']

nfl_data = nfl_outside_games_2017[['precip_intensity', 'temperature', 'dew_point', 'humidity', 'pressure', 'wind_speed', 'visibility', 'total_score']]

X = nfl_data.drop('total_score', axis=1)

# split into training/test data
X_train, X_test, y_train, y_test = model_selection.train_test_split(X, nfl_data['total_score'], test_size=0.33, random_state=5)

regr = linear_model.LinearRegression()

# use linear regression to fit the data
regr.fit(X_train, y_train)

df_coefficients = pd.DataFrame(list(zip(X.columns, regr.coef_)), columns = ['features', 'estimatedCoefficients'])
print(df_coefficients)
coef_plot = df_coefficients.set_index('features').sort_values('estimatedCoefficients', ascending=False).plot(kind='bar', figsize=(15,10))
coef_plot.set_xlabel('Weather feature')
coef_plot.set_ylabel('Correlation Coefficient')
coef_plot.set_title('Linear regression of weather attributes effect on total score')
coef_plot

Now make predictions using the test data set

In [0]:
nfl_total_score_y_pred = regr.predict(X_test)

# Intercept
print('Intercept: \n', regr.intercept_)
# The coefficients
print('Coefficients: \n', regr.coef_)
# The mean squared error
print("Mean squared error: %.2f"
      % mean_squared_error(y_test, nfl_total_score_y_pred))
print("Root mean squared error: %.2f"
      % np.sqrt(mean_squared_error(y_test, nfl_total_score_y_pred)))
# Explained variance score: 1 is perfect prediction
print('Variance score: %.2f' % r2_score(y_test, nfl_total_score_y_pred))

# plot test vs prediction
plt.figure(figsize=(15,10))
plt.scatter(y_test, nfl_total_score_y_pred)

plt.xlabel('Score')
plt.ylabel('Predicted Score')
plt.title('Score vs Predicted Score')
plt.show()

There probably isn't enough data here which is why we have a high mean squared error. But the data still shows that precipitation intensity has the greatest effect on total score. And we are able to predict total scores within around 14.5 points of the actual total scores.

# What weather attribute has the greatest effect on the length of passing attempts?

In [0]:
# select passing plays
nfl_plays = outdoor_data_nfl[(outdoor_data_nfl['PlayType'] == 'Pass')].copy()

y_column = 'AirYards'

nfl_data = nfl_plays[['precip_intensity', 'temperature', 'dew_point', 'humidity', 'pressure', 
                                   'wind_speed', 'visibility', y_column]]

X = nfl_data.drop(y_column, axis=1)

X_train, X_test, y_train, y_test = model_selection.train_test_split(X, nfl_data[y_column], test_size=0.33, random_state=5)

regr = linear_model.LinearRegression()

regr.fit(X_train, y_train)

df_coefficients = pd.DataFrame(list(zip(X.columns, regr.coef_)), columns = ['features', 'estimatedCoefficients'])
print(df_coefficients)
#df_coefficients.set_index('features').plot(kind='bar')

# Make predictions using the testing set
nfl_air_yards_y_pred = regr.predict(X_test)

# Intercept
print('Intercept: \n', regr.intercept_)
# The coefficients
print('Coefficients: \n', regr.coef_)
# The mean squared error
print("Mean squared error: %.2f"
      % mean_squared_error(y_test, nfl_air_yards_y_pred))
print("Root mean squared error: %.2f"
      % np.sqrt(mean_squared_error(y_test, nfl_air_yards_y_pred)))
# Explained variance score: 1 is perfect prediction
print('Variance score: %.2f' % r2_score(y_test, nfl_air_yards_y_pred))

plt.figure(figsize=(15,10))
plt.scatter(y_test, nfl_air_yards_y_pred)

plt.xlabel('Passing Length Attempt')
plt.ylabel('Predicted Passing Length Attempt')
plt.title('Passing Length Attempt vs Predicted Passing Length Attempt')
plt.show()


This model predicts the length of passing yards attempt within ~10 yards

# Visualize Map of NFL Stadiums by Conference

Generate a plot showing the locations of various NFL team stadiums, separated by conference, with bubble size representing stadium capacity

In [0]:
import matplotlib.patches as mpatches
from mpl_toolkits.basemap import Basemap

# define basemap colors
land_color = '#f5f5f3'
water_color = '#cdd2d4'
coastline_color = '#666666'
border_color = '#999999'
afc_color = '#013369'
nfc_color = '#D50A0A'

map_width_m = 5000 * 1000
map_height_m = 3500 * 1000
fig_width_in = 17
plt.figure(figsize=([fig_width_in, fig_width_in*(map_height_m/float(map_width_m))])) # make plot the same height:width ratio

# Albers Conical Equal Area projection for US
m = Basemap(ellps='WGS84', projection='aea', lat_1=33, lat_2=45, lon_0=-98.4, lat_0=39,
            width=map_width_m, height=map_height_m, resolution='l', area_thresh=10000, fix_aspect=False)

m.drawcoastlines(color=coastline_color)
m.drawcountries(color=border_color)
m.drawstates(color=border_color)
m.fillcontinents(color=land_color, lake_color=water_color)
m.drawmapboundary(fill_color=water_color)

# project and plot our lat-long data with each bin getting its own color
for conference, color in zip(['AFC', 'NFC'], [afc_color, nfc_color]):
    subset = nfl_stadiums[nfl_stadiums['conference'] == conference]
    sizes = subset['capacity'] / 100
    x, y = m(subset.longitude.values, subset.latitude.values)
    m.scatter(x=x, y=y, s=sizes, color=color, edgecolor='#555555', alpha=0.6, zorder=3)

afc_patch = mpatches.Patch(fc=afc_color, alpha=0.7, ec='w', label='AFC Team')
nfc_patch = mpatches.Patch(fc=nfc_color, alpha=0.7, ec='w', label='NFC Team')
legend = plt.legend(handles=[afc_patch, nfc_patch], loc=3, prop={'size':14})

plt.title('AFC & NFC Stadium Locations')
plt.show()

A noticeable takeaway from this plot is how much smaller the San Diego Chargers stadium is compared to every other team. It turns out they are currently playing in a temporary stadium until their new permament stadium in Los Angeles is completed. Another point of note is the darker red circle in NYC due to the fact that the Giants and Jets play their home games at the same stadium.

# Generate density plot and histogram of stadium capacity

In [0]:
plt.figure(figsize=(15,10))
sns.distplot(nfl_stadiums['capacity'], 
             #bins=8, # let seaborn figure it out
             hist_kws={'edgecolor':'black'},
             kde_kws={'linewidth': 4}
        )

The average stadium capacity in the NFL is around 70,000, with a few notable outliers

# How does temperature affect attendance?

In [0]:
# Join with stadium dataframe so we can create attendance percent full column
games_with_stadiums = pd.merge(nfl_outside_games_2017, nfl_stadiums, how='inner', left_on='home', right_on='city')
games_with_stadiums['attendance_percent_full'] = ((games_with_stadiums['attendance'] / games_with_stadiums['capacity']) * 100).astype(int)

# there are some outlier percent fulls because the games were played in London, so let's get rid of those
games_with_stadiums = games_with_stadiums[games_with_stadiums['attendance_percent_full'] < 105]

# create pivot
pivot = pd.pivot_table(
    data = games_with_stadiums, 
    index = 'temp_bucket', # specifies the rows
    values = 'attendance_percent_full',  # specifies the content of the cells
    aggfunc = 'mean' # we ask to take average of temperature values
)
plot = pivot.plot(figsize=(15,10), colormap = 'RdBu')
plot.set_xticks([0, 1, 2, 3])
plot.set_xticklabels(temp_bucket_labels)
plot.set_xlabel('Temperature')
plot.set_ylabel('Attendance Percent Full')
plot.set_title('NFL Outdoor Stadiums Average Percent Capacity by Temperature')
plot


NFL fans are a hardy bunch! On average, stadiums are filled closer to capacity when the temperature is colder. This could be explained by the fact that the NFL season starts in late summer and finishes in early winter. So fans may be attending games more often later in the season when the temperature is colder, but the stakes are higher.



# How does wind speed affect field goals?

In [0]:
# get all the plays with field goal attempts
field_goal_plays = outdoor_data_nfl[outdoor_data_nfl['FieldGoalResult'].isin(['Good', 'No Good'])].copy()
# convert makes and misses to binary integers
field_goal_plays['FieldGoalResult'] = field_goal_plays['FieldGoalResult'].replace({'Good': 1, 'No Good': 0})

# bucket field goals by attempt length
field_goal_bucket_labels = ['Very Short (15 to 29)', 'Short (30 to 39)', 'Medium (40 to 49)', 'Long (50 to 70)']
field_goal_plays['field_goal_bucket'] = pd.cut(field_goal_plays['FieldGoalDistance'], bins=[15, 29, 39, 49, 70], labels=field_goal_bucket_labels)

# create pivot table
pivot = pd.pivot_table(
    data = field_goal_plays, 
    index = 'wind_gust_bucket', 
    columns = 'field_goal_bucket', 
    values = 'FieldGoalResult',  
    aggfunc = 'mean'
) * 100 # multiply by 100 to convert to percent
plot = pivot.plot(kind='bar', figsize=(20,10), colormap = 'RdBu')
plot.set_xlabel('Wind Speed')
plot.set_ylabel('Percent Made')
plot.set_title('Field Goal Percentage Made at Different Wind Speeds')
plot


The interesting takeaway here is that the percentage made of longer distance field goal attempts actually increases on windy game days. This makes sense as kickers are more accurate on long field goal trys when they a have strong wind at their back, and generally won't attempt long field goals without favorable wind

# How does temperature affect field goals?

In [0]:
pivot = pd.pivot_table(
    data = field_goal_plays,
    index = 'temp_bucket',
    columns = 'field_goal_bucket',
    values = 'FieldGoalResult',
    aggfunc = 'mean'
) * 100 # multiply by 100 to convert to percent
plot = pivot.plot(kind='bar', figsize=(20,10), colormap = 'RdBu')
plot.set_xlabel('Temperature')
plot.set_ylabel('Percent Made')
plot.set_title('Field Goal Percentage Made at Different Temperature')
plot

Temperature seems to have no noticeable affect on field goal percentage

# **What are Tom Brady's Optimal Weather Conditions?**

See the Deflate controversy from 2014-15. Tom Brady was accused of deflating footballs to gain an advantage. We want to see if the outside air pressure affected his game at all during the most recent season.

In [0]:
# get all plays from 2017 season where Tom Brady threw a pass
tom_brady_passes = nfl_pbp_2017[(nfl_pbp_2017['PassAttempt'] == 1) & (nfl_pbp_2017['Passer'] == 'T.Brady')].copy()

# convert makes and misses to binary integers
tom_brady_passes['PassOutcome'] = tom_brady_passes['PassOutcome'].replace({'Complete': 1, 'Incomplete Pass': 0})

# merge weather data
tom_brady_passes = pd.merge(tom_brady_passes, nfl_outside_games_2017, how = 'inner', on = 'GameID')

# Evenly group pressure attribute into 3 buckets using qcut
tom_brady_passes['pressure_bucket_qcut'] = pd.qcut(tom_brady_passes['pressure'], 3, labels=pressure_bucket_labels)

#create pivot table
pivot = pd.pivot_table(
    data = tom_brady_passes, 
    index = 'pressure_bucket_qcut',  
    values = 'PassOutcome',  
    aggfunc = 'mean'
) * 100 # multiply by 100 to convert to percent

plot = pivot.plot(kind='bar', figsize=(15,8), colormap = 'RdBu')
plot.set_xlabel('Pressure (mbar)')
plot.set_ylabel('Completion Percentage')
plot.set_title('Tom Brady Completion Percentage at Different Pressures')
plot

We can clearly see that Tom Brady has a nearly identical completion percentage whether it's high or low atmospheric pressure at gametime. Therefore, he gains no advantage from footballs being overly inflated or deflated. Anyobdy with a brain knew DeflateGate was a witchunt, and we have the data here to prove it.

# Does temperature affect home team advantage?

In [0]:
nfl_outside_games_2017['score_difference'] = nfl_outside_games_2017['homescore'] - nfl_outside_games_2017['awayscore']
#nfl_outside_games_2017.set_index('Date')['total_score'].resample('1m').mean().plot()

pivot = pd.pivot_table(
    data = nfl_outside_games_2017,
    index = 'Date',
    values = ['score_difference', 'temperature'],
    aggfunc = 'mean'
)
pivot = pivot.resample('1m').mean()
plot = pivot.plot(kind='line', secondary_y='temperature', figsize=(20,10), colormap = 'RdBu')
plot.set_xlabel('Month')
plot.set_ylabel('Average Home Team Margin of Victory')
plot.set_title('Home Field Advantage by Month with Temperature Change')
plot

It seems that colder temperatures neutralize the home team advantage in the NFL

### **How does precipitation (rain, snow) affect % of Pass vs % of Run Plays?**

In [0]:
pivot_pie = pd.pivot_table(outdoor_data_nfl,index=["GameID"],values=["PassAttempt", "RushAttempt"],
                                   aggfunc=np.sum)
pivot_pie
total = pivot_pie.sum(axis='index')
total
plot_total = total.plot(kind='pie',
               figsize=(8,8),
               colormap = 'RdBu'
               )
plot_total.set_title('The NFL is a Passing League!\n58% Passing, 42% Rushing')
plot_total.legend(bbox_to_anchor=(1, 0.75))
plot_total.set_ylabel('Play Selection')
plot_total

Before taking weather into account, the NFL is clearly a pass happy league

In [0]:
column_list = ["Date" , "GameID", "play_id" , "desc", "RushAttempt" , "PassAttempt"]

pivot1 = pd.pivot_table(outdoor_data_nfl,index=["HomeTeam","GameID","weather_summary", 
                                   "precip_intensity", "wind_gust"],values=["PassAttempt", "RushAttempt"],
                                   aggfunc=np.sum)
pivot1

## Impact of weather type on % pass vs rush

In [0]:

column_list = ["Date" , "GameID", "play_id" , "desc", "RushAttempt" , "PassAttempt"]
pivot_summary = pd.pivot_table(outdoor_data_nfl,index=["weather_summary", 
                                   ],values=["PassAttempt", "RushAttempt"],
                                   aggfunc=np.sum)
pivot_summary
dfp_norm3 = pivot_summary.div(pivot_summary.sum(axis='columns'), axis='index' ).sort_index()
dfp_norm3
plot_weather = dfp_norm3.plot(kind = "bar",
               figsize=(12,8),
               colormap = 'RdBu'
               )
plot_weather.set_title('NFL Weather Impact on Play Selection')
plot_weather.set_ylabel('Play Selection')
plot_weather.set_xlabel('Weather Summary')
plot_weather.legend(bbox_to_anchor=(1, 0.5))
plot_weather

Sleet seems to be the only type of weather that will convice teams to run the ball as often as they pass it

## Impact of Tempurature on % pass vs. % rush

In [0]:

pivot_temp = pd.pivot_table(outdoor_data_nfl,index=["temp_bucket", 
                                   ],values=["PassAttempt", "RushAttempt"],aggfunc=np.sum)
pivot_temp
dfp_norm4 = pivot_temp.div(pivot_temp.sum(axis='columns'), axis='index' ).sort_index()
dfp_norm4
plot_temp = dfp_norm4.plot(kind = "bar",
               figsize=(12,8),
               colormap = 'RdBu'
               )
plot_temp.set_title('NFL Temperature Impact on Play Selection')
plot_temp.set_ylabel('Play Selection')
plot_temp.set_xlabel('Temperature')
plot_temp.legend(bbox_to_anchor=(1, 0.5))
plot_temp

Temperature has no impact on play selection. Even if the temperature is freezing, NFL teams are passing the ball

## Impact of Precipitation on % pass vs. % rush

In [0]:
pivot_precip = pd.pivot_table(outdoor_data_nfl,index=["precip_bucket", 
                                   ],values=["PassAttempt", "RushAttempt"],aggfunc=np.sum)
pivot_precip
dfp_norm5 = pivot_precip.div(pivot_precip.sum(axis='columns'), axis='index' ).sort_index()
plot_precip = dfp_norm5.plot(kind = "barh",
               figsize=(12,8),
               colormap = 'RdBu'
               )
plot_precip.set_title('NFL Precipitation Impact on on Play Selection')
plot_precip.set_ylabel('Precipitation')
plot_precip.set_xlabel('Play Selection')
plot_precip.legend(bbox_to_anchor=(1, 0.75))
plot_precip

More intense precipitation brings the ratio of pass to rush slightly closer to even

## Impact of Wind on % pass vs. % rush

In [0]:
pivot_wind = pd.pivot_table(outdoor_data_nfl,index=["wind_gust_bucket", 
                                   ],values=["PassAttempt", "RushAttempt"],aggfunc=np.sum)
pivot_wind
dfp_norm6 = pivot_wind.div(pivot_wind.sum(axis='columns'), axis='index' ).sort_index()
plot_wind = dfp_norm6.plot(kind = "bar",
               figsize=(12,8),
               colormap = 'RdBu'
               )
plot_wind.set_title('NFL Wind Impact on Play Selection')
plot_wind.set_ylabel('Play Selection')
plot_wind.set_xlabel('Wind Speed (MPH)')
plot_wind.legend(bbox_to_anchor=(1, 0.75))
plot_wind

Wind speed has very little impact on play selection

## Weather Impact on Home Field Advantage

In [0]:
pivot = pd.pivot_table(
    data = nfl_outside_games_2017,  # pulls data from outside games data set
    index = 'weather_summary',    # uses weather summary as x axis value
    values = ["homescore","awayscore"],   # uses home team and away team as column values
    aggfunc = 'mean'   # finds average of scores, makes more sense than sum
) 

#plots chart
plot = pivot.plot(kind='bar', figsize=(20,10),colormap = 'RdBu') 
plot.set_xlabel('Weather Summary')
plot.set_ylabel('Mean Final Score')
plot.set_title('Weather Impact on Score')
plot

Humidity is the noticeable factor here, with home teams enjoying a clear advantage when the weather is humid

## Weather Impact on Total Score

In [0]:
nfl_outside_games_2017['total_score'] = nfl_outside_games_2017['homescore'] + nfl_outside_games_2017['awayscore']

pivot = pd.pivot_table(
    data = nfl_outside_games_2017,  # pulls data from outside games data set
    index = 'weather_summary',    # uses weather summary as x axis value
    values = ["total_score"],   # uses home team and away team as column values
    aggfunc = 'mean'   #finds average of scores, makes more sense than sum
) 

#plots chart
plot = pivot.plot(kind='bar', figsize=(20,10), colormap = 'RdBu') 
plot.set_xlabel('Weather Summary')
plot.set_ylabel('Mean Final Score')
plot.set_title('Weather Impact on Score')
plot

Interestingly, rainy games had high total scores. We suspect if we pulled in data from from additional seasons this would not hold true.

# Appendix

Below is how we added weather data to the 2017 game outcome dataset. We did not want to make repeated api calls to the weather api for historical data, so we saved this output as a new csv, uploaded it to GitHub and loaded the file directly in our analysis above

In [0]:
# nfl game outcome dataset
nfl_games_2017 = pd.read_csv('https://github.com/ryurko/nflscrapR-data/raw/master/data/season_games/games_2017.csv', low_memory=False)

# nfl stadiums dataset
nfl_stadiums = pd.read_csv('https://raw.githubusercontent.com/mstefaniak10/dwd_nfl/master/nfl_stadiums.csv', low_memory=False)

# join datasets
games_with_locations = pd.merge(nfl_games_2017, nfl_stadiums, how='inner', left_on='home', right_on='city')

weather_summary = []
weather_icon = []
precip_intensity = []
precip_probability = []
temperature = []
apparent_temperature = []
dew_point = []
humidity = []
pressure = []
wind_speed = []
wind_gust = []
wind_bearing = []
cloud_cover = []
visibility = []

dark_sky_url = 'https://api.darksky.net/forecast/50bd3162bceb1bdb23894821419834b6/%s,%s,%s?exclude=hourly,daily,flags'

for index, row in games_with_locations.iterrows():
    game_date = datetime.strptime(row['date'], "%Y-%m-%d")
    # estimate games played around 4pm eastern
    game_date += timedelta(hours=15)
    full_url = dark_sky_url % (row['latitude'], row['longitude'], int(game_date.timestamp()))
    
    # make api request to dark sky
    historical_weather_data = requests.get(full_url).json()['currently']
    
    # add each game weather data to list
    weather_summary.append(historical_weather_data['summary'])
    weather_icon.append(historical_weather_data['icon'])
    precip_intensity.append(historical_weather_data['precipIntensity'])
    precip_probability.append(historical_weather_data['precipProbability'])
    temperature.append(historical_weather_data['temperature'])
    apparent_temperature.append(historical_weather_data['apparentTemperature'])
    dew_point.append(historical_weather_data['dewPoint'])
    humidity.append(historical_weather_data['humidity'])
    pressure.append(historical_weather_data['pressure'])
    wind_speed.append(historical_weather_data['windSpeed'])
    wind_gust.append(historical_weather_data['windGust'])
    wind_bearing.append(historical_weather_data['windBearing'])
    cloud_cover.append(historical_weather_data['cloudCover'])
    visibility.append(historical_weather_data['visibility'])

# add columns to dataframe
nfl_games_2017['weather_summary'] = weather_summary
nfl_games_2017['weather_icon'] = weather_icon
nfl_games_2017['precip_intensity'] = precip_intensity
nfl_games_2017['precip_probability'] = precip_probability
nfl_games_2017['temperature'] = temperature
nfl_games_2017['apparent_temperature'] = apparent_temperature
nfl_games_2017['dew_point'] = dew_point
nfl_games_2017['humidity'] = humidity
nfl_games_2017['pressure'] = pressure
nfl_games_2017['wind_speed'] = wind_speed
nfl_games_2017['wind_gust'] = wind_gust
nfl_games_2017['wind_bearing'] = wind_bearing
nfl_games_2017['cloud_cover'] = cloud_cover
nfl_games_2017['visibility'] = visibility

# save to file
nfl_games_2017.to_csv('nfl_games_2017.csv')


Below is how we added attendance data to the game file

In [0]:
team_abbr_map = {
    'Bears': 'CHI',
    'Bengals': 'CIN',
    'Bills': 'BUF',
    'Broncos': 'DEN',
    'Browns': 'CLE',
    'Buccaneers': 'TB',
    'Cardinals': 'ARI',
    'Chargers': 'LAC',
    'Chiefs': 'KC',
    'Colts': 'IND',
    'Cowboys': 'DAL',
    'Dolphins': 'MIA',
    'Eagles': 'PHI',
    'Falcons': 'ATL',
    'Forty-Niners': 'SF',
    'Giants': 'NYG',
    'Jaguars': 'JAX',
    'Jets': 'NYJ',
    'Lions': 'DET',
    'Packers': 'GB',
    'Panthers': 'CAR',
    'Patriots': 'NE',
    'Raiders': 'OAK',
    'Rams': 'LA',
    'Ravens': 'BAL',
    'Redskins': 'WAS',
    'Saints': 'NO',
    'Seahawks': 'SEA',
    'Steelers': 'PIT',
    'Texans': 'HOU',
    'Titans': 'TEN',
    'Vikings': 'MIN',
}


nfl_games_2017 = pd.read_csv('https://raw.githubusercontent.com/mstefaniak10/dwd_nfl/master/nfl_games_2017.csv', low_memory=False)

nfl_attendance_2017 = pd.read_csv('https://raw.githubusercontent.com/mstefaniak10/dwd_nfl/master/nfl_attendance_2017.csv', low_memory=False)

def get_short_name(input):
    return input.split(' ')[-1]
  
nfl_attendance_2017['short_name'] = nfl_attendance_2017['Tm'].map(get_short_name)
nfl_attendance_2017['team'] = nfl_attendance_2017['short_name'].map(team_abbr_map)

team_game_counter_dict = {}
attendance_list = []

def find_attendance(home_team):
    team_df = nfl_attendance_2017[nfl_attendance_2017['team'] == home_team]
    assert team_df.shape[0] == 1
    for column in list(team_df):
        dict_key = home_team + '_' + column
        if not 'Week' in column or dict_key in team_game_counter_dict:
            continue
        value = team_df.iloc[0][column]
        if '*' in value or value == 'Bye':
            continue
        value = int(value)
        team_game_counter_dict[dict_key] = value
        return value

for index, row in nfl_games_2017.iterrows():
    home_team = row['home']
    attendance = find_attendance(home_team)
    attendance_list.append(attendance)

nfl_games_2017['attendance'] = attendance_list
nfl_games_2017.to_csv('/Users/mstefaniak/repos/dwd_nfl/nfl_games_2017.csv')