In [18]:
import nfl_data_py as nfl
import pandas as pd
import warnings
warnings.filterwarnings("ignore")
import seaborn as sns

Getting NFL data from the last 9 years

In [2]:
years = list(range(2015,2024))

In [3]:
nfl_main_df = nfl.import_weekly_data(years, columns = list(nfl.see_weekly_cols()), downcast=True)

Downcasting floats.


In [None]:
pbp_df = pd.DataFrame()

for y in years:

    local_pbp_df = nfl.import_pbp_data([y], columns=list(nfl.see_pbp_cols()), downcast=True, cache=False, alt_path=None)
    
    pbp_df = pd.concat([pbp_df,local_pbp_df])

2015 done.
Downcasting floats.
2016 done.
Downcasting floats.


Preparing the dataframe we will use to analyze. The idea is to focus on 4 features: pass, complete_pass, air_yards and deep_complete_pass. The last one we will have to create ourselves. The glossary of play_by_play data is available below

https://nflreadr.nflverse.com/articles/dictionary_pbp.html

In [None]:
pbp_df["deep_complete_pass"] = 0
pbp_df.loc[(pbp_df["pass_length"]=="deep")&(pbp_df["complete_pass"]==1),"deep_complete_pass"] = 1

pass_df = pbp_df.groupby(["game_id"])[["pass","complete_pass","air_yards","deep_complete_pass"]].sum().reset_index()
temp_wind_df = pbp_df[["game_id","temp","wind"]].dropna().drop_duplicates()
temp_wind_df = temp_wind_df.merge(pass_df)

The correlation between temperature and wind. Of course this only show the linear correlation between the stats. But its an important first look

In [None]:
temp_wind_df.corr()[["temp","wind"]]

Lets get to know the distribution of wind and temperature to have a better idea of what are the low and highs of those features

In [None]:
sns.distplot(temp_wind_df["wind"])

In [None]:
sns.distplot(temp_wind_df["temp"])

In [None]:
stats = ['pass', 'complete_pass', 'air_yards',
       'deep_complete_pass']

In [None]:
temp_wind_df[temp_wind_df["wind"]>15].shape[0]/temp_wind_df.shape[0]

In [None]:
temp_wind_df[temp_wind_df["temp"]<35].shape[0]/temp_wind_df.shape[0]

In [None]:
threshold = 15

for s in stats:

    print(f"Average {s} in a game: ",round(temp_wind_df[s].mean(),2))
    print(f"Average {s} in a game with high wind: ",round(temp_wind_df[temp_wind_df["wind"]>threshold][s].mean(),2))   
    print("")

In [None]:
threshold = 35

for s in stats:

    print(f"Average {s} in a game: ",round(temp_wind_df[s].mean(),2))
    print(f"Average {s} in a game with low temperature: ",round(temp_wind_df[temp_wind_df["temp"]<threshold][s].mean(),2))   
    print("")

In [35]:
import statsmodels.api as sm

# Independent variables: wind and temp
X = temp_wind_df[['wind', 'temp']]

# Dependent variable: pass attempts
y_pass = temp_wind_df['pass']

# Add a constant to the model (intercept)
X = sm.add_constant(X)

# Fit the regression model for pass attempts
model_pass = sm.OLS(y_pass, X).fit()
print(model_pass.summary())

# Dependent variable: complete_pass
y_complete_pass = temp_wind_df['complete_pass']

# Fit the regression model for complete passes
model_complete_pass = sm.OLS(y_complete_pass, X).fit()
print(model_complete_pass.summary())

                            OLS Regression Results                            
Dep. Variable:                   pass   R-squared:                       0.008
Model:                            OLS   Adj. R-squared:                  0.007
Method:                 Least Squares   F-statistic:                     6.894
Date:                Sat, 07 Sep 2024   Prob (F-statistic):            0.00104
Time:                        14:15:51   Log-Likelihood:                -6295.1
No. Observations:                1624   AIC:                         1.260e+04
Df Residuals:                    1621   BIC:                         1.261e+04
Df Model:                           2                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const         79.9493      1.131     70.673      0.0