In [1]:
# loading in packages and data queried from CBA database
import pandas as pd
from pandasql import sqldf
import statsmodels.api as sm
from plotly.offline import init_notebook_mode, iplot
import plotly.graph_objs as go

init_notebook_mode(connected=True)

pysqldf = lambda q: sqldf(q, globals())

df10 = pd.read_csv('~/projects/NFL Five Factors/plays_in_drives_10.csv', header=0)
df11 = pd.read_csv('~/projects/NFL Five Factors/plays_in_drives_11.csv', header=0)
df12 = pd.read_csv('~/projects/NFL Five Factors/plays_in_drives_12.csv', header=0)
df13 = pd.read_csv('~/projects/NFL Five Factors/plays_in_drives_13.csv', header=0)
df14 = pd.read_csv('~/projects/NFL Five Factors/plays_in_drives_14.csv', header=0)
df15 = pd.read_csv('~/projects/NFL Five Factors/plays_in_drives_15.csv', header=0)

turnovers = pd.read_csv('~/projects/NFL Five Factors/turnovers.csv', header=0)
position = pd.read_csv('~/projects/NFL Five Factors/field_position.csv', header=0)
home = pd.read_csv('~/projects/NFL Five Factors/home.csv', header=0)
visiting = pd.read_csv('~/projects/NFL Five Factors/visiting.csv', header=0)

frames = [df10, df11, df12, df13, df14, df15]

df = pd.concat(frames, ignore_index = True)

In [2]:
# cleaning
plays = df.fillna(method='ffill')
plays['reverse_play_number_in_drive'] = plays.sort_values(['play_number_in_drive'], ascending = [False]).groupby(['game_id','team','drive_id']).cumcount() + 1
plays['drive_id'].astype('int32')

0         62085
1         62085
2         62085
3         62085
4         62085
5         62086
6         62086
7         62086
8         62087
9         62087
10        62087
11        62088
12        62088
13        62088
14        62088
15        62088
16        62088
17        62089
18        62089
19        62089
20        62089
21        62089
22        62089
23        62089
24        62089
25        62089
26        62090
27        62090
28        62090
29        62090
          ...  
211560    99641
211561    99641
211562    99641
211563    99641
211564    99641
211565    99641
211566    99642
211567    99642
211568    99642
211569    99643
211570    99643
211571    99643
211572    99644
211573    99644
211574    99644
211575    99645
211576    99645
211577    99645
211578    99646
211579    99646
211580    99646
211581    99647
211582    99647
211583    99647
211584    99648
211585    99648
211586    99648
211587    99649
211588    99649
211589    99649
Name: drive_id, Length: 

In [3]:
# this query finds the number of points scored in each drive
q = """
SELECT points_scored,
       drive_id
FROM plays
WHERE reverse_play_number_in_drive = 1
"""

drivepoints = pysqldf(q)

In [4]:
# this query finds the number of points scored per scoring opportunity
q2 = """
SELECT game_id,
       team,
       AVG(points_scored) AS points_per_trip_inside_forty,
       season
FROM (
        SELECT DISTINCT
               p.game_id,
               p.team,
               p.drive_id,
               p.season,
               CASE WHEN p.yardline >= 60 THEN 1
                    ELSE 0
                    END AS scoring_zone,
               p.points_scored

        FROM plays p
        JOIN
            (SELECT game_id,
                    drive_id,
                    team,
                    points_scored
             FROM plays
             WHERE reverse_play_number_in_drive = 1) a ON a.game_id = p.game_id
                                                      AND a.drive_id = p.drive_id
                                                      AND a.team = p.team
        WHERE scoring_zone = 1)

GROUP BY game_id,
         team,
         season
"""

ppo = pysqldf(q2)

In [5]:
# this query finds the expected points of a drive by down, distance, and yardline
q3 = """
SELECT p.down,
       p.distance,
       p.yardline,
       AVG(d.points_scored) AS exp_points,
       count(*)
FROM plays p
JOIN drivepoints d ON d.drive_id = p.drive_id
GROUP BY p.down,
         p.distance, 
         p.yardline
"""

lookup = pysqldf(q3)

In [6]:
# this query finds the expected points for each play

q4 = """
SELECT p.team,
       p.game_id,
       p.play_number_in_drive,
       p.drive_id,
       p.points_scored,
       l.exp_points
FROM plays p
JOIN lookup l ON l.down = p.down
             AND l.distance = p.distance
             AND l.yardline = p.yardline

"""

expected = pysqldf(q4)

In [7]:
# this query success rate by team and game

q5 = """
SELECT s.team,
       s.game_id,
       AVG(s.success) AS success_rate
FROM
(SELECT e.team,
       e.game_id,
       e.play_number_in_drive,
       e.drive_id,
       e.exp_points,
       e2.exp_points AS next_exp_points,
       CASE WHEN e.points_scored > 0 THEN 1
            WHEN e.exp_points <= e2.exp_points THEN 1
            ELSE 0
       END AS success
FROM expected e
LEFT JOIN expected e2 ON e.drive_id = e2.drive_id
                  AND e.play_number_in_drive = (e2.play_number_in_drive - 1)) s
GROUP BY s.team,
         s.game_id
"""

success = pysqldf(q5)

In [8]:
# this query finds average gained expected points on successful plays

q6 = """
SELECT s.team,
       s.game_id,
       AVG((CASE WHEN s.points_scored > 0 THEN points_scored ELSE s.next_exp_points END) - s.exp_points) AS explosiveness
FROM
(SELECT e.team,
       e.game_id,
       e.play_number_in_drive,
       e.drive_id,
       e.points_scored,
       e.exp_points,
       e2.exp_points AS next_exp_points,
       CASE WHEN e.points_scored > 0 THEN 1
            WHEN e.exp_points <= e2.exp_points THEN 1
            ELSE 0
       END AS success
FROM expected e
LEFT JOIN expected e2 ON e.drive_id = e2.drive_id
                  AND e.play_number_in_drive = (e2.play_number_in_drive - 1)) s
WHERE s.success = 1
GROUP BY s.team,
         s.game_id
"""

explode = pysqldf(q6)

In [9]:
# this query brings all of the data together, ready for analysis

q7 = """
SELECT s.team,
       s.game_id,
       s.success_rate - s2.success_rate AS suc_margin,
       e.explosiveness - e2.explosiveness AS expl_margin,
       ppo.points_per_trip_inside_forty - ppo2.points_per_trip_inside_forty AS ppo_margin,
       t.turnover_diff AS turn_margin,
       p.average_starting_field_position - p2.average_starting_field_position AS pos_margin,
       COALESCE(h.win, v.win) AS win,
       ppo.season
       
FROM success s
JOIN success s2 ON s.team != s2.team AND s.game_id = s2.game_id

JOIN ppo ON ppo.team = s.team AND ppo.game_id = s.game_id
JOIN ppo ppo2 ON ppo2.team != s.team AND ppo2.game_id = s.game_id

JOIN explode e ON e.team = s.team AND e.game_id = s.game_id
JOIN explode e2 ON e2.team != s.team AND e2.game_id = s.game_id


JOIN turnovers t ON t.team = s.team AND t.game_id = s.game_id

JOIN position p ON p.team = s.team AND p.game_id = s.game_id
JOIN position p2 ON p2.team != s.team AND p2.game_id = s.game_id


LEFT JOIN home h ON h.home_team = s.team AND h.game_id = s.game_id
LEFT JOIN visiting v ON v.visiting_team = s.team AND v.game_id = s.game_id 

WHERE COALESCE(h.win, v.win) IS NOT NULL
"""

data = pysqldf(q7)

final = data.set_index(['game_id', 'team'])

# making test and train set
train = final[final['season'] != 2015]
test = final[final['season'] == 2015]

ytrain = train['win']
xtrain = train[['suc_margin','expl_margin','ppo_margin','turn_margin','pos_margin']]
ytest = test['win']
xtest = test[['suc_margin','expl_margin','ppo_margin','turn_margin','pos_margin']]

In [10]:
# fitting a logit
model = sm.Logit(ytrain, xtrain)
reg = model.fit()
print(reg.summary())

Optimization terminated successfully.
         Current function value: 0.315555
         Iterations 8
                           Logit Regression Results                           
Dep. Variable:                    win   No. Observations:                 2662
Model:                          Logit   Df Residuals:                     2657
Method:                           MLE   Df Model:                            4
Date:                Tue, 11 Dec 2018   Pseudo R-squ.:                  0.5448
Time:                        21:24:45   Log-Likelihood:                -840.01
converged:                       True   LL-Null:                       -1845.2
                                        LLR p-value:                     0.000
                  coef    std err          z      P>|z|      [0.025      0.975]
-------------------------------------------------------------------------------
suc_margin     17.5655      0.988     17.784      0.000      15.630      19.501
expl_margin     3.3237    

In [11]:
# getting logit marginal effects
mEff = reg.get_margeff(at = 'overall', method = 'eydx')
mEff.summary()

0,1
Dep. Variable:,win
Method:,eydx
At:,overall

Unnamed: 0,d(lny)/dx,std err,z,P>|z|,[0.025,0.975]
suc_margin,8.7828,0.494,17.784,0.0,7.815,9.751
expl_margin,1.6619,0.158,10.506,0.0,1.352,1.972
ppo_margin,0.3609,0.041,8.804,0.0,0.281,0.441
turn_margin,0.3075,0.023,13.484,0.0,0.263,0.352
pos_margin,0.0676,0.005,14.936,0.0,0.059,0.076


In [12]:
# generating predictions on test data
pred = pd.DataFrame({'probability':reg.predict(xtest)})
predictions = pred.reset_index()

In [13]:
# making a dataframe of actual wins and win probability
q8 = """
SELECT d.game_id,
       d.team,
       p.probability
FROM data d
JOIN predictions p ON p.game_id = d.game_id AND d.team = p.team
WHERE d.win = 1
"""

example2 = pysqldf(q8)

In [14]:
# making a dataframe of of each game and win probability
q9 = """
SELECT d.game_id,
       e.team AS winner,
       d.team AS loser,
       e.probability
FROM example2 e
JOIN data d ON d.team != e.team AND e.game_id = d.game_id
ORDER BY e.probability ASC
"""

e = pysqldf(q9)
e

Unnamed: 0,game_id,winner,loser,probability
0,4072,DET,CHI,0.049045
1,4070,MIN,KC,0.093439
2,4236,IND,TEN,0.093608
3,4124,TB,DAL,0.096504
4,4134,ARI,SEA,0.105862
5,4176,SF,CHI,0.111341
6,4118,SF,ATL,0.114885
7,4076,CAR,SEA,0.141776
8,3997,STL,SEA,0.154287
9,4019,DAL,PHI,0.164095


In [15]:
# distribution of win probabilities
data = [go.Histogram(
                x = example2.probability, 
                nbinsx = 30)]
layout = go.Layout(
    title = "Distribution of win probabilities for actual wins",
    yaxis = dict(title = 'Count'),
    xaxis = dict(title = 'Win Probability'),
    bargap = 0.05)

fig = go.Figure(data = data, layout = layout)

iplot(fig)

In [16]:
# comparing wins to second order wins
q10 = """
SELECT team,
       wins,
       second_order_wins,
       wins - second_order_wins AS win_diff
FROM
    (SELECT d.team,
           SUM(d.win) AS wins,
           SUM(p.probability) AS second_order_wins
    FROM final d
    JOIN predictions p ON p.game_id = d.game_id AND d.team = p.team
    GROUP BY d.team)
    
ORDER BY win_diff
"""

d = pysqldf(q10)
d

Unnamed: 0,team,wins,second_order_wins,win_diff
0,SEA,10,12.650179,-2.650179
1,NYG,6,8.569771,-2.569771
2,CHI,6,8.556152,-2.556152
3,CLE,3,5.398457,-2.398457
4,TEN,3,5.061726,-2.061726
5,JAC,5,6.92517,-1.92517
6,OAK,7,8.23817,-1.23817
7,SD,4,5.135787,-1.135787
8,DAL,4,5.018898,-1.018898
9,MIA,6,6.626741,-0.626741


In [17]:
# differences between wins and second order wins
data = [go.Histogram(
                x = d.win_diff, 
                xbins = dict(size = 0.5))]
layout = go.Layout(
    title = "Differences between wins and second order wins",
    yaxis = dict(title = 'Count of Teams'),
    xaxis = dict(title = 'Difference'),
    bargap = 0.1)

fig = go.Figure(data = data, layout = layout)

iplot(fig)

In [18]:
# 1st and 10 from every yardline
example = lookup[(lookup['down'] == 1) & (lookup['distance'] == 10)]
example.head()

Unnamed: 0,down,distance,yardline,exp_points,count(*)
124,1,10,1,0.294643,224
125,1,10,2,0.94,200
126,1,10,3,0.602941,204
127,1,10,4,0.47032,219
128,1,10,5,0.929688,256


In [19]:
# scatter of expected points 1st and 10
trace = go.Scatter(x = example.yardline, 
                   y = example.exp_points, 
                   mode = 'lines+markers',
                   name = 'Expected Points')

layout = go.Layout(
    title = "Expected Points for 1st and 10",
    yaxis = dict(title = 'Exp Points'),
    xaxis = dict(title = 'Yards from own goal'))

data = [trace]

fig = go.Figure(data = data, layout = layout)

iplot(fig)

In [20]:
f = e[e['winner'] == 'ARI']
f

Unnamed: 0,game_id,winner,loser,probability
4,4134,ARI,SEA,0.105862
19,4098,ARI,CLE,0.302908
22,4148,ARI,CIN,0.336742
25,4162,ARI,SF,0.375654
96,4251,ARI,GB,0.756216
98,4182,ARI,MIN,0.770949
135,3998,ARI,NO,0.887396
139,4094,ARI,BAL,0.893838
162,4227,ARI,GB,0.936898
198,4011,ARI,CHI,0.970308


In [21]:
g = e[e['loser'] == 'SEA']
g

Unnamed: 0,game_id,winner,loser,probability
4,4134,ARI,SEA,0.105862
7,4076,CAR,SEA,0.141776
8,3997,STL,SEA,0.154287
33,4055,CIN,SEA,0.409527
39,4226,STL,SEA,0.462464
42,4252,CAR,SEA,0.469103
131,4020,GB,SEA,0.88469
