<a href="https://colab.research.google.com/github/microprediction/firstdown/blob/main/breakdown_by_play_call.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
df = pd.read_csv('https://raw.githubusercontent.com/microprediction/nflMarkov/master/inputData/pbp_nfldb_2009_2013.csv')
pd.set_option('mode.chained_assignment',None)

## Do you want 2nd and 1-yard or 1st and 10-yards? 
Thanks Ben Dilday for compiling the down data. 

This notebook examines 2nd down and close positions, and considers the question of whether 2nd down and 1 is superior to 1st down and 10 (plus one yard)

### Step 1: We add the final field position achieved in the drive and a few other things... 

In [2]:
drive_start = 0
drive_starts = list()

drive_index = 0
drive_indexes = list()
down_index = 0
down_indexes = list()
running_first_yfogs = list()    # Best field position of drive at 1st down
running_yfogs = list() # just a check
running_yfog = 0 
best_first_yfog = 0 
prev_team = None 
is_first_play_of_drive = True 
is_firsts = list()
prev_down = 3 

for yfog, dwn, posteam in zip(df['yfog'].values,df['dwn'].values, df['posteam'].values):

    is_first_play_of_drive = (prev_team != posteam )
    is_firsts.append(dwn==1)

    if is_first_play_of_drive:
        drive_start = yfog 
        drive_index += 1 
         
    # Update best field position for the drive 
    running_yfog = yfog 
    if dwn==1:
        running_first_yfog = yfog

    # Update    
    running_first_yfogs.append(running_first_yfog)
    running_yfogs.append(running_yfog)
    drive_indexes.append(drive_index)
    drive_starts.append(drive_start)

    if (posteam!=prev_team):
       down_index += 1 
       down_indexes.append(down_index)
    else:
       down_indexes.append(down_index)
       if dwn==1:
          down_index+=1 

    prev_team = posteam 
    prev_down = dwn
     
df['is_first'] = is_firsts
df['running_yfog'] = running_yfogs
df['running_first_yfog'] = running_first_yfogs
df['drive_index'] = drive_indexes
df['drive_start'] = drive_starts
df['down_index'] = down_indexes
   
df['eventual_yfog'] = df[['drive_index','running_yfog']].groupby('drive_index').transform(max)
df['eventual_down_yfog_raw'] = df[['down_index','running_yfog']].groupby('down_index').transform(max)
df['eventual_down_yfog'] = [ d1 if dwn>1 else d2 for d1, d2, dwn in zip (df['eventual_down_yfog_raw'],df['eventual_down_yfog_raw'].shift(-1),df['dwn']) ]
del df['eventual_down_yfog_raw']

df['eventual_first_yfog'] = df[['drive_index','running_first_yfog']].groupby('drive_index').transform(max)
df['eventual_drive'] = df['eventual_yfog'] - df['drive_start']
df['eventual_gain'] = df['eventual_yfog'] - df['yfog']
df['eventual_down_gain'] = df['eventual_down_yfog'] - df['yfog']
df['will_get_first_down'] = df['eventual_down_gain']>=df['ytg']
assert all(df['running_yfog']==df['yfog'])




In [26]:
df[170:180]

Unnamed: 0,game_id,playername,dwn,ytg,yfog,type,posteam,yds,seas,away_team,home_team,is_first,running_yfog,running_first_yfog,drive_index,drive_start,down_index,eventual_yfog,eventual_down_yfog,eventual_first_yfog,eventual_drive,eventual_gain,eventual_down_gain,will_get_first_down
170,2009091300,MichaelTurner,2,2,32,RUSH,ATL,1,2009,MIA,ATL,False,32,24,32,24,78,33,33.0,24,9,1,1.0,False
171,2009091300,MattRyan,3,1,33,PASS,ATL,0,2009,MIA,ATL,False,33,24,32,24,78,33,33.0,24,9,0,0.0,False
172,2009091300,RickyWilliams,1,10,36,RUSH,MIA,7,2009,MIA,ATL,True,36,36,33,36,79,84,58.0,84,48,48,22.0,True
173,2009091300,PatWhite,2,3,43,PASS,MIA,0,2009,MIA,ATL,False,43,36,33,36,79,84,58.0,84,48,41,15.0,True
174,2009091300,ChadPennington,3,3,43,PASS,MIA,15,2009,MIA,ATL,False,43,36,33,36,79,84,58.0,84,48,41,15.0,True
175,2009091300,ChadPennington,1,10,58,PASS,MIA,0,2009,MIA,ATL,True,58,58,33,36,79,84,84.0,84,48,26,26.0,True
176,2009091300,ChadPennington,2,5,63,PASS,MIA,21,2009,MIA,ATL,False,63,58,33,36,80,84,84.0,84,48,21,21.0,True
177,2009091300,ChadPennington,1,10,84,PASS,MIA,0,2009,MIA,ATL,True,84,84,33,36,80,84,82.0,84,48,0,-2.0,False
178,2009091300,MattRyan,1,10,69,PASS,ATL,11,2009,MIA,ATL,True,69,69,34,69,82,82,82.0,69,13,13,13.0,True
179,2009091300,MattRyan,2,18,72,PASS,ATL,10,2009,MIA,ATL,False,72,69,34,69,82,82,82.0,69,13,10,10.0,False


## Empirical analysis of stopping shy strategies



We compare stopping shy 1-yard, 2-yards or 3-yards to go versus making the first down with 0, 1, 2 or 3 extra yards. 

We'll report the implied value of possession, measured on yards, which is to say the value of possession in yards that makes the decision to get the first down a wash. 

In [None]:
# Parameters for stopping shy strategy
FIRST_PLAY = 'PASS'
SECOND_PLAY = 'PASS' # on 2nd and whatever 'RUSH' or 'PASS'
# Parameter for alternative scenario where we reach first down 
EXTRA_YARDS = 0      # 0 means we get 10 yards on 1st down to complete the first down 
                     # 1 means we get 11 yards on 1st down to complete the first down one yard further down the field

 

def strategy_implied_yards_per_possession(first_play:str, second_play:str, yards_to_go_on_second:int, extra_yards:int):
  """
      Compute implied value of possession in yards when stretching for the first down, as 
      compared to a strategy of stopping shy

          first_play: 'RUSH' or 'PASS'
          second_play: 'RUSH' or 'PASS'
  """
  from collections import Counter 
  the_2_data = df[(df['dwn']==2) & (df['ytg']==yards_to_go_on_second) & (df['type']==first_play) & (df['yfog']<70) & (df['yfog']>20)]
  the_2_outcomes = Counter(the_2_data['yds'].values)
  denom2 = 1.0* sum(the_2_outcomes.values()) 
  total_count = 0
  probs = list()
  yards = list() # Net yards compared to alternative strategy of taking first down 
  completed = list()
  for yd2,ct2 in the_2_outcomes.items():
    p2 = (1.0*ct2)/denom2 
    if yd2<yards_to_go_on_second:
        ytg3 = yards_to_go_on_second-yd2 
        the_3_data = df[(df['dwn']==3) & (df['ytg']==ytg3) & (df['type']==second_play) & (df['yfog']<70) & (df['yfog']>20)]
        rush_3_outcomes = Counter(the_3_data['yds'].values)
        denom3 = 1.0*sum(rush_3_outcomes.values())
        sum_p3 = 0
        for yd3,ct3 in rush_3_outcomes.items():
          p3 = ct3/denom3
          sum_p3+=p3
          probs.append(p2*p3)
          if yd3>=ytg3:
              completed.append(1.0)
              yards.append(yd3+yd2-yards_to_go_on_second)
          else:
              completed.append(0.0)
              yards.append(yd3+yd2-yards_to_go_on_second)
    else:
        probs.append(p2)
        yards.append(yd2-yards_to_go_on_second)
        completed.append(1.0)

  sum_prob = sum(probs)
  assert abs(sum_prob-1)<0.0001
  mean_yards = sum([ w*y for w,y in zip(probs,yards)])
  completion_prob = sum([ w*c for w,c in zip(probs,completed)])
  cond_mean_yards = sum([ w*y for w,y,c in zip(probs,yards,completed) if c])/completion_prob
  cond_mean_loss = sum([ w*y for w,y,c in zip(probs,yards,completed) if c<0.5])/(1-completion_prob)

  def implied_poss_yards(p,cy,cl,x,t=2/160.):
      """ Implied value of possession in yards. 

          Both turnovers and touchdowns are ambiguous. We are conservative, and assume
          that with probability t a turnover occurs. 

          Assuming no turnover, we use the empirical yardages in the database. We complete the
          first down with probability p and conditional *relative* gain cy-x 

          Set  
                 rel gain      -   rel loss  - turnover = 0 
              (1-t)*p * (cy-x) - (1-t)*(1-p)*(cl+v+x) - t*T = 0  
           =>   (1-t)*(1-p)*(cl+v+x) = (1-t)*p * (cy-x) - t*T 
           =>   v =  p/(1-p)* (cy-x) - t/((1-t)(1-p)) T - x - cl
                
          where 
              v  = value of possession in yards 
              p  = first down completion probability
              t  = turnover probability, conditional on loss
              cy = mean yards gained versus the "10-yard 1st down", assuming completion
              cl = mean yards lost versus the "10-yard 1st down", assuming no completion
              x  = extra yards when making first down 
              t  = probability of turnover 
              T  = yardage value of turnover, assumed 80 yards
        
          assuming a turnover is a loss of T=80 yards
      """
      T = 80
      v =  (p/(1-p))*(cy-x) - t/((1-t)*(1-p))*T - x - cl
      return v

  imp = implied_poss_yards(p=completion_prob,cy=cond_mean_yards, cl=cond_mean_loss,x=extra_yards)
  return imp 
      
strategies = list()
for x in [0,1,2,3]:
  for ytgo2 in [1,2,3]:
    for first_play in ['RUSH','PASS']:
      for second_play in ['RUSH','PASS']:
          strategies.append( (x,ytgo2, first_play,second_play,strategy_implied_yards_per_possession(first_play,second_play,ytgo2,x)))
from pprint import pprint
pprint(strategies)

In [None]:
# Print viable alternatives to first down 

In [36]:
print(""" \\begin{table}
        \\begin{centering}
        \\begin{tabular}{|c|c|c|c|c|}
        \\hline
          Extra yards & Yards to go & On second & On third & Implied \\\\
          \\hline """)

for x,ytg,p1,p2,imp in strategies:
  if (imp>30) and (x>0):
     print(str(x)+' & ' + str(ytg) + ' & ' + p1.lower() + ' & ' + p2.lower() + ' & '+str(round(imp)) + ' \\\\ ')
    

print("""    \\hline 
      \\end{tabular}


      \\caption{A team given the opportunity to gain a first down and extra yards in the
              process might nonetheless choose to stop shy with $1$, $2$, or even $3$ yards to go. 
              The decision is expressed in terms of an implied belief in the value of a possession, measured in yards. 
              For example if possession is believed to be worth less than $45$ yards, then a team might ``decline'' a $12$-yard 
              first down completion by stopping one yard shy instead, assuming they plan to pass on second down and rush on third}
      \\label{tab:viable}
      \\end{centering}
      \\end{table}""") 

 \begin{table}
        \begin{centering}
        \begin{tabular}{|c|c|c|c|c|}
        \hline
          Extra yards & Yards to go & On second & On third & Implied \\
          \hline 
1 & 1 & rush & rush & 43 \\ 
1 & 1 & rush & pass & 43 \\ 
1 & 1 & pass & rush & 44 \\ 
1 & 1 & pass & pass & 43 \\ 
1 & 2 & pass & rush & 39 \\ 
1 & 2 & pass & pass & 37 \\ 
1 & 3 & pass & pass & 32 \\ 
2 & 1 & rush & pass & 31 \\ 
2 & 1 & pass & rush & 36 \\ 
2 & 1 & pass & pass & 37 \\ 
2 & 2 & pass & rush & 32 \\ 
2 & 2 & pass & pass & 31 \\ 
3 & 1 & pass & pass & 31 \\ 
    \hline 
      \end{tabular}


      \caption{A team given the opportunity to gain a first down and extra yards in the
              process might nonetheless choose to stop shy with $1$, $2$, or even $3$ yards to go. 
              The decision is expressed in terms of an implied belief in the value of a possession, measured in yards. 
              For example if possession is believed to be worth less than $45$ yards, then a team 