In [2]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
pbp = pd.DataFrame()
for year in range(2010, 2021):
    i_df = pd.read_csv(f'pbp_data/play_by_play_{year}.csv.gz', compression='gzip', low_memory=False)
    pbp = pd.concat([pbp, i_df])

In [6]:
pbp['drive_id'] = pbp.game_id.astype('str') + '_' + pbp.fixed_drive.astype('str')

In [10]:
pbp['penalty_type_and_yards'] = pbp.apply(lambda x: (x['penalty_type'], x['penalty_yards']), axis=1)

In [59]:
def_penalties = [
    'Defensive Pass Interference',
    'Defensive Holding',
    'Defensive Offside',
    'Neutral Zone Infraction',
    'Roughing the Passer',
    'Encroachment',
    'Horse Collar Tackle',
    'Defensive Too Many Men on Field'
]

In [60]:
def get_first_fn(series):
    return series.iloc[0]

def contains_fn(series, looking_for):
    return int(series.str.contains(looking_for).sum()>0)   

drives = pbp.groupby('drive_id').agg({'fixed_drive_result':get_first_fn})
drives['touchdown'] = drives.fixed_drive_result.apply(lambda x: int(x=='Touchdown'))

In [61]:
# takes about 3-4 minutes to run, probably should implement more efficiently
for penalty in def_penalties:
    drives[penalty] = pbp.groupby('drive_id').penalty_type.agg(contains_fn, looking_for=penalty)

In [35]:
drives

Unnamed: 0_level_0,fixed_drive_result,touchdown,Defensive Pass Interference,Defensive Holding,Defensive Offside,Neutral Zone Infraction,Roughing The Passer,Encroachment,Horse Collar Tackle,Defensive Too Many Men on Field
drive_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
2010_01_ARI_STL_1,Turnover,0,0,0,0,0,0,0,0,0
2010_01_ARI_STL_10,Field goal,0,0,0,0,0,0,1,0,0
2010_01_ARI_STL_11,Touchdown,1,0,0,0,0,0,0,0,0
2010_01_ARI_STL_12,Punt,0,0,0,0,0,0,0,0,0
2010_01_ARI_STL_13,Punt,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...
2020_01_TEN_DEN_5,Punt,0,0,0,0,0,0,0,0,0
2020_01_TEN_DEN_6,Turnover,0,0,0,0,0,0,0,0,0
2020_01_TEN_DEN_7,Touchdown,1,0,0,0,1,0,0,0,0
2020_01_TEN_DEN_8,Turnover on downs,0,0,0,0,0,0,0,0,0


In [62]:
def def_penalty_fn(row):
    return int(sum([row[penalty] for penalty in def_penalties])>0)

drives['defensive_penalty'] = drives.apply(def_penalty_fn, axis=1)

In [63]:
td_drives = drives.touchdown.value_counts()[1]
total_drives = len(drives)

td_drives_penalty = drives.loc[drives.defensive_penalty==1].touchdown.value_counts()[1]
total_drives_penalty = drives.defensive_penalty.sum()

td_drives_dpi = drives.loc[drives['Defensive Pass Interference']==1].touchdown.value_counts()[1]
total_drives_dpi = drives['Defensive Pass Interference'].sum()

def print_division_fn(description, num, den):
    print(f"{description}: {num}/{den} = {(num/den)*100:.1f}%")

print_division_fn("% of drives ending with TD", td_drives, total_drives)
print_division_fn("% of drives with defensive penalties", total_drives_penalty, total_drives)
print_division_fn("% of drives with DPI", total_drives_dpi, total_drives)
print_division_fn("% of drives w defensive penalties ending in TD", td_drives_penalty, total_drives_penalty)
print_division_fn("% of drives w DPI ending in TD", td_drives_dpi, total_drives_dpi)

% of drives ending with TD: 12730/63415 = 20.1%
% of drives with defensive penalties: 8204/63415 = 12.9%
% of drives with DPI: 2378/63415 = 3.7%
% of drives w defensive penalties ending in TD: 3124/8204 = 38.1%
% of drives w DPI ending in TD: 1133/2378 = 47.6%


In [69]:
penalty_td_rates = {}
for penalty in def_penalties:
    try:
        penalty_td_rates[penalty] = (drives.loc[drives[penalty]==1].touchdown.value_counts()[1], drives[penalty].sum())
    except IndexError:
        penalty_td_rates[penalty] = (0, drives[penalty].sum())
penalty_td_df = pd.DataFrame(penalty_td_rates).transpose()
penalty_td_df.columns = ['td_drives', 'total_drives']
penalty_td_df['td_rate'] = penalty_td_df['td_drives']/penalty_td_df['total_drives']
penalty_td_df.sort_values('td_rate', ascending=False)

Unnamed: 0,td_drives,total_drives,td_rate
Defensive Too Many Men on Field,36,73,0.493151
Defensive Pass Interference,1133,2378,0.476451
Roughing the Passer,402,1017,0.39528
Defensive Holding,718,1889,0.380095
Encroachment,205,545,0.376147
Horse Collar Tackle,80,215,0.372093
Defensive Offside,566,1621,0.349167
Neutral Zone Infraction,398,1271,0.313139
