In [44]:
import pandas as pd
import numpy as np

features = ["posteam", "posteam_home", "posteam_away", "defteam", "score_differential", "yardline_100", "half_seconds_remaining", "qtr",
            "down", "ydstogo", "normal_first_down", "first_and_short", "first_and_long", "second_and_short", "second_and_medium", "second_and_long",
            "third_and_short", "third_and_medium", "third_and_long", "in_redzone", "is_two_minute_drill", "is_goal_to_go", "curr_drive_length"] 

output_columns = ["play_type", "play_length"]

def convert_drive_start_to_raw_yards(drive_start_label: str, posteam: str) -> int:
    if (drive_start_label == "50"):
        return 50
    drive_start_label_split = drive_start_label.split(" ")
    side_of_field = drive_start_label_split[0].strip()
    relative_yardline = int(drive_start_label_split[1])
    if (posteam == side_of_field):
        return int(100 - int(relative_yardline))
    else:
        return int(relative_yardline)

In [45]:
df = pd.read_csv("raw_pbp_data/2024_NFL.csv", low_memory=False)
df = df[df['season_type'] == 'REG']
df = df[df['down'].isin([1,2,3])]
df = df[df['play_type'].isin(['pass', 'run'])] 

df[["posteam", "drive_start_yard_line_100"]] = df[["posteam", "drive_start_yard_line"]].apply(lambda row: convert_drive_start_to_raw_yards(row["drive_start_yard_line"], row["posteam"]), axis=1, result_type='broadcast')

df["curr_drive_length"] = df["drive_start_yard_line_100"] - df["yardline_100"]

df['posteam_home'] = (df['posteam'] == df['home_team']).astype(int)
df['posteam_away'] = (df['posteam'] != df['home_team']).astype(int)

df['normal_first_down'] = ((df['down'] == 1) & (df['ydstogo'] <= 10) & (df['ydstogo'] >= 7)).astype(int)
df['first_and_short'] = ((df['down'] == 1) & (df['ydstogo'] < 5)).astype(int)
df['first_and_long'] = ((df['down'] == 1) & (df['ydstogo'] > 10)).astype(int)
df['second_and_short'] = ((df['down'] == 2) & (df['ydstogo'] <= 3)).astype(int)
df['second_and_medium'] = ((df['down'] == 2) & (df['ydstogo'] > 3) & (df['ydstogo'] <= 7)).astype(int)
df['second_and_long'] = ((df['down'] == 2) & (df['ydstogo'] > 7)).astype(int)
df['third_and_short'] = ((df['down'] == 3) & (df['ydstogo'] <= 3)).astype(int)
df['third_and_medium'] = ((df['down'] == 3) & (df['ydstogo'] > 3) & (df['ydstogo'] <= 7)).astype(int)
df['third_and_long'] = ((df['down'] == 3) & (df['ydstogo'] > 7)).astype(int)
df['is_two_minute_drill'] = ((df['half_seconds_remaining'] <= 120) & 
                                 ((df['qtr'] == 2) | (df['qtr'] == 4))).astype(int)
df['in_redzone'] = (df['yardline_100'] <= 20).astype(int)
df['is_goal_to_go'] = ((df['in_redzone'] == 1) & (df['ydstogo'] == df['yardline_100'])).astype(int)

df['play_length'] = np.where(
        df['play_type'].isin(['pass']), df['air_yards'], 
        df['yards_gained']
    )

df.head()

Unnamed: 0,play_id,game_id,old_game_id,home_team,away_team,season_type,week,posteam,posteam_type,defteam,...,second_and_short,second_and_medium,second_and_long,third_and_short,third_and_medium,third_and_long,is_two_minute_drill,in_redzone,is_goal_to_go,play_length
2,61,2024_01_ARI_BUF,2024090801,BUF,ARI,REG,1,70,away,BUF,...,0,0,0,0,0,0,0,0,0,3.0
3,83,2024_01_ARI_BUF,2024090801,BUF,ARI,REG,1,70,away,BUF,...,0,1,0,0,0,0,0,0,0,-3.0
4,108,2024_01_ARI_BUF,2024090801,BUF,ARI,REG,1,70,away,BUF,...,0,0,0,0,0,0,0,0,0,2.0
5,133,2024_01_ARI_BUF,2024090801,BUF,ARI,REG,1,70,away,BUF,...,1,0,0,0,0,0,0,0,0,2.0
6,155,2024_01_ARI_BUF,2024090801,BUF,ARI,REG,1,70,away,BUF,...,0,0,0,0,0,0,0,0,0,2.0


In [46]:
training_df = df[features]
training_df.head()

Unnamed: 0,posteam,posteam_home,posteam_away,defteam,score_differential,yardline_100,half_seconds_remaining,qtr,down,ydstogo,...,second_and_short,second_and_medium,second_and_long,third_and_short,third_and_medium,third_and_long,in_redzone,is_two_minute_drill,is_goal_to_go,curr_drive_length
2,70,0,1,BUF,0.0,70.0,1800.0,1,1.0,10,...,0,0,0,0,0,0,0,0,0,0.0
3,70,0,1,BUF,0.0,67.0,1767.0,1,2.0,7,...,0,1,0,0,0,0,0,0,0,3.0
4,70,0,1,BUF,0.0,45.0,1723.0,1,1.0,10,...,0,0,0,0,0,0,0,0,0,25.0
5,70,0,1,BUF,0.0,36.0,1682.0,1,2.0,1,...,1,0,0,0,0,0,0,0,0,34.0
6,70,0,1,BUF,0.0,34.0,1646.0,1,1.0,10,...,0,0,0,0,0,0,0,0,0,36.0
