# Initial Feature Extraction

This notebook extracts features from the different plays and analyzes the correlation between the different plays. 
The following functions are developed during this process:
- **extract_play_outcome_features**: This function extracts information regarding the outcomes of each play: pocket status, TD, pass complete, etc...
- **extract_formation_features**: This function analyzes the initial formation, describing the position of the different players
- **extract_foul_features**: This function extracts foul-related information
- **extract_injury_features**: This function extracts injury-related information

These functions will be stored inside the folder __feature_extraction__ for future use.

Furthermore, an initial Data Vizualization is performed to understand the correlation between variables.

# 0. Load libraries and constants

In [1]:
import os
import re
import sys
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

sys.path.append('../')
from definitions.fouls_def import ignore_fouls

In [2]:
# Define input parameters
input_path = '../input'


# 1. Feature Extraction

In [4]:
plays_data = pd.read_csv(os.path.join(input_path, 'plays.csv'))
plays_data.head(5)

Unnamed: 0,gameId,playId,playDescription,quarter,down,yardsToGo,possessionTeam,defensiveTeam,yardlineSide,yardlineNumber,...,foulNFLId3,absoluteYardlineNumber,offenseFormation,personnelO,defendersInBox,personnelD,dropBackType,pff_playAction,pff_passCoverage,pff_passCoverageType
0,2021090900,97,(13:33) (Shotgun) T.Brady pass incomplete deep...,1,3,2,TB,DAL,TB,33,...,,43.0,SHOTGUN,"1 RB, 1 TE, 3 WR",6.0,"4 DL, 2 LB, 5 DB",TRADITIONAL,0,Cover-1,Man
1,2021090900,137,(13:18) (Shotgun) D.Prescott pass deep left to...,1,1,10,DAL,TB,DAL,2,...,,108.0,EMPTY,"1 RB, 2 TE, 2 WR",6.0,"4 DL, 4 LB, 3 DB",TRADITIONAL,0,Cover-3,Zone
2,2021090900,187,(12:23) (Shotgun) D.Prescott pass short middle...,1,2,6,DAL,TB,DAL,34,...,,76.0,SHOTGUN,"0 RB, 2 TE, 3 WR",6.0,"3 DL, 3 LB, 5 DB",TRADITIONAL,0,Cover-3,Zone
3,2021090900,282,(9:56) D.Prescott pass incomplete deep left to...,1,1,10,DAL,TB,TB,39,...,,49.0,SINGLEBACK,"1 RB, 2 TE, 2 WR",6.0,"4 DL, 3 LB, 4 DB",TRADITIONAL,1,Cover-3,Zone
4,2021090900,349,(9:46) (Shotgun) D.Prescott pass incomplete sh...,1,3,15,DAL,TB,TB,44,...,,54.0,SHOTGUN,"1 RB, 1 TE, 3 WR",7.0,"3 DL, 4 LB, 4 DB",TRADITIONAL,0,Cover-3,Zone


In [5]:
plays_data.columns

Index(['gameId', 'playId', 'playDescription', 'quarter', 'down', 'yardsToGo',
       'possessionTeam', 'defensiveTeam', 'yardlineSide', 'yardlineNumber',
       'gameClock', 'preSnapHomeScore', 'preSnapVisitorScore', 'passResult',
       'penaltyYards', 'prePenaltyPlayResult', 'playResult', 'foulName1',
       'foulNFLId1', 'foulName2', 'foulNFLId2', 'foulName3', 'foulNFLId3',
       'absoluteYardlineNumber', 'offenseFormation', 'personnelO',
       'defendersInBox', 'personnelD', 'dropBackType', 'pff_playAction',
       'pff_passCoverage', 'pff_passCoverageType'],
      dtype='object')

In [6]:
def extract_play_outcome_features(plays_data):

    play_outcome = pd.DataFrame()

    # Play Identifiers
    play_outcome['gameId'] = plays_data.gameId
    play_outcome['playId'] = plays_data.playId
    # Has_pocket_hold - This feature answers the question, was the QB able to pass inside the pocket? 
    # This includes C, I and IN; as in all these plays the QB is able to pass the ball
    play_outcome['has_pocket_hold'] = plays_data.passResult.apply(lambda x : True if (x=='C')|(x=='I')|(x=='IN') else False)
    # was_qb_sacked - Self-explanatory 
    play_outcome['was_qb_sacked'] = plays_data.passResult.apply(lambda x : True if (x=='S') else False)
    # achieved_positive_yards - Self-explanatory, binary result
    play_outcome['achieved_pos_yards'] = plays_data.playResult.apply(lambda x: True if (x>0) else False)
    # gained_yards - Self-explanatory
    play_outcome['gained_yards'] = plays_data.playResult
    # touchdown - Self-explanatory
    play_outcome['touchdown'] = plays_data.playDescription.apply(lambda x: False if re.search('touchdown', x.lower()) == None else True)
    
    return play_outcome

In [7]:
def extract_number_players_regexp(x, pattern):

    # Match pattern
    match = re.search(pattern, x)

    # If pattern not available, return 0
    if match == None:
        return 0
    # Else, return the number
    else:
        return int(match[0][0])

def extract_formation_features(plays_data):

    play_formation = pd.DataFrame()

    # Play Identifiers
    play_formation['gameId'] = plays_data.gameId
    play_formation['playId'] = plays_data.playId

    # Set of features describing the formation
    # num_XX - # of players on the position
    # Possible positions:
    # RB -> Running Back
    # QB -> Quarterback - Ignore for now, this is not relevant
    # WR -> ???? - Need to check
    # TE -> ???? - Need to check
    # OL -> Offensive Lineman? - Need to check
    play_formation['num_rb'] = plays_data.personnelO.astype(str).apply(extract_number_players_regexp, pattern='\d RB')
    play_formation['num_wr'] = plays_data.personnelO.astype(str).apply(extract_number_players_regexp, pattern='\d WR')
    play_formation['num_te'] = plays_data.personnelO.astype(str).apply(extract_number_players_regexp, pattern='\d TE')
    play_formation['num_ol'] = plays_data.personnelO.astype(str).apply(extract_number_players_regexp, pattern='\d OL')
    # Note - Would we obtain the same result if we check the players position? 

    # Extract the offense formation used
    play_formation['formation'] = plays_data.offenseFormation

    return play_formation

In [8]:
def determine_foul_side(x):
    
    # Define pattern - Acept teams with either 2 or 3 letters
    pattern = 'penalty on [a-z]{2,3}-[a-z]\.[a-z]*,'

    # Extract valuable fields
    descrip = x.playDescription.lower()
    offense = x.possessionTeam.lower()
    defense = x.defensiveTeam.lower() 

    # Create final variables
    off_foul = 0
    off_penalty = []
    off_players = []
    def_foul = 0
    def_penalty = []
    def_players = []

    matches = re.findall(pattern, descrip)
    if matches!=[]:
        for idx, match in enumerate(matches):
            # Extract team from string
            team = match.split(" ")[-1].split("-")[0]
            player = match.split(" ")[-1].split("-")[1].replace(",","")
            
            # Determine whether it is a valid foul
            foul_number = idx + 1
            foulCol = f"foulName{foul_number}"
            foulName = x[foulCol]
            if foulName in ignore_fouls:
                next

            # Determine which team has committed the foul & store the penalty  
            if team == offense:
                off_foul += 1
                off_penalty.append(x[foulCol])
                off_players.append(player)
            elif team == defense:
                def_foul += 1
                def_penalty.append(x[foulCol])
                def_players.append(player)

            else:
                assert False, "Foul to neither team??"

    return off_foul, off_penalty, off_players, def_foul, def_penalty, def_players

def extract_foul_features(plays_data):

    play_foul = pd.DataFrame()

    # Play Identifiers
    play_foul['gameId'] = plays_data.gameId
    play_foul['playId'] = plays_data.playId

    # How to detect an offensive/defensive foul:
    # 1- The word "Penalty on XX" is found
    # 2- The "possesionTeam"/"defensiveTeam" is equal to XX
    # 3- The foul called should not be ignored
    foul_info = plays_data.apply(determine_foul_side, axis=1)

    # I hate this way of solving this, but I don;t have internet to get a better way
    off_foul = []
    off_penalty = []
    off_players = []
    def_foul = []
    def_penalty = []
    def_players = []
    for record in foul_info:
        off_foul.append(record[0])
        off_penalty.append(record[1])
        off_players.append(record[2])
        def_foul.append(record[3])
        def_penalty.append(record[4])
        def_players.append(record[5])

    # Assign to DataFrame
    play_foul['num_off_foul'] = off_foul
    play_foul['off_penalties'] = off_penalty
    play_foul['off_penalties_players'] = off_players
    play_foul['num_def_foul'] = def_foul 
    play_foul['def_penalties'] = def_penalty
    play_foul['def_penalties_players'] = def_players

    return play_foul

In [35]:
def determine_injury_side(x):
    
    # Define pattern - Acept teams with either 2 or 3 letters
    pattern = '[a-z]{2,3}-[a-z]\.[\s\-a-z]* was injured'

    # Extract valuable fields
    descrip = x.playDescription.lower()
    offense = x.possessionTeam.lower()
    defense = x.defensiveTeam.lower() 

    # Create final variables
    off_injury = 0
    off_players = []
    def_injury = 0
    def_players = []

    matches = re.findall(pattern, descrip)
    
    if matches!=[]:
        for idx, match in enumerate(matches):

            # Extract team from string
            team = match.split("-")[0].split(" ")[-1]
            player = match.replace(" was injured", "").split("-", 1)[1]

            # Determine which team has an injured player
            if team == offense:
                off_injury += 1
                off_players.append(player)
            elif team == defense:
                def_injury += 1
                def_players.append(player)
            else:
                assert False, "Injury to neither team??"

    return off_injury, off_players, def_injury, def_players

def extract_injury_features(plays_data):

    play_injury = pd.DataFrame()

    # Play Identifiers
    play_injury['gameId'] = plays_data.gameId
    play_injury['playId'] = plays_data.playId

    # How to detect an offensive/defensive injury:
    # 1- The word "XX-<Full Name> was injured during the play" is found
    # 2- The "possesionTeam"/"defensiveTeam" is equal to XX
    injury_info = plays_data.apply(determine_injury_side, axis=1)

    # I hate this way of solving this, but I don;t have internet to get a better way
    off_injury = []
    off_players = []
    def_injury = []
    def_players = []
    for record in injury_info:
        off_injury.append(record[0])
        off_players.append(record[1])
        def_injury.append(record[2])
        def_players.append(record[3])

    # Assign to DataFrame
    play_injury['num_off_injuries'] = off_injury
    play_injury['off_players_injured'] = off_players
    play_injury['num_def_injuries'] = def_injury 
    play_injury['def_players_injured'] = def_players

    return play_injury

In [36]:
# Perform all feature extractions
plays_outcomes = extract_play_outcome_features(plays_data)
plays_formation = extract_formation_features(plays_data)
plays_fouls = extract_foul_features(plays_data)
plays_injury = extract_injury_features(plays_data)

# 1.1 Game Feature Extraction

In [3]:
games_enhanced_data = pd.read_csv(os.path.join(input_path, 'games_enhanced.csv'))
games_enhanced_data.head(5)

Unnamed: 0,gameId,season,week,gameDate,gameTimeEastern,homeTeamAbbr,visitorTeamAbbr,homeScore,visitorScore,whoWon
0,2021090900,2021,1,09/09/2021,20:20:00,TB,DAL,31,29,TB
1,2021091200,2021,1,09/12/2021,13:00:00,ATL,PHI,6,32,PHI
2,2021091201,2021,1,09/12/2021,13:00:00,BUF,PIT,16,23,PIT
3,2021091202,2021,1,09/12/2021,13:00:00,CAR,NYJ,19,14,CAR
4,2021091203,2021,1,09/12/2021,13:00:00,CIN,MIN,27,24,CIN


In [14]:
def process_game_record(game):
    
    teams = [game['homeTeamAbbr'].values[0], game['visitorTeamAbbr'].values[0]]
    scores = [game['homeScore'].values[0], game['visitorScore'].values[0]]
    hasWon = [False, False]

    if (game['whoWon'].values == game['homeTeamAbbr'].values):
        hasWon[0] = True
    else:
        hasWon[1] = True

    return pd.DataFrame({
        'team': teams,
        'gameScore': scores,
        'hasWon': hasWon
    })    


def extract_game_features(games_data):
    return games_data.groupby(['gameId']).apply(process_game_record).reset_index(level=1, drop=True)

In [16]:
game_features = extract_game_features(games_enhanced_data)