# Quarterback Data Cleaning

In [1]:
# import packages
import os
import pandas as pd
import numpy as np
import re
import sys
sys.path.append('../src/')
from helpers import *

In [2]:
# Import all data
# assign path
path, dirs, files = next(os.walk("../data/raw_data/qb_stats/"))
file_count = len(files)
# create empty list
dataframes_list = []
 
# append datasets to the list
for i in range(file_count):
    temp_df = pd.read_csv("../data/raw_data/qb_stats/"+files[i], thousands=',')
    dataframes_list.append(temp_df)

All the csv files are formatted as "year_xx_pos_stats", so this code extracts the name of the file without the ".csv" suffix and stores the corresponding data in dataframe using the filename as the dataframe name

In [3]:
#  Array to store dataframe names
dataframes_names = []

for i in range(len(dataframes_list)):
    # If a player value is null, drop the row
    dataframes_list[i] = dataframes_list[i].dropna(subset=['Player'])
   
   # Rename all the dataframes as name of original csv file with date at the end (i.e. 2018_qb_stats.csv --> qb_stats_2018)
    globals()[files[i][5:].removesuffix('.csv')+'_'+files[i][:4]] = dataframes_list[i]
    dataframes_names.append(files[i][5:].removesuffix('.csv')+'_'+files[i][:4])#

The "comp_percent" helper funtion calculates the completion percentage. While the completion percentage was already included, it was not in a numerical format. The "team_remove" function removes the team identifier from the Player string. The merge function will take two dataframes and merge them together based on their matching columns

In [6]:
# function to add completion percentage
def comp_percent (row):
    if row['P_ATT'] == 0:
        return 0
    return row['CMP'] / row['P_ATT']

def merge(df1, df2):
        # merge stats and advanced stats
        df3 = pd.merge(left=df1, right=df2, on='Player', how='left')
        drop_y(df3)
        rename_x(df3)
        
        # Add completion percentage
        df3.insert(5, 'COMP_PER', round(df3.apply (lambda row: comp_percent(row), axis=1), 3))
        
        # Fix Names
        df3.insert(2, 'NAME', df3.apply (lambda row: team_remove(row), axis=1))
        df3 = df3.drop(columns='Player')
        
        # Convert all objects to floats
        cols = df3.columns
        df3[cols[3:]] = df3[cols[3:]].apply(pd.to_numeric, errors ='coerce')
        
        return df3

Iterate through the list of dataframes, rename the columns, and merge the regular and advanced yearly stats. Since the dataframes are listed in alphabetical order(i.e. "adv_qb_stats_2018" , "qb_stats_2018") we just need to compared if the last 4 chars in two dataframes are matching. Since they will be listed next to each other, we can take the ith frame and i+1 frame

In [7]:
for i in range(len(dataframes_names)-1):
    if dataframes_names[i][-4:] == dataframes_names[i+1][-4:]:
        df1 = globals()[dataframes_names[i]] 
        df2 = globals()[dataframes_names[i+1]]
        
        # rename columns in stats dataframe
        df1 = df1.rename(
            columns={'Rank':'RANK','ATT':'P_ATT', 'RZ ATT':'RZ_ATT','YDS':'P_YDS', 'Y/A':'P_Y/A'}).drop(
                columns=['PCT', '10+ YDS', '20+ YDS', '30+ YDS', '40+ YDS', '50+ YDS', 
                         'PKT TIME', 'SK', 'KNCK', 'HRRY', 'BLITZ','POOR','DROP'])
        df2 = df2.rename(
            columns={'Rank':'RANK','ATT':'P_ATT', 'YDS':'P_YDS', 'Y/A':'P_Y/A', 'ATT.1':'RUS_ATT', 'YDS.1':'RUS_YDS', 'TD.1':'RUS_TD'}).drop(
                columns=['PCT', 'ROST'])
            
        # Return new dataframe
        globals()[dataframes_names[i+1]] = merge(df1, df2)
        
# Change dataframes_names to contian only the mutated dataframes
dataframes_names = dataframes_names[1::2]

In [8]:
# export new csv files
for df in dataframes_names:
    globals()[df].to_csv(f'../data/clean_data/qb_stats_clean/{df}_clean.csv', index=False)