# Running Back Data Cleaning

In [1]:
# import packages
import os
import pandas as pd
import numpy as np
import re
import sys
sys.path.append('../src/')
from helpers import *


In [2]:
# Import all data
# assign path
path, dirs, files = next(os.walk("../data/raw_data/rb_stats/"))
file_count = len(files)
# create empty list
dataframes_list = []
 
# append datasets to the list
for i in range(file_count):
    temp_df = pd.read_csv("../data/raw_data/rb_stats/"+files[i], thousands=',')
    dataframes_list.append(temp_df)
    
# Array to store dataframe names
dataframes_names = []

for i in range(len(dataframes_list)):
    # If a player value is null, drop the row
    dataframes_list[i] = dataframes_list[i].dropna(subset=['Player'])
   
   # Rename all the dataframes as name of original csv file with date at the end (i.e. 2018_qb_stats.csv --> qb_stats_2018)
    globals()[files[i][5:].removesuffix('.csv')+'_'+files[i][:4]] = dataframes_list[i]
    dataframes_names.append(files[i][5:].removesuffix('.csv')+'_'+files[i][:4])

In [3]:
# function to compute implied targets
def comp_imp_att (row):
    if row['RUS_ATT'] == 0 and row['TGT']:
        return 0
    return row['RUS_ATT'] + row['TGT']

def merge(df1, df2):
    # merge stats and advanced stats
    df3= pd.merge(left=df1, right=df2, on='Player', how='left').fillna(0)
    drop_y(df3)
    rename_x(df3)
    
    # Add implied targets 
    df3.insert(2, 'IMP_ATT', df3.apply (lambda row: comp_imp_att(row), axis=1))
    
    # Fix Names
    df3.insert(2, 'NAME', df3.apply (lambda row: team_remove(row), axis=1))
    df3 = df3.drop(columns='Player')
    
    # # Add Snaps + Green zone Att
    tmp_df = pd.merge(snaps[i//4], grz_att[i//4], how= 'outer', on='NAME').fillna(0)
    df3 = pd.merge(left=df3, right=tmp_df, how='left', on = 'NAME', )
    
    # Convert all objects to floats
    cols = df3.columns
    df3[cols[3:]] = df3[cols[3:]].apply(pd.to_numeric, errors ='coerce')
    
    # Move Fantasy Point to end of Dataframe
    df3 = df3[[c for c in df3 if c not in ['FPTS', 'FPTS/G']] + ['FPTS', 'FPTS/G']]
    df3 = df3.fillna(0)
    return df3

In [4]:
# Series for green zone attempts and snaps
snaps = []
grz_att = []

for i in range(len(dataframes_names)//4):
    tmp_df1 = globals()[dataframes_names[(i * 4) + 2]][['Player', 'TTL']]
    tmp_df2 = globals()[dataframes_names[(i * 4) + 1]][['Player', 'ATT']]
    snaps.append(tmp_df1)
    snaps[i] = snaps[i].rename(columns={'Player':'NAME', 'TTL': 'SNAPS'})
    
    grz_att.append(tmp_df2)
    grz_att[i].insert(1, 'NAME', grz_att[i].apply (lambda row: team_remove(row), axis=1))
    grz_att[i] = grz_att[i].drop(columns='Player').rename(columns={'ATT': 'GRZ_ATT'})

In [5]:
for i in range(len(dataframes_names)-3):
    if dataframes_names[i][-4:] == dataframes_names[i+3][-4:]:
            df1 =  globals()[dataframes_names[i+3]]
            df2 =  globals()[dataframes_names[i]]
            
            # rename columns in stats dataframe
            df1 =df1.rename(
                columns={'Rank':'RANK','ATT':'RUS_ATT', 'YDS':'RUS_YDS', 'Y/A':'RUS_Y/A', 
                         '20+':'EXPLO', 'TD':'RUS_TD', 'YDS.1':'REC_YDS', 'TD.1':'REC_TD'}).drop(
                    columns=['ROST'])
            df2 = df2.rename(
                columns={'Rank':'RANK','ATT':'RUS_ATT', 'YDS':'RUS_YDS', 'Y/A':'RUS_Y/A', 'YACON.1':'RUS_YACON'}).drop(
                    columns=['TK LOSS', 'TK LOSS YDS', 'LNG TD', '10+ YDS', '20+ YDS', '30+ YDS', '40+ YDS', '50+ YDS','LNG', 'Y/ATT'])
                
            globals()[dataframes_names[i+3]] = merge(df1, df2)

# Change dataframes_names to contian only the mutated dataframes
dataframes_names = dataframes_names[3::4]

In [6]:
# export new csv files
for df in dataframes_names:
    globals()[df].to_csv(f'../data/clean_data/rb_stats_clean/{df}_clean.csv', index=False)