# Data Cleaning

## Import packages

In [73]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import re
%matplotlib inline

Read in csv and examine data

In [74]:
combine = pd.read_csv('../data/nfl_combine_data.csv')

In [75]:
# Look at descriptive stats
print(combine.shape, '\n')
print(combine.info())
combine.describe().T

(9950, 17) 

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9950 entries, 0 to 9949
Data columns (total 17 columns):
full_name               9950 non-null object
first_name              9950 non-null object
last_name               9950 non-null object
combine_year            9950 non-null int64
college                 9950 non-null object
position                9950 non-null object
height_inches           9950 non-null float64
weight_lbs              9950 non-null int64
hand_size_inches        8400 non-null float64
arm_length_inches       8082 non-null float64
40_yard_dash            9073 non-null float64
bench_press_reps        6779 non-null float64
vertical_leap_inches    8050 non-null float64
broad_jump_inches       7903 non-null float64
3_cone_drill            4518 non-null float64
20_yard_shuttle         7176 non-null float64
60_yard_shuttle         3169 non-null float64
dtypes: float64(10), int64(2), object(5)
memory usage: 1.3+ MB
None


Unnamed: 0,count,mean,std,min,25%,50%,75%,max
combine_year,9950.0,2002.204824,9.302148,1987.0,1994.0,2002.0,2011.0,2017.0
height_inches,9950.0,73.73564,2.645112,64.9,71.88,74.0,75.75,82.4
weight_lbs,9950.0,240.282513,45.046373,142.0,203.0,232.0,275.0,387.0
hand_size_inches,8400.0,9.528956,0.629254,7.13,9.13,9.5,10.0,11.88
arm_length_inches,8082.0,32.221633,1.49868,25.63,31.25,32.25,33.25,38.5
40_yard_dash,9073.0,4.830682,0.309814,4.21,4.59,4.76,5.05,6.12
bench_press_reps,6779.0,19.833014,6.540305,1.0,15.0,20.0,24.0,51.0
vertical_leap_inches,8050.0,32.001801,4.203431,17.5,29.0,32.0,35.0,46.0
broad_jump_inches,7903.0,112.30558,9.306169,7.0,106.0,113.0,119.0,147.0
3_cone_drill,4518.0,7.346076,0.446751,6.34,7.01,7.26,7.62,9.61


3_cone_drill, 60_yard_shuttle have many missing values

In [76]:
combine.head()

Unnamed: 0,full_name,first_name,last_name,combine_year,college,position,height_inches,weight_lbs,hand_size_inches,arm_length_inches,40_yard_dash,bench_press_reps,vertical_leap_inches,broad_jump_inches,3_cone_drill,20_yard_shuttle,60_yard_shuttle
0,jamal_adams,jamal,adams,2017,louisiana_state,db,71.63,214,9.25,33.38,4.56,18.0,31.5,120.0,6.96,4.13,
1,montravius_adams,montravius,adams,2017,auburn,dl,75.63,304,9.25,32.75,4.87,22.0,29.0,108.0,7.62,,
2,rodney_adams,rodney,adams,2017,south_florida,wr,73.25,189,9.0,32.0,4.44,8.0,29.5,125.0,6.98,4.28,11.39
3,quincy_adeboyejo,quincy,adeboyejo,2017,mississippi,wr,74.75,197,9.38,31.75,4.42,8.0,34.5,123.0,6.73,4.14,
4,brian_allen,brian,allen,2017,utah,db,74.88,215,10.0,34.0,4.48,15.0,34.5,117.0,6.64,4.34,


In [77]:
# Add full name as a column
combine['player_name'] = combine.first_name + ' ' + combine.last_name

In [78]:
# Check years of data
combine.combine_year.unique()

array([2017, 2016, 2015, 2014, 2013, 2012, 2011, 2010, 2009, 2008, 2007,
       2006, 2005, 2004, 2003, 2002, 2001, 2000, 1999, 1998, 1997, 1996,
       1995, 1994, 1993, 1992, 1991, 1990, 1989, 1988, 1987])

In [123]:
combine.groupby('combine_year').count()

Unnamed: 0_level_0,player_name,first_name,last_name,college,position,height_inches,weight_lbs,hand_size_inches,arm_length_inches,40_yard_dash,bench_press_reps,vertical_leap_inches,broad_jump_inches,3_cone_drill,20_yard_shuttle,60_yard_shuttle
combine_year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
1987,267,267,267,267,267,267,267,237,237,237,215,237,233,0,223,142
1988,322,322,322,322,322,322,322,316,1,272,277,264,260,0,249,159
1989,308,308,308,308,308,308,308,0,0,270,208,258,255,0,248,157
1990,324,324,324,324,324,324,324,314,314,287,207,277,279,0,257,147
1991,428,428,428,428,428,428,428,426,426,371,297,369,369,0,349,211
1992,438,438,438,438,438,438,438,421,421,364,288,366,358,0,340,197
1993,338,338,338,338,338,338,338,337,337,270,223,267,261,0,248,153
1994,315,315,315,315,315,315,315,315,315,249,181,245,246,0,229,128
1995,310,310,310,310,310,310,310,309,309,218,179,212,205,0,188,105
1996,296,296,296,296,296,296,296,296,296,222,164,226,218,0,205,116


In [80]:
combine.loc[combine.combine_year == 2015].isnull().sum()

full_name                 0
first_name                0
last_name                 0
combine_year              0
college                   0
position                  0
height_inches             0
weight_lbs                0
hand_size_inches        428
arm_length_inches       429
40_yard_dash              5
bench_press_reps        517
vertical_leap_inches    485
broad_jump_inches       486
3_cone_drill            516
20_yard_shuttle         508
60_yard_shuttle         648
player_name               0
dtype: int64

## Addressing 2015 data

- 2015 has 741 unique attendees. There are only 320, so we need to get rid of the extra attendees in our dataset.
- Load in actual attendee data
- Split first and last names into 2 columns
- Remove special characters
- Make everything lower case
- Inner join original combine and attendee data
- Drop 2015 rows from original combine data
- Concat joined result with combine data

In [81]:
combine = pd.read_csv('../data/nfl_combine_data.csv')
combine['player_name'] = combine.first_name + ' ' + combine.last_name

In [82]:
# Read in 2015 data with 320 attendees
df15 = pd.read_excel('../data/2015_combine_data.xlsx')
# drop unnecessary rows
cols = ['Player','Year','Pos','School']
df15 = df15[cols]
df15.rename(columns ={'Player':'player_name','Year':'combine_year', 'Pos':'position', 'School': 'college'},inplace=True)

In [83]:
combine15 = combine.loc[combine.combine_year == 2015]

In [84]:
df15.head()

Unnamed: 0,player_name,combine_year,position,college
0,Ameer Abdullah,2015,RB,Nebraska
1,Nelson Agholor,2015,WR,USC
2,Jay Ajayi,2015,RB,Boise St.
3,Kwon Alexander,2015,OLB,LSU
4,Mario Alford,2015,WR,West Virginia


In [85]:
combine15.head()

Unnamed: 0,full_name,first_name,last_name,combine_year,college,position,height_inches,weight_lbs,hand_size_inches,arm_length_inches,40_yard_dash,bench_press_reps,vertical_leap_inches,broad_jump_inches,3_cone_drill,20_yard_shuttle,60_yard_shuttle,player_name
638,ameer_abdullah,ameer,abdullah,2015,nebraska,rb,68.75,205,8.63,30.0,4.6,24.0,42.5,130.0,6.79,3.95,11.18,ameer abdullah
639,nelson_agholor,nelson,agholor,2015,southern_california,wr,72.13,198,9.25,32.25,4.42,12.0,,,,,,nelson agholor
640,malcolm_agnew,malcolm,agnew,2015,southern_illinois,rb,70.0,202,,,4.59,,,,,,,malcolm agnew
641,jay_ajayi,jay,ajayi,2015,boise_state,rb,71.75,221,10.0,32.0,4.57,19.0,39.0,121.0,7.1,4.1,11.1,jay ajayi
642,brandon_alexander,brandon,alexander,2015,central_florida,db,74.0,195,,,4.59,,,,,,,brandon alexander


## Cleaning Functions

In [86]:
regex = re.compile("[@_!#$%^&*()<>?/\|}{~:`'']") 

In [87]:
# Remove special characters
def remove_special(df):
    df = df.applymap(lambda x: re.sub("[@_!#$%^&*.<>?/\|}{~:`'']", '', x) if type(x) == str else x)

def lowercase(df, col):
    df[col] = df[col].apply(lambda x: x.lower())
# combine15.player = combine15.player_name.apply(lambda x: x.lower())

def clean_name(df, col):
    df[['first_name','last_name']] = df[col].str.split(" ",n=1,expand=True)

# Clean combine_year record that lists '5' instead of '2005'
def replace_5(df):
    df.loc[df.combine_year == 5, 'combine_year'] = 2005

# map positions

# Add 'events_missed' columns
def events_missed():
    pass
# Add 'conference' column

# Position standardization

# Regression imputation for missing values

In [88]:
replace_5(combine15)
remove_special(combine15) # doesn't work
lowercase(combine15,'player_name')
clean_name(combine15,'player_name')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [89]:
remove_special(df15) # doesn't work
lowercase(df15, 'player_name')
clean_name(df15,'player_name')

In [90]:
df15 = df15.applymap(lambda x: re.sub("[@_!#$%^&*.<>?/\|}{~:`'']", '', x) if type(x) == str else x)
combine15 = combine15.applymap(lambda x: re.sub("[@_!#$%^&*.<>?/\|}{~:`'']", '', x) if type(x) == str else x)

In [91]:
print(df15[~df15.player_name.isin(combine15.player_name)].player_name.unique())
len(df15[~df15.player_name.isin(combine15.player_name)].player_name.unique())

['trenton brown' 'michael burton' 'yannick cudjoe-virgil'
 'durell eskridge' 'mario edwards, jr' 'donatella luckett'
 'zack wagenmann']


7

In [106]:
merged_df

Unnamed: 0,player_name_x,combine_year,position_x,college_x,first_name,last_name,full_name,college_y,position_y,height_inches,...,hand_size_inches,arm_length_inches,40_yard_dash,bench_press_reps,vertical_leap_inches,broad_jump_inches,3_cone_drill,20_yard_shuttle,60_yard_shuttle,player_name_y
0,ameer abdullah,2015,RB,Nebraska,ameer,abdullah,ameerabdullah,nebraska,rb,68.75,...,8.63,30.00,4.60,24.0,42.5,130.0,6.79,3.95,11.18,ameer abdullah
1,nelson agholor,2015,WR,USC,nelson,agholor,nelsonagholor,southerncalifornia,wr,72.13,...,9.25,32.25,4.42,12.0,,,,,,nelson agholor
2,jay ajayi,2015,RB,Boise St,jay,ajayi,jayajayi,boisestate,rb,71.75,...,10.00,32.00,4.57,19.0,39.0,121.0,7.10,4.10,11.10,jay ajayi
3,kwon alexander,2015,OLB,LSU,kwon,alexander,kwonalexander,louisianastate,lb,72.75,...,9.25,30.25,4.55,24.0,36.0,121.0,7.14,4.20,,kwon alexander
4,mario alford,2015,WR,West Virginia,mario,alford,marioalford,westvirginia,wr,68.50,...,9.38,31.25,4.43,13.0,34.0,121.0,6.64,4.07,11.22,mario alford
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
319,jameis winston,2015,QB,Florida St,jameis,winston,jameiswinston,floridastate,qb,76.00,...,9.38,32.00,4.97,,28.5,103.0,7.16,4.36,,jameis winston
320,cam worthy,2015,WR,East Carolina,cam,worthy,camworthy,eastcarolinanc,wr,74.00,...,10.13,33.50,4.59,12.0,,,,,,cam worthy
321,gabe wright,2015,DT,Auburn,gabe,wright,gabewright,auburn,dl,75.00,...,10.75,32.63,5.07,34.0,26.5,100.0,7.73,4.56,,gabe wright
322,tj yeldon,2015,RB,Alabama,tj,yeldon,tjyeldon,alabama,rb,73.00,...,9.00,31.63,4.61,22.0,36.0,117.0,7.19,4.22,,tj yeldon


In [105]:
merged_df = pd.merge(df15, combine15, how='left', on = ['first_name', 'last_name', 'combine_year'])

In [107]:
merged_df = merged_df.drop(['position_x', 'player_name_y', 'college_x'],axis=1)
merged_df.rename(columns = {'player_name_x':'player_name', 'college_y':'college','position_y':'position'}, inplace=True)

In [109]:
combine.head()

Unnamed: 0,full_name,first_name,last_name,combine_year,college,position,height_inches,weight_lbs,hand_size_inches,arm_length_inches,40_yard_dash,bench_press_reps,vertical_leap_inches,broad_jump_inches,3_cone_drill,20_yard_shuttle,60_yard_shuttle,player_name
0,jamal_adams,jamal,adams,2017,louisiana_state,db,71.63,214,9.25,33.38,4.56,18.0,31.5,120.0,6.96,4.13,,jamal adams
1,montravius_adams,montravius,adams,2017,auburn,dl,75.63,304,9.25,32.75,4.87,22.0,29.0,108.0,7.62,,,montravius adams
2,rodney_adams,rodney,adams,2017,south_florida,wr,73.25,189,9.0,32.0,4.44,8.0,29.5,125.0,6.98,4.28,11.39,rodney adams
3,quincy_adeboyejo,quincy,adeboyejo,2017,mississippi,wr,74.75,197,9.38,31.75,4.42,8.0,34.5,123.0,6.73,4.14,,quincy adeboyejo
4,brian_allen,brian,allen,2017,utah,db,74.88,215,10.0,34.0,4.48,15.0,34.5,117.0,6.64,4.34,,brian allen


In [110]:
# cols = combine
merged_df = merged_df[['combine_year', 'player_name','first_name', 'last_name', 'college', 'position', 'height_inches',
       'weight_lbs', 'hand_size_inches', 'arm_length_inches', '40_yard_dash',
       'bench_press_reps', 'vertical_leap_inches', 'broad_jump_inches',
       '3_cone_drill', '20_yard_shuttle', '60_yard_shuttle']]

combine = combine[['combine_year', 'player_name','first_name', 'last_name', 'college', 'position', 'height_inches',
       'weight_lbs', 'hand_size_inches', 'arm_length_inches', '40_yard_dash',
       'bench_press_reps', 'vertical_leap_inches', 'broad_jump_inches',
       '3_cone_drill', '20_yard_shuttle', '60_yard_shuttle']]

In [116]:
# drop 2015 rows from combine df
index_drop_list = combine.loc[combine.combine_year == 2015].index

In [117]:
combine = combine.drop(index_drop_list)

In [118]:
# Union datasets together
combine = pd.concat([combine,merged_df])

In [125]:
# 3 repeated names
len(combine.loc[combine.combine_year == 2015].player_name.unique())

321

In [134]:
# Kevin White repeated 4 times
combine.loc[combine.combine_year == 2015].groupby('player_name').count().sort_values('combine_year', ascending = False).head()

Unnamed: 0_level_0,combine_year,first_name,last_name,college,position,height_inches,weight_lbs,hand_size_inches,arm_length_inches,40_yard_dash,bench_press_reps,vertical_leap_inches,broad_jump_inches,3_cone_drill,20_yard_shuttle,60_yard_shuttle
player_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
kevin white,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2
landon collins,1,1,1,1,1,1,1,1,1,1,0,1,1,1,1,1
mark glowinski,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0
"mario edwards, jr",1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0
mario alford,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1


In [131]:
combine.loc[combine.player_name == 'kevin white'] # index 9514, 9516

Unnamed: 0,combine_year,player_name,first_name,last_name,college,position,height_inches,weight_lbs,hand_size_inches,arm_length_inches,40_yard_dash,bench_press_reps,vertical_leap_inches,broad_jump_inches,3_cone_drill,20_yard_shuttle,60_yard_shuttle
9514,2015,kevin white,kevin,white,texaschristian,lb,69.13,183.0,9.25,32.63,4.63,18.0,35.5,121.0,6.92,4.14,11.52
9515,2015,kevin white,kevin,white,westvirginia,wr,74.63,215.0,9.25,32.63,4.35,23.0,36.5,123.0,6.92,4.14,11.52
9516,2015,kevin white,kevin,white,texaschristian,lb,69.13,183.0,9.25,32.63,4.63,18.0,35.5,121.0,6.92,4.14,11.52
9517,2015,kevin white,kevin,white,westvirginia,wr,74.63,215.0,9.25,32.63,4.35,23.0,36.5,123.0,6.92,4.14,11.52


In [133]:
combine.drop([9514,9516], inplace=True)
combine.reset_index(drop=True, inplace=True)
combine.shape

(9531, 17)

## Adding Events Missed Information

In [137]:
combine.loc[(combine.height_inches.isnull() == True) | (combine.weight_lbs.isnull() == True)]

Unnamed: 0,combine_year,player_name,first_name,last_name,college,position,height_inches,weight_lbs,hand_size_inches,arm_length_inches,40_yard_dash,bench_press_reps,vertical_leap_inches,broad_jump_inches,3_cone_drill,20_yard_shuttle,60_yard_shuttle
9239,2015,trenton brown,trenton,brown,,,,,,,,,,,,,
9245,2015,michael burton,michael,burton,,,,,,,,,,,,,
9277,2015,yannick cudjoe-virgil,yannick,cudjoe-virgil,,,,,,,,,,,,,
9307,2015,durell eskridge,durell,eskridge,,,,,,,,,,,,,
9387,2015,"mario edwards, jr",mario,"edwards, jr",,,,,,,,,,,,,
9402,2015,donatella luckett,donatella,luckett,,,,,,,,,,,,,
9507,2015,zack wagenmann,zack,wagenmann,,,,,,,,,,,,,


In [135]:
combine.isnull().sum()

combine_year               0
player_name                0
first_name                 0
last_name                  0
college                    7
position                   7
height_inches              7
weight_lbs                 7
hand_size_inches        1139
arm_length_inches       1457
40_yard_dash             879
bench_press_reps        2764
vertical_leap_inches    1493
broad_jump_inches       1640
3_cone_drill            5024
20_yard_shuttle         2366
60_yard_shuttle         6364
dtype: int64

In [55]:
combine.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9531 entries, 0 to 9530
Data columns (total 18 columns):
index                   9531 non-null int64
combine_year            9531 non-null int64
player_name             9531 non-null object
first_name              9531 non-null object
last_name               9531 non-null object
college                 9531 non-null object
position                9531 non-null object
height_inches           9209 non-null float64
weight_lbs              9209 non-null float64
hand_size_inches        8087 non-null float64
arm_length_inches       7770 non-null float64
40_yard_dash            8337 non-null float64
bench_press_reps        6555 non-null float64
vertical_leap_inches    7794 non-null float64
broad_jump_inches       7648 non-null float64
3_cone_drill            4293 non-null float64
20_yard_shuttle         6943 non-null float64
60_yard_shuttle         3076 non-null float64
dtypes: float64(11), int64(2), object(5)
memory usage: 1.3+ MB


## Regression Imputation