# Data Cleaning

## Import packages

In [495]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import math
import re
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 100)
%matplotlib inline

Read in csv and examine data

In [616]:
combine = pd.read_csv('../data/nfl_combine_data.csv')

In [617]:
# Look at descriptive stats
print(combine.shape, '\n')
print(combine.info())
combine.describe().T

(9950, 17) 

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9950 entries, 0 to 9949
Data columns (total 17 columns):
full_name               9950 non-null object
first_name              9950 non-null object
last_name               9950 non-null object
combine_year            9950 non-null int64
college                 9950 non-null object
position                9950 non-null object
height_inches           9950 non-null float64
weight_lbs              9950 non-null int64
hand_size_inches        8400 non-null float64
arm_length_inches       8082 non-null float64
40_yard_dash            9073 non-null float64
bench_press_reps        6779 non-null float64
vertical_leap_inches    8050 non-null float64
broad_jump_inches       7903 non-null float64
3_cone_drill            4518 non-null float64
20_yard_shuttle         7176 non-null float64
60_yard_shuttle         3169 non-null float64
dtypes: float64(10), int64(2), object(5)
memory usage: 1.3+ MB
None


Unnamed: 0,count,mean,std,min,25%,50%,75%,max
combine_year,9950.0,2002.204824,9.302148,1987.0,1994.0,2002.0,2011.0,2017.0
height_inches,9950.0,73.73564,2.645112,64.9,71.88,74.0,75.75,82.4
weight_lbs,9950.0,240.282513,45.046373,142.0,203.0,232.0,275.0,387.0
hand_size_inches,8400.0,9.528956,0.629254,7.13,9.13,9.5,10.0,11.88
arm_length_inches,8082.0,32.221633,1.49868,25.63,31.25,32.25,33.25,38.5
40_yard_dash,9073.0,4.830682,0.309814,4.21,4.59,4.76,5.05,6.12
bench_press_reps,6779.0,19.833014,6.540305,1.0,15.0,20.0,24.0,51.0
vertical_leap_inches,8050.0,32.001801,4.203431,17.5,29.0,32.0,35.0,46.0
broad_jump_inches,7903.0,112.30558,9.306169,7.0,106.0,113.0,119.0,147.0
3_cone_drill,4518.0,7.346076,0.446751,6.34,7.01,7.26,7.62,9.61


3_cone_drill and 60_yard_shuttle have many missing values

In [498]:
combine.head()

Unnamed: 0,full_name,first_name,last_name,combine_year,college,position,height_inches,weight_lbs,hand_size_inches,arm_length_inches,40_yard_dash,bench_press_reps,vertical_leap_inches,broad_jump_inches,3_cone_drill,20_yard_shuttle,60_yard_shuttle
0,jamal_adams,jamal,adams,2017,louisiana_state,db,71.63,214,9.25,33.38,4.56,18.0,31.5,120.0,6.96,4.13,
1,montravius_adams,montravius,adams,2017,auburn,dl,75.63,304,9.25,32.75,4.87,22.0,29.0,108.0,7.62,,
2,rodney_adams,rodney,adams,2017,south_florida,wr,73.25,189,9.0,32.0,4.44,8.0,29.5,125.0,6.98,4.28,11.39
3,quincy_adeboyejo,quincy,adeboyejo,2017,mississippi,wr,74.75,197,9.38,31.75,4.42,8.0,34.5,123.0,6.73,4.14,
4,brian_allen,brian,allen,2017,utah,db,74.88,215,10.0,34.0,4.48,15.0,34.5,117.0,6.64,4.34,


In [499]:
# Add full name as a column
combine['player_name'] = combine.first_name + ' ' + combine.last_name

In [500]:
# Check years of data
combine.combine_year.unique()

array([2017, 2016, 2015, 2014, 2013, 2012, 2011, 2010, 2009, 2008, 2007,
       2006, 2005, 2004, 2003, 2002, 2001, 2000, 1999, 1998, 1997, 1996,
       1995, 1994, 1993, 1992, 1991, 1990, 1989, 1988, 1987])

In [501]:
combine.groupby('combine_year').count()

Unnamed: 0_level_0,full_name,first_name,last_name,college,position,height_inches,weight_lbs,hand_size_inches,arm_length_inches,40_yard_dash,bench_press_reps,vertical_leap_inches,broad_jump_inches,3_cone_drill,20_yard_shuttle,60_yard_shuttle,player_name
combine_year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
1987,267,267,267,267,267,267,267,237,237,237,215,237,233,0,223,142,267
1988,322,322,322,322,322,322,322,316,1,272,277,264,260,0,249,159,322
1989,308,308,308,308,308,308,308,0,0,270,208,258,255,0,248,157,308
1990,324,324,324,324,324,324,324,314,314,287,207,277,279,0,257,147,324
1991,428,428,428,428,428,428,428,426,426,371,297,369,369,0,349,211,428
1992,438,438,438,438,438,438,438,421,421,364,288,366,358,0,340,197,438
1993,338,338,338,338,338,338,338,337,337,270,223,267,261,0,248,153,338
1994,315,315,315,315,315,315,315,315,315,249,181,245,246,0,229,128,315
1995,310,310,310,310,310,310,310,309,309,218,179,212,205,0,188,105,310
1996,296,296,296,296,296,296,296,296,296,222,164,226,218,0,205,116,296


In [502]:
combine.loc[combine.combine_year == 2015].isnull().sum()

full_name                 0
first_name                0
last_name                 0
combine_year              0
college                   0
position                  0
height_inches             0
weight_lbs                0
hand_size_inches        428
arm_length_inches       429
40_yard_dash              5
bench_press_reps        517
vertical_leap_inches    485
broad_jump_inches       486
3_cone_drill            516
20_yard_shuttle         508
60_yard_shuttle         648
player_name               0
dtype: int64

## Addressing 2015 data

- 2015 has 741 unique attendees. There are only 320, so we need to get rid of the extra attendees in our dataset.
- Load in actual attendee data
- Split first and last names into 2 columns
- Remove special characters
- Make everything lower case
- Inner join original combine and attendee data
- Drop 2015 rows from original combine data
- Concat joined result with combine data

In [503]:
combine = pd.read_csv('../data/nfl_combine_data.csv')
combine['player_name'] = combine.first_name + ' ' + combine.last_name

In [504]:
# Read in 2015 data with 320 attendees
df15 = pd.read_excel('../data/2015_combine_data.xlsx')
# drop unnecessary rows
cols = ['Player','Year','Pos','School']
df15 = df15[cols]
df15.rename(columns ={'Player':'player_name','Year':'combine_year', 'Pos':'position', 'School': 'college'},inplace=True)

In [505]:
combine15 = combine.loc[combine.combine_year == 2015]

In [506]:
df15.head()

Unnamed: 0,player_name,combine_year,position,college
0,Ameer Abdullah,2015,RB,Nebraska
1,Nelson Agholor,2015,WR,USC
2,Jay Ajayi,2015,RB,Boise St.
3,Kwon Alexander,2015,OLB,LSU
4,Mario Alford,2015,WR,West Virginia


In [507]:
combine15.head()

Unnamed: 0,full_name,first_name,last_name,combine_year,college,position,height_inches,weight_lbs,hand_size_inches,arm_length_inches,40_yard_dash,bench_press_reps,vertical_leap_inches,broad_jump_inches,3_cone_drill,20_yard_shuttle,60_yard_shuttle,player_name
638,ameer_abdullah,ameer,abdullah,2015,nebraska,rb,68.75,205,8.63,30.0,4.6,24.0,42.5,130.0,6.79,3.95,11.18,ameer abdullah
639,nelson_agholor,nelson,agholor,2015,southern_california,wr,72.13,198,9.25,32.25,4.42,12.0,,,,,,nelson agholor
640,malcolm_agnew,malcolm,agnew,2015,southern_illinois,rb,70.0,202,,,4.59,,,,,,,malcolm agnew
641,jay_ajayi,jay,ajayi,2015,boise_state,rb,71.75,221,10.0,32.0,4.57,19.0,39.0,121.0,7.1,4.1,11.1,jay ajayi
642,brandon_alexander,brandon,alexander,2015,central_florida,db,74.0,195,,,4.59,,,,,,,brandon alexander


## Cleaning Functions

In [508]:
regex = re.compile("[@_!#$%^&*()<>?/\|}{~:`'']") 

In [527]:
# Remove special characters
def remove_special(df):
    df = df.applymap(lambda x: re.sub("[@_!#$%^&*.<>?/\|}{~:`'']", '', x) if type(x) == str else x)

# Make text lowercase
def lowercase(df, col):
    df[col] = df[col].apply(lambda x: x.lower())

def clean_name(df, col):
    df[['first_name','last_name']] = df[col].str.split(" ",n=1,expand=True)

# Drop '60 yard shuttle'
def drop_60_shuttle(df):
    df.drop('60_yard_shuttle', axis=1, inplace=True)

# Add 'events_missed' columns
def drills_missed(df):
    drills = ['hand_size_inches'
               ,'arm_length_inches'
               ,'40_yard_dash'
               ,'bench_press_reps'
               ,'vertical_leap_inches'
               ,'broad_jump_inches'
               ,'3_cone_drill'
               ,'20_yard_shuttle'
               ,'60_yard_shuttle']
    for drill in drills:
        df[drill+'_missed'] = df[drill].apply(lambda x: 1 if math.isnan(x) else 0)
    
# Add 'conference' column

# Regression imputation for missing values

In [528]:
replace_5(combine15)
remove_special(combine15) # doesn't work
lowercase(combine15,'player_name')
clean_name(combine15,'player_name')

In [511]:
remove_special(df15) # doesn't work
lowercase(df15, 'player_name')
clean_name(df15,'player_name')

In [512]:
df15 = df15.applymap(lambda x: re.sub("[@_!#$%^&*.<>?/\|}{~:`'']", '', x) if type(x) == str else x)
combine15 = combine15.applymap(lambda x: re.sub("[@_!#$%^&*.<>?/\|}{~:`'']", '', x) if type(x) == str else x)

In [513]:
print(df15[~df15.player_name.isin(combine15.player_name)].player_name.unique())
len(df15[~df15.player_name.isin(combine15.player_name)].player_name.unique())

['trenton brown' 'michael burton' 'yannick cudjoe-virgil'
 'durell eskridge' 'mario edwards, jr' 'donatella luckett'
 'zack wagenmann']


7

In [516]:
merged_df = pd.merge(df15, combine15, on = ['first_name', 'last_name', 'combine_year'])

In [517]:
merged_df = merged_df.drop(['position_x', 'player_name_y', 'college_x'],axis=1)
merged_df.rename(columns = {'player_name_x':'player_name', 'college_y':'college','position_y':'position'}, inplace=True)

In [518]:
combine.head()

Unnamed: 0,full_name,first_name,last_name,combine_year,college,position,height_inches,weight_lbs,hand_size_inches,arm_length_inches,40_yard_dash,bench_press_reps,vertical_leap_inches,broad_jump_inches,3_cone_drill,20_yard_shuttle,60_yard_shuttle,player_name
0,jamal_adams,jamal,adams,2017,louisiana_state,db,71.63,214,9.25,33.38,4.56,18.0,31.5,120.0,6.96,4.13,,jamal adams
1,montravius_adams,montravius,adams,2017,auburn,dl,75.63,304,9.25,32.75,4.87,22.0,29.0,108.0,7.62,,,montravius adams
2,rodney_adams,rodney,adams,2017,south_florida,wr,73.25,189,9.0,32.0,4.44,8.0,29.5,125.0,6.98,4.28,11.39,rodney adams
3,quincy_adeboyejo,quincy,adeboyejo,2017,mississippi,wr,74.75,197,9.38,31.75,4.42,8.0,34.5,123.0,6.73,4.14,,quincy adeboyejo
4,brian_allen,brian,allen,2017,utah,db,74.88,215,10.0,34.0,4.48,15.0,34.5,117.0,6.64,4.34,,brian allen


In [519]:
# cols = combine
merged_df = merged_df[['combine_year', 'player_name','first_name', 'last_name', 'college', 'position', 'height_inches',
       'weight_lbs', 'hand_size_inches', 'arm_length_inches', '40_yard_dash',
       'bench_press_reps', 'vertical_leap_inches', 'broad_jump_inches',
       '3_cone_drill', '20_yard_shuttle', '60_yard_shuttle']]

combine = combine[['combine_year', 'player_name','first_name', 'last_name', 'college', 'position', 'height_inches',
       'weight_lbs', 'hand_size_inches', 'arm_length_inches', '40_yard_dash',
       'bench_press_reps', 'vertical_leap_inches', 'broad_jump_inches',
       '3_cone_drill', '20_yard_shuttle', '60_yard_shuttle']]

In [520]:
# drop 2015 rows from combine df
index_drop_list = combine.loc[combine.combine_year == 2015].index

In [521]:
combine = combine.drop(index_drop_list)

In [522]:
# Union datasets together
combine = pd.concat([combine,merged_df])

In [523]:
# 2 repeated names
len(combine.loc[combine.combine_year == 2015].player_name.unique())

314

In [524]:
# Kevin White repeated 4 times
combine.loc[combine.combine_year == 2015].groupby('player_name').count().sort_values('combine_year', ascending = False).head()

Unnamed: 0_level_0,combine_year,first_name,last_name,college,position,height_inches,weight_lbs,hand_size_inches,arm_length_inches,40_yard_dash,bench_press_reps,vertical_leap_inches,broad_jump_inches,3_cone_drill,20_yard_shuttle,60_yard_shuttle
player_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
kevin white,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4
aaron davis,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1
lorenzo mauldin,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0
marcus murphy,1,1,1,1,1,1,1,1,1,1,1,1,1,0,1,0
marcus mariota,1,1,1,1,1,1,1,1,1,1,0,1,1,1,1,0


In [525]:
combine.loc[combine.player_name == 'kevin white']
kw_index = combine.loc[combine.player_name == 'kevin white'].index[0::2] # index 298, 300

In [526]:
combine.drop(kw_index, inplace=True)
combine.reset_index(drop=True, inplace=True)
combine.shape

(9522, 17)

## Adding Events Missed Information

In [529]:
combine.loc[combine.hand_size_inches.isnull() == True]

Unnamed: 0,combine_year,player_name,first_name,last_name,college,position,height_inches,weight_lbs,hand_size_inches,arm_length_inches,40_yard_dash,bench_press_reps,vertical_leap_inches,broad_jump_inches,3_cone_drill,20_yard_shuttle,60_yard_shuttle
7,2017,gavin andrews,gavin,andrews,oregon_state,ol,77.00,340,,,,,,,,5.09,
10,2017,antony auclair,antony,auclair,laval,fb_te,78.00,254,,,,,,,,4.45,12.08
20,2017,collin bevins,collin,bevins,northwest_missouri_state,dl,78.00,285,,,,,,,,4.39,
61,2017,jeremy cutrer,jeremy,cutrer,middle_tennessee_state,lb,74.00,170,,,,,,,,4.26,
86,2017,darius english,darius,english,south_carolina,lb,78.00,245,,,,,,,,4.94,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9296,2015,alvin dupree,alvin,dupree,kentucky,dl,76.00,269,,,4.56,,42.0,138.0,,,
9372,2015,will johnson,will,johnson,texasstate,kp,73.88,206,,,5.02,,,,,,
9388,2015,dez lewis,dez,lewis,centralarkansas,wr,75.75,214,,,4.58,16.0,33.5,119.0,7.11,4.28,
9399,2015,justin manton,justin,manton,louisiana-monroe,kp,74.75,196,,,4.93,,,,,,


In [530]:
combine.isnull().sum()

combine_year               0
player_name                0
first_name                 0
last_name                  0
college                    0
position                   0
height_inches              0
weight_lbs                 0
hand_size_inches        1131
arm_length_inches       1449
40_yard_dash             871
bench_press_reps        2756
vertical_leap_inches    1485
broad_jump_inches       1632
3_cone_drill            5015
20_yard_shuttle         2359
60_yard_shuttle         6355
dtype: int64

In [531]:
# Use previously defined function
drills_missed(combine)
combine.head()

Unnamed: 0,combine_year,player_name,first_name,last_name,college,position,height_inches,weight_lbs,hand_size_inches,arm_length_inches,40_yard_dash,bench_press_reps,vertical_leap_inches,broad_jump_inches,3_cone_drill,20_yard_shuttle,60_yard_shuttle,hand_size_inches_missed,arm_length_inches_missed,40_yard_dash_missed,bench_press_reps_missed,vertical_leap_inches_missed,broad_jump_inches_missed,3_cone_drill_missed,20_yard_shuttle_missed,60_yard_shuttle_missed
0,2017,jamal adams,jamal,adams,louisiana_state,db,71.63,214,9.25,33.38,4.56,18.0,31.5,120.0,6.96,4.13,,0,0,0,0,0,0,0,0,1
1,2017,montravius adams,montravius,adams,auburn,dl,75.63,304,9.25,32.75,4.87,22.0,29.0,108.0,7.62,,,0,0,0,0,0,0,0,1,1
2,2017,rodney adams,rodney,adams,south_florida,wr,73.25,189,9.0,32.0,4.44,8.0,29.5,125.0,6.98,4.28,11.39,0,0,0,0,0,0,0,0,0
3,2017,quincy adeboyejo,quincy,adeboyejo,mississippi,wr,74.75,197,9.38,31.75,4.42,8.0,34.5,123.0,6.73,4.14,,0,0,0,0,0,0,0,0,1
4,2017,brian allen,brian,allen,utah,db,74.88,215,10.0,34.0,4.48,15.0,34.5,117.0,6.64,4.34,,0,0,0,0,0,0,0,0,1


In [535]:
combine.to_csv('../data/nfl_combine2.csv')

## Regression Imputation

Since there are a quite a few combine attendees who did not participate in all the drills, we will need to impute those values by regression imputation. We can assume these values are not missing at random, so a simple imputation is not viable. By segmenting the data by position and using regression imputation, a more realistic value can be obtained.

In [541]:
from sklearn.linear_model import LinearRegression
from sklearn.impute import KNNImputer

In [542]:
drills = ['hand_size_inches'
               ,'arm_length_inches'
               ,'40_yard_dash'
               ,'bench_press_reps'
               ,'vertical_leap_inches'
               ,'broad_jump_inches'
               ,'3_cone_drill'
               ,'20_yard_shuttle'
               ,'60_yard_shuttle']

In [543]:
# player counts by position
combine.groupby('position').count().player_name

position
db        709
dl       1454
fb_te     766
fbte       22
g          19
k_p       113
kp         12
lb       2087
ls          5
ol       1623
qb        553
rb        893
wr       1266
Name: player_name, dtype: int64

In [794]:
# Take note of columns with missing data:
for pos in ['qb','rb','fb_te','wr', 'ol']:
    print(pos)
    print(combine.loc[combine.position == pos, drills].shape, '\n')
    print(combine.loc[combine.position == pos, drills].isnull().sum(), '\n', '\n')
    
def missing_drills(df,pos):
    print(pos)
    print(df.loc[df.position == pos, drills].shape, '\n')
    print(df.loc[df.position == pos, drills].isnull().sum(), '\n', '\n')

qb
(580, 9) 

hand_size_inches         72
arm_length_inches       116
40_yard_dash             55
bench_press_reps        547
vertical_leap_inches    108
broad_jump_inches       107
3_cone_drill            280
20_yard_shuttle         126
60_yard_shuttle         576
dtype: int64 
 

rb
(919, 9) 

hand_size_inches        123
arm_length_inches       157
40_yard_dash            103
bench_press_reps        204
vertical_leap_inches    166
broad_jump_inches       181
3_cone_drill            549
20_yard_shuttle         317
60_yard_shuttle         491
dtype: int64 
 

fb_te
(824, 9) 

hand_size_inches        126
arm_length_inches       148
40_yard_dash             68
bench_press_reps        165
vertical_leap_inches    145
broad_jump_inches       155
3_cone_drill            452
20_yard_shuttle         208
60_yard_shuttle         327
dtype: int64 
 

wr
(1321, 9) 

hand_size_inches        211
arm_length_inches       260
40_yard_dash             95
bench_press_reps        892
vertical_leap_inches 

Too many players skipped the 60 yard shuttle, so it makes sense the drop the column as a whole.

In [540]:
# use earlier defined function
drop_60_shuttle(combine)

### Linemen Imputation

In [793]:
merged_data.shape

(9544, 29)

In [792]:
merged_data = pd.read_csv('../data/nfl_merged_data.csv')
merged_data.head()

Unnamed: 0,player_name,first_name,last_name,college,position,combine_year,height_inches,weight_lbs,hand_size_inches,arm_length_inches,40_yard_dash,bench_press_reps,vertical_leap_inches,broad_jump_inches,3_cone_drill,20_yard_shuttle,hand_size_inches_missed,arm_length_inches_missed,40_yard_dash_missed,bench_press_reps_missed,vertical_leap_inches_missed,broad_jump_inches_missed,3_cone_drill_missed,20_yard_shuttle_missed,round,pick,team,draft_status,draft_value
0,jamal_adams,jamal,adams,louisiana_state,db,2017,71.63,214,9.25,33.38,4.56,18.0,31.5,120.0,6.96,4.13,0,0,0,0,0,0,0,0,1,6,nyj,1,1
1,montravius_adams,montravius,adams,auburn,dl,2017,75.63,304,9.25,32.75,4.87,22.0,29.0,108.0,7.62,,0,0,0,0,0,0,0,1,3,93,gb,1,2
2,rodney_adams,rodney,adams,south_florida,wr,2017,73.25,189,9.0,32.0,4.44,8.0,29.5,125.0,6.98,4.28,0,0,0,0,0,0,0,0,5,170,min,1,3
3,quincy_adeboyejo,quincy,adeboyejo,mississippi,wr,2017,74.75,197,9.38,31.75,4.42,8.0,34.5,123.0,6.73,4.14,0,0,0,0,0,0,0,0,13,337,udfa,0,5
4,brian_allen,brian,allen,utah,db,2017,74.88,215,10.0,34.0,4.48,15.0,34.5,117.0,6.64,4.34,0,0,0,0,0,0,0,0,5,173,pit,1,3


In [758]:
linemen = merged_data.drop(['round','pick','team','draft_value'],axis=1).loc[(merged_data.position == 'dl') |(merged_data.position == 'ol')]
print(linemen.shape,'\n')
# Must reset the index or else merging the KNN imputation will not work 
linemen = linemen.reset_index(drop=True)
linemen.head()

(3098, 25) 



Unnamed: 0,player_name,first_name,last_name,college,position,combine_year,height_inches,weight_lbs,hand_size_inches,arm_length_inches,40_yard_dash,bench_press_reps,vertical_leap_inches,broad_jump_inches,3_cone_drill,20_yard_shuttle,hand_size_inches_missed,arm_length_inches_missed,40_yard_dash_missed,bench_press_reps_missed,vertical_leap_inches_missed,broad_jump_inches_missed,3_cone_drill_missed,20_yard_shuttle_missed,draft_status
0,montravius_adams,montravius,adams,auburn,dl,2017,75.63,304,9.25,32.75,4.87,22.0,29.0,108.0,7.62,,0,0,0,0,0,0,0,1,1
1,jonathan_allen,jonathan,allen,alabama,dl,2017,74.63,286,9.38,33.63,5.0,21.0,30.0,108.0,7.49,4.44,0,0,0,0,0,0,0,0,1
2,gavin_andrews,gavin,andrews,oregon_state,ol,2017,77.0,340,,,,,,,,5.09,1,1,1,1,1,1,1,0,0
3,isaac_asiata,isaac,asiata,utah,ol,2017,75.13,323,10.38,33.75,5.34,35.0,25.5,102.0,7.83,4.93,0,0,0,0,0,0,0,0,1
4,erik_austell,erik,austell,charleston_southern,ol,2017,75.13,301,9.0,32.0,5.23,24.0,27.5,107.0,8.13,4.9,0,0,0,0,0,0,0,0,0


Start with a KNNImputer as a baseline. This will give us pseudo values to use in our regression model.

In [796]:
# Call our earlier function
missing_drills(df,'ol')
missing_drills(df,'dl')

ol
(1642, 9) 

hand_size_inches         199
arm_length_inches        249
40_yard_dash             112
bench_press_reps         233
vertical_leap_inches     234
broad_jump_inches        287
3_cone_drill             800
20_yard_shuttle          302
60_yard_shuttle         1642
dtype: int64 
 

dl
(1456, 9) 

hand_size_inches         171
arm_length_inches        205
40_yard_dash             105
bench_press_reps         237
vertical_leap_inches     226
broad_jump_inches        259
3_cone_drill             703
20_yard_shuttle          336
60_yard_shuttle         1456
dtype: int64 
 



Passing list-likes to .loc or [] with any missing label will raise
KeyError in the future, you can use .reindex() as an alternative.

See the documentation here:
https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#deprecate-loc-reindex-listlike
  return self._getitem_tuple(key)


In [760]:
# include only numerical data
print(linemen.columns)
linemen = linemen.select_dtypes(include=['int','float'])

Index(['player_name', 'first_name', 'last_name', 'college', 'position',
       'combine_year', 'height_inches', 'weight_lbs', 'hand_size_inches',
       'arm_length_inches', '40_yard_dash', 'bench_press_reps',
       'vertical_leap_inches', 'broad_jump_inches', '3_cone_drill',
       '20_yard_shuttle', 'hand_size_inches_missed',
       'arm_length_inches_missed', '40_yard_dash_missed',
       'bench_press_reps_missed', 'vertical_leap_inches_missed',
       'broad_jump_inches_missed', '3_cone_drill_missed',
       '20_yard_shuttle_missed', 'draft_status'],
      dtype='object')


Although many values are missing for the 3 cone drill, it is known to be a very important evaluation metric for linemen since it measures their short-space agility.

Start with KNNImputer as a baseline.

In [761]:
# Only independent variables
linemen_X = linemen.drop('draft_status',axis=1)

In [762]:
imputer = KNNImputer(n_neighbors=5, copy=True)
knn_df = pd.DataFrame(imputer.fit_transform(linemen_X))

In [763]:
# list of the columns with missing values
missing_cols = linemen.isnull().sum()[linemen.isnull().sum() > 0].index
missing_cols

Index(['hand_size_inches', 'arm_length_inches', '40_yard_dash',
       'bench_press_reps', 'vertical_leap_inches', 'broad_jump_inches',
       '3_cone_drill', '20_yard_shuttle'],
      dtype='object')

In [764]:
linemen_X

Unnamed: 0,combine_year,height_inches,weight_lbs,hand_size_inches,arm_length_inches,40_yard_dash,bench_press_reps,vertical_leap_inches,broad_jump_inches,3_cone_drill,20_yard_shuttle,hand_size_inches_missed,arm_length_inches_missed,40_yard_dash_missed,bench_press_reps_missed,vertical_leap_inches_missed,broad_jump_inches_missed,3_cone_drill_missed,20_yard_shuttle_missed
0,2017,75.63,304,9.25,32.75,4.87,22.0,29.0,108.0,7.62,,0,0,0,0,0,0,0,1
1,2017,74.63,286,9.38,33.63,5.00,21.0,30.0,108.0,7.49,4.44,0,0,0,0,0,0,0,0
2,2017,77.00,340,,,,,,,,5.09,1,1,1,1,1,1,1,0
3,2017,75.13,323,10.38,33.75,5.34,35.0,25.5,102.0,7.83,4.93,0,0,0,0,0,0,0,0
4,2017,75.13,301,9.00,32.00,5.23,24.0,27.5,107.0,8.13,4.90,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3093,2015,73.00,290,10.13,31.88,4.96,,32.0,105.0,7.40,4.30,0,0,0,1,0,0,0,0
3094,2015,76.88,319,10.25,32.25,5.25,25.0,27.0,103.0,7.91,4.78,0,0,0,0,0,0,0,0
3095,2015,77.00,327,9.75,35.00,5.34,27.0,26.0,97.0,,5.15,0,0,0,0,0,0,1,0
3096,2015,76.63,302,10.63,34.63,4.97,,29.5,106.0,7.59,4.53,0,0,0,1,0,0,0,0


In [765]:
knn_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18
0,2017.0,75.63,304.0,9.250,32.750,4.87,22.0,29.0,108.0,7.620,4.602,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,2017.0,74.63,286.0,9.380,33.630,5.00,21.0,30.0,108.0,7.490,4.440,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,2017.0,77.00,340.0,9.978,34.328,5.40,25.6,27.7,102.8,8.016,5.090,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0
3,2017.0,75.13,323.0,10.380,33.750,5.34,35.0,25.5,102.0,7.830,4.930,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,2017.0,75.13,301.0,9.000,32.000,5.23,24.0,27.5,107.0,8.130,4.900,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3093,2015.0,73.00,290.0,10.130,31.880,4.96,22.8,32.0,105.0,7.400,4.300,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
3094,2015.0,76.88,319.0,10.250,32.250,5.25,25.0,27.0,103.0,7.910,4.780,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3095,2015.0,77.00,327.0,9.750,35.000,5.34,27.0,26.0,97.0,8.170,5.150,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
3096,2015.0,76.63,302.0,10.630,34.630,4.97,25.0,29.5,106.0,7.590,4.530,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0


In [766]:
missing_cols

Index(['hand_size_inches', 'arm_length_inches', '40_yard_dash',
       'bench_press_reps', 'vertical_leap_inches', 'broad_jump_inches',
       '3_cone_drill', '20_yard_shuttle'],
      dtype='object')

In [768]:
# creating imputed values columns
for index, col in enumerate(linemen_X.columns):
    if col in missing_cols:
        print(index, col, 'MISSING')
        linemen[col + '_imp'] = knn_df[index]
    else:
        print(index,col)
        continue

0 combine_year
1 height_inches
2 weight_lbs
3 hand_size_inches MISSING
4 arm_length_inches MISSING
5 40_yard_dash MISSING
6 bench_press_reps MISSING
7 vertical_leap_inches MISSING
8 broad_jump_inches MISSING
9 3_cone_drill MISSING
10 20_yard_shuttle MISSING
11 hand_size_inches_missed
12 arm_length_inches_missed
13 40_yard_dash_missed
14 bench_press_reps_missed
15 vertical_leap_inches_missed
16 broad_jump_inches_missed
17 3_cone_drill_missed
18 20_yard_shuttle_missed


In [769]:
linemen

Unnamed: 0,combine_year,height_inches,weight_lbs,hand_size_inches,arm_length_inches,40_yard_dash,bench_press_reps,vertical_leap_inches,broad_jump_inches,3_cone_drill,20_yard_shuttle,hand_size_inches_missed,arm_length_inches_missed,40_yard_dash_missed,bench_press_reps_missed,vertical_leap_inches_missed,broad_jump_inches_missed,3_cone_drill_missed,20_yard_shuttle_missed,draft_status,hand_size_inches_imp,arm_length_inches_imp,40_yard_dash_imp,bench_press_reps_imp,vertical_leap_inches_imp,broad_jump_inches_imp,3_cone_drill_imp,20_yard_shuttle_imp
0,2017,75.63,304,9.25,32.75,4.87,22.0,29.0,108.0,7.62,,0,0,0,0,0,0,0,1,1,9.250,32.750,4.87,22.0,29.0,108.0,7.620,4.602
1,2017,74.63,286,9.38,33.63,5.00,21.0,30.0,108.0,7.49,4.44,0,0,0,0,0,0,0,0,1,9.380,33.630,5.00,21.0,30.0,108.0,7.490,4.440
2,2017,77.00,340,,,,,,,,5.09,1,1,1,1,1,1,1,0,0,9.978,34.328,5.40,25.6,27.7,102.8,8.016,5.090
3,2017,75.13,323,10.38,33.75,5.34,35.0,25.5,102.0,7.83,4.93,0,0,0,0,0,0,0,0,1,10.380,33.750,5.34,35.0,25.5,102.0,7.830,4.930
4,2017,75.13,301,9.00,32.00,5.23,24.0,27.5,107.0,8.13,4.90,0,0,0,0,0,0,0,0,0,9.000,32.000,5.23,24.0,27.5,107.0,8.130,4.900
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3093,2015,73.00,290,10.13,31.88,4.96,,32.0,105.0,7.40,4.30,0,0,0,1,0,0,0,0,0,10.130,31.880,4.96,22.8,32.0,105.0,7.400,4.300
3094,2015,76.88,319,10.25,32.25,5.25,25.0,27.0,103.0,7.91,4.78,0,0,0,0,0,0,0,0,1,10.250,32.250,5.25,25.0,27.0,103.0,7.910,4.780
3095,2015,77.00,327,9.75,35.00,5.34,27.0,26.0,97.0,,5.15,0,0,0,0,0,0,1,0,1,9.750,35.000,5.34,27.0,26.0,97.0,8.170,5.150
3096,2015,76.63,302,10.63,34.63,4.97,,29.5,106.0,7.59,4.53,0,0,0,1,0,0,0,0,1,10.630,34.630,4.97,25.0,29.5,106.0,7.590,4.530


In [770]:
model_df = linemen.drop('combine_year',axis=1)
model_df = model_df.drop(missing_cols,axis=1)

In [771]:
model_df

Unnamed: 0,height_inches,weight_lbs,hand_size_inches_missed,arm_length_inches_missed,40_yard_dash_missed,bench_press_reps_missed,vertical_leap_inches_missed,broad_jump_inches_missed,3_cone_drill_missed,20_yard_shuttle_missed,draft_status,hand_size_inches_imp,arm_length_inches_imp,40_yard_dash_imp,bench_press_reps_imp,vertical_leap_inches_imp,broad_jump_inches_imp,3_cone_drill_imp,20_yard_shuttle_imp
0,75.63,304,0,0,0,0,0,0,0,1,1,9.250,32.750,4.87,22.0,29.0,108.0,7.620,4.602
1,74.63,286,0,0,0,0,0,0,0,0,1,9.380,33.630,5.00,21.0,30.0,108.0,7.490,4.440
2,77.00,340,1,1,1,1,1,1,1,0,0,9.978,34.328,5.40,25.6,27.7,102.8,8.016,5.090
3,75.13,323,0,0,0,0,0,0,0,0,1,10.380,33.750,5.34,35.0,25.5,102.0,7.830,4.930
4,75.13,301,0,0,0,0,0,0,0,0,0,9.000,32.000,5.23,24.0,27.5,107.0,8.130,4.900
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3093,73.00,290,0,0,0,1,0,0,0,0,0,10.130,31.880,4.96,22.8,32.0,105.0,7.400,4.300
3094,76.88,319,0,0,0,0,0,0,0,0,1,10.250,32.250,5.25,25.0,27.0,103.0,7.910,4.780
3095,77.00,327,0,0,0,0,0,0,1,0,1,9.750,35.000,5.34,27.0,26.0,97.0,8.170,5.150
3096,76.63,302,0,0,0,1,0,0,0,0,1,10.630,34.630,4.97,25.0,29.5,106.0,7.590,4.530


In [772]:
missing_cols

Index(['hand_size_inches', 'arm_length_inches', '40_yard_dash',
       'bench_press_reps', 'vertical_leap_inches', 'broad_jump_inches',
       '3_cone_drill', '20_yard_shuttle'],
      dtype='object')

In [774]:
linemen.head()

Unnamed: 0,combine_year,height_inches,weight_lbs,hand_size_inches,arm_length_inches,40_yard_dash,bench_press_reps,vertical_leap_inches,broad_jump_inches,3_cone_drill,20_yard_shuttle,hand_size_inches_missed,arm_length_inches_missed,40_yard_dash_missed,bench_press_reps_missed,vertical_leap_inches_missed,broad_jump_inches_missed,3_cone_drill_missed,20_yard_shuttle_missed,draft_status,hand_size_inches_imp,arm_length_inches_imp,40_yard_dash_imp,bench_press_reps_imp,vertical_leap_inches_imp,broad_jump_inches_imp,3_cone_drill_imp,20_yard_shuttle_imp
0,2017,75.63,304,9.25,32.75,4.87,22.0,29.0,108.0,7.62,,0,0,0,0,0,0,0,1,1,9.25,32.75,4.87,22.0,29.0,108.0,7.62,4.602
1,2017,74.63,286,9.38,33.63,5.0,21.0,30.0,108.0,7.49,4.44,0,0,0,0,0,0,0,0,1,9.38,33.63,5.0,21.0,30.0,108.0,7.49,4.44
2,2017,77.0,340,,,,,,,,5.09,1,1,1,1,1,1,1,0,0,9.978,34.328,5.4,25.6,27.7,102.8,8.016,5.09
3,2017,75.13,323,10.38,33.75,5.34,35.0,25.5,102.0,7.83,4.93,0,0,0,0,0,0,0,0,1,10.38,33.75,5.34,35.0,25.5,102.0,7.83,4.93
4,2017,75.13,301,9.0,32.0,5.23,24.0,27.5,107.0,8.13,4.9,0,0,0,0,0,0,0,0,0,9.0,32.0,5.23,24.0,27.5,107.0,8.13,4.9


In [775]:
# for for pos in ['qb','rb','fb_te','wr', 'ol']:
    for feature in missing_cols:
        X = model_df.drop(feature+'_imp', axis=1)
        y = model_df[feature+'_imp']

        model = LinearRegression()
        model.fit(X,y)

        linemen.loc[linemen[feature].isnull(), feature] = model.predict(linemen[model_df.columns].drop(feature+'_imp', axis=1))[linemen[feature].isnull()]

# new_df[feature + '_imp'] = model.predict(X,y)
    


In [779]:
linemen.head()

Unnamed: 0,combine_year,height_inches,weight_lbs,hand_size_inches,arm_length_inches,40_yard_dash,bench_press_reps,vertical_leap_inches,broad_jump_inches,3_cone_drill,20_yard_shuttle,hand_size_inches_missed,arm_length_inches_missed,40_yard_dash_missed,bench_press_reps_missed,vertical_leap_inches_missed,broad_jump_inches_missed,3_cone_drill_missed,20_yard_shuttle_missed,draft_status,hand_size_inches_imp,arm_length_inches_imp,40_yard_dash_imp,bench_press_reps_imp,vertical_leap_inches_imp,broad_jump_inches_imp,3_cone_drill_imp,20_yard_shuttle_imp
0,2017,75.63,304,9.25,32.75,4.87,22.0,29.0,108.0,7.62,4.5172,0,0,0,0,0,0,0,1,1,9.25,32.75,4.87,22.0,29.0,108.0,7.62,4.602
1,2017,74.63,286,9.38,33.63,5.0,21.0,30.0,108.0,7.49,4.44,0,0,0,0,0,0,0,0,1,9.38,33.63,5.0,21.0,30.0,108.0,7.49,4.44
2,2017,77.0,340,10.11866,33.870491,5.430197,27.21122,25.812246,98.570883,8.473006,5.09,1,1,1,1,1,1,1,0,0,9.978,34.328,5.4,25.6,27.7,102.8,8.016,5.09
3,2017,75.13,323,10.38,33.75,5.34,35.0,25.5,102.0,7.83,4.93,0,0,0,0,0,0,0,0,1,10.38,33.75,5.34,35.0,25.5,102.0,7.83,4.93
4,2017,75.13,301,9.0,32.0,5.23,24.0,27.5,107.0,8.13,4.9,0,0,0,0,0,0,0,0,0,9.0,32.0,5.23,24.0,27.5,107.0,8.13,4.9


In [782]:
final_ol_df = linemen.iloc[:,:-8].drop('combine_year',axis=1)

In [783]:
final_ol_df.head()

Unnamed: 0,height_inches,weight_lbs,hand_size_inches,arm_length_inches,40_yard_dash,bench_press_reps,vertical_leap_inches,broad_jump_inches,3_cone_drill,20_yard_shuttle,hand_size_inches_missed,arm_length_inches_missed,40_yard_dash_missed,bench_press_reps_missed,vertical_leap_inches_missed,broad_jump_inches_missed,3_cone_drill_missed,20_yard_shuttle_missed,draft_status
0,75.63,304,9.25,32.75,4.87,22.0,29.0,108.0,7.62,4.5172,0,0,0,0,0,0,0,1,1
1,74.63,286,9.38,33.63,5.0,21.0,30.0,108.0,7.49,4.44,0,0,0,0,0,0,0,0,1
2,77.0,340,10.11866,33.870491,5.430197,27.21122,25.812246,98.570883,8.473006,5.09,1,1,1,1,1,1,1,0,0
3,75.13,323,10.38,33.75,5.34,35.0,25.5,102.0,7.83,4.93,0,0,0,0,0,0,0,0,1
4,75.13,301,9.0,32.0,5.23,24.0,27.5,107.0,8.13,4.9,0,0,0,0,0,0,0,0,0


Upload dataset ready for modeling:

In [784]:
final_ol_df.to_csv('../data/nfl_linemen_data.csv')

In [802]:
final_ol_df.loc[(final_ol_df.weight_lbs == 329) & (final_ol_df['40_yard_dash'] == 5.42)]

Unnamed: 0,height_inches,weight_lbs,hand_size_inches,arm_length_inches,40_yard_dash,bench_press_reps,vertical_leap_inches,broad_jump_inches,3_cone_drill,20_yard_shuttle,hand_size_inches_missed,arm_length_inches_missed,40_yard_dash_missed,bench_press_reps_missed,vertical_leap_inches_missed,broad_jump_inches_missed,3_cone_drill_missed,20_yard_shuttle_missed,draft_status
197,78.75,329,9.93903,34.412607,5.42,17.0,23.0,92.0,8.36,5.12,1,1,0,0,0,0,0,0,0


In [805]:
fld.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3098 entries, 0 to 3097
Data columns (total 19 columns):
height_inches                  3098 non-null float64
weight_lbs                     3098 non-null int64
hand_size_inches               3098 non-null float64
arm_length_inches              3098 non-null float64
40_yard_dash                   3098 non-null float64
bench_press_reps               3098 non-null float64
vertical_leap_inches           3098 non-null float64
broad_jump_inches              3098 non-null float64
3_cone_drill                   3098 non-null float64
20_yard_shuttle                3098 non-null float64
hand_size_inches_missed        3098 non-null int64
arm_length_inches_missed       3098 non-null int64
40_yard_dash_missed            3098 non-null int64
bench_press_reps_missed        3098 non-null int64
vertical_leap_inches_missed    3098 non-null int64
broad_jump_inches_missed       3098 non-null int64
3_cone_drill_missed            3098 non-null int64
20_yard_