# Data Cleaning

## Import packages

In [495]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import math
import re
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 100)
%matplotlib inline

Read in csv and examine data

In [496]:
combine = pd.read_csv('../data/nfl_combine_data.csv')

In [497]:
# Look at descriptive stats
print(combine.shape, '\n')
print(combine.info())
combine.describe().T

(9950, 17) 

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9950 entries, 0 to 9949
Data columns (total 17 columns):
full_name               9950 non-null object
first_name              9950 non-null object
last_name               9950 non-null object
combine_year            9950 non-null int64
college                 9950 non-null object
position                9950 non-null object
height_inches           9950 non-null float64
weight_lbs              9950 non-null int64
hand_size_inches        8400 non-null float64
arm_length_inches       8082 non-null float64
40_yard_dash            9073 non-null float64
bench_press_reps        6779 non-null float64
vertical_leap_inches    8050 non-null float64
broad_jump_inches       7903 non-null float64
3_cone_drill            4518 non-null float64
20_yard_shuttle         7176 non-null float64
60_yard_shuttle         3169 non-null float64
dtypes: float64(10), int64(2), object(5)
memory usage: 1.3+ MB
None


Unnamed: 0,count,mean,std,min,25%,50%,75%,max
combine_year,9950.0,2002.204824,9.302148,1987.0,1994.0,2002.0,2011.0,2017.0
height_inches,9950.0,73.73564,2.645112,64.9,71.88,74.0,75.75,82.4
weight_lbs,9950.0,240.282513,45.046373,142.0,203.0,232.0,275.0,387.0
hand_size_inches,8400.0,9.528956,0.629254,7.13,9.13,9.5,10.0,11.88
arm_length_inches,8082.0,32.221633,1.49868,25.63,31.25,32.25,33.25,38.5
40_yard_dash,9073.0,4.830682,0.309814,4.21,4.59,4.76,5.05,6.12
bench_press_reps,6779.0,19.833014,6.540305,1.0,15.0,20.0,24.0,51.0
vertical_leap_inches,8050.0,32.001801,4.203431,17.5,29.0,32.0,35.0,46.0
broad_jump_inches,7903.0,112.30558,9.306169,7.0,106.0,113.0,119.0,147.0
3_cone_drill,4518.0,7.346076,0.446751,6.34,7.01,7.26,7.62,9.61


3_cone_drill and 60_yard_shuttle have many missing values

In [498]:
combine.head()

Unnamed: 0,full_name,first_name,last_name,combine_year,college,position,height_inches,weight_lbs,hand_size_inches,arm_length_inches,40_yard_dash,bench_press_reps,vertical_leap_inches,broad_jump_inches,3_cone_drill,20_yard_shuttle,60_yard_shuttle
0,jamal_adams,jamal,adams,2017,louisiana_state,db,71.63,214,9.25,33.38,4.56,18.0,31.5,120.0,6.96,4.13,
1,montravius_adams,montravius,adams,2017,auburn,dl,75.63,304,9.25,32.75,4.87,22.0,29.0,108.0,7.62,,
2,rodney_adams,rodney,adams,2017,south_florida,wr,73.25,189,9.0,32.0,4.44,8.0,29.5,125.0,6.98,4.28,11.39
3,quincy_adeboyejo,quincy,adeboyejo,2017,mississippi,wr,74.75,197,9.38,31.75,4.42,8.0,34.5,123.0,6.73,4.14,
4,brian_allen,brian,allen,2017,utah,db,74.88,215,10.0,34.0,4.48,15.0,34.5,117.0,6.64,4.34,


In [499]:
# Add full name as a column
combine['player_name'] = combine.first_name + ' ' + combine.last_name

In [500]:
# Check years of data
combine.combine_year.unique()

array([2017, 2016, 2015, 2014, 2013, 2012, 2011, 2010, 2009, 2008, 2007,
       2006, 2005, 2004, 2003, 2002, 2001, 2000, 1999, 1998, 1997, 1996,
       1995, 1994, 1993, 1992, 1991, 1990, 1989, 1988, 1987])

In [501]:
combine.groupby('combine_year').count()

Unnamed: 0_level_0,full_name,first_name,last_name,college,position,height_inches,weight_lbs,hand_size_inches,arm_length_inches,40_yard_dash,bench_press_reps,vertical_leap_inches,broad_jump_inches,3_cone_drill,20_yard_shuttle,60_yard_shuttle,player_name
combine_year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
1987,267,267,267,267,267,267,267,237,237,237,215,237,233,0,223,142,267
1988,322,322,322,322,322,322,322,316,1,272,277,264,260,0,249,159,322
1989,308,308,308,308,308,308,308,0,0,270,208,258,255,0,248,157,308
1990,324,324,324,324,324,324,324,314,314,287,207,277,279,0,257,147,324
1991,428,428,428,428,428,428,428,426,426,371,297,369,369,0,349,211,428
1992,438,438,438,438,438,438,438,421,421,364,288,366,358,0,340,197,438
1993,338,338,338,338,338,338,338,337,337,270,223,267,261,0,248,153,338
1994,315,315,315,315,315,315,315,315,315,249,181,245,246,0,229,128,315
1995,310,310,310,310,310,310,310,309,309,218,179,212,205,0,188,105,310
1996,296,296,296,296,296,296,296,296,296,222,164,226,218,0,205,116,296


In [502]:
combine.loc[combine.combine_year == 2015].isnull().sum()

full_name                 0
first_name                0
last_name                 0
combine_year              0
college                   0
position                  0
height_inches             0
weight_lbs                0
hand_size_inches        428
arm_length_inches       429
40_yard_dash              5
bench_press_reps        517
vertical_leap_inches    485
broad_jump_inches       486
3_cone_drill            516
20_yard_shuttle         508
60_yard_shuttle         648
player_name               0
dtype: int64

## Addressing 2015 data

- 2015 has 741 unique attendees. There are only 320, so we need to get rid of the extra attendees in our dataset.
- Load in actual attendee data
- Split first and last names into 2 columns
- Remove special characters
- Make everything lower case
- Inner join original combine and attendee data
- Drop 2015 rows from original combine data
- Concat joined result with combine data

In [503]:
combine = pd.read_csv('../data/nfl_combine_data.csv')
combine['player_name'] = combine.first_name + ' ' + combine.last_name

In [504]:
# Read in 2015 data with 320 attendees
df15 = pd.read_excel('../data/2015_combine_data.xlsx')
# drop unnecessary rows
cols = ['Player','Year','Pos','School']
df15 = df15[cols]
df15.rename(columns ={'Player':'player_name','Year':'combine_year', 'Pos':'position', 'School': 'college'},inplace=True)

In [505]:
combine15 = combine.loc[combine.combine_year == 2015]

In [506]:
df15.head()

Unnamed: 0,player_name,combine_year,position,college
0,Ameer Abdullah,2015,RB,Nebraska
1,Nelson Agholor,2015,WR,USC
2,Jay Ajayi,2015,RB,Boise St.
3,Kwon Alexander,2015,OLB,LSU
4,Mario Alford,2015,WR,West Virginia


In [507]:
combine15.head()

Unnamed: 0,full_name,first_name,last_name,combine_year,college,position,height_inches,weight_lbs,hand_size_inches,arm_length_inches,40_yard_dash,bench_press_reps,vertical_leap_inches,broad_jump_inches,3_cone_drill,20_yard_shuttle,60_yard_shuttle,player_name
638,ameer_abdullah,ameer,abdullah,2015,nebraska,rb,68.75,205,8.63,30.0,4.6,24.0,42.5,130.0,6.79,3.95,11.18,ameer abdullah
639,nelson_agholor,nelson,agholor,2015,southern_california,wr,72.13,198,9.25,32.25,4.42,12.0,,,,,,nelson agholor
640,malcolm_agnew,malcolm,agnew,2015,southern_illinois,rb,70.0,202,,,4.59,,,,,,,malcolm agnew
641,jay_ajayi,jay,ajayi,2015,boise_state,rb,71.75,221,10.0,32.0,4.57,19.0,39.0,121.0,7.1,4.1,11.1,jay ajayi
642,brandon_alexander,brandon,alexander,2015,central_florida,db,74.0,195,,,4.59,,,,,,,brandon alexander


## Cleaning Functions

In [508]:
regex = re.compile("[@_!#$%^&*()<>?/\|}{~:`'']") 

In [527]:
# Remove special characters
def remove_special(df):
    df = df.applymap(lambda x: re.sub("[@_!#$%^&*.<>?/\|}{~:`'']", '', x) if type(x) == str else x)

# Make text lowercase
def lowercase(df, col):
    df[col] = df[col].apply(lambda x: x.lower())

def clean_name(df, col):
    df[['first_name','last_name']] = df[col].str.split(" ",n=1,expand=True)

# Drop '60 yard shuttle'
def drop_60_shuttle(df):
    df.drop('60_yard_shuttle', axis=1, inplace=True)

# Add 'events_missed' columns
def drills_missed(df):
    drills = ['hand_size_inches'
               ,'arm_length_inches'
               ,'40_yard_dash'
               ,'bench_press_reps'
               ,'vertical_leap_inches'
               ,'broad_jump_inches'
               ,'3_cone_drill'
               ,'20_yard_shuttle'
               ,'60_yard_shuttle']
    for drill in drills:
        df[drill+'_missed'] = df[drill].apply(lambda x: 1 if math.isnan(x) else 0)
    
# Add 'conference' column

# Regression imputation for missing values

In [528]:
replace_5(combine15)
remove_special(combine15) # doesn't work
lowercase(combine15,'player_name')
clean_name(combine15,'player_name')

In [511]:
remove_special(df15) # doesn't work
lowercase(df15, 'player_name')
clean_name(df15,'player_name')

In [512]:
df15 = df15.applymap(lambda x: re.sub("[@_!#$%^&*.<>?/\|}{~:`'']", '', x) if type(x) == str else x)
combine15 = combine15.applymap(lambda x: re.sub("[@_!#$%^&*.<>?/\|}{~:`'']", '', x) if type(x) == str else x)

In [513]:
print(df15[~df15.player_name.isin(combine15.player_name)].player_name.unique())
len(df15[~df15.player_name.isin(combine15.player_name)].player_name.unique())

['trenton brown' 'michael burton' 'yannick cudjoe-virgil'
 'durell eskridge' 'mario edwards, jr' 'donatella luckett'
 'zack wagenmann']


7

In [516]:
merged_df = pd.merge(df15, combine15, on = ['first_name', 'last_name', 'combine_year'])

In [517]:
merged_df = merged_df.drop(['position_x', 'player_name_y', 'college_x'],axis=1)
merged_df.rename(columns = {'player_name_x':'player_name', 'college_y':'college','position_y':'position'}, inplace=True)

In [518]:
combine.head()

Unnamed: 0,full_name,first_name,last_name,combine_year,college,position,height_inches,weight_lbs,hand_size_inches,arm_length_inches,40_yard_dash,bench_press_reps,vertical_leap_inches,broad_jump_inches,3_cone_drill,20_yard_shuttle,60_yard_shuttle,player_name
0,jamal_adams,jamal,adams,2017,louisiana_state,db,71.63,214,9.25,33.38,4.56,18.0,31.5,120.0,6.96,4.13,,jamal adams
1,montravius_adams,montravius,adams,2017,auburn,dl,75.63,304,9.25,32.75,4.87,22.0,29.0,108.0,7.62,,,montravius adams
2,rodney_adams,rodney,adams,2017,south_florida,wr,73.25,189,9.0,32.0,4.44,8.0,29.5,125.0,6.98,4.28,11.39,rodney adams
3,quincy_adeboyejo,quincy,adeboyejo,2017,mississippi,wr,74.75,197,9.38,31.75,4.42,8.0,34.5,123.0,6.73,4.14,,quincy adeboyejo
4,brian_allen,brian,allen,2017,utah,db,74.88,215,10.0,34.0,4.48,15.0,34.5,117.0,6.64,4.34,,brian allen


In [519]:
# cols = combine
merged_df = merged_df[['combine_year', 'player_name','first_name', 'last_name', 'college', 'position', 'height_inches',
       'weight_lbs', 'hand_size_inches', 'arm_length_inches', '40_yard_dash',
       'bench_press_reps', 'vertical_leap_inches', 'broad_jump_inches',
       '3_cone_drill', '20_yard_shuttle', '60_yard_shuttle']]

combine = combine[['combine_year', 'player_name','first_name', 'last_name', 'college', 'position', 'height_inches',
       'weight_lbs', 'hand_size_inches', 'arm_length_inches', '40_yard_dash',
       'bench_press_reps', 'vertical_leap_inches', 'broad_jump_inches',
       '3_cone_drill', '20_yard_shuttle', '60_yard_shuttle']]

In [520]:
# drop 2015 rows from combine df
index_drop_list = combine.loc[combine.combine_year == 2015].index

In [521]:
combine = combine.drop(index_drop_list)

In [522]:
# Union datasets together
combine = pd.concat([combine,merged_df])

In [523]:
# 2 repeated names
len(combine.loc[combine.combine_year == 2015].player_name.unique())

314

In [524]:
# Kevin White repeated 4 times
combine.loc[combine.combine_year == 2015].groupby('player_name').count().sort_values('combine_year', ascending = False).head()

Unnamed: 0_level_0,combine_year,first_name,last_name,college,position,height_inches,weight_lbs,hand_size_inches,arm_length_inches,40_yard_dash,bench_press_reps,vertical_leap_inches,broad_jump_inches,3_cone_drill,20_yard_shuttle,60_yard_shuttle
player_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
kevin white,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4
aaron davis,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1
lorenzo mauldin,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0
marcus murphy,1,1,1,1,1,1,1,1,1,1,1,1,1,0,1,0
marcus mariota,1,1,1,1,1,1,1,1,1,1,0,1,1,1,1,0


In [525]:
combine.loc[combine.player_name == 'kevin white']
kw_index = combine.loc[combine.player_name == 'kevin white'].index[0::2] # index 298, 300

In [526]:
combine.drop(kw_index, inplace=True)
combine.reset_index(drop=True, inplace=True)
combine.shape

(9522, 17)

## Adding Events Missed Information

In [529]:
combine.loc[combine.hand_size_inches.isnull() ==True]

Unnamed: 0,combine_year,player_name,first_name,last_name,college,position,height_inches,weight_lbs,hand_size_inches,arm_length_inches,40_yard_dash,bench_press_reps,vertical_leap_inches,broad_jump_inches,3_cone_drill,20_yard_shuttle,60_yard_shuttle
7,2017,gavin andrews,gavin,andrews,oregon_state,ol,77.00,340,,,,,,,,5.09,
10,2017,antony auclair,antony,auclair,laval,fb_te,78.00,254,,,,,,,,4.45,12.08
20,2017,collin bevins,collin,bevins,northwest_missouri_state,dl,78.00,285,,,,,,,,4.39,
61,2017,jeremy cutrer,jeremy,cutrer,middle_tennessee_state,lb,74.00,170,,,,,,,,4.26,
86,2017,darius english,darius,english,south_carolina,lb,78.00,245,,,,,,,,4.94,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9296,2015,alvin dupree,alvin,dupree,kentucky,dl,76.00,269,,,4.56,,42.0,138.0,,,
9372,2015,will johnson,will,johnson,texasstate,kp,73.88,206,,,5.02,,,,,,
9388,2015,dez lewis,dez,lewis,centralarkansas,wr,75.75,214,,,4.58,16.0,33.5,119.0,7.11,4.28,
9399,2015,justin manton,justin,manton,louisiana-monroe,kp,74.75,196,,,4.93,,,,,,


In [530]:
combine.isnull().sum()

combine_year               0
player_name                0
first_name                 0
last_name                  0
college                    0
position                   0
height_inches              0
weight_lbs                 0
hand_size_inches        1131
arm_length_inches       1449
40_yard_dash             871
bench_press_reps        2756
vertical_leap_inches    1485
broad_jump_inches       1632
3_cone_drill            5015
20_yard_shuttle         2359
60_yard_shuttle         6355
dtype: int64

In [531]:
# Use previously defined function
drills_missed(combine)
combine.head()

Unnamed: 0,combine_year,player_name,first_name,last_name,college,position,height_inches,weight_lbs,hand_size_inches,arm_length_inches,40_yard_dash,bench_press_reps,vertical_leap_inches,broad_jump_inches,3_cone_drill,20_yard_shuttle,60_yard_shuttle,hand_size_inches_missed,arm_length_inches_missed,40_yard_dash_missed,bench_press_reps_missed,vertical_leap_inches_missed,broad_jump_inches_missed,3_cone_drill_missed,20_yard_shuttle_missed,60_yard_shuttle_missed
0,2017,jamal adams,jamal,adams,louisiana_state,db,71.63,214,9.25,33.38,4.56,18.0,31.5,120.0,6.96,4.13,,0,0,0,0,0,0,0,0,1
1,2017,montravius adams,montravius,adams,auburn,dl,75.63,304,9.25,32.75,4.87,22.0,29.0,108.0,7.62,,,0,0,0,0,0,0,0,1,1
2,2017,rodney adams,rodney,adams,south_florida,wr,73.25,189,9.0,32.0,4.44,8.0,29.5,125.0,6.98,4.28,11.39,0,0,0,0,0,0,0,0,0
3,2017,quincy adeboyejo,quincy,adeboyejo,mississippi,wr,74.75,197,9.38,31.75,4.42,8.0,34.5,123.0,6.73,4.14,,0,0,0,0,0,0,0,0,1
4,2017,brian allen,brian,allen,utah,db,74.88,215,10.0,34.0,4.48,15.0,34.5,117.0,6.64,4.34,,0,0,0,0,0,0,0,0,1


In [535]:
combine.to_csv('../data/nfl_combine2.csv')

## Regression Imputation

Since there are a quite a few combine attendees who did not participate in all the drills, we will need to impute those values by regression imputation. We can assume these values are not missing at random, so a simple imputation is not viable. By segmenting the data by position and using regression imputation, a more realistic value can be obtained.

In [541]:
from sklearn.linear_model import LinearRegression
from sklearn.impute import KNNImputer

In [542]:
drills = ['hand_size_inches'
               ,'arm_length_inches'
               ,'40_yard_dash'
               ,'bench_press_reps'
               ,'vertical_leap_inches'
               ,'broad_jump_inches'
               ,'3_cone_drill'
               ,'20_yard_shuttle'
               ,'60_yard_shuttle']

In [543]:
# player counts by position
combine.groupby('position').count().player_name

position
db        709
dl       1454
fb_te     766
fbte       22
g          19
k_p       113
kp         12
lb       2087
ls          5
ol       1623
qb        553
rb        893
wr       1266
Name: player_name, dtype: int64

In [544]:
# Take note of columns with missing data:
for pos in ['qb','rb','fb_te','wr', 'ol']:
    print(pos)
    print(combine.loc[combine.position == pos, drills].shape, '\n')
    print(combine.loc[combine.position == pos, drills].isnull().sum(), '\n', '\n')
    
def missing_drills(pos):
    print(pos)
    print(combine.loc[combine.position == pos, drills].shape, '\n')
    print(combine.loc[combine.position == pos, drills].isnull().sum(), '\n', '\n')

qb
(553, 9) 

hand_size_inches         46
arm_length_inches        90
40_yard_dash             54
bench_press_reps        521
vertical_leap_inches     82
broad_jump_inches        81
3_cone_drill            254
20_yard_shuttle         101
60_yard_shuttle         553
dtype: int64 
 

rb
(893, 9) 

hand_size_inches         97
arm_length_inches       131
40_yard_dash            103
bench_press_reps        178
vertical_leap_inches    140
broad_jump_inches       155
3_cone_drill            523
20_yard_shuttle         291
60_yard_shuttle         893
dtype: int64 
 

fb_te
(766, 9) 

hand_size_inches         89
arm_length_inches       111
40_yard_dash             68
bench_press_reps        126
vertical_leap_inches    101
broad_jump_inches       111
3_cone_drill            409
20_yard_shuttle         165
60_yard_shuttle         766
dtype: int64 
 

wr
(1266, 9) 

hand_size_inches         158
arm_length_inches        207
40_yard_dash              95
bench_press_reps         839
vertical_leap_inc

Passing list-likes to .loc or [] with any missing label will raise
KeyError in the future, you can use .reindex() as an alternative.

See the documentation here:
https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#deprecate-loc-reindex-listlike
  return self._getitem_tuple(key)


Too many players skipped the 60 yard shuttle, so it makes sense the drop the column as a whole.

In [540]:
# use earlier defined function
drop_60_shuttle(combine)

### RB Imputation

Start with a KNNImputer as a baseline. This will give us pseudo values to use in our regression model.

In [574]:
rb_df = combine.loc[combine.position == 'rb']
rb_df.reset_index(inplace=True)
# Call our earlier function
missing_drills('rb')

rb
(893, 9) 

hand_size_inches         97
arm_length_inches       131
40_yard_dash            103
bench_press_reps        178
vertical_leap_inches    140
broad_jump_inches       155
3_cone_drill            523
20_yard_shuttle         291
60_yard_shuttle         893
dtype: int64 
 



Passing list-likes to .loc or [] with any missing label will raise
KeyError in the future, you can use .reindex() as an alternative.

See the documentation here:
https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#deprecate-loc-reindex-listlike
  return self._getitem_tuple(key)


In [575]:
# include only numerical data, and drop the 'index' column
print(rb_df.columns)
rb_df = rb_df.select_dtypes(include=['int','float']).drop('index',axis=1)

Index(['index', 'combine_year', 'player_name', 'first_name', 'last_name',
       'college', 'position', 'height_inches', 'weight_lbs',
       'hand_size_inches', 'arm_length_inches', '40_yard_dash',
       'bench_press_reps', 'vertical_leap_inches', 'broad_jump_inches',
       '3_cone_drill', '20_yard_shuttle', 'hand_size_inches_missed',
       'arm_length_inches_missed', '40_yard_dash_missed',
       'bench_press_reps_missed', 'vertical_leap_inches_missed',
       'broad_jump_inches_missed', '3_cone_drill_missed',
       '20_yard_shuttle_missed', '60_yard_shuttle_missed'],
      dtype='object')


Very few RBs ran the 3 cone drill, so it is probably best to drop that column. 

In [576]:
rb_df = rb_df.drop('3_cone_drill', axis=1)

Start with KNNImputer as a baseline.

In [577]:
imputer = KNNImputer(n_neighbors=5, copy=True)
# columns of interest are 97-99
knn_df = pd.DataFrame(imputer.fit_transform(rb_df))

In [578]:
missing_cols = rb_df.isnull().sum()[rb_df.isnull().sum() > 0].index

In [579]:
rb_df

Unnamed: 0,combine_year,height_inches,weight_lbs,hand_size_inches,arm_length_inches,40_yard_dash,bench_press_reps,vertical_leap_inches,broad_jump_inches,20_yard_shuttle,hand_size_inches_missed,arm_length_inches_missed,40_yard_dash_missed,bench_press_reps_missed,vertical_leap_inches_missed,broad_jump_inches_missed,3_cone_drill_missed,20_yard_shuttle_missed,60_yard_shuttle_missed
0,2017,72.25,218,9.63,33.25,4.58,23.0,37.0,130.0,,0,0,0,0,0,0,1,1,1
1,2017,70.25,220,9.75,30.50,4.68,19.0,28.5,115.0,4.53,0,0,0,0,0,0,1,0,1
2,2017,66.38,179,10.13,29.75,4.42,11.0,31.5,118.0,,0,0,0,0,0,0,1,1,1
3,2017,73.25,233,9.88,31.25,4.65,20.0,29.0,113.0,,0,0,0,0,0,0,1,1,1
4,2017,71.13,210,9.25,32.38,4.49,22.0,30.5,116.0,4.53,0,0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
888,2015,69.00,215,9.50,30.75,4.65,15.0,35.5,116.0,,0,0,0,0,0,0,1,1,1
889,2015,67.88,217,10.13,29.75,4.70,21.0,32.0,113.0,,0,0,0,0,0,0,1,1,1
890,2015,72.75,230,9.75,33.00,4.48,16.0,33.5,117.0,4.46,0,0,0,0,0,0,0,0,1
891,2015,67.50,195,8.25,29.88,4.49,18.0,33.5,119.0,4.12,0,0,0,0,0,0,0,0,0


In [580]:
knn_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18
0,2017.0,72.25,218.0,9.63,33.25,4.58,23.0,37.0,130.0,4.200,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0
1,2017.0,70.25,220.0,9.75,30.50,4.68,19.0,28.5,115.0,4.530,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0
2,2017.0,66.38,179.0,10.13,29.75,4.42,11.0,31.5,118.0,4.188,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0
3,2017.0,73.25,233.0,9.88,31.25,4.65,20.0,29.0,113.0,4.250,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0
4,2017.0,71.13,210.0,9.25,32.38,4.49,22.0,30.5,116.0,4.530,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
888,2015.0,69.00,215.0,9.50,30.75,4.65,15.0,35.5,116.0,4.368,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0
889,2015.0,67.88,217.0,10.13,29.75,4.70,21.0,32.0,113.0,4.280,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0
890,2015.0,72.75,230.0,9.75,33.00,4.48,16.0,33.5,117.0,4.460,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
891,2015.0,67.50,195.0,8.25,29.88,4.49,18.0,33.5,119.0,4.120,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [581]:
# creating imputed values column in rb_df
for index, col in enumerate(rb_df.columns):
    if col in missing_cols:
#         print(index,col, 'missing')
        rb_df[col + '_imp'] = knn_df[index]
    else:
#         print(index,col)
        continue

In [582]:
rb_df

Unnamed: 0,combine_year,height_inches,weight_lbs,hand_size_inches,arm_length_inches,40_yard_dash,bench_press_reps,vertical_leap_inches,broad_jump_inches,20_yard_shuttle,hand_size_inches_missed,arm_length_inches_missed,40_yard_dash_missed,bench_press_reps_missed,vertical_leap_inches_missed,broad_jump_inches_missed,3_cone_drill_missed,20_yard_shuttle_missed,60_yard_shuttle_missed,hand_size_inches_imp,arm_length_inches_imp,40_yard_dash_imp,bench_press_reps_imp,vertical_leap_inches_imp,broad_jump_inches_imp,20_yard_shuttle_imp
0,2017,72.25,218,9.63,33.25,4.58,23.0,37.0,130.0,,0,0,0,0,0,0,1,1,1,9.63,33.25,4.58,23.0,37.0,130.0,4.200
1,2017,70.25,220,9.75,30.50,4.68,19.0,28.5,115.0,4.53,0,0,0,0,0,0,1,0,1,9.75,30.50,4.68,19.0,28.5,115.0,4.530
2,2017,66.38,179,10.13,29.75,4.42,11.0,31.5,118.0,,0,0,0,0,0,0,1,1,1,10.13,29.75,4.42,11.0,31.5,118.0,4.188
3,2017,73.25,233,9.88,31.25,4.65,20.0,29.0,113.0,,0,0,0,0,0,0,1,1,1,9.88,31.25,4.65,20.0,29.0,113.0,4.250
4,2017,71.13,210,9.25,32.38,4.49,22.0,30.5,116.0,4.53,0,0,0,0,0,0,0,0,1,9.25,32.38,4.49,22.0,30.5,116.0,4.530
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
888,2015,69.00,215,9.50,30.75,4.65,15.0,35.5,116.0,,0,0,0,0,0,0,1,1,1,9.50,30.75,4.65,15.0,35.5,116.0,4.368
889,2015,67.88,217,10.13,29.75,4.70,21.0,32.0,113.0,,0,0,0,0,0,0,1,1,1,10.13,29.75,4.70,21.0,32.0,113.0,4.280
890,2015,72.75,230,9.75,33.00,4.48,16.0,33.5,117.0,4.46,0,0,0,0,0,0,0,0,1,9.75,33.00,4.48,16.0,33.5,117.0,4.460
891,2015,67.50,195,8.25,29.88,4.49,18.0,33.5,119.0,4.12,0,0,0,0,0,0,0,0,0,8.25,29.88,4.49,18.0,33.5,119.0,4.120


In [583]:
missing_cols

Index(['hand_size_inches', 'arm_length_inches', '40_yard_dash',
       'bench_press_reps', 'vertical_leap_inches', 'broad_jump_inches',
       '20_yard_shuttle'],
      dtype='object')

In [584]:
model_df

Unnamed: 0,height_inches,weight_lbs,hand_size_inches_missed,arm_length_inches_missed,40_yard_dash_missed,bench_press_reps_missed,vertical_leap_inches_missed,broad_jump_inches_missed,3_cone_drill_missed,20_yard_shuttle_missed,60_yard_shuttle_missed,hand_size_inches_imp,arm_length_inches_imp,40_yard_dash_imp,bench_press_reps_imp,vertical_leap_inches_imp,broad_jump_inches_imp,20_yard_shuttle_imp
0,72.25,218,0,0,0,0,0,0,1,1,1,9.63,33.25,4.58,23.0,37.0,130.0,4.200
1,70.25,220,0,0,0,0,0,0,1,0,1,9.75,30.50,4.68,19.0,28.5,115.0,4.530
2,66.38,179,0,0,0,0,0,0,1,1,1,10.13,29.75,4.42,11.0,31.5,118.0,4.188
3,73.25,233,0,0,0,0,0,0,1,1,1,9.88,31.25,4.65,20.0,29.0,113.0,4.250
4,71.13,210,0,0,0,0,0,0,0,0,1,9.25,32.38,4.49,22.0,30.5,116.0,4.530
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
888,69.00,215,0,0,0,0,0,0,1,1,1,9.50,30.75,4.65,15.0,35.5,116.0,4.368
889,67.88,217,0,0,0,0,0,0,1,1,1,10.13,29.75,4.70,21.0,32.0,113.0,4.280
890,72.75,230,0,0,0,0,0,0,0,0,1,9.75,33.00,4.48,16.0,33.5,117.0,4.460
891,67.50,195,0,0,0,0,0,0,0,0,0,8.25,29.88,4.49,18.0,33.5,119.0,4.120


In [587]:
model_df = rb_df.drop('combine_year',axis=1)
model_df = model_df.drop(missing_cols,axis=1)

In [588]:
model_df

Unnamed: 0,height_inches,weight_lbs,hand_size_inches_missed,arm_length_inches_missed,40_yard_dash_missed,bench_press_reps_missed,vertical_leap_inches_missed,broad_jump_inches_missed,3_cone_drill_missed,20_yard_shuttle_missed,60_yard_shuttle_missed,hand_size_inches_imp,arm_length_inches_imp,40_yard_dash_imp,bench_press_reps_imp,vertical_leap_inches_imp,broad_jump_inches_imp,20_yard_shuttle_imp
0,72.25,218,0,0,0,0,0,0,1,1,1,9.63,33.25,4.58,23.0,37.0,130.0,4.200
1,70.25,220,0,0,0,0,0,0,1,0,1,9.75,30.50,4.68,19.0,28.5,115.0,4.530
2,66.38,179,0,0,0,0,0,0,1,1,1,10.13,29.75,4.42,11.0,31.5,118.0,4.188
3,73.25,233,0,0,0,0,0,0,1,1,1,9.88,31.25,4.65,20.0,29.0,113.0,4.250
4,71.13,210,0,0,0,0,0,0,0,0,1,9.25,32.38,4.49,22.0,30.5,116.0,4.530
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
888,69.00,215,0,0,0,0,0,0,1,1,1,9.50,30.75,4.65,15.0,35.5,116.0,4.368
889,67.88,217,0,0,0,0,0,0,1,1,1,10.13,29.75,4.70,21.0,32.0,113.0,4.280
890,72.75,230,0,0,0,0,0,0,0,0,1,9.75,33.00,4.48,16.0,33.5,117.0,4.460
891,67.50,195,0,0,0,0,0,0,0,0,0,8.25,29.88,4.49,18.0,33.5,119.0,4.120


In [589]:
missing_cols

Index(['hand_size_inches', 'arm_length_inches', '40_yard_dash',
       'bench_press_reps', 'vertical_leap_inches', 'broad_jump_inches',
       '20_yard_shuttle'],
      dtype='object')

In [607]:
rb_df.head()

Unnamed: 0,combine_year,height_inches,weight_lbs,hand_size_inches,arm_length_inches,40_yard_dash,bench_press_reps,vertical_leap_inches,broad_jump_inches,20_yard_shuttle,hand_size_inches_missed,arm_length_inches_missed,40_yard_dash_missed,bench_press_reps_missed,vertical_leap_inches_missed,broad_jump_inches_missed,3_cone_drill_missed,20_yard_shuttle_missed,60_yard_shuttle_missed,hand_size_inches_imp,arm_length_inches_imp,40_yard_dash_imp,bench_press_reps_imp,vertical_leap_inches_imp,broad_jump_inches_imp,20_yard_shuttle_imp
0,2017,72.25,218,9.63,33.25,4.58,23.0,37.0,130.0,4.222797,0,0,0,0,0,0,1,1,1,9.63,33.25,4.58,23.0,37.0,130.0,4.2
1,2017,70.25,220,9.75,30.5,4.68,19.0,28.5,115.0,4.53,0,0,0,0,0,0,1,0,1,9.75,30.5,4.68,19.0,28.5,115.0,4.53
2,2017,66.38,179,10.13,29.75,4.42,11.0,31.5,118.0,4.19712,0,0,0,0,0,0,1,1,1,10.13,29.75,4.42,11.0,31.5,118.0,4.188
3,2017,73.25,233,9.88,31.25,4.65,20.0,29.0,113.0,4.341827,0,0,0,0,0,0,1,1,1,9.88,31.25,4.65,20.0,29.0,113.0,4.25
4,2017,71.13,210,9.25,32.38,4.49,22.0,30.5,116.0,4.53,0,0,0,0,0,0,0,0,1,9.25,32.38,4.49,22.0,30.5,116.0,4.53


In [606]:
for feature in missing_cols:
    X = model_df.drop(feature+'_imp', axis=1)
    y = model_df[feature+'_imp']
    
    model = LinearRegression()
    model.fit(X,y)

    rb_df.loc[rb_df[feature].isnull(), feature] = model.predict(rb_df[model_df.columns].drop(feature+'_imp', axis=1))[rb_df[feature].isnull()]

# new_df[feature + '_imp'] = model.predict(X,y)
    


In [610]:
rb_df.iloc[:,:-7].drop('combine_year',axis=1)

Unnamed: 0,height_inches,weight_lbs,hand_size_inches,arm_length_inches,40_yard_dash,bench_press_reps,vertical_leap_inches,broad_jump_inches,20_yard_shuttle,hand_size_inches_missed,arm_length_inches_missed,40_yard_dash_missed,bench_press_reps_missed,vertical_leap_inches_missed,broad_jump_inches_missed,3_cone_drill_missed,20_yard_shuttle_missed,60_yard_shuttle_missed
0,72.25,218,9.63,33.25,4.58,23.0,37.0,130.0,4.222797,0,0,0,0,0,0,1,1,1
1,70.25,220,9.75,30.50,4.68,19.0,28.5,115.0,4.530000,0,0,0,0,0,0,1,0,1
2,66.38,179,10.13,29.75,4.42,11.0,31.5,118.0,4.197120,0,0,0,0,0,0,1,1,1
3,73.25,233,9.88,31.25,4.65,20.0,29.0,113.0,4.341827,0,0,0,0,0,0,1,1,1
4,71.13,210,9.25,32.38,4.49,22.0,30.5,116.0,4.530000,0,0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
888,69.00,215,9.50,30.75,4.65,15.0,35.5,116.0,4.290577,0,0,0,0,0,0,1,1,1
889,67.88,217,10.13,29.75,4.70,21.0,32.0,113.0,4.294484,0,0,0,0,0,0,1,1,1
890,72.75,230,9.75,33.00,4.48,16.0,33.5,117.0,4.460000,0,0,0,0,0,0,0,0,1
891,67.50,195,8.25,29.88,4.49,18.0,33.5,119.0,4.120000,0,0,0,0,0,0,0,0,0
