In [38]:
import pandas as pd
import re
import numpy as np
from sklearn.preprocessing import scale

In [39]:
df = pd.read_csv('final_wr.csv')

In [40]:
test_df = df[(df.rookie_season == 2013)]
test_df.drop('Unnamed: 0', axis = 1, inplace = True)
print test_df.shape
test_df.head()

(86, 64)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from ipykernel import kernelapp as app


Unnamed: 0,name,season,age,draft_pos,team,weight,bmi,games,rush_atts,rush_yds,...,rookie_season,height_inches,start_ratio,dpis_drawn,dpi_yards,pct_team_tgts,pct_team_receptions,pct_of_team_passyards,pct_team_touchdowns,years_in_league
50,Jaron Brown,2013.0,23.0,UDFA,ARI,205.0,26.3,16.0,0.0,0.0,...,2013.0,74.0,0.0,0.0,0.0,0.029617,0.030303,0.034983,0.041667,0.0
53,Jaron Brown,2014.0,24.0,UDFA,ARI,205.0,26.3,16.0,0.0,0.0,...,2013.0,74.0,0.125,0.0,0.0,0.053691,0.065476,0.059342,0.090909,1.0
56,Jaron Brown,2015.0,25.0,UDFA,ARI,205.0,26.3,16.0,0.0,0.0,...,2013.0,74.0,0.0,1.0,32.0,0.035714,0.027431,0.027848,0.025641,2.0
103,Darius Johnson,2013.0,22.0,UDFA,ATL,175.0,25.1,10.0,0.0,0.0,...,2013.0,70.0,0.2,2.0,38.0,0.06525,0.049438,0.049493,0.038462,0.0
214,Robert Woods,2013.0,21.0,2-41,BUF,201.0,26.5,14.0,2.0,16.0,...,2013.0,73.0,1.0,1.0,6.0,0.162835,0.133779,0.189172,0.1875,0.0


In [41]:
def feat_engineering(data):
    ## adding in the drop constant
    data['dropK'] = np.log(data['drops'] +1)
    
    ## adding in the yards after catch constant
    data['yacK'] = data.yac*(data.yac/data.rec_yards)
    
    ## compiling the base function score
    data['base'] = (((data.rec_yards+data.yacK+data.dpi_yards+(data.DYAR*100))*(data.receptions+(data.first_down_ctchs*data.first_down_ctchpct)+((data.recs_ovr_25**2)/data.receptions)))/(data.fumbles+data.dropK + (data.targets/data.pct_team_tgts))**2)
    
    ## compiling td_points score
    data['td_points'] = (((data.rec_tds+data.rush_tds)/np.average(data.rec_tds+data.rush_tds))*data.pct_team_touchdowns)
    
    ## creating final compilation score column
    data['compilation'] = (data.base*100) + (data.td_points*7)
    
    ## We know the nulls are all coming from if a player has zero recieving yards so you 
    ## cannot divide by zero and you get a null value. So we are fine putting a zero here
    data.yacK.fillna(value = 0, inplace = True)
    
    return data
    
    

In [42]:
test_df = feat_engineering(test_df)
print test_df.shape

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  app.launch_new_instance()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-

(86, 69)


In [43]:
def pivot_df(df):
    ## I am creating a list of the columns where there are no null values in order to create
    ## my pivot table with it

    cols_to_keep = []
    for col in df.columns:
        if df[col].isnull().sum() == 0:
            cols_to_keep.append(col)
    
    ## I am removing some columns that I do not want to be used
    cols_to_keep.remove('years_in_league')
    cols_to_keep.remove('team_pass_yds')
    cols_to_keep.remove('team_pass_tds')
    cols_to_keep.remove('team_pass_attempts')
    cols_to_keep.remove('team_completions')
    cols_to_keep.remove('total_points')
    cols_to_keep.append('compilation')
    
    ## creating a pivot table with columns that have no nulls in it and name as the index, with
    ## years in the league as the breakdown for columns
    pivoted = df.pivot_table(index=df['name'], columns='years_in_league', values=cols_to_keep)
    
    ## below we will lay out how we want to fill any null values for players
    ## that might not have been playing for the three initial seasons
    zero_cols = ['games', 'rush_atts', 'rush_yds', 'rush_y/a', 'rush_tds', 'rush_ypg',
             'targets', 'receptions', 'rec_yards', 'yards/reception', 'rec_tds',
             'rec_ypg', 'ctch_pct', 'y/tgt', 'fumbles', 'fumbles_recovered', 'fum_ret_yds',
             'fum_tds', 'forced_fumbles', 'pro_bowls', 'all_pros', '100yd_gms',
             'first_down_ctchs', 'first_down_ctchpct', 'long_ctch', 'drops', 'EYds',
             'DVOA', 'DYAR', '40 Yard', 'start_ratio', 'dpis_drawn', 'dpi_yards',
             'pct_team_tgts', 'pct_team_receptions', 'pct_of_team_passyards',
             'pct_team_touchdowns', 'dropK', 'yacK', 'td_points', 'compilation']

    backfill_cols = ['weight', 'bmi', 'rookie_age',
                     'rookie_season', 'height_inches']

    team_cols = ['team_pass_tds', 'team_pass_yds', 'team_pass_attempts', 'team_completions',
                 'total_points']

    years = [0.0, 1.0, 2.0]
    back_years = [1.0, 2.0]
    
    for col in zero_cols:
        for i in years:
            pivoted[col][i].fillna(0, inplace = True)
    
    for col in backfill_cols:
        for i in back_years:
            pivoted[col][i] = pivoted[col][0.0]
            
    pivoted = pivoted[pivoted.season[0.0].isnull() == False]
    
    ## creating a variable made of the multiindexed columns
    mi = pivoted.columns
    
    ## I am now creating a new set of column names made up of the upper index and the lower
    ## index added together
    new_cols = pd.Index([x[0]+'_'+str(x[1]) for x in mi.tolist()])
    
    ## I will now be renaming the pivoted dataframe columns my new_cols list which is a combination
    ## of the two levels of column names from when we initially pivoted
    pivoted.columns = new_cols
    print pivoted.columns
    pivoted.head()
    
    ## this is simply to get rid of the '.0' at the end of every column name
    pivoted.rename(columns = lambda x: x.replace('.0', ''), inplace = True)
    
    ## here I am correcting the values for season 1, 2
    pivoted['season_1'] = pivoted['season_0']+1
    pivoted['season_2'] = pivoted['season_0']+2
    
    ## here i am correcting the values for age 1 and 2
    pivoted['age_1'] = pivoted['age_0']+1
    pivoted['age_2'] = pivoted['age_0']+2
    
    return pivoted

In [44]:
test_df = pivot_df(test_df)
print test_df.shape

Index([u'season_0.0', u'season_1.0', u'season_2.0', u'age_0.0', u'age_1.0',
       u'age_2.0', u'weight_0.0', u'weight_1.0', u'weight_2.0', u'bmi_0.0',
       ...
       u'dropK_2.0', u'yacK_0.0', u'yacK_1.0', u'yacK_2.0', u'td_points_0.0',
       u'td_points_1.0', u'td_points_2.0', u'compilation_0.0',
       u'compilation_1.0', u'compilation_2.0'],
      dtype='object', length=144)
(44, 144)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


# Performing the LDA

In [97]:
train_df = pd.read_csv('/Users/TerryONeill/Terry_git/Capstone/GABBERT/wide_receivers/pivot_catcherr.csv')
train_df.drop('Unnamed: 0', axis = 1, inplace = True)

comp_df_cols = ['name', 'compilation_3']

train_df.set_index(train_df.name, drop = True, inplace = True)
train_df.drop('name', axis = 1, inplace = True)
print train_df.shape
train_df.head()

(872, 152)


Unnamed: 0_level_0,season_0,season_1,season_2,age_0,age_1,age_2,weight_0,weight_1,weight_2,bmi_0,...,td_points_2,compilation_0,compilation_1,compilation_2,compilation_3,avg_starts,dpis,dpi_yards,year_1_growth,year_2_growth
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
A.J. Green,2011.0,2012.0,2013.0,23.0,24.0,25.0,205.0,205.0,205.0,25.0,...,1.562539,53.00641,98.47157,82.935491,59.741763,1.0,0.0,0.0,9.264454,-13.711468
A.J. Jenkins,2012.0,2013.0,2014.0,23.0,24.0,25.0,192.0,192.0,192.0,26.0,...,0.0,0.0,0.597268,0.0,0.0,0.020833,0.0,0.0,0.121706,-0.527123
Aaron Brown,2009.0,2010.0,2011.0,24.0,25.0,26.0,196.0,196.0,196.0,26.6,...,0.0,1.007213,0.0,0.0,0.0,0.044444,0.0,0.0,-0.20524,0.0
Aaron Halterman,2007.0,2008.0,2009.0,25.0,26.0,27.0,255.0,255.0,255.0,30.2,...,0.0,0.030196,0.0,0.0,0.0,0.0,0.0,0.0,-0.006153,0.0
Aaron Moorehead,2003.0,2004.0,2005.0,23.0,24.0,25.0,200.0,200.0,200.0,25.0,...,0.0,0.240777,0.02588,0.222518,0.274562,0.0,0.0,0.0,-0.04379,0.173544


In [98]:
# Create an average starts column
train_df['avg_starts'] = (train_df.start_ratio_0 + train_df.start_ratio_1 + train_df.start_ratio_2) / 3

#Create a column that adds up a player's dpi yards and penaltys drawn
train_df['dpis'] = train_df.dpis_drawn_0 + train_df.dpis_drawn_1 + train_df.dpis_drawn_2
train_df['dpi_yards'] = train_df.dpi_yards_0 + train_df.dpi_yards_1 + train_df.dpi_yards_2

# Try adding a column about year over year growth to see if that helps with modeling
# First we'll need to define variables that show how much growth an average player had over that time period.
year_1_growth = (train_df[train_df.compilation_1 >0].compilation_1 - train_df[train_df.compilation_1 > 0].compilation_0).mean()
year_2_growth = (train_df[train_df.compilation_2 >0].compilation_2 - train_df[train_df.compilation_2 >0].compilation_1).mean()

train_df['year_1_growth'] = (train_df.compilation_1 - train_df.compilation_0) / year_1_growth
train_df['year_2_growth'] = (train_df.compilation_2 - train_df.compilation_1) / year_2_growth

In [99]:
train_df.drop(['receptions_3', 'rec_yards_3'], axis = 1, inplace = True)

print train_df.shape
train_df.head()


(872, 150)


Unnamed: 0_level_0,season_0,season_1,season_2,age_0,age_1,age_2,weight_0,weight_1,weight_2,bmi_0,...,td_points_2,compilation_0,compilation_1,compilation_2,compilation_3,avg_starts,dpis,dpi_yards,year_1_growth,year_2_growth
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
A.J. Green,2011.0,2012.0,2013.0,23.0,24.0,25.0,205.0,205.0,205.0,25.0,...,1.562539,53.00641,98.47157,82.935491,59.741763,1.0,0.0,0.0,9.264454,-13.711468
A.J. Jenkins,2012.0,2013.0,2014.0,23.0,24.0,25.0,192.0,192.0,192.0,26.0,...,0.0,0.0,0.597268,0.0,0.0,0.020833,0.0,0.0,0.121706,-0.527123
Aaron Brown,2009.0,2010.0,2011.0,24.0,25.0,26.0,196.0,196.0,196.0,26.6,...,0.0,1.007213,0.0,0.0,0.0,0.044444,0.0,0.0,-0.20524,0.0
Aaron Halterman,2007.0,2008.0,2009.0,25.0,26.0,27.0,255.0,255.0,255.0,30.2,...,0.0,0.030196,0.0,0.0,0.0,0.0,0.0,0.0,-0.006153,0.0
Aaron Moorehead,2003.0,2004.0,2005.0,23.0,24.0,25.0,200.0,200.0,200.0,25.0,...,0.0,0.240777,0.02588,0.222518,0.274562,0.0,0.0,0.0,-0.04379,0.173544


In [100]:
train_df = train_df[train_df.season_0 < 2013]
train_df.shape

(872, 150)

In [101]:
## this is a  list of features without any first year stats in it as they
## have been found to be not very indicative of the target
features_no_year_1 = ['age_2', 'weight_2', 'bmi_2',
         'rush_y/a_1', 'rush_y/a_2',
         'receptions_1', 'receptions_2',
        'rec_yards_1','rec_yards_2', 'rec_tds_1',
        'rec_tds_2', 'ctch_pct_1', 'ctch_pct_2',
         'first_down_ctchpct_1',
        'first_down_ctchpct_2',  'long_ctch_1', 'long_ctch_2',
         'drops_1', 'drops_2',  'EYds_1', 'EYds_2',
        'DVOA_1', 'DVOA_2', 'height_inches_2', 'avg_starts', 'dpis', 'dpi_yards',
         'pct_team_tgts_1',
        'pct_team_tgts_2', 'compilation_0', 'compilation_1', 'compilation_2', 'yacK_2',
                 'year_1_growth', 'year_2_growth']

## creating dataframe to perform LDA on
lda_df = train_df[features_no_year_1]


In [102]:
# Create categories for player season_3 ratings

bins = [-1, 10, 30, 65, 200]
labels = ['below average', 'league_average', 'quality starter', 'all_pro']
train_df['categories'] =  pd.cut(train_df['compilation_3'], bins, labels=labels)

comp_df_cols = ['compilation_3', 'categories']
comp_df = pd.DataFrame(train_df[comp_df_cols], columns = comp_df_cols, index = train_df.index)

In [103]:
train_df.shape

(872, 151)

In [104]:
from sklearn.lda import LDA

lda = LDA(n_components=4)

X = scale(train_df.drop(['compilation_3', 'categories'], axis = 1))
y = train_df['categories']

print X.shape



(872, 149)


In [105]:
## fit and transform the standardized data
lda_model = lda.fit(X, y)
lda_cols = lda_model.transform(X)
X.shape

(872, 149)

In [106]:
train_df.drop(['compilation_3', 'categories'], axis = 1)

Unnamed: 0_level_0,season_0,season_1,season_2,age_0,age_1,age_2,weight_0,weight_1,weight_2,bmi_0,...,td_points_1,td_points_2,compilation_0,compilation_1,compilation_2,avg_starts,dpis,dpi_yards,year_1_growth,year_2_growth
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
A.J. Green,2011.0,2012.0,2013.0,23.0,24.0,25.0,205.0,205.0,205.0,25.0,...,1.897369,1.562539,53.006410,98.471570,82.935491,1.000000,0.0,0.0,9.264454,-13.711468
A.J. Jenkins,2012.0,2013.0,2014.0,23.0,24.0,25.0,192.0,192.0,192.0,26.0,...,0.000000,0.000000,0.000000,0.597268,0.000000,0.020833,0.0,0.0,0.121706,-0.527123
Aaron Brown,2009.0,2010.0,2011.0,24.0,25.0,26.0,196.0,196.0,196.0,26.6,...,0.000000,0.000000,1.007213,0.000000,0.000000,0.044444,0.0,0.0,-0.205240,0.000000
Aaron Halterman,2007.0,2008.0,2009.0,25.0,26.0,27.0,255.0,255.0,255.0,30.2,...,0.000000,0.000000,0.030196,0.000000,0.000000,0.000000,0.0,0.0,-0.006153,0.000000
Aaron Moorehead,2003.0,2004.0,2005.0,23.0,24.0,25.0,200.0,200.0,200.0,25.0,...,0.000000,0.000000,0.240777,0.025880,0.222518,0.000000,0.0,0.0,-0.043790,0.173544
Aaron Walker,2006.0,2007.0,2008.0,26.0,27.0,28.0,252.0,252.0,252.0,29.1,...,0.000000,0.000000,0.412599,0.000000,0.000000,0.125000,0.0,0.0,-0.084075,0.000000
Adam Jennings,2007.0,2008.0,2009.0,25.0,26.0,27.0,181.0,181.0,181.0,26.7,...,0.000000,0.000000,0.585068,0.000000,0.000000,0.041667,0.0,0.0,-0.119220,0.000000
Adrian Arrington,2010.0,2011.0,2012.0,25.0,26.0,27.0,185.0,185.0,185.0,23.8,...,0.000000,0.000000,0.268449,0.066052,0.000000,0.000000,0.0,0.0,-0.041242,-0.058295
Adrian Madise,2003.0,2004.0,2005.0,23.0,24.0,25.0,215.0,215.0,215.0,30.0,...,0.000000,0.000000,0.036275,0.000000,0.000000,0.000000,0.0,0.0,-0.007392,0.000000
Ahmad Merritt,2001.0,2002.0,2003.0,24.0,25.0,26.0,195.0,195.0,195.0,28.0,...,0.000000,0.000000,0.047423,0.258573,0.054774,0.105556,1.0,5.0,0.043026,-0.179864


In [107]:
lda_df = pd.DataFrame(lda_cols, columns = ['lda1', 'lda2', 'lda3'])


In [108]:
lda_df.set_index(train_df.index, drop = False, inplace = True)
lda_df.head()

Unnamed: 0_level_0,lda1,lda2,lda3
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
A.J. Green,-6.777309,5.213721,-2.349613
A.J. Jenkins,0.647419,1.011704,-0.039015
Aaron Brown,1.021427,0.053724,0.333371
Aaron Halterman,0.474874,-0.065886,0.500346
Aaron Moorehead,0.025532,0.152643,-0.915714


In [109]:
## need to join the compilation score and the bin score for the player to the lda dataframe

lda_comp_df = lda_df.join(comp_df)
lda_comp_df

Unnamed: 0_level_0,lda1,lda2,lda3,compilation_3,categories
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
A.J. Green,-6.777309,5.213721,-2.349613,59.741763,quality starter
A.J. Jenkins,0.647419,1.011704,-0.039015,0.000000,below average
Aaron Brown,1.021427,0.053724,0.333371,0.000000,below average
Aaron Halterman,0.474874,-0.065886,0.500346,0.000000,below average
Aaron Moorehead,0.025532,0.152643,-0.915714,0.274562,below average
Aaron Walker,0.848070,0.553799,0.009269,0.000000,below average
Adam Jennings,1.175468,0.428332,0.381803,0.000000,below average
Adrian Arrington,0.291072,0.645354,0.800769,0.000000,below average
Adrian Madise,0.330100,-0.206074,0.720408,0.000000,below average
Ahmad Merritt,0.995817,-0.679317,0.521043,0.000000,below average


In [110]:
X = lda_comp_df.drop(['compilation_3', 'categories'], axis = 1)
y = lda_comp_df['categories']
print X.shape
print y.shape

(872, 3)
(872,)


In [111]:
X

Unnamed: 0_level_0,lda1,lda2,lda3
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
A.J. Green,-6.777309,5.213721,-2.349613
A.J. Jenkins,0.647419,1.011704,-0.039015
Aaron Brown,1.021427,0.053724,0.333371
Aaron Halterman,0.474874,-0.065886,0.500346
Aaron Moorehead,0.025532,0.152643,-0.915714
Aaron Walker,0.848070,0.553799,0.009269
Adam Jennings,1.175468,0.428332,0.381803
Adrian Arrington,0.291072,0.645354,0.800769
Adrian Madise,0.330100,-0.206074,0.720408
Ahmad Merritt,0.995817,-0.679317,0.521043


In [112]:
## I am going to weigth each category to try to more accurately predict that bin
cat_weights = {'below average':1, 'league_average':8, 'quality starter':4, 'all_pro':5}

In [113]:
## support vector machine classifier

from sklearn.svm import SVC
from sklearn.cross_validation import cross_val_predict
from sklearn.metrics import classification_report, recall_score


svc = SVC(C = .8, class_weight=cat_weights, probability = True, kernel='linear', degree = 1, shrinking = True)

cvp = cross_val_predict(svc, X, y, cv = 3, n_jobs = -1, verbose = 1)


print classification_report(y, cvp)
print recall_score(y, cvp, average = 'macro')

             precision    recall  f1-score   support

    all_pro       0.58      0.70      0.63        27
below average       0.97      0.90      0.94       741
league_average       0.42      0.74      0.54        57
quality starter       0.57      0.62      0.59        47

avg / total       0.90      0.87      0.88       872

0.740437655331


[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:    0.1s finished


In [114]:
svc_model = SVC(C = .8, class_weight = cat_weights, probability = True, kernel = 'linear', shrinking = True)
svc_fit = svc_model.fit(X, y)
svc_fit.score(X, y)

0.88188073394495414

In [115]:
# Create an average starts column
test_df['avg_starts'] = (test_df.start_ratio_0 + test_df.start_ratio_1 + test_df.start_ratio_2) / 3

#Create a column that adds up a player's dpi yards and penaltys drawn
test_df['dpis'] = test_df.dpis_drawn_0 + test_df.dpis_drawn_1 + test_df.dpis_drawn_2
test_df['dpi_yards'] = test_df.dpi_yards_0 + test_df.dpi_yards_1 + test_df.dpi_yards_2

# Try adding a column about year over year growth to see if that helps with modeling
# First we'll need to define variables that show how much growth an average player had over that time period.
year_1_growth = (test_df[test_df.compilation_1 >0].compilation_1 - test_df[test_df.compilation_1 > 0].compilation_0).mean()
year_2_growth = (test_df[test_df.compilation_2 >0].compilation_2 - test_df[test_df.compilation_2 >0].compilation_1).mean()

test_df['year_1_growth'] = (test_df.compilation_1 - test_df.compilation_0) / year_1_growth
test_df['year_2_growth'] = (test_df.compilation_2 - test_df.compilation_1) / year_2_growth

print test_df.shape
test_df.head()

(44, 149)


Unnamed: 0_level_0,season_0,season_1,season_2,age_0,age_1,age_2,weight_0,weight_1,weight_2,bmi_0,...,td_points_1,td_points_2,compilation_0,compilation_1,compilation_2,avg_starts,dpis,dpi_yards,year_1_growth,year_2_growth
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Aaron Dobson,2013.0,2014.0,2015.0,22.0,23.0,24.0,210.0,210.0,210.0,26.2,...,0.0,0.0,10.546819,0.083563,0.966747,0.458333,7.0,147.0,-2.527641,0.476617
Ace Sanders,2013.0,2014.0,2015.0,22.0,23.0,24.0,173.0,173.0,173.0,27.1,...,0.0,0.0,12.428353,0.229302,0.0,0.088889,0.0,0.0,-2.946962,-0.123745
Ben Watson,2013.0,2014.0,2015.0,33.0,34.0,35.0,255.0,255.0,255.0,31.9,...,0.0,0.0,0.0,0.0,0.0,0.155556,0.0,0.0,0.0,0.0
Chris Hogan,2013.0,2014.0,2015.0,25.0,26.0,27.0,220.0,220.0,220.0,29.0,...,0.376265,0.094066,0.327132,10.805001,10.623025,0.125,1.0,5.0,2.531171,-0.098205
Cordarrelle Patterson,2013.0,2014.0,2015.0,22.0,23.0,24.0,216.0,216.0,216.0,27.7,...,0.063633,0.0,19.58242,7.735541,0.06359,0.291667,6.0,160.0,-2.861887,-4.140231


In [116]:
X_test = scale(test_df)

test_lda_cols = lda_model.transform(scale(test_df))

test_lda_df = pd.DataFrame(test_lda_cols, columns = ['LDA1', 'LDA2', 'LDA3'])

test_lda_df.head()

Unnamed: 0,LDA1,LDA2,LDA3
0,1.757126,-1.088666,-3.088434
1,1.157981,-0.25812,0.295615
2,1.235852,-0.201291,1.724468
3,-2.785315,-2.751434,-0.082557
4,-0.997164,-2.343705,-0.168027


In [117]:
train_df.shape

(872, 151)

In [118]:
test_lda_df.set_index(test_df.index, inplace = True)


In [119]:
rookie_pred = svc_fit.predict(test_lda_df)
test_lda_df['prediction'] = rookie_pred
test_lda_df

Unnamed: 0_level_0,LDA1,LDA2,LDA3,prediction
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Aaron Dobson,1.757126,-1.088666,-3.088434,below average
Ace Sanders,1.157981,-0.25812,0.295615,below average
Ben Watson,1.235852,-0.201291,1.724468,below average
Chris Hogan,-2.785315,-2.751434,-0.082557,league_average
Cordarrelle Patterson,-0.997164,-2.343705,-0.168027,league_average
Da'Rick Rogers,0.865497,0.573908,0.02423,below average
Darius Johnson,0.710085,-0.189241,0.296804,below average
David Johnson,0.633285,0.335408,0.817796,below average
DeAndre Hopkins,-5.572474,-3.25233,-2.031542,league_average
DeVier Posey,1.830728,0.267881,2.333915,below average


# PCA

In [120]:
from sklearn.decomposition import PCA

print train_df.shape
train_df.head()

(872, 151)


Unnamed: 0_level_0,season_0,season_1,season_2,age_0,age_1,age_2,weight_0,weight_1,weight_2,bmi_0,...,compilation_0,compilation_1,compilation_2,compilation_3,avg_starts,dpis,dpi_yards,year_1_growth,year_2_growth,categories
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
A.J. Green,2011.0,2012.0,2013.0,23.0,24.0,25.0,205.0,205.0,205.0,25.0,...,53.00641,98.47157,82.935491,59.741763,1.0,0.0,0.0,9.264454,-13.711468,quality starter
A.J. Jenkins,2012.0,2013.0,2014.0,23.0,24.0,25.0,192.0,192.0,192.0,26.0,...,0.0,0.597268,0.0,0.0,0.020833,0.0,0.0,0.121706,-0.527123,below average
Aaron Brown,2009.0,2010.0,2011.0,24.0,25.0,26.0,196.0,196.0,196.0,26.6,...,1.007213,0.0,0.0,0.0,0.044444,0.0,0.0,-0.20524,0.0,below average
Aaron Halterman,2007.0,2008.0,2009.0,25.0,26.0,27.0,255.0,255.0,255.0,30.2,...,0.030196,0.0,0.0,0.0,0.0,0.0,0.0,-0.006153,0.0,below average
Aaron Moorehead,2003.0,2004.0,2005.0,23.0,24.0,25.0,200.0,200.0,200.0,25.0,...,0.240777,0.02588,0.222518,0.274562,0.0,0.0,0.0,-0.04379,0.173544,below average


In [123]:
## going to create and scale a new data frame of just the feature columns we want to use
## for PCA

pca_df = train_df[features_no_year_1]
pca_df = scale(pca_df)

In [124]:
## creating the covariance matrix - this explains the variance between the different
## features within our dataframe

## for example, the value in the i,j position within the matrix explains the variance
## between the ith and the jth elements of a random vector, or between our features

cov_mat = np.cov(pca_df.T)

In [125]:
## creating my eigenvalues and corresponding eigenvectors

eigenValues, eigenVectors = np.linalg.eig(cov_mat)


In [126]:
## creating the eigenpairs - just pairing the eigenvalue with its eigenvector
eigenPairs = [(np.abs(eigenValues[i]), eigenVectors[:,i]) for i in range(len(eigenValues))]

## sort in ascending order and then reverse to descending (for clarification's sake)
# eigenPairs.sort()
# eigenPairs.reverse()

## loop through the eigenpairs and printing out the first row (eigenvalue)
## this is also seen in the code block above but just wanted to loop through again
## as it is a bit more clear like this
## I am also creating a list of the eigenvalues in ascending order to be able to reference it
sort_values = []
for i in eigenPairs:
    print i[0]
    sort_values.append(i[0])

15.9865197402
3.10576302391
2.48750785344
1.64437554327
1.5082482694
1.22161235017
1.12754095358
1.04043249825
0.917719338057
0.830283111813
0.774969766868
0.649287636388
0.635009444538
0.542322868587
0.411381307014
0.349671073791
0.27083659808
0.235648595754
0.220396154743
0.185147723815
0.146134129701
0.140098615504
0.126960649577
0.121886591744
0.10632889429
0.0755289899866
0.0676389406711
0.042833506206
0.0310440818611
0.0179058773735
0.0118276498813
0.00601783703884
0.00130408143002
3.64960706758e-17
3.29723518238e-16


In [127]:
## we have the eigenvalues above showing us feature correlation explanation, but it helps
## to see the cumulative variance explained as well, which i can show below

## need to sum the eigen values to get percentages
sumEigenvalues = sum(eigenValues)

## this is a percentage explanation
variance_explained = [(i/sumEigenvalues)*100 for i in sort_values]
variance_explained

[45.623390215194057,
 8.8634324830548401,
 7.0990148766386305,
 4.6928279757211966,
 4.3043389339682241,
 3.4863183387866705,
 3.2178511486450505,
 2.9692552620561021,
 2.6190483074968638,
 2.3695170065163067,
 2.2116601144882382,
 1.8529801156421686,
 1.8122320648506247,
 1.5477169676910612,
 1.1740272556007239,
 0.99791449957986922,
 0.77293144471668884,
 0.67250959011055866,
 0.6289811624542283,
 0.52838685269600927,
 0.41704727054262686,
 0.39982271986864149,
 0.3623287214323388,
 0.34784803869194325,
 0.30344844995638476,
 0.21554964049254294,
 0.19303249451027865,
 0.1222411006076167,
 0.088595659570865459,
 0.051100980315639692,
 0.033754531607640192,
 0.017174102427361466,
 0.0037216740679843988,
 1.041549068106593e-16,
 9.4098684267686055e-16]

In [134]:
### based on the above results, it seems that sticking to 16 features would be a decent
## cutoff point since the variance explained per feature drops below 1%

## this can very easily be manipulated by changing n_components adn then adding/subtracting
## columns to the dataframe in the code block below

## instantiate
pca = PCA(n_components = 16)

## fit and transform the standardized data
pca_model = pca.fit(pca_df)
pca_cols = pca_model.transform(pca_df)

In [135]:
## Here I am simply creating the column headers for the pca features
pca_col_list = []

for i in range(1, 17):
    pca_col_list.append('pca'+str(i))

In [136]:
## going to organize the columns into dataframe for organization
pca_df = pd.DataFrame(pca_cols, columns = pca_col_list)

##previewing dataframe
print pca_df.shape

(872, 16)


In [137]:
## We used all of our columns to perform the PCA so we only need to join the names back on
## since we would not want to build a model off of the PCA features as well as the 
## original features that were used to construct the PCA columns

## I am going to set the index of our pca dataframe to the names of the related player

pca_df.set_index(train_df.index, drop = False, inplace = True)
pca_df.head()

Unnamed: 0_level_0,pca1,pca2,pca3,pca4,pca5,pca6,pca7,pca8,pca9,pca10,pca11,pca12,pca13,pca14,pca15,pca16
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
A.J. Green,-12.892792,1.329578,-2.777141,2.452978,-4.338498,1.490638,1.723164,0.998773,-1.914128,0.843979,-1.392522,-0.107002,2.699691,-1.619266,0.011533,0.454178
A.J. Jenkins,1.453281,1.322052,1.236139,0.516286,0.147259,1.260387,-0.431638,-0.552503,-0.359431,0.644517,0.239868,0.295308,0.189341,0.064761,-0.593146,-0.396209
Aaron Brown,2.87819,-0.135445,0.220238,-0.328004,-0.900965,0.189693,0.110739,0.066989,-0.169119,-0.009064,0.0468,0.146688,-0.367358,-0.339924,0.045357,-0.059018
Aaron Halterman,3.118955,-0.315891,-2.590588,0.920189,0.489418,-0.386888,-0.172732,0.14871,-0.029566,0.087971,0.432581,-0.056638,0.114427,-0.080593,0.110085,0.043237
Aaron Moorehead,0.969374,-0.251551,1.505049,1.284337,1.42974,0.975902,-0.019919,0.785061,-0.796259,-0.690721,0.250392,0.428363,0.938047,0.775258,0.317561,-0.039776


In [138]:
pca_joined_df = pca_df.join(comp_df)

In [139]:
# Create categories for player season_3 ratings

bins = [-1, 10, 30, 65, 200]
labels = ['below average', 'league_average', 'quality starter', 'all_pro']
pca_joined_df['categories'] =  pd.cut(pca_joined_df['compilation_3'], bins, labels=labels)
pca_joined_df

Unnamed: 0_level_0,pca1,pca2,pca3,pca4,pca5,pca6,pca7,pca8,pca9,pca10,pca11,pca12,pca13,pca14,pca15,pca16,compilation_3,categories
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
A.J. Green,-12.892792,1.329578,-2.777141,2.452978,-4.338498,1.490638,1.723164,0.998773,-1.914128,0.843979,-1.392522,-0.107002,2.699691,-1.619266,0.011533,0.454178,59.741763,quality starter
A.J. Jenkins,1.453281,1.322052,1.236139,0.516286,0.147259,1.260387,-0.431638,-0.552503,-0.359431,0.644517,0.239868,0.295308,0.189341,0.064761,-0.593146,-0.396209,0.000000,below average
Aaron Brown,2.878190,-0.135445,0.220238,-0.328004,-0.900965,0.189693,0.110739,0.066989,-0.169119,-0.009064,0.046800,0.146688,-0.367358,-0.339924,0.045357,-0.059018,0.000000,below average
Aaron Halterman,3.118955,-0.315891,-2.590588,0.920189,0.489418,-0.386888,-0.172732,0.148710,-0.029566,0.087971,0.432581,-0.056638,0.114427,-0.080593,0.110085,0.043237,0.000000,below average
Aaron Moorehead,0.969374,-0.251551,1.505049,1.284337,1.429740,0.975902,-0.019919,0.785061,-0.796259,-0.690721,0.250392,0.428363,0.938047,0.775258,0.317561,-0.039776,0.274562,below average
Aaron Walker,3.013970,-0.269620,-2.625744,0.677416,0.563743,-0.237749,0.003479,0.395056,0.028626,-0.084283,0.785045,-0.253703,0.542811,-0.291671,0.119224,0.055124,0.000000,below average
Adam Jennings,2.878849,-0.055542,0.901534,-0.910277,-1.473379,0.281649,-0.093502,-0.675676,0.231557,-0.465309,-0.444792,0.298390,-0.428576,-0.109878,0.210936,-0.135636,0.000000,below average
Adrian Arrington,1.806616,1.056161,1.626470,0.410156,0.705266,2.228908,-0.244692,0.106920,-0.104391,-0.142290,0.919345,-0.288578,0.954624,0.460765,-1.080136,0.182419,0.000000,below average
Adrian Madise,2.989113,-0.246314,-0.576047,0.278871,-0.665936,-0.257046,-0.169049,-0.620172,-0.056011,0.256014,-0.637354,0.459049,-1.041793,0.061517,-0.004265,-0.090552,0.000000,below average
Ahmad Merritt,0.386600,0.206790,1.373420,-0.247224,0.243270,-0.953224,-0.762603,-0.504800,0.449854,0.272211,-1.271946,0.630523,0.960674,1.189778,0.272013,-0.779214,0.000000,below average


In [143]:
pca_test_df = scale(test_df)
pca_test_cols = pca_model.transform(pca_test_df)
pca_test_cols

ValueError: operands could not be broadcast together with shapes (44,149) (16,) 

In [91]:
## setting my X and y in order to build a model off of the data

pca_X = pca_joined_df.drop(['compilation_3', 'categories'], axis = 1)
pca_y = pca_joined_df['categories']
print pca_X.shape
print pca_y.shape

(872, 16)
(872,)


In [93]:
svc = SVC(C = .8, class_weight=cat_weights, probability = True, kernel='linear', degree = 1, shrinking = True)

pca_cvp = cross_val_predict(svc, pca_X, pca_y, cv = 10, n_jobs = -1, verbose = 1)


print classification_report(pca_y, pca_cvp)
print recall_score(pca_y, pca_cvp, average = 'macro')

             precision    recall  f1-score   support

    all_pro       0.33      0.33      0.33        27
below average       0.96      0.88      0.92       741
league_average       0.22      0.54      0.31        57
quality starter       0.30      0.19      0.23        47

avg / total       0.86      0.80      0.82       872

0.486131449737


[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:    1.7s finished


In [94]:
pca_svc_model = SVC(C = .8, class_weight = cat_weights, probability = True, kernel = 'linear', shrinking = True)
pca_svc_fit = svc_model.fit(pca_X, pca_y)
pca_svc_fit.score(pca_X, pca_y)

0.83715596330275233

In [None]:
pca_rookie_pred = svc_fit.predict(test_lda_df)
test_lda_df['prediction'] = rookie_pred
test_lda_df