In [1]:
import pandas as pd
import matplotlib.pyplot as plt 
from sklearn import datasets, linear_model
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
import numpy as np
from collections import defaultdict
import math

In [2]:
# format the data to be fed into the network
combine_cleaned = pd.read_csv('combine_cleaned.csv')
final_continuous = pd.read_csv('finalData.csv')
final_discrete = pd.read_csv('discreteData.csv')
final_combined = pd.read_csv('combinedFeatures.csv')
final_beast = pd.read_csv('superbeastfire.csv')

# Choose dataset+features to use
data = final_discrete

In [3]:
# need to figure out how to add 'school', 'conference' back in. 
featuresStandard = ['heightinchestotal', 'weight', 'fortyyd', 'vertical', 'bench', 'twentyss', 'threecone', 'broad', 
            'games', 'rushingAtt', 'rushingYds', 'rushingAvg', 'rushingTD', 'passCmp',
            'passPct', 'passYds', 'passTD', 'passInt', 'passRate', 'recYds', 'recAtt', 'recTD', 'recAvg', 'soloTackles', 
            'tackleAssists', 'totalTackles', 'sacks', 'ints', 'intTDs', 'passDef', 'fumbles', 'fumblesForced']
featuresCombined = ['heightinchestotal','weight','fortyyd','twentyss','threecone','vertical','broad','bench',
                    'games','rushingAtt','rushingYds','rushingAvg','rushingTD','passCmp',
                    'passAtt','passPct','passYds','passTD','passInt','passRate','recYds','recAtt','recTD','recAvg',
                    'soloTackles','tackleAssists','totalTackles','sacks','ints','intTDs','passDef','fumbles','fumblesForced',
                    'heightinchestotal * weight','rushingAtt * rushingYds','passTD / passInt','passRate * passPct',
                    'rushingAvg * rushingTD','rushingAtt * rushingYds','recAvg * recTD','recAtt * recYds','soloTackles * tackleAssists',
                    'totalTackles * sacks','intTDs * ints','passDefs * ints','fumblesForced * totalTackles']
featuresDiscrete = ['confACC','confPac-12','confUnknown','confSEC','confBig 12','confBig Ten','confAmerican',
            'confBig East','confPac-10','confMAC','confSun Belt','confMWC','confWAC','confCUSA','confInd','confSouthern','confMVC','confPac-8',
            'confBig West','confSWC','confSouthland','confBig 8','confSWAC','heightinchestotalNone','heightinchestotalQ1','heightinchestotalQ2',
            'heightinchestotalQ3','heightinchestotalQ4','weightNone','weightQ1','weightQ2','weightQ3','weightQ4','fortyydNone','fortyydQ1','fortyydQ2',
            'fortyydQ3','fortyydQ4','verticalNone','verticalQ1','verticalQ2','verticalQ3','verticalQ4','benchNone','benchQ1','benchQ2','benchQ3',
            'benchQ4','twentyssNone','twentyssQ1','twentyssQ2','twentyssQ3','twentyssQ4','threeconeNone','threeconeQ1','threeconeQ2','threeconeQ3',
            'threeconeQ4','broadNone','broadQ1','broadQ2','broadQ3','broadQ4','gamesNone','gamesQ1','gamesQ2','gamesQ3','gamesQ4','rushingAttNone',
            'rushingAttQ1','rushingAttQ2','rushingAttQ3','rushingAttQ4','rushingYdsNone','rushingYdsQ1','rushingYdsQ2','rushingYdsQ3','rushingYdsQ4',
            'rushingAvgNone','rushingAvgQ1','rushingAvgQ2','rushingAvgQ3','rushingAvgQ4','rushingTDNone','rushingTDQ1','rushingTDQ2','rushingTDQ3','rushingTDQ4'
            ,'passCmpNone','passCmpQ1','passCmpQ2','passCmpQ3','passCmpQ4','passPctNone','passPctQ1','passPctQ2','passPctQ3','passPctQ4','passYdsNone','passYdsQ1',
            'passYdsQ2','passYdsQ3','passYdsQ4','passTDNone','passTDQ1','passTDQ2','passTDQ3','passTDQ4','passIntNone','passIntQ1','passIntQ2','passIntQ3','passIntQ4','passRateNone',
            'passRateQ1','passRateQ2','passRateQ3','passRateQ4','recYdsNone','recYdsQ1','recYdsQ2','recYdsQ3','recYdsQ4','recAttNone','recAttQ1','recAttQ2','recAttQ3',
            'recAttQ4','recTDNone','recTDQ1','recTDQ2','recTDQ3','recTDQ4','recAvgNone','recAvgQ1','recAvgQ2','recAvgQ3','recAvgQ4','soloTacklesNone','soloTacklesQ1',
            'soloTacklesQ2','soloTacklesQ3','soloTacklesQ4','tackleAssistsNone','tackleAssistsQ1','tackleAssistsQ2','tackleAssistsQ3','tackleAssistsQ4','totalTacklesNone',
            'totalTacklesQ1','totalTacklesQ2','totalTacklesQ3','totalTacklesQ4','sacksNone','sacksQ1','sacksQ2','sacksQ3','sacksQ4','intsNone','intsQ1','intsQ2','intsQ3',
            'intsQ4','intTDsNone','intTDsQ1','intTDsQ2','intTDsQ3','intTDsQ4','passDefNone','passDefQ1','passDefQ2','passDefQ3','passDefQ4','fumblesNone','fumblesQ1',
            'fumblesQ2','fumblesQ3','fumblesQ4','fumblesForcedNone','fumblesForcedQ1','fumblesForcedQ2','fumblesForcedQ3','fumblesForcedQ4']
featuresBeast = featuresDiscrete + featuresStandard

In [4]:
TEST_YEAR = 2015
# features = featuresStandard if data.equals(final_continuous) else featuresDiscrete
# features = featuresCombined
features = featuresDiscrete
shouldNormalize = False
# shouldNormalize = True if features == featuresStandard or features == featuresCombined else False 

In [5]:
# get player values/draft positions based on 'nfl_draft' data
def get_player_draft_info():
    valuesMap = {}
    pickMap = {}
    MAX_VALUE = 256 # number of draft picks + 1 (7 rounds * 32 picks + 32 compensatory + 1)
    for index, row in data.iterrows():
        primaryKey = (row['name'], row['year'])
        valuesMap[primaryKey] = (MAX_VALUE - row['pick'])
        pickMap[primaryKey] = row['pick']
    return valuesMap, pickMap

In [6]:
# Creates list of positions to partition by
positionsSet = set()
for position in data['pos']:
    positionsSet.add(position)
positions = list(positionsSet)

In [7]:
draftValues, draftPositions = get_player_draft_info()

In [8]:
# replace here with NEW dataset 
combineFeatures = data[['name', 'pos', 'year'] + features]

In [9]:
def get_position_maps():
    positionMaps = {}
    for position in positions:
        positionMaps[position] = combineFeatures[combineFeatures['pos'] == position]
    return positionMaps

In [10]:
positionMaps = get_position_maps()

In [11]:
# build X matrix and Y values 
def build_data_arrays():
    X = defaultdict(list)
    Y = defaultdict(list)
    xTrain = defaultdict(list)
    yTrain = defaultdict(list)
    xTest = defaultdict(list)
    yTest = defaultdict(list)
    names = defaultdict(list)
    for position in positionMaps:
        for index, row in positionMaps[position].iterrows(): 
            X[position].append(row[3:])                  
            Y[position].append(draftValues[(row['name'], row['year'])])
            if row['year'] == TEST_YEAR:
                xTest[position].append(row[3:]) 
                yTest[position].append(draftValues[(row['name'], row['year'])])
                names[position].append(row['name'])
            if not row['year'] == TEST_YEAR:
                xTrain[position].append(row[3:])                  
                yTrain[position].append(draftValues[(row['name'], row['year'])])
    return X, Y, xTrain, yTrain, xTest, yTest, names

In [12]:
X, Y, xTrain, yTrain, xTest, yTest, names = build_data_arrays()

In [13]:
# predictor = linear_model.LinearRegression(fit_intercept=True, normalize=shouldNormalize, copy_X=True, n_jobs=1)
predictor = linear_model.RidgeCV(alphas=[0.1, 1.0, 10], fit_intercept=True, normalize=shouldNormalize)

In [14]:
numFolds = 10
seed = 1
kFold = KFold(n_splits=numFolds, random_state=seed)
def eval_data():
    errorMap = {}
    for p in positions:
        print p, len(X[p])
        if len(X[p]) > numFolds:
            results = cross_val_score(predictor, np.array(X[p]), np.array(Y[p]), cv=kFold, scoring='mean_squared_error')
            errorMap[p] = (results.mean(), results.std())
            print 'Results[{}]: mean={} (std={}) Mean Squared Error'.format(p, results.mean(), results.std())
        else:
            print 'Position: {} has too little data ({} row(s))'.format(p, len(X[p]))
    return errorMap

In [15]:
errors = eval_data()

C 110
Results[C]: mean=-4109.97554039 (std=1280.47663554) Mean Squared Error
LB

  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample

 541
Results[LB]: mean=-4807.45521717 (std=768.635127799) Mean Squared Error
G 260
Results[G]: mean=-4483.46837972 (std=709.786564043) Mean Squared Error
LS 1
Position: LS has too little data (1 row(s))
K 38
Results[K]: mean=-3598.26917116 (std=2628.40784676) Mean Squared Error
DE 396


  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)


Results[DE]: mean=-5608.17598469 (std=1157.85463689) Mean Squared Error
DB 847


  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)


Results[DB]: mean=-4362.7661728 (std=907.376621327) Mean Squared Error
P 34
Results[P]: mean=-2257.66548719 (std=1355.30085952) Mean Squared Error
FB 49
Results[FB]: mean=-3000.18367253 (std=1534.36266116) Mean Squared Error
QB 211
Results[QB]: mean=-5399.10313765 (std=1400.12451635) Mean Squared Error


  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample

WR 552
Results[WR]: mean=-4772.7942548 (std=627.121240703) Mean Squared Error
RB 342
Results[RB]: mean=-4932.69259811 (std=1024.69594231) Mean Squared Error
DT 350


  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)


Results[DT]: mean=-6112.39089771 (std=825.957665125) Mean Squared Error
TE 256
Results[TE]: mean=-3762.13202132 (std=765.565378872) Mean Squared Error
T 345
Results[T]: mean=-5796.99831682 (std=926.192286891) Mean Squared Error


  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)


In [16]:
errors

{'C': (-4109.9755403932377, 1280.4766355442666),
 'DB': (-4362.7661727956283, 907.37662132728894),
 'DE': (-5608.1759846891709, 1157.8546368881243),
 'DT': (-6112.3908977117862, 825.95766512474643),
 'FB': (-3000.1836725295389, 1534.3626611616951),
 'G': (-4483.4683797199732, 709.78656404260857),
 'K': (-3598.2691711568427, 2628.4078467632203),
 'LB': (-4807.4552171689056, 768.63512779930386),
 'P': (-2257.6654871918281, 1355.3008595232361),
 'QB': (-5399.1031376474712, 1400.1245163544027),
 'RB': (-4932.6925981058448, 1024.6959423110661),
 'T': (-5796.9983168239651, 926.19228689113424),
 'TE': (-3762.1320213223325, 765.56537887173738),
 'WR': (-4772.7942547990579, 627.12124070290463)}

In [17]:
# Map players to their true and predicted draft positions 
def get_relative_error(results, test_year):
    trueDraftMap = {}      # true pick number
    predictedDraftMap = {} # relative pick based on regression
    i = 0
    for index, row in results.iterrows():
        trueDraftMap[row['name']] = draftPositions[(row['name'], test_year)] #need primary key to get draft positions 
        predictedDraftMap[row['name']] = i
        i += 1
    
    # Compute absolute_error based on relative draft positions
    i = 0
    absolute_error = 0
    errors = {}
    for key, value in sorted(trueDraftMap.iteritems(), key=lambda (k,v): (v,k)):  
        errors[key] = math.fabs(i - predictedDraftMap[key]) 
        absolute_error += errors[key]
        i += 1
        
    #Add the errors to the results Dataframe (for visualization)
    results['error'] = 0.0
    for index, row in results.iterrows():
        results.at[index, 'error'] = errors[row['name']]

    #average the error by the total number of players
    return absolute_error / float(len(results))

In [18]:
scores = {}
relativeError = {} 
coefficients = {}
output = {}
for p in positions:
    if len(xTrain[p]) > 1 and len(xTest[p]) > 1:
        predictor.fit(np.array(xTrain[p]), np.array(yTrain[p]))
        coefficients[p] = pd.DataFrame(zip(features, predictor.coef_), columns = ['feature', 'coefficient']).sort_values(by=['coefficient'], ascending=False)
        prediction = predictor.predict(np.array(xTest[p])) 
        output[p] = pd.DataFrame(zip(names[p], prediction), columns = ['name', 'value']).sort_values(by=['value'], ascending=False)
        # scores[p] = predictor.score(np.array(xTest[p]), np.array(yTest[p]))
        scores[p] = (mean_squared_error(np.array(yTest[p]), np.array(prediction)),
                     r2_score(np.array(yTest[p]), np.array(prediction)))
        relativeError[p] = get_relative_error(output[p], TEST_YEAR)
    else:
        print 'Not enough data for position: {}'.format(p)

Not enough data for position: LS
Not enough data for position: K
Not enough data for position: P


In [19]:
scores

{'C': (2959.3307790070112, 0.39985925051798454),
 'DB': (4176.4407568159977, 0.27739150985915273),
 'DE': (3769.1942258405525, 0.23998869110413112),
 'DT': (6797.4510558075499, -0.33805261773391515),
 'FB': (3706.6265304646467, -0.72546694851573568),
 'G': (2931.7510033338385, 0.22930345573104793),
 'LB': (4908.956157136422, 0.05342303838827922),
 'QB': (6455.8207114041707, -0.010675012488432278),
 'RB': (2617.4645081555773, 0.39758346719194027),
 'T': (7078.2326043014546, 0.09227942149123225),
 'TE': (3515.0965904496029, 0.17119209118532108),
 'WR': (4168.9710668685457, 0.27715694333202567)}

In [20]:
output

{'C':             name       value  error
 1      AliMarpet  163.424409    1.0
 0  CameronErving  152.239793    1.0
 2   HronissGrasu  125.465058    0.0
 4      MaxGarcia  125.465058    1.0
 5     AndyGallik  116.010540    1.0
 3      ShaqMason   69.069627    2.0
 6   AustinReiter   69.069627    0.0,
 'DB':                 name       value  error
 3         ByronJones  189.840812    3.0
 21          JoshShaw  180.075602   20.0
 0         TraeWaynes  170.701670    2.0
 8           EricRowe  168.805175    5.0
 2       MarcusPeters  168.523865    2.0
 29       DamianSwann  165.928011   24.0
 1       KevinJohnson  165.707030    5.0
 9        RonaldDarby  165.091232    2.0
 4   DamariousRandall  164.876207    4.0
 37      QuandreDiggs  162.898229   28.0
 33     CharlesGaines  158.185953   23.0
 26       BobbyMcCain  157.489341   15.0
 16        CraigMager  151.892939    4.0
 14        PJWilliams  151.280343    1.0
 38     GerodHolliman  146.534419   24.0
 17      StevenNelson  146.227703   

In [21]:
relativeError

{'C': 0.8571428571428571,
 'DB': 9.869565217391305,
 'DE': 5.142857142857143,
 'DT': 8.090909090909092,
 'FB': 1.5,
 'G': 5.7894736842105265,
 'LB': 8.944444444444445,
 'QB': 2.2857142857142856,
 'RB': 3.3333333333333335,
 'T': 5.2,
 'TE': 5.0,
 'WR': 7.117647058823529}

In [22]:
coefficients

{'C':                  feature  coefficient
 27   heightinchestotalQ4    14.502761
 41            verticalQ3    13.733438
 32              weightQ4    13.694954
 50            twentyssQ2    12.765850
 54           threeconeQ1     8.023571
 59               broadQ1     5.592308
 46               benchQ3     4.735542
 34             fortyydQ1     3.336635
 47               benchQ4     3.007020
 49            twentyssQ1     2.100805
 129              recTDQ1     0.000000
 124             recAttQ1     0.000000
 125             recAttQ2     0.000000
 126             recAttQ3     0.000000
 127             recAttQ4     0.000000
 128            recTDNone     0.000000
 0                confACC     0.000000
 130              recTDQ2     0.000000
 131              recTDQ3     0.000000
 132              recTDQ4     0.000000
 139        soloTacklesQ1     0.000000
 134             recAvgQ1     0.000000
 135             recAvgQ2     0.000000
 136             recAvgQ3     0.000000
 137             rec