In [1]:
from numpy.random import seed 
s = 1
seed(s)
from tensorflow import set_random_seed
set_random_seed(s)
import pandas as pd
import numpy as np
from keras.models import Sequential
from keras.layers import Dense
from keras.wrappers.scikit_learn import KerasRegressor
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import KFold
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from collections import defaultdict
import math

Using TensorFlow backend.


In [2]:
# format the data to be fed into the network
final_continuous = pd.read_csv('finalData.csv')
final_discrete = pd.read_csv('discreteData.csv')
final_beast = pd.read_csv('superbeastfire.csv')

# Set which of the datasets to use 
data = final_beast
TEST_YEAR = 2015

In [3]:
featuresStandard = ['heightinchestotal', 'weight', 'fortyyd', 'vertical', 'bench', 'twentyss', 'threecone', 'broad', 
            'games', 'rushingAtt', 'rushingYds', 'rushingAvg', 'rushingTD', 'passCmp',
            'passPct', 'passYds', 'passTD', 'passInt', 'passRate', 'recYds', 'recAtt', 'recTD', 'recAvg', 'soloTackles', 
            'tackleAssists', 'totalTackles', 'sacks', 'ints', 'intTDs', 'passDef', 'fumbles', 'fumblesForced']
featuresDiscrete = ['confACC','confPac-12','confUnknown','confSEC','confBig 12','confBig Ten','confAmerican',
            'confBig East','confPac-10','confMAC','confSun Belt','confMWC','confWAC','confCUSA','confInd','confSouthern','confMVC','confPac-8',
            'confBig West','confSWC','confSouthland','confBig 8','confSWAC','heightinchestotalNone','heightinchestotalQ1','heightinchestotalQ2',
            'heightinchestotalQ3','heightinchestotalQ4','weightNone','weightQ1','weightQ2','weightQ3','weightQ4','fortyydNone','fortyydQ1','fortyydQ2',
            'fortyydQ3','fortyydQ4','verticalNone','verticalQ1','verticalQ2','verticalQ3','verticalQ4','benchNone','benchQ1','benchQ2','benchQ3',
            'benchQ4','twentyssNone','twentyssQ1','twentyssQ2','twentyssQ3','twentyssQ4','threeconeNone','threeconeQ1','threeconeQ2','threeconeQ3',
            'threeconeQ4','broadNone','broadQ1','broadQ2','broadQ3','broadQ4','gamesNone','gamesQ1','gamesQ2','gamesQ3','gamesQ4','rushingAttNone',
            'rushingAttQ1','rushingAttQ2','rushingAttQ3','rushingAttQ4','rushingYdsNone','rushingYdsQ1','rushingYdsQ2','rushingYdsQ3','rushingYdsQ4',
            'rushingAvgNone','rushingAvgQ1','rushingAvgQ2','rushingAvgQ3','rushingAvgQ4','rushingTDNone','rushingTDQ1','rushingTDQ2','rushingTDQ3','rushingTDQ4'
            ,'passCmpNone','passCmpQ1','passCmpQ2','passCmpQ3','passCmpQ4','passPctNone','passPctQ1','passPctQ2','passPctQ3','passPctQ4','passYdsNone','passYdsQ1',
            'passYdsQ2','passYdsQ3','passYdsQ4','passTDNone','passTDQ1','passTDQ2','passTDQ3','passTDQ4','passIntNone','passIntQ1','passIntQ2','passIntQ3','passIntQ4','passRateNone',
            'passRateQ1','passRateQ2','passRateQ3','passRateQ4','recYdsNone','recYdsQ1','recYdsQ2','recYdsQ3','recYdsQ4','recAttNone','recAttQ1','recAttQ2','recAttQ3',
            'recAttQ4','recTDNone','recTDQ1','recTDQ2','recTDQ3','recTDQ4','recAvgNone','recAvgQ1','recAvgQ2','recAvgQ3','recAvgQ4','soloTacklesNone','soloTacklesQ1',
            'soloTacklesQ2','soloTacklesQ3','soloTacklesQ4','tackleAssistsNone','tackleAssistsQ1','tackleAssistsQ2','tackleAssistsQ3','tackleAssistsQ4','totalTacklesNone',
            'totalTacklesQ1','totalTacklesQ2','totalTacklesQ3','totalTacklesQ4','sacksNone','sacksQ1','sacksQ2','sacksQ3','sacksQ4','intsNone','intsQ1','intsQ2','intsQ3',
            'intsQ4','intTDsNone','intTDsQ1','intTDsQ2','intTDsQ3','intTDsQ4','passDefNone','passDefQ1','passDefQ2','passDefQ3','passDefQ4','fumblesNone','fumblesQ1',
            'fumblesQ2','fumblesQ3','fumblesQ4','fumblesForcedNone','fumblesForcedQ1','fumblesForcedQ2','fumblesForcedQ3','fumblesForcedQ4']
featuresBeast = featuresDiscrete + featuresStandard
# features = featuresStandard if data.equals(final_continuous) else featuresDiscrete
features = featuresBeast

In [4]:
numFeatures = len(features)

In [5]:
# get player values/draft positions based on 'nfl_draft' data
def get_player_draft_info():
    valuesMap = {}
    pickMap = {}
    MAX_VALUE = 256 # number of draft picks (7 rounds * 32 picks + 32 compensatory + 1)
    for index, row in data.iterrows():
        primaryKey = (row['name'], row['year'])
        valuesMap[primaryKey] = (MAX_VALUE - row['pick'])
        pickMap[primaryKey] = row['pick']
    return valuesMap, pickMap

In [6]:
# Creates list of positions to partition by
positionsSet = set()
for position in data['pos']:
    positionsSet.add(position)
positions = list(positionsSet)

In [7]:
draftValues, draftPositions = get_player_draft_info()

In [8]:
combineFeatures = data[['name', 'pos', 'year'] + features]

In [9]:
def get_position_maps():
    positionMaps = {}
    for position in positions:
        positionMaps[position] = combineFeatures[combineFeatures['pos'] == position]
    return positionMaps

In [10]:
positionMaps = get_position_maps()

In [11]:
# build X matrix and 
def build_data_arrays():
    X = defaultdict(list)
    Y = defaultdict(list)
    xTrain = defaultdict(list)
    yTrain = defaultdict(list)
    xTest = defaultdict(list)
    yTest = defaultdict(list)
    names = defaultdict(list)
    for position in positionMaps:
        for index, row in positionMaps[position].iterrows(): 
            X[position].append(row[3:])                  
            Y[position].append(draftValues[(row['name'], row['year'])])
            if row['year'] == TEST_YEAR:
                xTest[position].append(row[3:]) 
                yTest[position].append(draftValues[(row['name'], row['year'])])
                names[position].append(row['name'])
            if not row['year'] == TEST_YEAR:
                xTrain[position].append(row[3:])                  
                yTrain[position].append(draftValues[(row['name'], row['year'])])
    return X, Y, xTrain, yTrain, xTest, yTest, names

In [12]:
X, Y, xTrain, yTrain, xTest, yTest, names = build_data_arrays()

In [13]:
def base_model():
    model = Sequential()
    model.add(Dense(numFeatures, input_dim=numFeatures, kernel_initializer='normal', activation='relu'))
    model.add(Dense(1, kernel_initializer='normal'))
    model.compile(loss='mean_squared_error', optimizer='adam')
    return model

In [14]:
def deep_model():
    model = Sequential()
    model.add(Dense(numFeatures, input_dim=numFeatures, kernel_initializer='normal', activation='relu'))
    model.add(Dense(numFeatures // 2, kernel_initializer='normal'))
    model.add(Dense(1, kernel_initializer='normal'))
    model.compile(loss='mean_squared_error', optimizer='adam')
    return model

In [15]:
def wide_model():
    model = Sequential()
    model.add(Dense(numFeatures, input_dim=numFeatures, kernel_initializer='normal', activation='relu'))
    model.add(Dense(numFeatures * 2, kernel_initializer='normal'))
    model.add(Dense(1, kernel_initializer='normal', activation='linear'))
    model.compile(loss='mean_squared_error', optimizer='adam', metrics=['mae'])
    model.output_shape
    model.summary()
    return model

In [16]:
def positions_model():
    model = Sequential()
    model.add(Dense(numFeatures, input_dim=numFeatures, kernel_initializer='normal', activation='relu'))
    model.add(Dense(len(positions), kernel_initializer='normal'))
    model.add(Dense(1, kernel_initializer='normal'))
    model.compile(loss='mean_squared_error', optimizer='adam')
    return model

In [17]:
estimators = []
estimators.append(('standardize', StandardScaler())) # Makes data Gaussian 
predictor = KerasRegressor(build_fn=deep_model, nb_epoch=1000, batch_size=5, verbose=0)
estimators.append(('mlp', predictor))
pipeline = Pipeline(estimators)

In [18]:
numFolds = 10
kFold = KFold(n_splits=numFolds, random_state=s)
def eval_data():
    errorMap = {}
    for p in positions:
        if len(X[p]) > numFolds:
            results = cross_val_score(predictor, np.array(X[p]), np.array(Y[p]), cv=kFold)
            errorMap[p] = (results.mean(), results.std())
            print 'Results[{}]: mean={} (std={}) Mean Squared Error'.format(p, results.mean(), results.std())
        else:
            print 'Position: {} has too little data ({} row(s))'.format(p, len(X[p]))
    return errorMap

In [19]:
errors = eval_data()
errors

Results[C]: mean=-5197.67579769 (std=1902.84752465) Mean Squared Error
Results[LB]: mean=-5130.48856205 (std=809.989380131) Mean Squared Error
Results[G]: mean=-4563.06433569 (std=741.287290085) Mean Squared Error
Position: LS has too little data (1 row(s))
Results[K]: mean=-5140.38017578 (std=4372.54165176) Mean Squared Error
Results[DE]: mean=-5494.04582425 (std=1189.13414068) Mean Squared Error
Results[DB]: mean=-5024.38101267 (std=1036.85331149) Mean Squared Error
Results[P]: mean=-4923.29293823 (std=2316.24253703) Mean Squared Error
Results[FB]: mean=-3846.32945557 (std=2718.86504525) Mean Squared Error
Results[QB]: mean=-6190.05264987 (std=2016.92437681) Mean Squared Error
Results[WR]: mean=-5474.94888374 (std=1022.45953815) Mean Squared Error
Results[RB]: mean=-5346.14606501 (std=1077.64195902) Mean Squared Error
Results[DT]: mean=-5372.30746787 (std=641.528902992) Mean Squared Error
Results[TE]: mean=-4454.29576842 (std=990.440713541) Mean Squared Error
Results[T]: mean=-5878.9

{'C': (-5197.675797687878, 1902.8475246543571),
 'DB': (-5024.3810126749404, 1036.8533114859035),
 'DE': (-5494.0458242465293, 1189.1341406771467),
 'DT': (-5372.3074678693501, 641.52890299199464),
 'FB': (-3846.3294555664061, 2718.8650452476404),
 'G': (-4563.0643356910123, 741.28729008531025),
 'K': (-5140.3801757812498, 4372.5416517571211),
 'LB': (-5130.4885620541045, 809.98938013093107),
 'P': (-4923.2929382324219, 2316.2425370314304),
 'QB': (-6190.0526498687223, 2016.9243768142208),
 'RB': (-5346.146065014751, 1077.6419590198273),
 'T': (-5878.9578201037502, 1171.7044102877292),
 'TE': (-4454.2957684150115, 990.44071354067637),
 'WR': (-5474.9488837427898, 1022.4595381499059)}

In [20]:
# Map players to their true and predicted draft positions 
def get_relative_error(results, test_year):
    trueDraftMap = {}      # true pick number
    predictedDraftMap = {} # relative pick based on regression
    i = 0
    for index, row in results.iterrows():
        trueDraftMap[row['name']] = draftPositions[(row['name'], test_year)] #need primary key to get draft positions 
        predictedDraftMap[row['name']] = i
        i += 1
    
    # Compute absolute_error based on relative draft positions
    i = 0
    absolute_error = 0
    errors = {}
    for key, value in sorted(trueDraftMap.iteritems(), key=lambda (k,v): (v,k)):  
        errors[key] = math.fabs(i - predictedDraftMap[key]) 
        absolute_error += errors[key]
        i += 1
        
    #Add the errors to the results Dataframe (for visualization)
    results['error'] = 0.0
    for index, row in results.iterrows():
        results.at[index, 'error'] = errors[row['name']]

    #average the error by the total number of players
    return absolute_error / float(len(results))

In [21]:
scores = {}
output = {}
relativeError = {} 
for p in positions:
    if len(xTrain[p]) > 1 and len(xTest[p]) > 1:
        predictor.fit(np.array(xTrain[p]), np.array(yTrain[p]))
        prediction = predictor.predict(np.array(xTest[p]))
        output[p] = pd.DataFrame(zip(names[p], prediction), columns = ['name', 'value']).sort_values(by=['value'], ascending=False)
        # scores[p] = predictor.score(np.array(xTest[p]), np.array(yTest[p]))
        scores[p] = (mean_squared_error(np.array(yTest[p]), np.array(prediction)), 
                     r2_score(np.array(yTest[p]), np.array(prediction)))
        relativeError[p] = get_relative_error(output[p], TEST_YEAR)
    else:
        print 'Not enough data for position: {}'.format(p)

Not enough data for position: LS
Not enough data for position: K
Not enough data for position: P


In [22]:
scores

{'C': (4054.5767462891695, 0.17774763652246361),
 'DB': (5744.677387570704, 0.0060549412549414372),
 'DE': (4654.1707876361015, 0.061544133787000743),
 'DT': (4880.2815936542465, 0.03933496423065852),
 'FB': (2636.5975183620249, -0.22735911942603937),
 'G': (3407.3827784200489, 0.1042697250402056),
 'LB': (5359.2650882208382, -0.033408468377745137),
 'QB': (4938.6712135652615, 0.22683856730577001),
 'RB': (3510.6664588062881, 0.19201062349842235),
 'T': (7350.3495499262208, 0.05738283570307301),
 'TE': (3929.9196565236225, 0.073382933151081464),
 'WR': (5517.5024680240795, 0.043339882865784496)}

In [23]:
output

{'C':             name       value  error
 0  CameronErving  164.338867    0.0
 4      MaxGarcia  161.217636    3.0
 1      AliMarpet  161.150543    1.0
 5     AndyGallik  157.847031    2.0
 2   HronissGrasu  156.396729    2.0
 3      ShaqMason  121.159370    2.0
 6   AustinReiter  121.159370    0.0,
 'DB':                 name       value  error
 3         ByronJones  161.506241    3.0
 24        AdrianAmos  156.177673   23.0
 8           EricRowe  155.231934    6.0
 19   ClaytonGeathers  154.522385   16.0
 5      LandonCollins  153.668167    1.0
 16        CraigMager  152.551437   11.0
 1       KevinJohnson  151.415802    5.0
 26       BobbyMcCain  151.244308   19.0
 6       JalenCollins  149.875595    2.0
 15        AlexCarter  148.715042    6.0
 14        PJWilliams  148.567764    4.0
 22        DoranGrant  147.815292   11.0
 12    JordanRichards  147.772705    0.0
 2       MarcusPeters  147.699326   11.0
 21          JoshShaw  146.998596    7.0
 9        RonaldDarby  146.964310   

In [24]:
relativeError

{'C': 1.4285714285714286,
 'DB': 9.73913043478261,
 'DE': 5.714285714285714,
 'DT': 5.545454545454546,
 'FB': 1.5,
 'G': 5.578947368421052,
 'LB': 8.722222222222221,
 'QB': 1.7142857142857142,
 'RB': 3.4444444444444446,
 'T': 5.8,
 'TE': 5.2,
 'WR': 8.058823529411764}