In [1]:
import numpy as np
import pandas as pd
from keras.models import Sequential
from keras.layers import Dense
from keras.wrappers.scikit_learn import KerasRegressor
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import KFold
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from collections import defaultdict
import math

Using TensorFlow backend.


In [2]:
# format the data to be fed into the network
combine_cleaned = pd.read_csv('combine_cleaned.csv')
final_continuous = pd.read_csv('finalData.csv')
final_discrete = pd.read_csv('discreteFeatures.csv')
train_continuous = pd.read_csv('finalData.csv')
test_continuous = pd.read_csv('finalData.csv')
train_discrete = pd.read_csv('finalData.csv')
test_discrete = pd.read_csv('finalData.csv')

# Set which of the datasets to use 
data = final_continuous
TEST_YEAR = 2015

In [3]:
features = ['heightinchestotal', 'weight', 'fortyyd', 'vertical', 'bench', 'twentyss', 'threecone', 'broad', 
            'games', 'rushingAtt', 'rushingYds', 'rushingAvg', 'rushingTD', 'passCmp',
            'passPct', 'passYds', 'passTD', 'passInt', 'passRate', 'recYds', 'recAtt', 'recTD', 'recAvg', 'soloTackles', 
            'tackleAssists', 'totalTackles', 'sacks', 'ints', 'intTDs', 'passDef', 'fumbles', 'fumblesForced']

In [4]:
featuresDiscrete = [] # TO-DO - add all features from discretized data

In [5]:
numFeatures = len(features) + 1

In [6]:
# get player values/draft positions based on 'nfl_draft' data
def get_player_draft_info():
    valuesMap = {}
    pickMap = {}
    MAX_VALUE = 256 # number of draft picks + 1 (7 rounds * 32 picks + 32 compensatory + 1)
    for index, row in data.iterrows():
        primaryKey = (row['name'], row['year'])
        valuesMap[primaryKey] = (MAX_VALUE - row['pick'])
        pickMap[primaryKey] = row['pick']
    return valuesMap, pickMap

In [7]:
# Creates list of positions to partition by
positionsSet = set()
for position in data['pos']:
    positionsSet.add(position)
positions = list(positionsSet)

In [8]:
draftValues, draftPositions = get_player_draft_info()

In [9]:
combineFeatures = data[['name', 'pos', 'year'] + features]

In [10]:
def get_position_maps():
    positionMaps = {}
    for position in positions:
        positionMaps[position] = combineFeatures[combineFeatures['pos'] == position]
    return positionMaps

In [11]:
positionMaps = get_position_maps()

In [12]:
# build X matrix and 
def build_data_arrays():
    X = defaultdict(list)
    Y = defaultdict(list)
    xTrain = defaultdict(list)
    yTrain = defaultdict(list)
    xTest = defaultdict(list)
    yTest = defaultdict(list)
    names = defaultdict(list)
    for position in positionMaps:
        for index, row in positionMaps[position].iterrows(): 
            X[position].append(row[3:])                  
            Y[position].append(draftValues[(row['name'], row['year'])])
            if row['year'] == TEST_YEAR:
                xTest[position].append(row[3:]) 
                yTest[position].append(draftValues[(row['name'], row['year'])])
                names[position].append(row['name'])
            if not row['year'] == TEST_YEAR:
                xTrain[position].append(row[3:])                  
                yTrain[position].append(draftValues[(row['name'], row['year'])])
    return X, Y, xTrain, yTrain, xTest, yTest, names

In [13]:
X, Y, xTrain, yTrain, xTest, yTest, names = build_data_arrays()

In [14]:
def base_model():
    model = Sequential()
    model.add(Dense(numFeatures, input_dim=numFeatures, kernel_initializer='normal', activation='relu'))
    model.add(Dense(1, kernel_initializer='normal'))
    model.compile(loss='mean_squared_error', optimizer='adam')
    return model

In [15]:
def deep_model():
    model = Sequential()
    model.add(Dense(numFeatures, input_dim=numFeatures, kernel_initializer='normal', activation='relu'))
    model.add(Dense(numFeatures // 2, kernel_initializer='normal'))
    model.add(Dense(1, kernel_initializer='normal'))
    model.compile(loss='mean_squared_error', optimizer='adam')
    return model

In [16]:
def wide_model():
    model = Sequential()
    model.add(Dense(numFeatures, input_dim=numFeatures, kernel_initializer='normal', activation='relu'))
    model.add(Dense(numFeatures * 2, kernel_initializer='normal'))
    model.add(Dense(1, kernel_initializer='normal'))
    model.compile(loss='mean_squared_error', optimizer='adam')
    # model.compile(optimizer='rmsprop', loss='mse')
    return model

In [17]:
seed = 7
np.random.seed(seed)
estimators = []
estimators.append(('standardize', StandardScaler())) # Makes data Gaussian 
predictor = KerasRegressor(build_fn=base_model, nb_epoch=1000, batch_size=5, verbose=0)
estimators.append(('mlp', predictor))
pipeline = Pipeline(estimators)

In [18]:
numFolds = 10
kFold = KFold(n_splits=numFolds, random_state=seed)
def eval_data():
    errorMap = {}
    for p in positions:
        if len(X[p]) > numFolds:
            results = cross_val_score(pipeline, np.array(X[p]), np.array(Y[p]), cv=kFold)
            errorMap[p] = (results.mean(), results.std())
            print 'Results[{}]: mean={} (std={}) Mean Squared Error'.format(p, results.mean(), results.std())
        else:
            print 'Position: {} has too little data ({} row(s))'.format(p, len(X[p]))
    return errorMap

In [19]:
# errors = eval_data()

In [20]:
# Map players to their true and predicted draft positions 
def get_relative_error(results, test_year):
    trueDraftMap = {}      # true pick number
    predictedDraftMap = {} # relative pick based on regression
    i = 0
    for index, row in results.iterrows():
        trueDraftMap[row['name']] = draftPositions[(row['name'], test_year)] #need primary key to get draft positions 
        predictedDraftMap[row['name']] = i
        i += 1
    
    # Compute absolute_error based on relative draft positions
    i = 0
    absolute_error = 0
    errors = {}
    for key, value in sorted(trueDraftMap.iteritems(), key=lambda (k,v): (v,k)):  
        errors[key] = math.fabs(i - predictedDraftMap[key]) 
        absolute_error += errors[key]
        i += 1
        
    #Add the errors to the results Dataframe (for visualization)
    results['error'] = 0.0
    for index, row in results.iterrows():
        results.at[index, 'error'] = errors[row['name']]

    #average the error by the total number of players
    return absolute_error / float(len(results))

In [21]:
scores = {}
output = {}
relativeError = {} 
scaler = StandardScaler()
for p in positions:
    if len(xTrain[p]) > 1 and len(xTest[p]) > 1:
#         scaler.fit(np.array(xTrain[p]))
#         predictor.fit(scaler.transform(np.array(xTrain[p])), np.array(yTrain[p]))
#         prediction = predictor.predict(scaler.transform(np.array(xTest[p])))
        predictor.fit(np.array(xTrain[p]), np.array(yTrain[p]))
        prediction = predictor.predict(np.array(xTest[p]))
        output[p] = pd.DataFrame(zip(names[p], prediction), columns = ['name', 'value']).sort_values(by=['value'], ascending=False)
        # scores[p] = predictor.score(np.array(xTest[p]), np.array(yTest[p]))
        scores[p] = (mean_squared_error(np.array(yTest[p]), np.array(prediction)), 
                     r2_score(np.array(yTest[p]), np.array(prediction)))
        relativeError[p] = get_relative_error(output[p], TEST_YEAR)
    else:
        print 'Not enough data for position: {}'.format(p)

Not enough data for position: LS
Not enough data for position: K
Not enough data for position: P


In [22]:
scores

{'C': (10977.630418087374, -1.2262206690048147),
 'DB': (5775.454365590319, 0.00072990328294186835),
 'DE': (4946.7564242139815, 0.0025478658061179882),
 'DT': (5102.2083498078919, -0.0043504811780106234),
 'FB': (2519.8988119453643, -0.17303485470675373),
 'G': (3774.8951797279879, 0.0076583356889523557),
 'LB': (5263.2967150021641, -0.014903220373015946),
 'QB': (6658.148287500987, -0.042349904750724976),
 'RB': (4688.1736001201307, -0.078995828438863214),
 'T': (7646.8259933918716, 0.019362360279120527),
 'TE': (4338.6231497419994, -0.022983319901512589),
 'WR': (5605.1514126098964, 0.028142744290840405)}

In [23]:
output

{'C':             name      value  error
 0  CameronErving  57.698215    0.0
 1      AliMarpet  57.569374    0.0
 4      MaxGarcia  57.466431    2.0
 2   HronissGrasu  57.413815    1.0
 5     AndyGallik  57.185734    1.0
 3      ShaqMason  57.035084    2.0
 6   AustinReiter  57.035084    0.0,
 'DB':                 name       value  error
 3         ByronJones  132.282837    3.0
 24        AdrianAmos  132.002823   23.0
 19   ClaytonGeathers  131.900024   17.0
 8           EricRowe  131.767365    5.0
 5      LandonCollins  131.742065    1.0
 12    JordanRichards  131.386795    7.0
 1       KevinJohnson  131.310898    5.0
 16        CraigMager  131.309677    9.0
 26       BobbyMcCain  131.229706   18.0
 22        DoranGrant  131.172745   13.0
 6       JalenCollins  131.091965    4.0
 42        RyanMurphy  131.090546   31.0
 15        AlexCarter  131.067032    3.0
 41     DarrylRoberts  131.065445   28.0
 25   MykkeleThompson  131.062317   11.0
 35      RandallEvans  131.058578   20.0
 20

In [24]:
relativeError

{'C': 0.8571428571428571,
 'DB': 13.391304347826088,
 'DE': 6.0,
 'DT': 5.090909090909091,
 'FB': 1.5,
 'G': 4.842105263157895,
 'LB': 7.888888888888889,
 'QB': 1.7142857142857142,
 'RB': 3.5555555555555554,
 'T': 5.8,
 'TE': 4.9,
 'WR': 8.235294117647058}