In [1]:
from numpy.random import seed 
s = 1
seed(s)
from tensorflow import set_random_seed
set_random_seed(s)
import pandas as pd
import numpy as np
from keras.models import Sequential
from keras.layers import Dense
from keras.wrappers.scikit_learn import KerasRegressor
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import KFold
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from collections import defaultdict
import math

Using TensorFlow backend.


In [2]:
# format the data to be fed into the network
final_continuous = pd.read_csv('finalData.csv')
final_discrete = pd.read_csv('discreteData.csv')
final_beast = pd.read_csv('superbeastfire.csv')

# Set which of the datasets to use 
data = final_beast
TEST_YEAR = 2015

In [3]:
featuresStandard = ['heightinchestotal', 'weight', 'fortyyd', 'vertical', 'bench', 'twentyss', 'threecone', 'broad', 
            'games', 'rushingAtt', 'rushingYds', 'rushingAvg', 'rushingTD', 'passCmp',
            'passPct', 'passYds', 'passTD', 'passInt', 'passRate', 'recYds', 'recAtt', 'recTD', 'recAvg', 'soloTackles', 
            'tackleAssists', 'totalTackles', 'sacks', 'ints', 'intTDs', 'passDef', 'fumbles', 'fumblesForced']
featuresDiscrete = ['confACC','confPac-12','confUnknown','confSEC','confBig 12','confBig Ten','confAmerican',
            'confBig East','confPac-10','confMAC','confSun Belt','confMWC','confWAC','confCUSA','confInd','confSouthern','confMVC','confPac-8',
            'confBig West','confSWC','confSouthland','confBig 8','confSWAC','heightinchestotalNone','heightinchestotalQ1','heightinchestotalQ2',
            'heightinchestotalQ3','heightinchestotalQ4','weightNone','weightQ1','weightQ2','weightQ3','weightQ4','fortyydNone','fortyydQ1','fortyydQ2',
            'fortyydQ3','fortyydQ4','verticalNone','verticalQ1','verticalQ2','verticalQ3','verticalQ4','benchNone','benchQ1','benchQ2','benchQ3',
            'benchQ4','twentyssNone','twentyssQ1','twentyssQ2','twentyssQ3','twentyssQ4','threeconeNone','threeconeQ1','threeconeQ2','threeconeQ3',
            'threeconeQ4','broadNone','broadQ1','broadQ2','broadQ3','broadQ4','gamesNone','gamesQ1','gamesQ2','gamesQ3','gamesQ4','rushingAttNone',
            'rushingAttQ1','rushingAttQ2','rushingAttQ3','rushingAttQ4','rushingYdsNone','rushingYdsQ1','rushingYdsQ2','rushingYdsQ3','rushingYdsQ4',
            'rushingAvgNone','rushingAvgQ1','rushingAvgQ2','rushingAvgQ3','rushingAvgQ4','rushingTDNone','rushingTDQ1','rushingTDQ2','rushingTDQ3','rushingTDQ4'
            ,'passCmpNone','passCmpQ1','passCmpQ2','passCmpQ3','passCmpQ4','passPctNone','passPctQ1','passPctQ2','passPctQ3','passPctQ4','passYdsNone','passYdsQ1',
            'passYdsQ2','passYdsQ3','passYdsQ4','passTDNone','passTDQ1','passTDQ2','passTDQ3','passTDQ4','passIntNone','passIntQ1','passIntQ2','passIntQ3','passIntQ4','passRateNone',
            'passRateQ1','passRateQ2','passRateQ3','passRateQ4','recYdsNone','recYdsQ1','recYdsQ2','recYdsQ3','recYdsQ4','recAttNone','recAttQ1','recAttQ2','recAttQ3',
            'recAttQ4','recTDNone','recTDQ1','recTDQ2','recTDQ3','recTDQ4','recAvgNone','recAvgQ1','recAvgQ2','recAvgQ3','recAvgQ4','soloTacklesNone','soloTacklesQ1',
            'soloTacklesQ2','soloTacklesQ3','soloTacklesQ4','tackleAssistsNone','tackleAssistsQ1','tackleAssistsQ2','tackleAssistsQ3','tackleAssistsQ4','totalTacklesNone',
            'totalTacklesQ1','totalTacklesQ2','totalTacklesQ3','totalTacklesQ4','sacksNone','sacksQ1','sacksQ2','sacksQ3','sacksQ4','intsNone','intsQ1','intsQ2','intsQ3',
            'intsQ4','intTDsNone','intTDsQ1','intTDsQ2','intTDsQ3','intTDsQ4','passDefNone','passDefQ1','passDefQ2','passDefQ3','passDefQ4','fumblesNone','fumblesQ1',
            'fumblesQ2','fumblesQ3','fumblesQ4','fumblesForcedNone','fumblesForcedQ1','fumblesForcedQ2','fumblesForcedQ3','fumblesForcedQ4']
featuresBeast = featuresDiscrete + featuresStandard
# features = featuresStandard if data.equals(final_continuous) else featuresDiscrete
features = featuresBeast

In [4]:
numFeatures = len(features)

In [5]:
# get player values/draft positions based on 'nfl_draft' data
def get_player_draft_info():
    valuesMap = {}
    pickMap = {}
    MAX_VALUE = 256 # number of draft picks (7 rounds * 32 picks + 32 compensatory + 1)
    for index, row in data.iterrows():
        primaryKey = (row['name'], row['year'])
        valuesMap[primaryKey] = (MAX_VALUE - row['pick'])
        pickMap[primaryKey] = row['pick']
    return valuesMap, pickMap

In [6]:
# Creates list of positions to partition by
positionsSet = set()
for position in data['pos']:
    positionsSet.add(position)
positions = list(positionsSet)

In [7]:
draftValues, draftPositions = get_player_draft_info()

In [8]:
combineFeatures = data[['name', 'pos', 'year'] + features]

In [9]:
def get_position_maps():
    positionMaps = {}
    for position in positions:
        positionMaps[position] = combineFeatures[combineFeatures['pos'] == position]
    return positionMaps

In [10]:
positionMaps = get_position_maps()

In [11]:
# build X matrix and 
def build_data_arrays():
    X = defaultdict(list)
    Y = defaultdict(list)
    xTrain = defaultdict(list)
    yTrain = defaultdict(list)
    xTest = defaultdict(list)
    yTest = defaultdict(list)
    names = defaultdict(list)
    for position in positionMaps:
        for index, row in positionMaps[position].iterrows(): 
            X[position].append(row[3:])                  
            Y[position].append(draftValues[(row['name'], row['year'])])
            if row['year'] == TEST_YEAR:
                xTest[position].append(row[3:]) 
                yTest[position].append(draftValues[(row['name'], row['year'])])
                names[position].append(row['name'])
            if not row['year'] == TEST_YEAR:
                xTrain[position].append(row[3:])                  
                yTrain[position].append(draftValues[(row['name'], row['year'])])
    return X, Y, xTrain, yTrain, xTest, yTest, names

In [12]:
X, Y, xTrain, yTrain, xTest, yTest, names = build_data_arrays()

In [13]:
def base_model():
    model = Sequential()
    model.add(Dense(numFeatures, input_dim=numFeatures, kernel_initializer='normal', activation='relu'))
    model.add(Dense(1, kernel_initializer='normal'))
    model.compile(loss='mean_squared_error', optimizer='adam')
    return model

In [14]:
def deep_model():
    model = Sequential()
    model.add(Dense(numFeatures, input_dim=numFeatures, kernel_initializer='normal', activation='relu'))
    model.add(Dense(numFeatures // 2, kernel_initializer='normal'))
    model.add(Dense(1, kernel_initializer='normal'))
    model.compile(loss='mean_squared_error', optimizer='adam')
    return model

In [15]:
def wide_model():
    model = Sequential()
    model.add(Dense(numFeatures, input_dim=numFeatures, kernel_initializer='normal', activation='relu'))
    model.add(Dense(numFeatures * 2, kernel_initializer='normal'))
    model.add(Dense(1, kernel_initializer='normal', activation='linear'))
    model.compile(loss='mean_squared_error', optimizer='adam', metrics=['mae'])
    model.output_shape
    model.summary()
    return model

In [16]:
def positions_model():
    model = Sequential()
    model.add(Dense(numFeatures, input_dim=numFeatures, kernel_initializer='normal', activation='relu'))
    model.add(Dense(len(positions), kernel_initializer='normal'))
    model.add(Dense(1, kernel_initializer='normal'))
    model.compile(loss='mean_squared_error', optimizer='adam')
    return model

In [17]:
estimators = []
estimators.append(('standardize', StandardScaler())) # Makes data Gaussian 
predictor = KerasRegressor(build_fn=wide_model, nb_epoch=1000, batch_size=5, verbose=0)
estimators.append(('mlp', predictor))
pipeline = Pipeline(estimators)

In [18]:
numFolds = 10
kFold = KFold(n_splits=numFolds, random_state=s)
def eval_data():
    errorMap = {}
    for p in positions:
        if len(X[p]) > numFolds:
            results = cross_val_score(predictor, np.array(X[p]), np.array(Y[p]), cv=kFold)
            errorMap[p] = (results.mean(), results.std())
            print 'Results[{}]: mean={} (std={}) Mean Squared Error'.format(p, results.mean(), results.std())
        else:
            print 'Position: {} has too little data ({} row(s))'.format(p, len(X[p]))
    return errorMap

In [19]:
# testErrors = eval_data()
# testErrors

In [20]:
def eval_data_train():
    errorMap = {}
    for p in positions:
        if len(X[p]) > 1:
            predictor.fit(np.array(X[p]), np.array(Y[p]))
            errorMap[p] = predictor.score(np.array(X[p]), np.array(Y[p]))
        else:
            print 'Position: {} has too little data ({} row(s))'.format(p, len(X[p]))
    return errorMap

In [21]:
trainErrors = eval_data_train()
trainErrors

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_1 (Dense)              (None, 215)               46440     
_________________________________________________________________
dense_2 (Dense)              (None, 430)               92880     
_________________________________________________________________
dense_3 (Dense)              (None, 1)                 431       
Total params: 139,751
Trainable params: 139,751
Non-trainable params: 0
_________________________________________________________________
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_4 (Dense)              (None, 215)               46440     
_________________________________________________________________
dense_5 (Dense)              (None, 430)               92880     
_________________________________________________________________
dens

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_37 (Dense)             (None, 215)               46440     
_________________________________________________________________
dense_38 (Dense)             (None, 430)               92880     
_________________________________________________________________
dense_39 (Dense)             (None, 1)                 431       
Total params: 139,751
Trainable params: 139,751
Non-trainable params: 0
_________________________________________________________________
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_40 (Dense)             (None, 215)               46440     
_________________________________________________________________
dense_41 (Dense)             (None, 430)               92880     
_________________________________________________________________
dens

{'C': -5314.2760814319954,
 'DB': -4713.5223761697025,
 'DE': -5671.2799313863115,
 'DT': -5788.5956006731303,
 'FB': -3929.1448376713965,
 'G': -4450.4740289541387,
 'K': -3805.1363236276725,
 'LB': -4774.4803293760515,
 'P': -2638.8445147346047,
 'QB': -7026.8760037535176,
 'RB': -4708.3974041408965,
 'T': -5755.7278304876281,
 'TE': -3941.4598425030708,
 'WR': -5499.5845631309176}

In [22]:
# Map players to their true and predicted draft positions 
def get_relative_error(results, test_year):
    trueDraftMap = {}      # true pick number
    predictedDraftMap = {} # relative pick based on regression
    i = 0
    for index, row in results.iterrows():
        trueDraftMap[row['name']] = draftPositions[(row['name'], test_year)] #need primary key to get draft positions 
        predictedDraftMap[row['name']] = i
        i += 1
    
    # Compute absolute_error based on relative draft positions
    i = 0
    absolute_error = 0
    errors = {}
    for key, value in sorted(trueDraftMap.iteritems(), key=lambda (k,v): (v,k)):  
        errors[key] = math.fabs(i - predictedDraftMap[key]) 
        absolute_error += errors[key]
        i += 1
        
    #Add the errors to the results Dataframe (for visualization)
    results['error'] = 0.0
    for index, row in results.iterrows():
        results.at[index, 'error'] = errors[row['name']]

    #average the error by the total number of players
    return absolute_error / float(len(results))

In [23]:
scores = {}
output = {}
relativeError = {} 
for p in positions:
    if len(xTrain[p]) > 1 and len(xTest[p]) > 1:
        predictor.fit(np.array(xTrain[p]), np.array(yTrain[p]))
        prediction = predictor.predict(np.array(xTest[p]))
        output[p] = pd.DataFrame(zip(names[p], prediction), columns = ['name', 'value']).sort_values(by=['value'], ascending=False)
        # scores[p] = predictor.score(np.array(xTest[p]), np.array(yTest[p]))
        scores[p] = (mean_squared_error(np.array(yTest[p]), np.array(prediction)), 
                     r2_score(np.array(yTest[p]), np.array(prediction)))
        relativeError[p] = get_relative_error(output[p], TEST_YEAR)
    else:
        print 'Not enough data for position: {}'.format(p)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_43 (Dense)             (None, 215)               46440     
_________________________________________________________________
dense_44 (Dense)             (None, 430)               92880     
_________________________________________________________________
dense_45 (Dense)             (None, 1)                 431       
Total params: 139,751
Trainable params: 139,751
Non-trainable params: 0
_________________________________________________________________
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_46 (Dense)             (None, 215)               46440     
_________________________________________________________________
dense_47 (Dense)             (None, 430)               92880     
_________________________________________________________________
dens

In [24]:
scores

{'C': (5523.6568173699943, -0.12017607689336951),
 'DB': (5780.9414313556426, -0.00021946976910336424),
 'DE': (4940.8083011182498, 0.0037472311209825193),
 'DT': (4884.4906402722381, 0.03850642926966652),
 'FB': (4324.8590360802191, -1.0132595669978617),
 'G': (3491.1441370372104, 0.082250600784390393),
 'LB': (4935.0084780602738, 0.048399459852711657),
 'QB': (5481.5222247163183, 0.14185387256273396),
 'RB': (3104.5740762082601, 0.28547388321492329),
 'T': (6990.0968641913641, 0.10358204878147026),
 'TE': (3798.787483771358, 0.10430196455762064),
 'WR': (4804.4517037021387, 0.1669732172724584)}

In [25]:
output

{'C':             name       value  error
 0  CameronErving  109.009346    0.0
 4      MaxGarcia  107.092743    3.0
 1      AliMarpet  106.810333    1.0
 5     AndyGallik  105.775459    2.0
 2   HronissGrasu  103.097046    2.0
 3      ShaqMason   73.714912    2.0
 6   AustinReiter   73.714912    0.0,
 'DB':                 name       value  error
 3         ByronJones  179.362488    3.0
 1       KevinJohnson  166.505981    0.0
 16        CraigMager  166.411972   14.0
 8           EricRowe  165.883438    5.0
 26       BobbyMcCain  165.213242   22.0
 14        PJWilliams  165.097824    9.0
 24        AdrianAmos  164.585037   18.0
 21          JoshShaw  163.687943   14.0
 9        RonaldDarby  161.987427    1.0
 15        AlexCarter  160.475647    6.0
 6       JalenCollins  160.402039    4.0
 2       MarcusPeters  159.894180    9.0
 5      LandonCollins  159.609970    7.0
 19   ClaytonGeathers  159.355209    6.0
 0         TraeWaynes  158.211319   14.0
 13        DJounSmith  157.465729   

In [26]:
relativeError

{'C': 1.4285714285714286,
 'DB': 8.0,
 'DE': 5.809523809523809,
 'DT': 5.2727272727272725,
 'FB': 1.5,
 'G': 5.578947368421052,
 'LB': 9.61111111111111,
 'QB': 2.0,
 'RB': 3.5555555555555554,
 'T': 5.9,
 'TE': 5.5,
 'WR': 8.058823529411764}