In [1]:
import numpy as np
import pandas as pd
from keras.models import Sequential
from keras.layers import Dense
from keras.wrappers.scikit_learn import KerasRegressor
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import KFold
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from collections import defaultdict
import math

Using TensorFlow backend.


In [2]:
# format the data to be fed into the network
combine_cleaned = pd.read_csv('combine_cleaned.csv')
final_continuous = pd.read_csv('finalData.csv')
final_discrete = pd.read_csv('discreteFeatures.csv')
train_continuous = pd.read_csv('finalData.csv')
test_continuous = pd.read_csv('finalData.csv')
train_discrete = pd.read_csv('finalData.csv')
test_discrete = pd.read_csv('finalData.csv')

# Set which of the datasets to use 
data = final_continuous
TEST_YEAR = 2015

In [3]:
features = ['heightinchestotal', 'weight', 'fortyyd', 'vertical', 'bench', 'twentyss', 'threecone', 'broad', 
            'games', 'rushingAtt', 'rushingYds', 'rushingAvg', 'rushingTD', 'passCmp',
            'passPct', 'passYds', 'passTD', 'passInt', 'passRate', 'recYds', 'recAtt', 'recTD', 'recAvg', 'soloTackles', 
            'tackleAssists', 'totalTackles', 'sacks', 'ints', 'intTDs', 'passDef', 'fumbles', 'fumblesForced']

In [4]:
featuresDiscrete = [] # TO-DO - add all features from discretized data

In [5]:
numFeatures = len(features) + 1

In [6]:
# get player values/draft positions based on 'nfl_draft' data
def get_player_draft_info():
    valuesMap = {}
    pickMap = {}
    MAX_VALUE = 256 # number of draft picks + 1 (7 rounds * 32 picks + 32 compensatory + 1)
    for index, row in data.iterrows():
        primaryKey = (row['name'], row['year'])
        valuesMap[primaryKey] = (MAX_VALUE - row['pick'])
        pickMap[primaryKey] = row['pick']
    return valuesMap, pickMap

In [7]:
# Creates list of positions to partition by
positionsSet = set()
for position in data['pos']:
    positionsSet.add(position)
positions = list(positionsSet)

In [8]:
draftValues, draftPositions = get_player_draft_info()

In [9]:
combineFeatures = data[['name', 'pos', 'year'] + features]

In [10]:
def get_position_maps():
    positionMaps = {}
    for position in positions:
        positionMaps[position] = combineFeatures[combineFeatures['pos'] == position]
    return positionMaps

In [11]:
positionMaps = get_position_maps()

In [12]:
# build X matrix and 
def build_data_arrays():
    X = defaultdict(list)
    Y = defaultdict(list)
    xTrain = defaultdict(list)
    yTrain = defaultdict(list)
    xTest = defaultdict(list)
    yTest = defaultdict(list)
    names = defaultdict(list)
    for position in positionMaps:
        for index, row in positionMaps[position].iterrows(): 
            X[position].append(row[2:])                  
            Y[position].append(draftValues[(row['name'], row['year'])])
            if row['year'] == TEST_YEAR:
                xTest[position].append(row[2:]) 
                yTest[position].append(draftValues[(row['name'], row['year'])])
                names[position].append(row['name'])
            if not row['year'] == TEST_YEAR:
                xTrain[position].append(row[2:])                  
                yTrain[position].append(draftValues[(row['name'], row['year'])])
    return X, Y, xTrain, yTrain, xTest, yTest, names

In [13]:
X, Y, xTrain, yTrain, xTest, yTest, names = build_data_arrays()

In [14]:
def base_model():
    model = Sequential()
    model.add(Dense(numFeatures, input_dim=numFeatures, kernel_initializer='normal', activation='relu'))
    model.add(Dense(1, kernel_initializer='normal'))
    model.compile(loss='mean_squared_error', optimizer='adam')
    return model

In [15]:
def deep_model():
    model = Sequential()
    model.add(Dense(numFeatures, input_dim=numFeatures, kernel_initializer='normal', activation='relu'))
    model.add(Dense(numFeatures // 2, kernel_initializer='normal'))
    model.add(Dense(1, kernel_initializer='normal'))
    model.compile(loss='mean_squared_error', optimizer='adam')
    return model

In [16]:
def wide_model():
    model = Sequential()
    model.add(Dense(numFeatures, input_dim=numFeatures, kernel_initializer='normal', activation='relu'))
    model.add(Dense(numFeatures * 2, kernel_initializer='normal'))
    model.add(Dense(1, kernel_initializer='normal'))
    model.compile(loss='mean_squared_error', optimizer='adam')
    # model.compile(optimizer='rmsprop', loss='mse')
    return model

In [17]:
seed = 7
np.random.seed(seed)
estimators = []
estimators.append(('standardize', StandardScaler())) # Makes data Gaussian 
predictor = KerasRegressor(build_fn=wide_model, nb_epoch=100, batch_size=5, verbose=0)
estimators.append(('mlp', predictor))
pipeline = Pipeline(estimators)

In [18]:
numFolds = 10
kFold = KFold(n_splits=numFolds, random_state=seed)
def eval_data():
    errorMap = {}
    for p in positions:
        if len(X[p]) > numFolds:
            results = cross_val_score(pipeline, np.array(X[p]), np.array(Y[p]), cv=kFold)
            errorMap[p] = (results.mean(), results.std())
            print 'Results[{}]: mean={} (std={}) Mean Squared Error'.format(p, results.mean(), results.std())
        else:
            print 'Position: {} has too little data ({} row(s))'.format(p, len(X[p]))
    return errorMap

In [19]:
# errors = eval_data()

In [20]:
scores = {}
output = {}
scaler = StandardScaler()
for p in positions:
    if len(xTrain[p]) > 1 and len(xTest[p]) > 1:
#         scaler.fit(np.array(xTrain[p]))
#         predictor.fit(scaler.transform(np.array(xTrain[p])), np.array(yTrain[p]))
#         prediction = predictor.predict(scaler.transform(np.array(xTest[p])))
        predictor.fit(np.array(xTrain[p]), np.array(yTrain[p]))
        prediction = predictor.predict(np.array(xTest[p]))
        output[p] = pd.DataFrame(zip(names[p], prediction), columns = ['name', 'value']).sort_values(by=['value'], ascending=False)
        # scores[p] = predictor.score(np.array(xTest[p]), np.array(yTest[p]))
        scores[p] = (mean_squared_error(np.array(yTest[p]), np.array(prediction)), 
                     r2_score(np.array(yTest[p]), np.array(prediction)))
    else:
        print 'Not enough data for position: {}'.format(p)

Not enough data for position: LS
Not enough data for position: K
Not enough data for position: P


In [21]:
scores

{'C': (5983.6430441121092, -0.21345949111212281),
 'DB': (5775.2052514764728, 0.00077300504929278535),
 'DE': (4938.8639660918243, 0.0041392821854341344),
 'DT': (5060.9997924140343, 0.003761268011676977),
 'FB': (3034.7880206454392, -0.41272026796796801),
 'G': (3747.9723042288801, 0.014735801369138102),
 'LB': (5168.8975738628587, 0.0032994380615065655),
 'QB': (6192.5772970221669, 0.030536407873357874),
 'RB': (4945.6807871805895, -0.13826180370567953),
 'T': (7905.8465705712906, -0.01385473236348278),
 'TE': (4824.1882664527247, -0.13747240963742136),
 'WR': (5796.6683339816727, -0.0050636931316190292)}

In [22]:
output

{'C':             name       value
 0  CameronErving  102.452324
 4      MaxGarcia  102.274246
 1      AliMarpet  102.166084
 2   HronissGrasu  101.954193
 5     AndyGallik  101.789841
 3      ShaqMason   99.553421
 6   AustinReiter   99.553413, 'DB':                 name       value
 19   ClaytonGeathers  125.546227
 24        AdrianAmos  125.481461
 12    JordanRichards  125.267471
 5      LandonCollins  125.233475
 8           EricRowe  125.216911
 3         ByronJones  125.173737
 35      RandallEvans  125.071228
 42        RyanMurphy  125.063644
 41     DarrylRoberts  125.027718
 25   MykkeleThompson  125.019760
 20  IbraheimCampbell  124.997238
 36       DerronSmith  124.935341
 31    KyshoenJarrett  124.866219
 22        DoranGrant  124.837036
 32      TevinMitchel  124.760529
 27    CedricThompson  124.747574
 37      QuandreDiggs  124.707314
 16        CraigMager  124.707153
 1       KevinJohnson  124.645142
 38     GerodHolliman  124.618172
 15        AlexCarter  124.598969
 