In [1]:
import numpy as np
import pandas as pd
from keras.models import Sequential
from keras.layers import Dense
from keras.wrappers.scikit_learn import KerasRegressor
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import KFold
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from collections import defaultdict
import math

Using TensorFlow backend.


In [2]:
# format the data to be fed into the network
combine_cleaned = pd.read_csv('combine_cleaned.csv')
final_continuous = pd.read_csv('finalData.csv')
final_discrete = pd.read_csv('discreteData.csv')

# Set which of the datasets to use 
data = final_continuous
TEST_YEAR = 2015

In [3]:
featuresStandard = ['heightinchestotal', 'weight', 'fortyyd', 'vertical', 'bench', 'twentyss', 'threecone', 'broad', 
            'games', 'rushingAtt', 'rushingYds', 'rushingAvg', 'rushingTD', 'passCmp',
            'passPct', 'passYds', 'passTD', 'passInt', 'passRate', 'recYds', 'recAtt', 'recTD', 'recAvg', 'soloTackles', 
            'tackleAssists', 'totalTackles', 'sacks', 'ints', 'intTDs', 'passDef', 'fumbles', 'fumblesForced']
featuresDiscrete = ['confACC','confPac-12','confUnknown','confSEC','confBig 12','confBig Ten','confAmerican',
            'confBig East','confPac-10','confMAC','confSun Belt','confMWC','confWAC','confCUSA','confInd','confSouthern','confMVC','confPac-8',
            'confBig West','confSWC','confSouthland','confBig 8','confSWAC','heightinchestotalNone','heightinchestotalQ1','heightinchestotalQ2',
            'heightinchestotalQ3','heightinchestotalQ4','weightNone','weightQ1','weightQ2','weightQ3','weightQ4','fortyydNone','fortyydQ1','fortyydQ2',
            'fortyydQ3','fortyydQ4','verticalNone','verticalQ1','verticalQ2','verticalQ3','verticalQ4','benchNone','benchQ1','benchQ2','benchQ3',
            'benchQ4','twentyssNone','twentyssQ1','twentyssQ2','twentyssQ3','twentyssQ4','threeconeNone','threeconeQ1','threeconeQ2','threeconeQ3',
            'threeconeQ4','broadNone','broadQ1','broadQ2','broadQ3','broadQ4','gamesNone','gamesQ1','gamesQ2','gamesQ3','gamesQ4','rushingAttNone',
            'rushingAttQ1','rushingAttQ2','rushingAttQ3','rushingAttQ4','rushingYdsNone','rushingYdsQ1','rushingYdsQ2','rushingYdsQ3','rushingYdsQ4',
            'rushingAvgNone','rushingAvgQ1','rushingAvgQ2','rushingAvgQ3','rushingAvgQ4','rushingTDNone','rushingTDQ1','rushingTDQ2','rushingTDQ3','rushingTDQ4'
            ,'passCmpNone','passCmpQ1','passCmpQ2','passCmpQ3','passCmpQ4','passPctNone','passPctQ1','passPctQ2','passPctQ3','passPctQ4','passYdsNone','passYdsQ1',
            'passYdsQ2','passYdsQ3','passYdsQ4','passTDNone','passTDQ1','passTDQ2','passTDQ3','passTDQ4','passIntNone','passIntQ1','passIntQ2','passIntQ3','passIntQ4','passRateNone',
            'passRateQ1','passRateQ2','passRateQ3','passRateQ4','recYdsNone','recYdsQ1','recYdsQ2','recYdsQ3','recYdsQ4','recAttNone','recAttQ1','recAttQ2','recAttQ3',
            'recAttQ4','recTDNone','recTDQ1','recTDQ2','recTDQ3','recTDQ4','recAvgNone','recAvgQ1','recAvgQ2','recAvgQ3','recAvgQ4','soloTacklesNone','soloTacklesQ1',
            'soloTacklesQ2','soloTacklesQ3','soloTacklesQ4','tackleAssistsNone','tackleAssistsQ1','tackleAssistsQ2','tackleAssistsQ3','tackleAssistsQ4','totalTacklesNone',
            'totalTacklesQ1','totalTacklesQ2','totalTacklesQ3','totalTacklesQ4','sacksNone','sacksQ1','sacksQ2','sacksQ3','sacksQ4','intsNone','intsQ1','intsQ2','intsQ3',
            'intsQ4','intTDsNone','intTDsQ1','intTDsQ2','intTDsQ3','intTDsQ4','passDefNone','passDefQ1','passDefQ2','passDefQ3','passDefQ4','fumblesNone','fumblesQ1',
            'fumblesQ2','fumblesQ3','fumblesQ4','fumblesForcedNone','fumblesForcedQ1','fumblesForcedQ2','fumblesForcedQ3','fumblesForcedQ4']
features = featuresStandard

In [4]:
numFeatures = len(features)

In [5]:
# get player values/draft positions based on 'nfl_draft' data
def get_player_draft_info():
    valuesMap = {}
    pickMap = {}
    MAX_VALUE = 256 # number of draft picks + 1 (7 rounds * 32 picks + 32 compensatory + 1)
    for index, row in data.iterrows():
        primaryKey = (row['name'], row['year'])
        valuesMap[primaryKey] = (MAX_VALUE - row['pick'])
        pickMap[primaryKey] = row['pick']
    return valuesMap, pickMap

In [6]:
# Creates list of positions to partition by
positionsSet = set()
for position in data['pos']:
    positionsSet.add(position)
positions = list(positionsSet)

In [7]:
draftValues, draftPositions = get_player_draft_info()

In [8]:
combineFeatures = data[['name', 'pos', 'year'] + features]

In [9]:
def get_position_maps():
    positionMaps = {}
    for position in positions:
        positionMaps[position] = combineFeatures[combineFeatures['pos'] == position]
    return positionMaps

In [10]:
positionMaps = get_position_maps()

In [11]:
# build X matrix and 
def build_data_arrays():
    X = defaultdict(list)
    Y = defaultdict(list)
    xTrain = defaultdict(list)
    yTrain = defaultdict(list)
    xTest = defaultdict(list)
    yTest = defaultdict(list)
    names = defaultdict(list)
    for position in positionMaps:
        for index, row in positionMaps[position].iterrows(): 
            X[position].append(row[3:])                  
            Y[position].append(draftValues[(row['name'], row['year'])])
            if row['year'] == TEST_YEAR:
                xTest[position].append(row[3:]) 
                yTest[position].append(draftValues[(row['name'], row['year'])])
                names[position].append(row['name'])
            if not row['year'] == TEST_YEAR:
                xTrain[position].append(row[3:])                  
                yTrain[position].append(draftValues[(row['name'], row['year'])])
    return X, Y, xTrain, yTrain, xTest, yTest, names

In [12]:
X, Y, xTrain, yTrain, xTest, yTest, names = build_data_arrays()

In [13]:
def base_model():
    model = Sequential()
    model.add(Dense(numFeatures, input_dim=numFeatures, kernel_initializer='normal', activation='relu'))
    model.add(Dense(1, kernel_initializer='normal'))
    model.compile(loss='mean_squared_error', optimizer='adam')
    return model

In [14]:
def deep_model():
    model = Sequential()
    model.add(Dense(numFeatures, input_dim=numFeatures, kernel_initializer='normal', activation='relu'))
    model.add(Dense(numFeatures // 2, kernel_initializer='normal'))
    model.add(Dense(1, kernel_initializer='normal'))
    model.compile(loss='mean_squared_error', optimizer='adam')
    return model

In [15]:
def wide_model():
    model = Sequential()
    model.add(Dense(numFeatures, input_dim=numFeatures, kernel_initializer='normal', activation='relu'))
    model.add(Dense(numFeatures * 2, kernel_initializer='normal'))
    model.add(Dense(1, kernel_initializer='normal'))
    model.compile(loss='mean_squared_error', optimizer='adam')
    # model.compile(optimizer='rmsprop', loss='mse')
    return model

In [16]:
seed = 7
np.random.seed(seed)
estimators = []
estimators.append(('standardize', StandardScaler())) # Makes data Gaussian 
predictor = KerasRegressor(build_fn=base_model, nb_epoch=1000, batch_size=5, verbose=0)
estimators.append(('mlp', predictor))
pipeline = Pipeline(estimators)

In [17]:
numFolds = 10
kFold = KFold(n_splits=numFolds, random_state=seed)
def eval_data():
    errorMap = {}
    for p in positions:
        if len(X[p]) > numFolds:
            results = cross_val_score(pipeline, np.array(X[p]), np.array(Y[p]), cv=kFold)
            errorMap[p] = (results.mean(), results.std())
            print 'Results[{}]: mean={} (std={}) Mean Squared Error'.format(p, results.mean(), results.std())
        else:
            print 'Position: {} has too little data ({} row(s))'.format(p, len(X[p]))
    return errorMap

In [18]:
# errors = eval_data()

In [19]:
# Map players to their true and predicted draft positions 
def get_relative_error(results, test_year):
    trueDraftMap = {}      # true pick number
    predictedDraftMap = {} # relative pick based on regression
    i = 0
    for index, row in results.iterrows():
        trueDraftMap[row['name']] = draftPositions[(row['name'], test_year)] #need primary key to get draft positions 
        predictedDraftMap[row['name']] = i
        i += 1
    
    # Compute absolute_error based on relative draft positions
    i = 0
    absolute_error = 0
    errors = {}
    for key, value in sorted(trueDraftMap.iteritems(), key=lambda (k,v): (v,k)):  
        errors[key] = math.fabs(i - predictedDraftMap[key]) 
        absolute_error += errors[key]
        i += 1
        
    #Add the errors to the results Dataframe (for visualization)
    results['error'] = 0.0
    for index, row in results.iterrows():
        results.at[index, 'error'] = errors[row['name']]

    #average the error by the total number of players
    return absolute_error / float(len(results))

In [20]:
scores = {}
output = {}
relativeError = {} 
scaler = StandardScaler()
for p in positions:
    if len(xTrain[p]) > 1 and len(xTest[p]) > 1:
#         scaler.fit(np.array(xTrain[p]))
#         predictor.fit(scaler.transform(np.array(xTrain[p])), np.array(yTrain[p]))
#         prediction = predictor.predict(scaler.transform(np.array(xTest[p])))
        predictor.fit(np.array(xTrain[p]), np.array(yTrain[p]))
        prediction = predictor.predict(np.array(xTest[p]))
        output[p] = pd.DataFrame(zip(names[p], prediction), columns = ['name', 'value']).sort_values(by=['value'], ascending=False)
        # scores[p] = predictor.score(np.array(xTest[p]), np.array(yTest[p]))
        scores[p] = (mean_squared_error(np.array(yTest[p]), np.array(prediction)), 
                     r2_score(np.array(yTest[p]), np.array(prediction)))
        relativeError[p] = get_relative_error(output[p], TEST_YEAR)
    else:
        print 'Not enough data for position: {}'.format(p)

Not enough data for position: LS
Not enough data for position: K
Not enough data for position: P


In [21]:
scores

{'C': (22319.180921313739, -3.5262429130806519),
 'DB': (6377.162198762263, -0.10337768836066097),
 'DE': (15982.595983537854, -2.2226924284576022),
 'DT': (16486.286777516976, -2.2452634080419887),
 'FB': (8012.1579072414625, -2.7297293216916412),
 'G': (15799.915410982701, -3.1534701252388526),
 'LB': (10997.612005838948, -1.1206313163620116),
 'QB': (29208.742428056888, -3.572702284947276),
 'RB': (18014.087944847954, -3.1459910411857814),
 'T': (23034.432155804112, -1.9539617092240142),
 'TE': (10826.223481529025, -1.5526637499707392),
 'WR': (10978.519992709873, -0.90352651096273462)}

In [22]:
output

{'C':             name     value  error
 2   HronissGrasu  3.486539    2.0
 4      MaxGarcia  3.486539    3.0
 1      AliMarpet  3.425262    1.0
 0  CameronErving  3.387036    3.0
 3      ShaqMason  3.373101    1.0
 6   AustinReiter  3.373101    1.0
 5     AndyGallik  3.316045    1.0,
 'DB':                 name       value  error
 39     IfoEkpreOlomu  113.231590   39.0
 23        TrayWalker  112.602882   22.0
 21          JoshShaw  112.073265   19.0
 7      JaquiskiTartt  111.704742    4.0
 30          TyeSmith  111.203499   26.0
 6       JalenCollins  108.027824    1.0
 10     SenquezGolson  106.268242    4.0
 3         ByronJones  106.098999    4.0
 43         AkeemKing  106.092003   35.0
 40    DexterMcDonald  105.740150   31.0
 1       KevinJohnson  105.453987    9.0
 35      RandallEvans  105.368240   24.0
 18       JamesSample  105.046883    6.0
 24        AdrianAmos  105.009964   11.0
 17      StevenNelson  104.841377    3.0
 32      TevinMitchel  104.453720   17.0
 16        

In [23]:
relativeError

{'C': 1.7142857142857142,
 'DB': 15.652173913043478,
 'DE': 8.285714285714286,
 'DT': 6.7272727272727275,
 'FB': 1.0,
 'G': 4.631578947368421,
 'LB': 12.333333333333334,
 'QB': 1.7142857142857142,
 'RB': 4.111111111111111,
 'T': 6.5,
 'TE': 6.9,
 'WR': 9.352941176470589}