In [1]:
import pandas as pd
from sklearn import datasets, linear_model
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
import numpy as np
from collections import defaultdict
import math

In [2]:
# format the data to be fed into the network
combine_cleaned = pd.read_csv('combine_cleaned.csv')
final_continuous = pd.read_csv('finalData.csv')
final_discrete = pd.read_csv('discreteFeatures.csv')
train_continuous = pd.read_csv('finalData.csv')
test_continuous = pd.read_csv('finalData.csv')
train_discrete = pd.read_csv('finalData.csv')
test_discrete = pd.read_csv('finalData.csv')

# Choose dataset to use
data = final_continuous


In [3]:
# need to figure out how to add 'school', 'conference' back in. 
features = ['heightinchestotal', 'weight', 'fortyyd', 'vertical', 'bench', 'twentyss', 'threecone', 'broad', 
            'games', 'rushingAtt', 'rushingYds', 'rushingAvg', 'rushingTD', 'passCmp',
            'passPct', 'passYds', 'passTD', 'passInt', 'passRate', 'recYds', 'recAtt', 'recTD', 'recAvg', 'soloTackles', 
            'tackleAssists', 'totalTackles', 'sacks', 'ints', 'intTDs', 'passDef', 'fumbles', 'fumblesForced']

In [4]:
MAX_YEAR = 2015 
MIN_YEAR = 1999
TEST_YEAR = 2015

In [5]:
# get player values/draft positions based on 'nfl_draft' data
def get_player_draft_info():
    valuesMap = {}
    pickMap = {}
    MAX_VALUE = 256 # number of draft picks + 1 (7 rounds * 32 picks + 32 compensatory + 1)
    for index, row in data.iterrows():
        primaryKey = (row['name'], row['year'])
        valuesMap[primaryKey] = (MAX_VALUE - row['pick'])
        pickMap[primaryKey] = row['pick']
    return valuesMap, pickMap

In [6]:
# Creates list of positions to partition by
positionsSet = set()
for position in data['pos']:
    positionsSet.add(position)
positions = list(positionsSet)

In [7]:
draftValues, draftPositions = get_player_draft_info()

In [8]:
# replace here with NEW dataset 
combineFeatures = data[['name', 'pos', 'year'] + features]

In [9]:
def get_position_maps():
    positionMaps = {}
    for position in positions:
        positionMaps[position] = combineFeatures[combineFeatures['pos'] == position]
    return positionMaps

In [10]:
positionMaps = get_position_maps()

In [11]:
# build X matrix and Y values 
def build_data_arrays():
    X = defaultdict(list)
    Y = defaultdict(list)
    xTrain = defaultdict(list)
    yTrain = defaultdict(list)
    xTest = defaultdict(list)
    yTest = defaultdict(list)
    names = defaultdict(list)
    for position in positionMaps:
        for index, row in positionMaps[position].iterrows(): 
            X[position].append(row[2:])                  
            Y[position].append(draftValues[(row['name'], row['year'])])
            if row['year'] == TEST_YEAR:
                xTest[position].append(row[2:]) 
                yTest[position].append(draftValues[(row['name'], row['year'])])
                names[position].append(row['name'])
            if not row['year'] == TEST_YEAR:
                xTrain[position].append(row[2:])                  
                yTrain[position].append(draftValues[(row['name'], row['year'])])
    return X, Y, xTrain, yTrain, xTest, yTest, names

In [12]:
X, Y, xTrain, yTrain, xTest, yTest, names = build_data_arrays()

In [13]:
seed = 7
np.random.seed(seed)
predictor = linear_model.LinearRegression(fit_intercept=True, normalize=True, copy_X=True, n_jobs=1)

In [14]:
numFolds = 10
kFold = KFold(n_splits=numFolds, random_state=seed)
def eval_data():
    errorMap = {}
    for p in positions:
        print p, len(X[p])
        if len(X[p]) > numFolds:
            results = cross_val_score(predictor, np.array(X[p]), np.array(Y[p]), cv=kFold, scoring='mean_squared_error')
            errorMap[p] = (results.mean(), results.std())
            print 'Results[{}]: mean={} (std={}) Mean Squared Error'.format(p, results.mean(), results.std())
        else:
            print 'Position: {} has too little data ({} row(s))'.format(p, len(X[p]))
    return errorMap

In [15]:
# errors = eval_data()

In [16]:
# errors

In [17]:
scores = {}
output = {}
for p in positions:
    if len(xTrain[p]) > 1 and len(xTest[p]) > 1:
        predictor.fit(np.array(xTrain[p]), np.array(yTrain[p]))
        prediction = predictor.predict(np.array(xTest[p])) 
        output[p] = pd.DataFrame(zip(names[p], prediction), columns = ['name', 'value']).sort_values(by=['value'], ascending=False)
        # scores[p] = predictor.score(np.array(xTest[p]), np.array(yTest[p]))
        scores[p] = (mean_squared_error(np.array(yTest[p]), np.array(prediction)),
                     r2_score(np.array(yTest[p]), np.array(prediction)))
    else:
        print 'Not enough data for position: {}'.format(p)

Not enough data for position: LS
Not enough data for position: K
Not enough data for position: P


In [18]:
scores

{'C': (3043.7999088947449, 0.38272924015262477),
 'DB': (3836.5147759834422, 0.33620556088290621),
 'DE': (9039.6770511896211, -0.82273886035641053),
 'DT': (6722.7212602597801, -0.32334233917001387),
 'FB': (10003.531210798494, -3.6567309468091098),
 'G': (3782.9059146091995, 0.0055524796039765345),
 'LB': (5013.6383524941521, 0.033237534333101948),
 'QB': (6961.6364454540044, -0.089861741206688661),
 'RB': (3.1831414482313363e+29, -7.3260860986129957e+25),
 'T': (8354.784844889251, -0.071427085924028688),
 'TE': (3056.1162278529291, 0.27941288817403076),
 'WR': (4505.3550419078165, 0.21883252302988376)}

In [19]:
output

{'C':             name       value
 4      MaxGarcia  162.144792
 0  CameronErving  160.366601
 2   HronissGrasu  148.751202
 1      AliMarpet  146.878715
 5     AndyGallik  124.835336
 3      ShaqMason   68.827499
 6   AustinReiter   68.827499, 'DB':                 name       value
 3         ByronJones  196.745811
 9        RonaldDarby  176.254383
 16        CraigMager  173.807453
 8           EricRowe  171.067323
 26       BobbyMcCain  166.149385
 0         TraeWaynes  165.762846
 1       KevinJohnson  161.337891
 21          JoshShaw  155.928957
 24        AdrianAmos  144.664892
 4   DamariousRandall  143.220844
 15        AlexCarter  143.117959
 19   ClaytonGeathers  140.749176
 33     CharlesGaines  138.697984
 5      LandonCollins  137.568757
 13        DJounSmith  135.345795
 2       MarcusPeters  130.685767
 14        PJWilliams  128.440037
 6       JalenCollins  128.308691
 17      StevenNelson  127.724295
 22        DoranGrant  127.508895
 18       JamesSample  127.178601
 