In [1]:
import pandas as pd
import matplotlib.pyplot as plt 
from sklearn import datasets, linear_model
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
import numpy as np
from collections import defaultdict
import math

In [2]:
# format the data to be fed into the network
combine_cleaned = pd.read_csv('combine_cleaned.csv')
final_continuous = pd.read_csv('finalData.csv')
final_discrete = pd.read_csv('discreteData.csv')
final_beast = pd.read_csv('superbeastfire.csv')

# Choose dataset+features to use
data = final_continuous

In [3]:
# need to figure out how to add 'school', 'conference' back in. 
featuresStandard = ['heightinchestotal', 'weight', 'fortyyd', 'vertical', 'bench', 'twentyss', 'threecone', 'broad', 
            'games', 'rushingAtt', 'rushingYds', 'rushingAvg', 'rushingTD', 'passCmp',
            'passPct', 'passYds', 'passTD', 'passInt', 'passRate', 'recYds', 'recAtt', 'recTD', 'recAvg', 'soloTackles', 
            'tackleAssists', 'totalTackles', 'sacks', 'ints', 'intTDs', 'passDef', 'fumbles', 'fumblesForced']
featuresCombined = ['heightinchestotal','weight','fortyyd','twentyss','threecone','vertical','broad','bench',
                    'games','rushingAtt','rushingYds','rushingAvg','rushingTD','passCmp',
                    'passAtt','passPct','passYds','passTD','passInt','passRate','recYds','recAtt','recTD','recAvg',
                    'soloTackles','tackleAssists','totalTackles','sacks','ints','intTDs','passDef','fumbles','fumblesForced',
                    'heightinchestotal * weight','rushingAtt * rushingYds','passTD / passInt','passRate * passPct',
                    'rushingAvg * rushingTD','rushingAtt * rushingYds','recAvg * recTD','recAtt * recYds','soloTackles * tackleAssists',
                    'totalTackles * sacks','intTDs * ints','passDefs * ints','fumblesForced * totalTackles']
featuresDiscrete = ['confACC','confPac-12','confUnknown','confSEC','confBig 12','confBig Ten','confAmerican',
            'confBig East','confPac-10','confMAC','confSun Belt','confMWC','confWAC','confCUSA','confInd','confSouthern','confMVC','confPac-8',
            'confBig West','confSWC','confSouthland','confBig 8','confSWAC','heightinchestotalNone','heightinchestotalQ1','heightinchestotalQ2',
            'heightinchestotalQ3','heightinchestotalQ4','weightNone','weightQ1','weightQ2','weightQ3','weightQ4','fortyydNone','fortyydQ1','fortyydQ2',
            'fortyydQ3','fortyydQ4','verticalNone','verticalQ1','verticalQ2','verticalQ3','verticalQ4','benchNone','benchQ1','benchQ2','benchQ3',
            'benchQ4','twentyssNone','twentyssQ1','twentyssQ2','twentyssQ3','twentyssQ4','threeconeNone','threeconeQ1','threeconeQ2','threeconeQ3',
            'threeconeQ4','broadNone','broadQ1','broadQ2','broadQ3','broadQ4','gamesNone','gamesQ1','gamesQ2','gamesQ3','gamesQ4','rushingAttNone',
            'rushingAttQ1','rushingAttQ2','rushingAttQ3','rushingAttQ4','rushingYdsNone','rushingYdsQ1','rushingYdsQ2','rushingYdsQ3','rushingYdsQ4',
            'rushingAvgNone','rushingAvgQ1','rushingAvgQ2','rushingAvgQ3','rushingAvgQ4','rushingTDNone','rushingTDQ1','rushingTDQ2','rushingTDQ3','rushingTDQ4'
            ,'passCmpNone','passCmpQ1','passCmpQ2','passCmpQ3','passCmpQ4','passPctNone','passPctQ1','passPctQ2','passPctQ3','passPctQ4','passYdsNone','passYdsQ1',
            'passYdsQ2','passYdsQ3','passYdsQ4','passTDNone','passTDQ1','passTDQ2','passTDQ3','passTDQ4','passIntNone','passIntQ1','passIntQ2','passIntQ3','passIntQ4','passRateNone',
            'passRateQ1','passRateQ2','passRateQ3','passRateQ4','recYdsNone','recYdsQ1','recYdsQ2','recYdsQ3','recYdsQ4','recAttNone','recAttQ1','recAttQ2','recAttQ3',
            'recAttQ4','recTDNone','recTDQ1','recTDQ2','recTDQ3','recTDQ4','recAvgNone','recAvgQ1','recAvgQ2','recAvgQ3','recAvgQ4','soloTacklesNone','soloTacklesQ1',
            'soloTacklesQ2','soloTacklesQ3','soloTacklesQ4','tackleAssistsNone','tackleAssistsQ1','tackleAssistsQ2','tackleAssistsQ3','tackleAssistsQ4','totalTacklesNone',
            'totalTacklesQ1','totalTacklesQ2','totalTacklesQ3','totalTacklesQ4','sacksNone','sacksQ1','sacksQ2','sacksQ3','sacksQ4','intsNone','intsQ1','intsQ2','intsQ3',
            'intsQ4','intTDsNone','intTDsQ1','intTDsQ2','intTDsQ3','intTDsQ4','passDefNone','passDefQ1','passDefQ2','passDefQ3','passDefQ4','fumblesNone','fumblesQ1',
            'fumblesQ2','fumblesQ3','fumblesQ4','fumblesForcedNone','fumblesForcedQ1','fumblesForcedQ2','fumblesForcedQ3','fumblesForcedQ4']
featuresBeast = featuresDiscrete + featuresStandard

In [4]:
TEST_YEAR = 2015
# features = featuresStandard if data.equals(final_continuous) else featuresDiscrete
# features = featuresCombined
features = featuresStandard
shouldNormalize = True
# shouldNormalize = True if features == featuresStandard or features == featuresCombined else False 

In [5]:
# get player values/draft positions based on 'nfl_draft' data
def get_player_draft_info():
    valuesMap = {}
    pickMap = {}
    MAX_VALUE = 256 # number of draft picks + 1 (7 rounds * 32 picks + 32 compensatory + 1)
    for index, row in data.iterrows():
        primaryKey = (row['name'], row['year'])
        valuesMap[primaryKey] = (MAX_VALUE - row['pick'])
        pickMap[primaryKey] = row['pick']
    return valuesMap, pickMap

In [6]:
# Creates list of positions to partition by
positionsSet = set()
for position in data['pos']:
    positionsSet.add(position)
positions = list(positionsSet)

In [7]:
draftValues, draftPositions = get_player_draft_info()

In [8]:
# replace here with NEW dataset 
combineFeatures = data[['name', 'pos', 'year'] + features]

In [9]:
def get_position_maps():
    positionMaps = {}
    for position in positions:
        positionMaps[position] = combineFeatures[combineFeatures['pos'] == position]
    return positionMaps

In [10]:
positionMaps = get_position_maps()

In [11]:
# build X matrix and Y values 
def build_data_arrays():
    X = defaultdict(list)
    Y = defaultdict(list)
    xTrain = defaultdict(list)
    yTrain = defaultdict(list)
    xTest = defaultdict(list)
    yTest = defaultdict(list)
    names = defaultdict(list)
    for position in positionMaps:
        for index, row in positionMaps[position].iterrows(): 
            X[position].append(row[3:])                  
            Y[position].append(draftValues[(row['name'], row['year'])])
            if row['year'] == TEST_YEAR:
                xTest[position].append(row[3:]) 
                yTest[position].append(draftValues[(row['name'], row['year'])])
                names[position].append(row['name'])
            if not row['year'] == TEST_YEAR:
                xTrain[position].append(row[3:])                  
                yTrain[position].append(draftValues[(row['name'], row['year'])])
    return X, Y, xTrain, yTrain, xTest, yTest, names

In [12]:
X, Y, xTrain, yTrain, xTest, yTest, names = build_data_arrays()

In [13]:
# predictor = linear_model.LinearRegression(fit_intercept=True, normalize=shouldNormalize, copy_X=True, n_jobs=1)
predictor = linear_model.RidgeCV(alphas=[0.1, 1.0, 10], fit_intercept=True, normalize=shouldNormalize)

In [14]:
numFolds = 10
seed = 1
kFold = KFold(n_splits=numFolds, random_state=seed)
def eval_data():
    errorMap = {}
    for p in positions:
        print p, len(X[p])
        if len(X[p]) > numFolds:
            results = cross_val_score(predictor, np.array(X[p]), np.array(Y[p]), cv=kFold, scoring='mean_squared_error')
            errorMap[p] = (results.mean(), results.std())
            print 'Results[{}]: mean={} (std={}) Mean Squared Error'.format(p, results.mean(), results.std())
        else:
            print 'Position: {} has too little data ({} row(s))'.format(p, len(X[p]))
    return errorMap

In [15]:
# errors = eval_data()
# errors

In [16]:
def eval_data_train():
    errorMap = {}
    for p in positions:
        if len(X[p]) > 1:
            predictor.fit(np.array(X[p]), np.array(Y[p]))
            prediction = predictor.predict(np.array(X[p]))
            errorMap[p] = mean_squared_error(np.array(Y[p]), np.array(prediction))
        else:
            print 'Position: {} has too little data ({} row(s))'.format(p, len(X[p]))
    return errorMap

In [17]:
trainError = eval_data_train()
trainError

Position: LS has too little data (1 row(s))


{'C': 3707.9103539624066,
 'DB': 3937.73285476502,
 'DE': 4500.539255873713,
 'DT': 5118.8840341959531,
 'FB': 2758.8939061548331,
 'G': 4163.1785564440297,
 'K': 3302.9931501792653,
 'LB': 4221.5035478587433,
 'P': 2083.4195222331928,
 'QB': 4447.6287048195527,
 'RB': 3902.704620622872,
 'T': 5238.0901323497028,
 'TE': 3349.6150724464824,
 'WR': 4634.4124289599222}

In [18]:
# Map players to their true and predicted draft positions 
def get_relative_error(results, test_year):
    trueDraftMap = {}      # true pick number
    predictedDraftMap = {} # relative pick based on regression
    i = 0
    for index, row in results.iterrows():
        trueDraftMap[row['name']] = draftPositions[(row['name'], test_year)] #need primary key to get draft positions 
        predictedDraftMap[row['name']] = i
        i += 1
    
    # Compute absolute_error based on relative draft positions
    i = 0
    absolute_error = 0
    errors = {}
    for key, value in sorted(trueDraftMap.iteritems(), key=lambda (k,v): (v,k)):  
        errors[key] = math.fabs(i - predictedDraftMap[key]) 
        absolute_error += errors[key]
        i += 1
        
    #Add the errors to the results Dataframe (for visualization)
    results['error'] = 0.0
    for index, row in results.iterrows():
        results.at[index, 'error'] = errors[row['name']]

    #average the error by the total number of players
    return absolute_error / float(len(results))

In [19]:
scores = {}
relativeError = {} 
coefficients = {}
output = {}
for p in positions:
    if len(xTrain[p]) > 1 and len(xTest[p]) > 1:
        predictor.fit(np.array(xTrain[p]), np.array(yTrain[p]))
        coefficients[p] = pd.DataFrame(zip(features, predictor.coef_), columns = ['feature', 'coefficient']).sort_values(by=['coefficient'], ascending=False)
        prediction = predictor.predict(np.array(xTest[p])) 
        output[p] = pd.DataFrame(zip(names[p], prediction), columns = ['name', 'value']).sort_values(by=['value'], ascending=False)
        # scores[p] = predictor.score(np.array(xTest[p]), np.array(yTest[p]))
        scores[p] = (mean_squared_error(np.array(yTest[p]), np.array(prediction)),
                     r2_score(np.array(yTest[p]), np.array(prediction)))
        relativeError[p] = get_relative_error(output[p], TEST_YEAR)
    else:
        print 'Not enough data for position: {}'.format(p)

Not enough data for position: LS
Not enough data for position: K
Not enough data for position: P


In [20]:
scores

{'C': (4356.9601084647056, 0.11642546906005835),
 'DB': (4030.7343974443465, 0.30260164894172825),
 'DE': (4796.1989300979067, 0.032905918830494096),
 'DT': (5333.6627847872332, -0.049911414249479424),
 'FB': (5347.6737445136378, -1.489388726316319),
 'G': (3266.0811501714979, 0.1414149929933497),
 'LB': (4891.8647093227301, 0.056718722892919016),
 'QB': (5873.337396797876, 0.080514219304216872),
 'RB': (2366.1311739692605, 0.45542851352890124),
 'T': (7311.71622953521, 0.062337216534487316),
 'TE': (3575.821324325289, 0.15687409496479676),
 'WR': (4661.8305148255613, 0.19170179764867157)}

In [21]:
output

{'C':             name       value  error
 0  CameronErving  140.904265    0.0
 5     AndyGallik  129.840909    4.0
 1      AliMarpet  129.369249    1.0
 4      MaxGarcia  114.735856    1.0
 2   HronissGrasu  107.266736    2.0
 3      ShaqMason   77.532232    2.0
 6   AustinReiter   77.532232    0.0,
 'DB':                 name       value  error
 3         ByronJones  203.564830    3.0
 9        RonaldDarby  184.512572    8.0
 8           EricRowe  179.458359    6.0
 16        CraigMager  177.767519   13.0
 1       KevinJohnson  176.191506    3.0
 0         TraeWaynes  174.733283    5.0
 21          JoshShaw  172.298750   15.0
 26       BobbyMcCain  171.302268   19.0
 15        AlexCarter  155.446453    7.0
 33     CharlesGaines  151.272823   24.0
 24        AdrianAmos  147.284180   14.0
 4   DamariousRandall  145.417654    7.0
 6       JalenCollins  145.349869    6.0
 13        DJounSmith  145.324029    0.0
 2       MarcusPeters  140.154390   12.0
 17      StevenNelson  139.681354   

In [22]:
relativeError

{'C': 1.4285714285714286,
 'DB': 8.304347826086957,
 'DE': 4.571428571428571,
 'DT': 6.181818181818182,
 'FB': 1.5,
 'G': 5.2631578947368425,
 'LB': 8.5,
 'QB': 2.0,
 'RB': 2.6666666666666665,
 'T': 5.4,
 'TE': 5.0,
 'WR': 8.058823529411764}

In [23]:
coefficients

{'C':               feature  coefficient
 0   heightinchestotal     4.778891
 3            vertical     1.061150
 1              weight     0.224186
 30            fumbles     0.000000
 29            passDef     0.000000
 28             intTDs     0.000000
 27               ints     0.000000
 26              sacks     0.000000
 17            passInt     0.000000
 24      tackleAssists     0.000000
 23        soloTackles     0.000000
 22             recAvg     0.000000
 21              recTD     0.000000
 20             recAtt     0.000000
 19             recYds     0.000000
 18           passRate     0.000000
 25       totalTackles     0.000000
 16             passTD     0.000000
 15            passYds     0.000000
 14            passPct     0.000000
 13            passCmp     0.000000
 12          rushingTD     0.000000
 11         rushingAvg     0.000000
 10         rushingYds     0.000000
 9          rushingAtt     0.000000
 8               games     0.000000
 31      fumblesForced 