In [1]:
import pandas as pd
import matplotlib.pyplot as plt 
from sklearn import datasets, linear_model
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
import numpy as np
from collections import defaultdict
import math

In [2]:
# format the data to be fed into the network
combine_cleaned = pd.read_csv('combine_cleaned.csv')
final_continuous = pd.read_csv('finalData.csv')
final_discrete = pd.read_csv('discreteData.csv')
final_combined = pd.read_csv('combinedFeatures.csv')
final_beast = pd.read_csv('superbeastfire.csv')

# Choose dataset+features to use
data = final_beast

In [3]:
# need to figure out how to add 'school', 'conference' back in. 
featuresStandard = ['heightinchestotal', 'weight', 'fortyyd', 'vertical', 'bench', 'twentyss', 'threecone', 'broad', 
            'games', 'rushingAtt', 'rushingYds', 'rushingAvg', 'rushingTD', 'passCmp',
            'passPct', 'passYds', 'passTD', 'passInt', 'passRate', 'recYds', 'recAtt', 'recTD', 'recAvg', 'soloTackles', 
            'tackleAssists', 'totalTackles', 'sacks', 'ints', 'intTDs', 'passDef', 'fumbles', 'fumblesForced']
featuresCombined = ['heightinchestotal','weight','fortyyd','twentyss','threecone','vertical','broad','bench',
                    'games','rushingAtt','rushingYds','rushingAvg','rushingTD','passCmp',
                    'passAtt','passPct','passYds','passTD','passInt','passRate','recYds','recAtt','recTD','recAvg',
                    'soloTackles','tackleAssists','totalTackles','sacks','ints','intTDs','passDef','fumbles','fumblesForced',
                    'heightinchestotal * weight','rushingAtt * rushingYds','passTD / passInt','passRate * passPct',
                    'rushingAvg * rushingTD','rushingAtt * rushingYds','recAvg * recTD','recAtt * recYds','soloTackles * tackleAssists',
                    'totalTackles * sacks','intTDs * ints','passDefs * ints','fumblesForced * totalTackles']
featuresDiscrete = ['confACC','confPac-12','confUnknown','confSEC','confBig 12','confBig Ten','confAmerican',
            'confBig East','confPac-10','confMAC','confSun Belt','confMWC','confWAC','confCUSA','confInd','confSouthern','confMVC','confPac-8',
            'confBig West','confSWC','confSouthland','confBig 8','confSWAC','heightinchestotalNone','heightinchestotalQ1','heightinchestotalQ2',
            'heightinchestotalQ3','heightinchestotalQ4','weightNone','weightQ1','weightQ2','weightQ3','weightQ4','fortyydNone','fortyydQ1','fortyydQ2',
            'fortyydQ3','fortyydQ4','verticalNone','verticalQ1','verticalQ2','verticalQ3','verticalQ4','benchNone','benchQ1','benchQ2','benchQ3',
            'benchQ4','twentyssNone','twentyssQ1','twentyssQ2','twentyssQ3','twentyssQ4','threeconeNone','threeconeQ1','threeconeQ2','threeconeQ3',
            'threeconeQ4','broadNone','broadQ1','broadQ2','broadQ3','broadQ4','gamesNone','gamesQ1','gamesQ2','gamesQ3','gamesQ4','rushingAttNone',
            'rushingAttQ1','rushingAttQ2','rushingAttQ3','rushingAttQ4','rushingYdsNone','rushingYdsQ1','rushingYdsQ2','rushingYdsQ3','rushingYdsQ4',
            'rushingAvgNone','rushingAvgQ1','rushingAvgQ2','rushingAvgQ3','rushingAvgQ4','rushingTDNone','rushingTDQ1','rushingTDQ2','rushingTDQ3','rushingTDQ4'
            ,'passCmpNone','passCmpQ1','passCmpQ2','passCmpQ3','passCmpQ4','passPctNone','passPctQ1','passPctQ2','passPctQ3','passPctQ4','passYdsNone','passYdsQ1',
            'passYdsQ2','passYdsQ3','passYdsQ4','passTDNone','passTDQ1','passTDQ2','passTDQ3','passTDQ4','passIntNone','passIntQ1','passIntQ2','passIntQ3','passIntQ4','passRateNone',
            'passRateQ1','passRateQ2','passRateQ3','passRateQ4','recYdsNone','recYdsQ1','recYdsQ2','recYdsQ3','recYdsQ4','recAttNone','recAttQ1','recAttQ2','recAttQ3',
            'recAttQ4','recTDNone','recTDQ1','recTDQ2','recTDQ3','recTDQ4','recAvgNone','recAvgQ1','recAvgQ2','recAvgQ3','recAvgQ4','soloTacklesNone','soloTacklesQ1',
            'soloTacklesQ2','soloTacklesQ3','soloTacklesQ4','tackleAssistsNone','tackleAssistsQ1','tackleAssistsQ2','tackleAssistsQ3','tackleAssistsQ4','totalTacklesNone',
            'totalTacklesQ1','totalTacklesQ2','totalTacklesQ3','totalTacklesQ4','sacksNone','sacksQ1','sacksQ2','sacksQ3','sacksQ4','intsNone','intsQ1','intsQ2','intsQ3',
            'intsQ4','intTDsNone','intTDsQ1','intTDsQ2','intTDsQ3','intTDsQ4','passDefNone','passDefQ1','passDefQ2','passDefQ3','passDefQ4','fumblesNone','fumblesQ1',
            'fumblesQ2','fumblesQ3','fumblesQ4','fumblesForcedNone','fumblesForcedQ1','fumblesForcedQ2','fumblesForcedQ3','fumblesForcedQ4']
featuresBeast = featuresDiscrete + featuresStandard

In [4]:
TEST_YEAR = 2015
# features = featuresStandard if data.equals(final_continuous) else featuresDiscrete
# features = featuresCombined
features = featuresBeast
shouldNormalize = True
# shouldNormalize = True if features == featuresStandard or features == featuresCombined else False 

In [5]:
# get player values/draft positions based on 'nfl_draft' data
def get_player_draft_info():
    valuesMap = {}
    pickMap = {}
    MAX_VALUE = 256 # number of draft picks + 1 (7 rounds * 32 picks + 32 compensatory + 1)
    for index, row in data.iterrows():
        primaryKey = (row['name'], row['year'])
        valuesMap[primaryKey] = (MAX_VALUE - row['pick'])
        pickMap[primaryKey] = row['pick']
    return valuesMap, pickMap

In [6]:
# Creates list of positions to partition by
positionsSet = set()
for position in data['pos']:
    positionsSet.add(position)
positions = list(positionsSet)

In [7]:
draftValues, draftPositions = get_player_draft_info()

In [8]:
# replace here with NEW dataset 
combineFeatures = data[['name', 'pos', 'year'] + features]

In [9]:
def get_position_maps():
    positionMaps = {}
    for position in positions:
        positionMaps[position] = combineFeatures[combineFeatures['pos'] == position]
    return positionMaps

In [10]:
positionMaps = get_position_maps()

In [11]:
# build X matrix and Y values 
def build_data_arrays():
    X = defaultdict(list)
    Y = defaultdict(list)
    xTrain = defaultdict(list)
    yTrain = defaultdict(list)
    xTest = defaultdict(list)
    yTest = defaultdict(list)
    names = defaultdict(list)
    for position in positionMaps:
        for index, row in positionMaps[position].iterrows(): 
            X[position].append(row[3:])                  
            Y[position].append(draftValues[(row['name'], row['year'])])
            if row['year'] == TEST_YEAR:
                xTest[position].append(row[3:]) 
                yTest[position].append(draftValues[(row['name'], row['year'])])
                names[position].append(row['name'])
            if not row['year'] == TEST_YEAR:
                xTrain[position].append(row[3:])                  
                yTrain[position].append(draftValues[(row['name'], row['year'])])
    return X, Y, xTrain, yTrain, xTest, yTest, names

In [12]:
X, Y, xTrain, yTrain, xTest, yTest, names = build_data_arrays()

In [13]:
seed = 7
np.random.seed(seed)
# predictor = linear_model.LinearRegression(fit_intercept=True, normalize=shouldNormalize, copy_X=True, n_jobs=1)
predictor = linear_model.RidgeCV(alphas=[0.1, 1.0, 10], fit_intercept=True, normalize=shouldNormalize)

In [14]:
numFolds = 10
kFold = KFold(n_splits=numFolds, random_state=seed)
def eval_data():
    errorMap = {}
    for p in positions:
        print p, len(X[p])
        if len(X[p]) > numFolds:
            results = cross_val_score(predictor, np.array(X[p]), np.array(Y[p]), cv=kFold, scoring='mean_squared_error')
            errorMap[p] = (results.mean(), results.std())
            print 'Results[{}]: mean={} (std={}) Mean Squared Error'.format(p, results.mean(), results.std())
        else:
            print 'Position: {} has too little data ({} row(s))'.format(p, len(X[p]))
    return errorMap

In [15]:
# errors = eval_data()

In [16]:
# errors

In [17]:
# Map players to their true and predicted draft positions 
def get_relative_error(results, test_year):
    trueDraftMap = {}      # true pick number
    predictedDraftMap = {} # relative pick based on regression
    i = 0
    for index, row in results.iterrows():
        trueDraftMap[row['name']] = draftPositions[(row['name'], test_year)] #need primary key to get draft positions 
        predictedDraftMap[row['name']] = i
        i += 1
    
    # Compute absolute_error based on relative draft positions
    i = 0
    absolute_error = 0
    errors = {}
    for key, value in sorted(trueDraftMap.iteritems(), key=lambda (k,v): (v,k)):  
        errors[key] = math.fabs(i - predictedDraftMap[key]) 
        absolute_error += errors[key]
        i += 1
        
    #Add the errors to the results Dataframe (for visualization)
    results['error'] = 0.0
    for index, row in results.iterrows():
        results.at[index, 'error'] = errors[row['name']]

    #average the error by the total number of players
    return absolute_error / float(len(results))

In [18]:
scores = {}
relativeError = {} 
coefficients = {}
output = {}
for p in positions:
    if len(xTrain[p]) > 1 and len(xTest[p]) > 1:
        predictor.fit(np.array(xTrain[p]), np.array(yTrain[p]))
        coefficients[p] = pd.DataFrame(zip(features, predictor.coef_), columns = ['feature', 'coefficient']).sort_values(by=['coefficient'], ascending=False)
        prediction = predictor.predict(np.array(xTest[p])) 
        output[p] = pd.DataFrame(zip(names[p], prediction), columns = ['name', 'value']).sort_values(by=['value'], ascending=False)
        # scores[p] = predictor.score(np.array(xTest[p]), np.array(yTest[p]))
        scores[p] = (mean_squared_error(np.array(yTest[p]), np.array(prediction)),
                     r2_score(np.array(yTest[p]), np.array(prediction)))
        relativeError[p] = get_relative_error(output[p], TEST_YEAR)
    else:
        print 'Not enough data for position: {}'.format(p)

Not enough data for position: LS
Not enough data for position: K
Not enough data for position: P


In [19]:
scores

{'C': (3069.4544391368768, 0.37752660139512562),
 'DB': (4119.8915545537038, 0.287175662453377),
 'DE': (3578.6817134766925, 0.27840317844735318),
 'DT': (7458.9242899130386, -0.46826113048217199),
 'FB': (7144.4864949680805, -2.3258207186142181),
 'G': (3448.5251124121814, 0.093454258585230821),
 'LB': (4628.9623633886031, 0.1074132689122681),
 'QB': (4724.4674349547904, 0.260372709020669),
 'RB': (2359.7891122374899, 0.45688815618208856),
 'T': (7521.3380099621081, 0.035455081623929408),
 'TE': (3207.8356126174185, 0.24363969595082013),
 'WR': (4038.8032253865276, 0.29972628212271413)}

In [20]:
output

{'C':             name       value  error
 4      MaxGarcia  159.813477    4.0
 1      AliMarpet  154.311858    0.0
 0  CameronErving  153.305426    2.0
 2   HronissGrasu  148.507574    1.0
 5     AndyGallik  121.572279    1.0
 3      ShaqMason   62.389112    2.0
 6   AustinReiter   62.389112    0.0,
 'DB':                 name       value  error
 3         ByronJones  236.211039    3.0
 21          JoshShaw  209.926457   20.0
 1       KevinJohnson  181.591235    1.0
 4   DamariousRandall  178.680564    1.0
 29       DamianSwann  176.409677   25.0
 2       MarcusPeters  176.365592    3.0
 14        PJWilliams  175.419203    8.0
 8           EricRowe  173.677928    1.0
 0         TraeWaynes  173.573215    8.0
 9        RonaldDarby  172.104214    0.0
 16        CraigMager  170.782747    6.0
 38     GerodHolliman  165.713667   27.0
 26       BobbyMcCain  156.320758   14.0
 34   JaCoreyShepherd  156.053175   21.0
 37      QuandreDiggs  149.881118   23.0
 33     CharlesGaines  149.707303   

In [21]:
relativeError

{'C': 1.4285714285714286,
 'DB': 10.217391304347826,
 'DE': 4.571428571428571,
 'DT': 8.0,
 'FB': 1.5,
 'G': 5.7894736842105265,
 'LB': 8.333333333333334,
 'QB': 2.0,
 'RB': 2.7777777777777777,
 'T': 5.9,
 'TE': 4.5,
 'WR': 7.529411764705882}

In [22]:
coefficients

{'C':                  feature  coefficient
 50            twentyssQ2    13.014335
 41            verticalQ3    12.855771
 27   heightinchestotalQ4     7.404852
 46               benchQ3     7.328538
 54           threeconeQ1     5.168747
 31              weightQ3     5.010487
 47               benchQ4     4.687620
 59               broadQ1     4.422008
 185              fortyyd     3.846500
 184               weight     0.889666
 37             fortyydQ4     0.881837
 186             vertical     0.771614
 183    heightinchestotal     0.629917
 190                broad     0.015385
 144      tackleAssistsQ1     0.000000
 140        soloTacklesQ2     0.000000
 141        soloTacklesQ3     0.000000
 142        soloTacklesQ4     0.000000
 143    tackleAssistsNone     0.000000
 147      tackleAssistsQ4     0.000000
 145      tackleAssistsQ2     0.000000
 146      tackleAssistsQ3     0.000000
 138      soloTacklesNone     0.000000
 148     totalTacklesNone     0.000000
 149       totalTack