In [1]:
import pandas as pd
import matplotlib.pyplot as plt 
from sklearn import datasets, linear_model
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
import numpy as np
from collections import defaultdict
import math

from KNNLib import get_draft_probability_matrix
from KNNLib import positionMLE

In [2]:
# format the data to be fed into the network
combine_cleaned = pd.read_csv('data/combine_cleaned.csv')
final_continuous = pd.read_csv('data/finalData.csv')
final_discrete = pd.read_csv('data/discreteData.csv')


# Choose dataset to use
data = final_continuous


In [3]:
# need to figure out how to add 'school', 'conference' back in. 
featuresStandard = ['heightinchestotal', 'weight', 'fortyyd', 'vertical', 'bench', 'twentyss', 'threecone', 'broad', 
            'games', 'rushingAtt', 'rushingYds', 'rushingAvg', 'rushingTD', 'passCmp',
            'passPct', 'passYds', 'passTD', 'passInt', 'passRate', 'recYds', 'recAtt', 'recTD', 'recAvg', 'soloTackles', 
            'tackleAssists', 'totalTackles', 'sacks', 'ints', 'intTDs', 'passDef', 'fumbles', 'fumblesForced']
featuresDiscrete = ['confACC','confPac-12','confUnknown','confSEC','confBig 12','confBig Ten','confAmerican',
            'confBig East','confPac-10','confMAC','confSun Belt','confMWC','confWAC','confCUSA','confInd','confSouthern','confMVC','confPac-8',
            'confBig West','confSWC','confSouthland','confBig 8','confSWAC','heightinchestotalNone','heightinchestotalQ1','heightinchestotalQ2',
            'heightinchestotalQ3','heightinchestotalQ4','weightNone','weightQ1','weightQ2','weightQ3','weightQ4','fortyydNone','fortyydQ1','fortyydQ2',
            'fortyydQ3','fortyydQ4','verticalNone','verticalQ1','verticalQ2','verticalQ3','verticalQ4','benchNone','benchQ1','benchQ2','benchQ3',
            'benchQ4','twentyssNone','twentyssQ1','twentyssQ2','twentyssQ3','twentyssQ4','threeconeNone','threeconeQ1','threeconeQ2','threeconeQ3',
            'threeconeQ4','broadNone','broadQ1','broadQ2','broadQ3','broadQ4','gamesNone','gamesQ1','gamesQ2','gamesQ3','gamesQ4','rushingAttNone',
            'rushingAttQ1','rushingAttQ2','rushingAttQ3','rushingAttQ4','rushingYdsNone','rushingYdsQ1','rushingYdsQ2','rushingYdsQ3','rushingYdsQ4',
            'rushingAvgNone','rushingAvgQ1','rushingAvgQ2','rushingAvgQ3','rushingAvgQ4','rushingTDNone','rushingTDQ1','rushingTDQ2','rushingTDQ3','rushingTDQ4'
            ,'passCmpNone','passCmpQ1','passCmpQ2','passCmpQ3','passCmpQ4','passPctNone','passPctQ1','passPctQ2','passPctQ3','passPctQ4','passYdsNone','passYdsQ1',
            'passYdsQ2','passYdsQ3','passYdsQ4','passTDNone','passTDQ1','passTDQ2','passTDQ3','passTDQ4','passIntNone','passIntQ1','passIntQ2','passIntQ3','passIntQ4','passRateNone',
            'passRateQ1','passRateQ2','passRateQ3','passRateQ4','recYdsNone','recYdsQ1','recYdsQ2','recYdsQ3','recYdsQ4','recAttNone','recAttQ1','recAttQ2','recAttQ3',
            'recAttQ4','recTDNone','recTDQ1','recTDQ2','recTDQ3','recTDQ4','recAvgNone','recAvgQ1','recAvgQ2','recAvgQ3','recAvgQ4','soloTacklesNone','soloTacklesQ1',
            'soloTacklesQ2','soloTacklesQ3','soloTacklesQ4','tackleAssistsNone','tackleAssistsQ1','tackleAssistsQ2','tackleAssistsQ3','tackleAssistsQ4','totalTacklesNone',
            'totalTacklesQ1','totalTacklesQ2','totalTacklesQ3','totalTacklesQ4','sacksNone','sacksQ1','sacksQ2','sacksQ3','sacksQ4','intsNone','intsQ1','intsQ2','intsQ3',
            'intsQ4','intTDsNone','intTDsQ1','intTDsQ2','intTDsQ3','intTDsQ4','passDefNone','passDefQ1','passDefQ2','passDefQ3','passDefQ4','fumblesNone','fumblesQ1',
            'fumblesQ2','fumblesQ3','fumblesQ4','fumblesForcedNone','fumblesForcedQ1','fumblesForcedQ2','fumblesForcedQ3','fumblesForcedQ4']


In [4]:
MAX_YEAR = 2015 
MIN_YEAR = 1999
TEST_YEAR = 2015
features = featuresStandard
shouldNormalize = True if features == featuresStandard else False 

In [5]:
# get player values/draft positions based on 'nfl_draft' data
def get_player_draft_info():
    valuesMap = {}
    pickMap = {}
    MAX_VALUE = 256 # number of draft picks + 1 (7 rounds * 32 picks + 32 compensatory + 1)
    for index, row in data.iterrows():
        primaryKey = (row['name'], row['year'])
        valuesMap[primaryKey] = (MAX_VALUE - row['pick'])
        pickMap[primaryKey] = row['pick']
    return valuesMap, pickMap

In [6]:
# Creates list of positions to partition by
positionsSet = set()
for position in data['pos']:
    positionsSet.add(position)
positions = list(positionsSet)

In [7]:
draftValues, draftPositions = get_player_draft_info()
# print draftPositions

In [8]:
# replace here with NEW dataset 
combineFeatures = data[['name', 'pos', 'year'] + features]

In [9]:
def get_position_maps():
    positionMaps = {}
    for position in positions:
        positionMaps[position] = combineFeatures[combineFeatures['pos'] == position]
    return positionMaps

In [10]:
positionMaps = get_position_maps()

In [11]:
# build X matrix and Y values 
def build_data_arrays():
    X = defaultdict(list)
    Y = defaultdict(list)
    xTrain = defaultdict(list)
    yTrain = defaultdict(list)
    xTest = defaultdict(list)
    yTest = defaultdict(list)
    names = defaultdict(list)
    for position in positionMaps:
        for index, row in positionMaps[position].iterrows(): 
            X[position].append(row[3:])                  
            Y[position].append(draftValues[(row['name'], row['year'])])
            if row['year'] == TEST_YEAR:
                xTest[position].append(row[3:]) 
                yTest[position].append(draftValues[(row['name'], row['year'])])
                names[position].append(row['name'])
            if not row['year'] == TEST_YEAR:
                xTrain[position].append(row[3:])                  
                yTrain[position].append(draftValues[(row['name'], row['year'])])
    return X, Y, xTrain, yTrain, xTest, yTest, names

In [12]:
X, Y, xTrain, yTrain, xTest, yTest, names = build_data_arrays()

In [13]:
seed = 7
np.random.seed(seed)
predictor = linear_model.LinearRegression(fit_intercept=True, normalize=shouldNormalize, copy_X=True, n_jobs=1)

In [14]:
numFolds = 10
kFold = KFold(n_splits=numFolds, random_state=seed)
def eval_data():
    errorMap = {}
    for p in positions:
        print p, len(X[p])
        if len(X[p]) > numFolds:
            results = cross_val_score(predictor, np.array(X[p]), np.array(Y[p]), cv=kFold, scoring='mean_squared_error')
            errorMap[p] = (results.mean(), results.std())
            print 'Results[{}]: mean={} (std={}) Mean Squared Error'.format(p, results.mean(), results.std())
        else:
            print 'Position: {} has too little data ({} row(s))'.format(p, len(X[p]))
    return errorMap

In [15]:
def get_player_draft(playerName, year, draftPositions):
    if (playerName, year) in draftPositions:
        return draftPositions[(playerName, year)]
    else:
        return -1

In [16]:
# errors = eval_data()

In [17]:
# errors

In [18]:
# Map players to their true and predicted draft positions 
def get_relative_error(results, test_year):
    trueDraftMap = {}      # true pick number
    predictedDraftMap = {} # relative pick based on regression
    i = 0
    for index, row in results.iterrows():
        trueDraftMap[row['name']] = draftPositions[(row['name'], test_year)] #need primary key to get draft positions 
        predictedDraftMap[row['name']] = i
        i += 1
    
    # Compute absolute_error based on relative draft positions
    i = 0
    absolute_error = 0
    errors = {}
    for key, value in sorted(trueDraftMap.iteritems(), key=lambda (k,v): (v,k)):  
        errors[key] = math.fabs(i - predictedDraftMap[key]) 
        absolute_error += errors[key]
        i += 1
        
    #Add the errors to the results Dataframe (for visualization)
    results['error'] = 0.0
    for index, row in results.iterrows():
        results.at[index, 'error'] = errors[row['name']]

    #average the error by the total number of players
    return absolute_error / float(len(results))

In [19]:
scores = {}
relativeError = {} 
coefficients = {}
output = {}
for p in positions:
    if len(xTrain[p]) > 1 and len(xTest[p]) > 1:
        predictor.fit(np.array(xTrain[p]), np.array(yTrain[p]))
        coefficients[p] = pd.DataFrame(zip(features, predictor.coef_), columns = ['feature', 'coefficient']).sort_values(by=['coefficient'], ascending=False)
        prediction = predictor.predict(np.array(xTest[p])) 
        output[p] = pd.DataFrame(zip(names[p], prediction), columns = ['name', 'value']).sort_values(by=['value'], ascending=False)
        # scores[p] = predictor.score(np.array(xTest[p]), np.array(yTest[p]))
        scores[p] = (mean_squared_error(np.array(yTest[p]), np.array(prediction)),
                     r2_score(np.array(yTest[p]), np.array(prediction)))
        relativeError[p] = get_relative_error(output[p], TEST_YEAR)
    else:
        print 'Not enough data for position: {}'.format(p)



Not enough data for position: LS
Not enough data for position: K
Not enough data for position: P


In [20]:
team_probability_matrix = get_draft_probability_matrix()



In [44]:
def determine_pick(team, team_need, player_rankings, position_likelihood):
    best_player, best_score = None, 0
    best_pos = None
    idx_to_remove = None
    
    #DEBUG
#     print 'PICKING NEW PLAYER'
#     for position in position_likelihood:
#         print position, '->', position_likelihood[position]
    
    
    for position in team_need[team]:
        if position not in team_need[team]:
            continue
        position_probability = team_need[team][position]+0.1
        if position not in output:
            continue
        max_score = output[position].max()[1]
        score = (position_probability * position_likelihood[position]) * max_score
        
        player, i = "", 0
        for idx, row in player_rankings[position].iterrows():
            if row[1] == max_score:
                i = idx
                player = row[0]
                break
#         print player, position_probability, '+', max_score, '*', position_likelihood[position], '=', score, position
        if score > best_score:
            best_score = score
            best_pos = position
            idx_to_remove = i
            best_player = player
    
    player_rankings[best_pos].drop(idx_to_remove)
    return best_player, best_pos, idx_to_remove
        
        

In [45]:
scores = {}
relativeError = {} 
coefficients = {}
output = {}
for p in positions:
    if len(xTrain[p]) > 1 and len(xTest[p]) > 1:
        predictor.fit(np.array(xTrain[p]), np.array(yTrain[p]))
        coefficients[p] = pd.DataFrame(zip(features, predictor.coef_), columns = ['feature', 'coefficient']).sort_values(by=['coefficient'], ascending=False)
        prediction = predictor.predict(np.array(xTest[p])) 
        output[p] = pd.DataFrame(zip(names[p], prediction), columns = ['name', 'value']).sort_values(by=['value'], ascending=False)
        # scores[p] = predictor.score(np.array(xTest[p]), np.array(yTest[p]))
        scores[p] = (mean_squared_error(np.array(yTest[p]), np.array(prediction)),
                     r2_score(np.array(yTest[p]), np.array(prediction)))
        relativeError[p] = get_relative_error(output[p], TEST_YEAR)
    else:
        print 'Not enough data for position: {}'.format(p)
draft_order = [
    'TAM',
    'TEN',
    'JAX',
    'OAK',
    'WAS',
    'NYJ',
    'CHI',
    'ATL',
    'NYG',
    'STL',
    'MIN',
    'CLE',
    'NOR',
    'MIA',
    'SFO',
    'HOU',
    'SDG',
    'KAN',
    'CLE',
    'PHI',
    'CIN',
    'PIT',
    'DET',
    'ARI',
    'CAR',
    'BAL',
    'DAL',
    'DEN',
    'IND',
    'GNB',
    'NOR',
    'NWE'
]
teams = {'PIT':'pittsburgh-steelers', 'CIN':'cincinnati-bengals', 'BAL':'baltimore-ravens', 'CLE':'cleveland-browns',
         'NWE':'new-england-patriots', 'BUF':'buffalo-bills', 'MIA':'miami-dolphins', 'NYJ':'new-york-jets',
         'TEN':'tennessee-titans', 'HOU':'houston-texans', 'IND':'indianapolis-colts', 'JAX':'jacksonville-jaguars',
         'KAN':'kansas-city-chiefs', 'OAK':'oakland-raiders', 'DEN':'denver-broncos', 'SDG':'san-diego-chargers',
         'GNB':'green-bay-packers', 'MIN':'minnesota-vikings', 'DET':'detroit-lions', 'CHI':'chicago-bears',
         'DAL':'dallas-cowboys', 'NYG':'new-york-giants', 'PHI':'philadelphia-eagles', 'WAS':'washington-redskins',
         'CAR':'carolina-panthers', 'ATL':'atlanta-falcons', 'NOR':'new-orleans-saints', 'TAM':'tampa-bay-buccaneers',
         'SEA':'seattle-seahawks', 'ARI':'arizona-cardinals', 'STL':'st.-louis-rams', 'SFO':'san-francisco-49ers'}
draft = [teams[abbr] for abbr in draft_order]

avg_error = 0
position_likelihood = positionMLE('data/draftpositions.train.csv')
for i, team in enumerate(draft):
    player, pos, idx =  determine_pick(team, team_probability_matrix, output, position_likelihood[i+1])
    output[pos] = output[pos].drop(idx)
    true_result = get_player_draft(player, 2015, draftPositions)
    error = abs(i+1 - true_result)
    print ' '.join(team.split('-')), '  \t\t', player, ' ' * (20-len(player)), pos, '\t', i+1, '\t', true_result, '\t', error
    avg_error += error
print avg_error / 32

Not enough data for position: LS
Not enough data for position: K
Not enough data for position: P
tampa bay buccaneers   		MarcusMariota         QB 	1 	2 	1
tennessee titans   		CoreyRobinson         T 	2 	240 	238
jacksonville jaguars   		TyrusThompson         T 	3 	185 	182
oakland raiders   		ByronJones            DB 	4 	27 	23
washington redskins   		ChristianCovington    DT 	5 	216 	211
new york jets   		XavierCooper          DT 	6 	96 	90
chicago bears   		RonaldDarby           DB 	7 	50 	43
atlanta falcons   		CraigMager            DB 	8 	83 	75
new york giants   		EricRowe              DB 	9 	47 	38
st. louis rams   		TraeWaynes            DB 	10 	11 	1
minnesota vikings   		MarcusHardison        DE 	11 	135 	124
cleveland browns   		JoshShaw              DB 	12 	120 	108
new orleans saints   		KevinJohnson          DB 	13 	16 	3
miami dolphins   		EddieGoldman          DT 	14 	39 	25
san francisco 49ers   		BobbyMcCain           DB 	15 	145 	130
houston texans   		AlexCarter   