In [1]:
import pandas as pd
import numpy as np
pd.set_option('display.max_columns', None)
pd.options.mode.chained_assignment = None

Load data from pickle

In [2]:
all_players = pd.read_pickle("all_players.pkl")
reg_players = pd.read_pickle("reg_players.pkl")
hof_players = pd.read_pickle("hof_players.pkl")

In [3]:
#Helper function to convert height from a readable format to inches
def readable_to_inches(x):
    if x == 0:
        return
    inches = str(x).split('-')
    return (int(inches[0])*12)+int(inches[1])

all_players["Ht"] = all_players["Ht"].apply(readable_to_inches)
reg_players["Ht"] = reg_players["Ht"].apply(readable_to_inches)
hof_players["Ht"] = hof_players["Ht"].apply(readable_to_inches)

Clean up the data a bit by dropping positions, birth dates, colleges and their player page URL. I'll also fill in NaNs with 0s and shift the player name to be the index so all the columns will be quantitative and have a value.

In [4]:
all_players.drop(["Pos","Birth Date","Colleges","URL"],axis=1,inplace=True)
reg_players.drop(["Pos","Birth Date","Colleges","URL"],axis=1,inplace=True)
hof_players.drop(["Pos","Birth Date","Colleges","URL"],axis=1,inplace=True)
all_players.set_index('Player',inplace=True)
reg_players.set_index('Player',inplace=True)
hof_players.set_index('Player',inplace=True)
all_players.fillna(0,inplace=True)
reg_players.fillna(0,inplace=True)
hof_players.fillna(0,inplace=True)

Describe the dataset to get the average, standard deviation and max across all players. This is useful information to see the average of the career averages of all the players who've ever played in the NBA.

In [5]:
print len(all_players)
all_players.describe().loc[['mean','std','max']]

4580


Unnamed: 0,From,To,Ht,Wt,Years of Service,G,GS,MP,FG,FGA,FG%,3P,3PA,3P%,2P,2PA,2P%,eFG%,FT,FTA,FT%,ORB,DRB,TRB,AST,STL,BLK,TOV,PF,PTS,Teams
mean,1985.291921,1989.475983,78.00262,208.621616,4.184061,275.614629,91.408079,15.439913,2.468013,5.83917,0.405636,0.211026,0.643144,0.152528,1.708472,3.691856,0.33453,0.338043,1.303712,1.816769,0.666403,0.737511,1.617751,2.878843,1.424956,0.415284,0.239345,0.854869,1.815022,6.446638,2.681659
std,21.073436,21.94476,3.820211,27.291328,4.498959,310.838927,201.040494,9.700809,1.810298,3.873235,0.10927,0.402833,1.105978,0.171514,1.794362,3.706936,0.213343,0.214088,1.11781,1.441973,0.196767,0.766755,1.605284,2.380207,1.364652,0.428248,0.365912,0.773852,0.845626,4.748298,1.943505
max,2018.0,2018.0,91.0,360.0,22.0,1611.0,1471.0,45.8,12.1,23.8,1.0,3.4,7.8,1.0,10.8,21.3,1.0,1.5,7.8,11.4,1.0,5.1,10.4,22.9,11.2,2.7,3.5,4.4,5.0,30.1,13.0


Get the same stats, but this time for separating between HOF and non-HOF players.

In [6]:
print len(reg_players)
reg_players.describe().loc[['mean','std','max']]

4440


Unnamed: 0,From,To,Ht,Wt,Years of Service,G,GS,MP,FG,FGA,FG%,3P,3PA,3P%,2P,2PA,2P%,eFG%,FT,FTA,FT%,ORB,DRB,TRB,AST,STL,BLK,TOV,PF,PTS,Teams,AllStars,MVPs
mean,1985.810135,1989.769144,77.998198,208.647748,3.959009,257.868468,85.361036,14.913333,2.344167,5.584054,0.404084,0.212185,0.646959,0.15338,1.645878,3.573536,0.336556,0.340149,1.218784,1.70786,0.663525,0.721036,1.571351,2.737432,1.356532,0.404752,0.230248,0.83545,1.780608,6.115383,2.675901,0.228153,0.003153
std,21.041313,22.024369,3.81582,27.296752,4.324955,293.754773,186.44814,9.311707,1.653524,3.57795,0.110185,0.403022,1.107211,0.172035,1.664794,3.463517,0.211788,0.212505,0.991077,1.280241,0.198728,0.739985,1.522384,2.158248,1.279941,0.412261,0.3439,0.738876,0.829725,4.327541,1.951462,1.021936,0.082148
max,2018.0,2018.0,91.0,360.0,22.0,1471.0,1440.0,39.0,9.9,19.6,1.0,3.4,7.8,1.0,8.5,17.0,1.0,1.5,7.1,9.1,1.0,4.7,9.1,13.4,9.8,2.7,3.5,4.0,5.0,27.2,13.0,18.0,4.0


In [7]:
print len(hof_players)
hof_players.describe().loc[['mean','std','max']]

140


Unnamed: 0,From,To,Ht,Wt,Years of Service,G,GS,MP,FG,FGA,FG%,3P,3PA,3P%,2P,2PA,2P%,eFG%,FT,FTA,FT%,ORB,DRB,TRB,AST,STL,BLK,TOV,PF,PTS,Teams,AllStars,MVPs
mean,1968.857143,1980.178571,78.142857,207.792857,11.321429,838.421429,283.185714,32.14,6.395714,13.93,0.454829,0.174286,0.522143,0.125493,3.693571,7.444286,0.270286,0.271257,3.997143,5.270714,0.757664,1.26,3.089286,7.363571,3.595,0.749286,0.527857,1.470714,2.906429,16.952143,2.864286,6.3,0.407143
std,14.486695,16.835757,3.96853,27.203072,4.061165,313.184389,427.867434,6.424714,2.148276,4.166908,0.05566,0.396454,1.06285,0.152188,3.669066,7.412941,0.250321,0.251175,1.502178,1.939053,0.074647,1.263591,2.937005,4.120666,2.034074,0.70563,0.73983,1.377961,0.577418,5.476838,1.667336,4.166844,1.031144
max,2003.0,2014.0,90.0,325.0,20.0,1611.0,1471.0,45.8,12.1,23.8,0.582,2.3,5.7,0.667,10.8,21.3,0.583,0.582,7.8,11.4,0.904,5.1,10.4,22.9,11.2,2.6,3.1,4.4,4.2,30.1,9.0,19.0,6.0


Preliminary analysis on the average of the players' career averagers tells us a few things. The first and the most obvious is that having a HOF selection is afforded to a very select few: only the best. Only 140 out of 4580 total players are currently voted in, or 3.1%. The NBA started inducting in 1959 - 59 years ago - so about 5 players can be expected to be inducted every 2 years. Of course, the amount of people inducted per year has risen and some NBA players from decades ago are retroactively inducted.

The second thing we can take away from this just how far out of the pack the averages of the HOF players are. They average close to 7 more years in the league(which makes sense since they also average about 580 more games given that there are 82 games in a season), and close to either double or triple the stats in every other category. 

The third thing we can see is the MVP count, a HOF averages 0.4 MVPs, while a non-HOF averages 0.003, and while a player doesn't need and MVP to be a HOF, it usually helps. So, who are the players with MVPs that *aren't* currently in the HOF?

In [8]:
for player in reg_players[reg_players['MVPs'] > 0].index:
    print player

Kobe Bryant
Stephen Curry
Tim Duncan
Kevin Durant
Kevin Garnett
LeBron James
Dirk Nowitzki
Derrick Rose
Russell Westbrook


These players here are either still active or not yet eligible to be nominated, so moving on...

HOF players were the best players, but they're still human, and that's evident in some of the stats off the court. Players' height and weight are pretty similar HOF-caliber or not, and they also play for around the same amount of distinct teams. So whether a player is a career journeyman(taking a trip through ~3 teams) or a team's stalwart cornerstone, they're probably about 6'6", 208lbs and had a career lasting ~11 years.

Mimicking basketball-reference, I will run the data through a logistic regression to try and predict future HOF-ers given their current stats. The training data will be all players from 1946, the year the league was formed, to 2015 (a player must be retired for 3 full seasons to be eligible for voting. At the time of me writing this, it is June of 2018, so 3 full seasons since 2015 has already passed.)

*NOTE*: Some of the players here, expecially the ones with a lower amount of years of service may have been enshrined due to their coaching prowess, and not their playing abilities. However, these guys still played professional basketball at some point in the NBA, so their statistics will not be removed and will be included in the model.

In [9]:
training_data = pd.concat([reg_players[reg_players['To'] <= 2015],hof_players[hof_players['To'] <= 2015]])
training_data.drop_duplicates(inplace=True)
training_data_y = np.append(np.zeros(len(reg_players[reg_players['To'] <= 2015])),(np.ones(len(hof_players[hof_players['To'] <= 2015]))))

test_data = pd.concat([reg_players[reg_players['To'] > 2015],hof_players[hof_players['To'] > 2015]])
test_data.drop_duplicates(inplace=True)

print "Training: %s, Testing: %s"%(training_data.shape[0],test_data.shape[0])
print "Total: %s"%(training_data.shape[0]+test_data.shape[0])

Training: 3867, Testing: 713
Total: 4580


3867 training and 713 testing samples are by no means shabby. Most models use 80-20 distribution, but a 85-15 will still yield good results.

In [10]:
from sklearn import datasets
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression

There are currently 33 columns that needs to be considered by the model. This is too many, so I'm going to try to cut it down a bit.

In [11]:
logreg = LogisticRegression()
rfe = RFE(logreg)
rfe = rfe.fit(training_data, training_data_y)
print sorted(zip(training_data.columns,rfe.support_, rfe.ranking_),key=lambda x:x[2])

[('Years of Service', True, 1), ('3PA', True, 1), ('3P%', True, 1), ('2P%', True, 1), ('eFG%', True, 1), ('FT', True, 1), ('FTA', True, 1), ('ORB', True, 1), ('TRB', True, 1), ('AST', True, 1), ('STL', True, 1), ('BLK', True, 1), ('TOV', True, 1), ('Teams', True, 1), ('AllStars', True, 1), ('MVPs', True, 1), ('PF', False, 2), (u'From', False, 3), (u'To', False, 4), ('FGA', False, 5), ('2P', False, 6), ('3P', False, 7), ('FT%', False, 8), ('2PA', False, 9), (u'Wt', False, 10), ('DRB', False, 11), ('FG%', False, 12), ('MP', False, 13), ('FG', False, 14), ('PTS', False, 15), (u'Ht', False, 16), ('G', False, 17), ('GS', False, 18)]


According to the Recursive Feature Elimination, the features most likely to affect the outcome of a HOF player is the following:
- Years of Service
- 3PA
- 3P%
- 2P%
- eFG%
- FT
- FTA
- ORB
- TRB
- AST
- STL
- BLK
- TOV
- Team
- AllStars
- MVPs

This list kind of makes sense, as players that stay in the league longer tend to have a better game. Players also don't-or rather can't-stay around if they're bad either. Shooting percentages are very important and shows how effective a player is. Rebounds-especially offensive rebounds-show grit and hustle, while assists show how much of a team player a player is. Steals and blocks are important too since some players are defensive beasts while being not so gifted on the offensive side. A lower turnover amount means the player is efficient with the possession of the ball. The amount of teams played means either the player was so good or loyal that they weren't traded, or they were good that they were highly sought after. Both explanations are pretty reasonable. Lastly, AllStar selections and MVPs selections are both based off merit. This list generally makes sense, and it shortens down the list from 33 features to about half, at 16. 

The odd item on the list that I didn't mention above is 3PA, since some players are just _not_ 3pt shooters, and the 3pt line was added later as well. This stat should not be considered as important. FT and FTA are also considered but not FT%, although this might make some sense because free throws are free points and getting to the free throw line can be considered a skill. Clutch free throws are also very important to winning close games.

The rest of the features not included isn't suprising since they can either be derived (games played/started is directly related to years played) or they don't matter much since they're close to the average of all players (height and weight). One feature that I'm really shocked to see did not make the list however, is points, since you literally cannot win a game without them.

Select the columns given by RFE for both training and testing dataframes

In [12]:
training_data_x = training_data[['Years of Service', '3PA', '3P%', '2P%', 'eFG%', 'FT', 'FTA', 'ORB', 'TRB', \
                                  'AST', 'STL', 'BLK', 'TOV', 'Teams', 'AllStars', 'MVPs']]
test_data_x = test_data[['Years of Service', '3PA', '3P%', '2P%', 'eFG%', 'FT', 'FTA', 'ORB', 'TRB', \
                                  'AST', 'STL', 'BLK', 'TOV', 'Teams', 'AllStars', 'MVPs']]

In [13]:
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
logreg = LogisticRegression()
logreg.fit(training_data_x, training_data_y)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [14]:
test_data_x["HOF?"] = logreg.predict(test_data_x)
print len(test_data_x[test_data_x["HOF?"] == 1])
for player in test_data_x[test_data_x["HOF?"] == 1].index.tolist():
    print player

21
LaMarcus Aldridge
Carmelo Anthony
Chris Bosh
Kobe Bryant
Stephen Curry
Tim Duncan
Kevin Durant
Kevin Garnett
Pau Gasol
Blake Griffin
James Harden
Dwight Howard
LeBron James
Dirk Nowitzki
Tony Parker
Chris Paul
Paul Pierce
Amar'e Stoudemire
Dwyane Wade
John Wall
Russell Westbrook


This list seems somewhat short, with only 21 of the possible 713 eligible players having a 100% chance. (100% because the predict function only returns a binary value.) To add to that, 21/713 is 2.9%, which isn't that far off from the current HOF percentage of 3.1%. All of these players are also known for high levels of play and have either won MVPs and championships were close to.

Reset for the probabilities

In [15]:
test_data_x = test_data[['Years of Service', '3PA', '3P%', '2P%', 'eFG%', 'FT', 'FTA', 'ORB', 'TRB', \
                                  'AST', 'STL', 'BLK', 'TOV', 'Teams', 'AllStars', 'MVPs']]

test_data_x["HOF?"] = [x[1] for x in logreg.predict_proba(test_data_x)]
print test_data_x[test_data_x["HOF?"] > 0.95]["HOF?"].sort_values(ascending=False)

Player
LeBron James         0.999999
Tim Duncan           0.999994
Kobe Bryant          0.999991
Kevin Garnett        0.999856
Dirk Nowitzki        0.999740
Dwyane Wade          0.994220
Russell Westbrook    0.990159
Kevin Durant         0.986226
Chris Bosh           0.980685
Paul Pierce          0.960214
Carmelo Anthony      0.955126
Name: HOF?, dtype: float64


So far, this model has been pretty consistent in the results, and matches wih the basketballreference model results well. Some of the player's probability rankings are not the same, but the results are similar. There are outliers, however, Vince Carter Chris Paul, and James Harden  does not appear in my model's top 10, but they do on the one for basketballreference.

In [16]:
print test_data_x[test_data_x["HOF?"] >= 0.5]["HOF?"].sort_values(ascending=False)

Player
LeBron James         0.999999
Tim Duncan           0.999994
Kobe Bryant          0.999991
Kevin Garnett        0.999856
Dirk Nowitzki        0.999740
Dwyane Wade          0.994220
Russell Westbrook    0.990159
Kevin Durant         0.986226
Chris Bosh           0.980685
Paul Pierce          0.960214
Carmelo Anthony      0.955126
Stephen Curry        0.933087
Chris Paul           0.903754
Dwight Howard        0.886409
Tony Parker          0.877733
John Wall            0.770405
Pau Gasol            0.763726
James Harden         0.720776
Blake Griffin        0.598244
LaMarcus Aldridge    0.577846
Amar'e Stoudemire    0.570620
Name: HOF?, dtype: float64


Lowering the threshold to 0.5, which is were the predict function will round up to the binary 1, we get back the list of 21 players. Vince Carter is still not on the list, but the other 2 makes their appearances. Another oddity that I've noticed is that at rank 21, Amare'e Stoudemire has a 57% chance of being a HOF, while for basketballreference, LeMarcus Aldridge only has a 21% chance. Player name aside, it does seem that my model has a higher average % chance of players making it to the HOF.

Lastly, let's investigate Vince Carter's HOF chances using my model.

In [17]:
print test_data_x.ix["Vince Carter"]["HOF?"]

0.485976892722


Seems like he just barely missed the 0.5 mark used for rounding, let's see if here are other players who barely missed as well.

In [18]:
print test_data_x[(test_data_x["HOF?"] >= 0.45) & (test_data_x["HOF?"] < 0.5)]["HOF?"].sort_values(ascending=False)

Player
Vince Carter        0.485977
DeMarcus Cousins    0.479510
Name: HOF?, dtype: float64


Cousins also barely missed the cut, this model has him at 48%, while basektballreference has him at around 1.3%. Only time will tell now...

Before I wrap this up, I'm going to tweak the model with what I think are the best metrics for evaluating a player's HOF-worthiness. This includes taking out the 3PA and putting in PTs. I'll also throw in FG%.

In [19]:
training_data_x = training_data[['Years of Service', '3PA', 'PTS', '3P%', '2P%', 'eFG%', 'FT', 'FTA', 'ORB', 'TRB', \
                                  'AST', 'STL', 'BLK', 'TOV', 'Teams', 'AllStars', 'MVPs']]
test_data_x = test_data[['Years of Service', '3PA', 'PTS', '3P%', '2P%', 'eFG%', 'FT', 'FTA', 'ORB', 'TRB', \
                                  'AST', 'STL', 'BLK', 'TOV', 'Teams', 'AllStars', 'MVPs']]

logreg = LogisticRegression()
logreg.fit(training_data_x, training_data_y)

test_data_x["HOF?"] = [x[1] for x in logreg.predict_proba(test_data_x)]
print test_data_x[test_data_x["HOF?"] > 0.95]["HOF?"].sort_values(ascending=False)

Player
LeBron James         0.999999
Tim Duncan           0.999994
Kobe Bryant          0.999989
Kevin Garnett        0.999846
Dirk Nowitzki        0.999727
Dwyane Wade          0.993786
Russell Westbrook    0.989357
Kevin Durant         0.986331
Chris Bosh           0.979359
Paul Pierce          0.955146
Carmelo Anthony      0.955081
Name: HOF?, dtype: float64


Seems like the RFE was correct, since the results of those with 95%+ are the same.