In [24]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.utils import resample
import numpy as np
from scipy import spatial
from sklearn.metrics import precision_recall_fscore_support as score
from sklearn.metrics import make_scorer
from sklearn.metrics import f1_score as f1s
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_validate
from sklearn.model_selection import RepeatedKFold
from sklearn.model_selection import KFold

## Read in Data and Aggregate by player

In [33]:
df = pd.read_csv('Seasons_Stats.csv')
df.columns

Index(['Unnamed: 0', 'Year', 'Player', 'Pos', 'Age', 'Tm', 'G', 'GS', 'MP',
       'PER', 'TS%', '3PAr', 'FTr', 'ORB%', 'DRB%', 'TRB%', 'AST%', 'STL%',
       'BLK%', 'TOV%', 'USG%', 'blanl', 'OWS', 'DWS', 'WS', 'WS/48', 'blank2',
       'OBPM', 'DBPM', 'BPM', 'VORP', 'FG', 'FGA', 'FG%', '3P', '3PA', '3P%',
       '2P', '2PA', '2P%', 'eFG%', 'FT', 'FTA', 'FT%', 'ORB', 'DRB', 'TRB',
       'AST', 'STL', 'BLK', 'TOV', 'PF', 'PTS'],
      dtype='object')

In [34]:
## select relevant columns and fill NA with 0
## remove blank rows and rows representing total counts for a traded player mid season (will be combined later on)
df_new = df[['Year','Player','Tm','G','WS','3P','TRB','AST','STL','BLK','PTS']].fillna(0).query('Year != 0 & Tm != "TOT"')

In [35]:
## create HOF class - whether a player has an asterisk in the name
df_new['HOF'] = df_new.Player.str.contains('\\*',regex=True).astype(int)

## CALL READ IN DATA

In [41]:
## create player aggregated data
df_agg = df_new.groupby('Player').agg({'G':'sum', 'Year':'nunique', 'WS':'max','3P':'sum','TRB':'sum','AST':'sum',
                                      'STL':'sum','BLK':'sum','PTS':'sum', 'HOF':'max'}).reset_index()

In [42]:
df_agg.HOF.value_counts()

0    3793
1     127
Name: HOF, dtype: int64

### cleanup and exploration

# create data set for the model

In [44]:
## only keep players who have played at least 8 seasons
df_agg = df_agg[(df_agg['Year'] >=7)]

df_agg.shape

(1267, 11)

In [38]:
df_agg.HOF.value_counts()

0    1156
1     111
Name: HOF, dtype: int64

In [45]:
# find the most recent season that a player has played
df_maxyear = df_new.groupby('Player').agg({'Year':'max'}).reset_index()

In [46]:
## add most recent season to the data
df_agg = pd.merge(df_agg,df_maxyear,on='Player')

In [47]:
# calculate stats per game
for i in range(4,10):
    df_agg.iloc[:,i] = df_agg.iloc[:,i]/df_agg.iloc[:,1]
    i+=1


## CALL AGG PLAYER

In [15]:
## create df for model -- only players who would be legible for HOF in 2016, i.e. last season before 2012
df_model = df_agg[df_agg['Year_y'] <= 2011].drop(['Year_y'],axis=1)
df_non_model = df_agg[df_agg['Year_y'] > 2011]

In [16]:
df_model.head()

Unnamed: 0,Player,G,Year_x,WS,3P,TRB,AST,STL,BLK,PTS,HOF
0,A.C. Green,1278.0,16,9.4,0.097027,7.412363,1.095462,0.808294,0.42723,9.64867,0
3,Aaron McKie,793.0,13,5.9,0.489281,3.262295,2.680958,1.150063,0.20681,7.403531,0
4,Aaron Williams,715.0,14,5.2,0.001399,3.902098,0.678322,0.394406,0.753846,5.762238,0
5,Adam Keefe,617.0,9,6.5,0.001621,4.142626,0.692058,0.497569,0.280389,5.035656,0
6,Adonal Foyle,733.0,12,4.0,0.0,4.721692,0.469304,0.360164,1.627558,4.077763,0


In [None]:
## select relevant fields that go into model fitting
df_model_fit = df_model[['Year_x','WS','3P','TRB','AST','STL','BLK','PTS','HOF','Player']]
df_non_model_fit = df_non_model[['Year_x','WS','3P','TRB','AST','STL','BLK','PTS','HOF','Player']]

# Model Fitting

## Logistic Regression

In [25]:
def getIndices(K):
    kf = KFold(n_splits=K, shuffle=True)
    train_ind = []
    test_ind = []

    for train_index, test_index in kf.split(df_model_fit):
        train_ind.append(train_index)
        test_ind.append(test_index)
    return train_ind, test_ind

In [30]:
### logistic regression CV

cv= RepeatedKFold(n_splits=10,n_repeats=10,random_state=132)
myscore = make_scorer(f1s,average = 'macro',labels=[1])

sc = []

log = LogisticRegression()
scores = cross_validate(log,df_model_fit.drop(['Player','HOF'],axis=1), df_model_fit.HOF, scoring=myscore,cv=cv, return_train_score=False)
sc.append(scores['test_score'])


In [31]:
np.mean(sc)

0.6927773245027656

In [157]:
logreg = LogisticRegression()
logreg.fit(df_model_fit.drop(['Player','HOF'],axis=1), df_model_fit.HOF)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [158]:
y_pred = logreg.predict(df_model_fit.drop(['Player','HOF'],axis=1))

### function to output prob of HOF

In [160]:
def log_predict(player):
    d = df_non_model_fit[df_non_model_fit['Player'] == player]
    pred = logreg.predict(d.drop(['Player','HOF'],axis=1))
    pred_prob = logreg.predict_proba(d.drop(['Player','HOF'],axis=1))
    return pred, pred_prob
    

In [161]:
log_predict("Blake Griffin")

(array([1]), array([[0.17835399, 0.82164601]]))

In [162]:
## print HOF players
for pl in df_non_model['Player']:
    if log_predict(pl)[0] == 1:
        print(pl + " " + str(log_predict(pl)))

Amar'e Stoudemire (array([1]), array([[0.38186754, 0.61813246]]))
Blake Griffin (array([1]), array([[0.17835399, 0.82164601]]))
Chris Paul (array([1]), array([[0.23981588, 0.76018412]]))
Deron Williams (array([1]), array([[0.46998113, 0.53001887]]))
Derrick Rose (array([1]), array([[0.18553422, 0.81446578]]))
Dirk Nowitzki (array([1]), array([[0.24445321, 0.75554679]]))
Dwight Howard (array([1]), array([[0.30193746, 0.69806254]]))
Dwyane Wade (array([1]), array([[0.29873309, 0.70126691]]))
Elton Brand (array([1]), array([[0.47525768, 0.52474232]]))
James Harden (array([1]), array([[0.2098263, 0.7901737]]))
John Wall (array([1]), array([[0.48161149, 0.51838851]]))
Kevin Durant (array([1]), array([[0.04065112, 0.95934888]]))
Kevin Garnett (array([1]), array([[0.27870852, 0.72129148]]))
Kevin Love (array([1]), array([[0.2949604, 0.7050396]]))
Kobe Bryant (array([1]), array([[0.43863303, 0.56136697]]))
LeBron James (array([1]), array([[0.02829758, 0.97170242]]))
Pau Gasol (array([1]), arra

## Random Forest

### tune the random forest

In [27]:
#lambdas = [1,.9,.8,.7,.6,.5,.4,.3,.2,.1]
#n_tree = [100,80,60,40,20]
#max_depth = [2,3,4,5,6,7]


max_features = ['auto', 'sqrt',None]
n_tree = [50,100,150,200]
min_samples_leaf = [1,2,3,4,5]


param_grid = {'n_estimators': n_tree,
               'min_samples_leaf': min_samples_leaf,
               'max_features': max_features}

myscore = make_scorer(f1s,  average = 'macro',labels=[1] )
cv= RepeatedKFold(n_splits=10,n_repeats=10,random_state=132)


rf =RandomForestClassifier()

grid_search = GridSearchCV(estimator = rf, param_grid = param_grid, 
                          cv = cv, n_jobs = -1,scoring = myscore)

grid_search.fit(df_model_fit.drop(['Player','HOF'],axis=1), df_model_fit.HOF)

GridSearchCV(cv=<sklearn.model_selection._split.RepeatedKFold object at 0x00000203A084F4A8>,
       error_score='raise',
       estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False),
       fit_params=None, iid=True, n_jobs=-1,
       param_grid={'n_estimators': [50, 100, 150, 200], 'min_samples_leaf': [1, 2, 3, 4, 5], 'max_features': ['auto', 'sqrt', None]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=make_scorer(f1_score, average=macro, labels=[1]), verbose=0)

In [29]:
grid_search.best_score_

0.6903882487585301

In [42]:
rf = RandomForestClassifier(n_estimators=grid_search.best_params_['n_estimators'], 
                            max_features=grid_search.best_params_['max_features'],
                            min_samples_leaf=grid_search.best_params_['min_samples_leaf'], 
                            random_state=132)
rf.fit(df_model_fit.drop(['Player','HOF'],axis=1), df_model_fit.HOF)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='sqrt', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=50, n_jobs=1,
            oob_score=False, random_state=132, verbose=0, warm_start=False)

In [43]:
y_pred_rf = rf.predict(df_model_fit.drop(['Player','HOF'],axis=1))

In [45]:
def rf_predict(player):
    d = df_non_model_fit[df_non_model_fit['Player'] == player]
    pred = rf.predict(d.drop(['Player','HOF'],axis=1))
    pred_prob = rf.predict_proba(d.drop(['Player','HOF'],axis=1))
    return pred, pred_prob

In [46]:
for pl in df_non_model['Player']:
    if rf_predict(pl)[0] == 1:
        print(pl + " " + str(rf_predict(pl)))

Amar'e Stoudemire (array([1]), array([[0.2, 0.8]]))
Blake Griffin (array([1]), array([[0.42, 0.58]]))
Carmelo Anthony (array([1]), array([[0.38, 0.62]]))
Chris Paul (array([1]), array([[0.22, 0.78]]))
DeAndre Jordan (array([1]), array([[0.44, 0.56]]))
Dirk Nowitzki (array([1]), array([[0.08, 0.92]]))
Dwight Howard (array([1]), array([[0.14, 0.86]]))
Dwyane Wade (array([1]), array([[0.14, 0.86]]))
Elton Brand (array([1]), array([[0.4, 0.6]]))
Gilbert Arenas (array([1]), array([[0.46, 0.54]]))
James Harden (array([1]), array([[0.34, 0.66]]))
Kevin Durant (array([1]), array([[0.2, 0.8]]))
Kevin Garnett (array([1]), array([[0.2, 0.8]]))
Kevin Love (array([1]), array([[0.3, 0.7]]))
Kobe Bryant (array([1]), array([[0.24, 0.76]]))
LeBron James (array([1]), array([[0.1, 0.9]]))
Pau Gasol (array([1]), array([[0.22, 0.78]]))
Paul Pierce (array([1]), array([[0.26, 0.74]]))
Ray Allen (array([1]), array([[0.32, 0.68]]))
Russell Westbrook (array([1]), array([[0.3, 0.7]]))
Stephen Curry (array([1]), 

## Boosted tree

In [70]:
lambdas = [1,.9,.8,.7,.6,.5,.4,.3,.2,.1]
n_tree = [100,80,60,40,20]
max_depth = [2,3,4,5,6,7]



param_grid = {'n_estimators': n_tree,
               'learning_rate': lambdas,
               'max_depth': max_depth}

myscore = make_scorer(f1s,  average = 'macro',labels=[1] )
cv= RepeatedKFold(n_splits=10,n_repeats=20,random_state=132)


b = GradientBoostingClassifier()

grid_search = GridSearchCV(estimator = b, param_grid = param_grid, 
                          cv = cv, n_jobs = -1,scoring = myscore)

grid_search.fit(df_model_fit.drop(['Player','HOF'],axis=1), df_model_fit.HOF)





GridSearchCV(cv=<sklearn.model_selection._split.RepeatedKFold object at 0x0000022026622278>,
       error_score='raise',
       estimator=GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='deviance', max_depth=3,
              max_features=None, max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=1, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=100,
              presort='auto', random_state=None, subsample=1.0, verbose=0,
              warm_start=False),
       fit_params=None, iid=True, n_jobs=-1,
       param_grid={'n_estimators': [85, 80, 75], 'learning_rate': [0.19, 0.2], 'max_depth': [6]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=make_scorer(f1_score, average=macro, labels=[1]), verbose=0)

In [152]:
best=grid_search.best_params_
best

{'learning_rate': 0.2, 'max_depth': 6, 'n_estimators': 85}

In [153]:
grid_search.best_score_

0.6913019473711881

#### fit model with best paramters

In [19]:
b = GradientBoostingClassifier(learning_rate=grid_search.best_params_['learning_rate'], 
                               n_estimators=grid_search.best_params_['n_estimators'],
                                                 max_depth=grid_search.best_params_['max_depth'])
b.fit(df_model_fit.drop(['Player','HOF'],axis=1), df_model_fit.HOF)

GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='deviance', max_depth=5,
              max_features=None, max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=1, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=100,
              presort='auto', random_state=None, subsample=1.0, verbose=0,
              warm_start=False)

In [94]:
y_pred_b = b.predict(df_model_fit.drop(['Player','HOF'],axis=1))

In [155]:
def bs_predict(player):
    
    d = df_non_model_fit[df_non_model_fit['Player'] == player]
    pred = b.predict(d.drop(['Player','HOF'],axis=1))
    pred_prob = b.predict_proba(d.drop(['Player','HOF'],axis=1))
    return pred, pred_prob

In [156]:
for pl in df_non_model['Player']:
    if bs_predict(pl)[0] == 1:
        print(pl + " " + str(bs_predict(pl)))

Amar'e Stoudemire (array([1]), array([[0.05444948, 0.94555052]]))
Blake Griffin (array([1]), array([[3.43768900e-04, 9.99656231e-01]]))
Carmelo Anthony (array([1]), array([[0.0016427, 0.9983573]]))
Chris Paul (array([1]), array([[0.00748688, 0.99251312]]))
DeAndre Jordan (array([1]), array([[0.22327803, 0.77672197]]))
DeMarcus Cousins (array([1]), array([[0.17458287, 0.82541713]]))
Dirk Nowitzki (array([1]), array([[7.65058048e-05, 9.99923494e-01]]))
Dwight Howard (array([1]), array([[8.15124469e-05, 9.99918488e-01]]))
Dwyane Wade (array([1]), array([[1.70925341e-04, 9.99829075e-01]]))
Elton Brand (array([1]), array([[0.01867171, 0.98132829]]))
Gilbert Arenas (array([1]), array([[0.04168782, 0.95831218]]))
James Harden (array([1]), array([[0.00463622, 0.99536378]]))
John Wall (array([1]), array([[0.04812249, 0.95187751]]))
Kevin Durant (array([1]), array([[2.33700177e-05, 9.99976630e-01]]))
Kevin Garnett (array([1]), array([[0.00255973, 0.99744027]]))
Kevin Love (array([1]), array([[0.

## SIMILARITY

### sample by one player

In [64]:
def player_dist(player):
    d = df_non_model_fit[df_non_model_fit['Player'] == player].drop(['HOF','Player'],axis=1)
    
    return d

In [65]:
player_dist('James Harden')

Unnamed: 0,Year_x,WS,3P,TRB,AST,STL,BLK,PTS
546,8,16.4,2.247154,5.029268,5.733333,1.469919,0.445528,22.143089


In [66]:
df_model_fit_new = df_model_fit.reset_index()
df_model_fit_new = df_model_fit_new.drop(['index'],axis=1)

In [194]:
df_model_fit_new.head()

Unnamed: 0,Year_x,WS,3P,TRB,AST,STL,BLK,PTS,HOF,Player
0,16,9.4,0.097027,7.412363,1.095462,0.808294,0.42723,9.64867,0,A.C. Green
1,13,5.9,0.489281,3.262295,2.680958,1.150063,0.20681,7.403531,0,Aaron McKie
2,14,5.2,0.001399,3.902098,0.678322,0.394406,0.753846,5.762238,0,Aaron Williams
3,9,6.5,0.001621,4.142626,0.692058,0.497569,0.280389,5.035656,0,Adam Keefe
4,12,4.0,0.0,4.721692,0.469304,0.360164,1.627558,4.077763,0,Adonal Foyle


## correlation

In [67]:
def corr_dist(top,player):
    df = player_dist(player)
    df_corr = pd.DataFrame(columns=['Player','Corr'])
    for i in range(len(df_model_fit_new['Player'])):
        df_corr = df_corr.append({'Player': df_model_fit_new['Player'][i], 
                        'Corr':np.corrcoef(df_model_fit_new.drop(['HOF','Player'],axis=1).iloc[i,], df)[0][1]}, ignore_index=True)
        i +=1
    df_corr.set_index('Player')
    df_corr_top = df_corr.sort_values(by='Corr', ascending=False).head(top)
    
    return df_corr_top

In [None]:
corr = []
df = df_model_fit_new.drop(['HOF','Player'],axis=1)
def corr_dist(top,player):    
    pl = player_dist(player)
    corr = np.corrcoef(df_model_fit_new.drop(['HOF','Player'],axis=1).iloc[i,], df)[0][1]}, ignore_index=True)
     
    return df_corr_top


In [68]:
corr_dist(10, 'Blake Griffin')

Unnamed: 0,Player,Corr
543,Larry Kenon,0.993635
176,Connie Hawkins,0.993401
100,Brad Daugherty,0.992061
342,George Yardley,0.991148
192,Dan Issel,0.987771
67,Billy Cunningham,0.987545
340,George McGinnis,0.983512
917,Yao Ming,0.983403
420,Jeff Ruland,0.982571
681,Paul Arizin,0.979078


## Cosine

In [57]:
def cos_dist(top,player):
    df = player_dist(player)
    df_cos = pd.DataFrame(columns=['Player','Cos'])
    for i in range(len(df_model_fit_new['Player'])):
        df_cos = df_cos.append({'Player': df_model_fit_new['Player'][i], 
                        'Cos':1 - spatial.distance.cosine(df, df_model_fit_new.drop(['HOF','Player'],axis=1).iloc[i,])}, ignore_index=True)
        i +=1
    df_cos.set_index('Player')
    df_cos_top = df_cos.sort_values(by='Cos', ascending=False).head(top)
    
    return df_cos_top

In [58]:
cos_dist(10,'Blake Griffin')

Unnamed: 0,Player,Cos
544,Larry Kenon,0.996862
176,Connie Hawkins*,0.996535
100,Brad Daugherty,0.995897
343,George Yardley*,0.994363
192,Dan Issel*,0.993889
67,Billy Cunningham*,0.993819
918,Yao Ming*,0.991891
341,George McGinnis,0.991358
421,Jeff Ruland,0.991241
294,Elgin Baylor*,0.98886


## Euclid

In [277]:
players = df_model_fit['Player'].tolist()
f = ['Year_x','WS','3P','TRB','AST','STL','BLK','PTS']
def euc_dist(top, player):
    
    df = df_non_model_fit[f][df_non_model.Player == player]
    df_euc = pd.DataFrame(columns=['Player','Player1','Euc'])
    
    for p in range(len(players)):
        df_euc = df_euc.append({'Player':player,'Player1': players[p], 
                        'Euc':np.linalg.norm(df - df_model_fit[f][df_model_fit.Player==players[p]].iloc[0,])}, ignore_index=True)
        p +=1
    #df_euc.set_index('Player')
    df_euc_top = df_euc.sort_values(by='Euc', ascending=True).head(top)
    df_euc_top = df_euc_top.drop(['Euc'],axis=1)
    
    return df_euc_top