In [1]:
# read in libraries
import pandas as pd
import numpy as np
import pickle
pd.options.mode.chained_assignment = None 
pd.errors.DtypeWarning = None

In [2]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from urllib.request import urlopen
from bs4 import BeautifulSoup
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from scipy import stats
from scipy.stats import ttest_ind
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score

In [None]:
# read in datasets

# kaggle datasets
games = pd.read_csv('games.csv') 
games_details = pd.read_csv('games_details.csv')
players = pd.read_csv('players.csv')
ranking = pd.read_csv('ranking.csv')
teams = pd.read_csv('teams.csv')

# datasets created from internet
jersey = pd.read_csv('jersey2.csv')
twok = pd.read_csv('2kRank.csv')
pacedata = {'season':[2012,2013,2014,2015,2016,2017,2018,2019,2020],
           'pace':[92.0,93.9,93.9,95.8,96.4,97.3,100.0,100.3,99.2]}
pace = pd.DataFrame(pacedata)

# load in scraped all star info
# get this from ASG Scrape
all_star_appearances = pickle.load(open('all_star_appearances.pickle', 'rb'))

  interactivity=interactivity, compiler=compiler, result=result)


In [None]:
# join date to individual games stats dataset
games_details2 = pd.merge(games_details, games[['GAME_DATE_EST', 'GAME_ID']], on = 'GAME_ID', how = 'left')

# extract year and month for every row
games_details2['year'] =  pd.to_numeric(games_details2['GAME_DATE_EST'].apply(lambda x: x.split('-')[0]))
games_details2['month'] =  pd.to_numeric(games_details2['GAME_DATE_EST'].apply(lambda x: x.split('-')[1]))

# label each row with season based on year and month
games_details2['season'] = games_details2[['year','month']].apply(lambda x: x['year']-1 if (x['month']==1) else x['year'], axis=1)

# keep only game info for September through January in the 2012-2020 seasons
games_details2 = games_details2[games_details2['month'].isin([9,10,11,12,1])]
games_details3 = games_details2[(games_details2['season']<2021) & (games_details2['season']>2011)]

# get rid of 2020 finals
games_details3 = games_details3[~((games_details3['season']==2020) & (games_details3['month'].isin([9,10])))]

# drop duplicates
games_details3 = games_details3.drop_duplicates(subset=['GAME_ID','PLAYER_ID'])

In [None]:
# get rid of rows with comments (means they did not play)
games_details3 = games_details3[games_details3['COMMENT'].isnull()]

# get rid of one erroneous row
games_details3 = games_details3[games_details3['PTS'].notnull()]

# extract number of minutes played
games_details3['minutes'] =  pd.to_numeric(games_details3['MIN'].apply(lambda x: x.split(':')[0]))

# add a column to indicate they played in that game (useful for aggregation in next step)
games_details3['GP'] = 1

In [None]:
# uses groupby to summarize by player and season
grouped = games_details3.groupby(['PLAYER_ID','season']).apply(lambda s: pd.Series({ 
    "PLAYER_NAME": s["PLAYER_NAME"].mode()[0],
    "TEAM_ABBREVIATION": s["TEAM_ABBREVIATION"].mode()[0],
    "TEAM_ID": s["TEAM_ID"].mode()[0],
    "GP": s["GP"].sum(),
    "Minutes": s["minutes"].mean(),
    "FGM": s["FGM"].mean(),
    "FGA": s["FGA"].mean(),
    "FG3M": s["FG3M"].mean(),
    "FG3A": s["FG3A"].mean(),
    "FTM": s["FTM"].mean(),
    "FTA": s["FTA"].mean(),
    "OREB": s["OREB"].mean(),
    "DREB": s["DREB"].mean(),
    "AST": s["AST"].mean(),
    "STL": s["STL"].mean(),
    "BLK": s["BLK"].mean(),
    "TO": s["TO"].mean(), 
    "PF": s["PF"].mean(),
    "PTS": s["PTS"].mean(),
    "PLUS_MINUS": s["PLUS_MINUS"].mean(),
    "Starting_Position": s["START_POSITION"].mode(),
})).reset_index()

In [None]:
# Fix Starting_Position column
grouped['Starting_Position'] = grouped['Starting_Position'].apply(lambda x: "None" if len(x)==0 else x[0])

In [None]:
# merge pace data
grouped = pd.merge(grouped, pace, on = ['season'], how = 'left')

# divide pace data by 100
grouped['pace'] = grouped['pace']/100

# divide relevant statistics by pace/100
grouped[['FGM','FGA','FG3M','FG3A','FTM','FTA','OREB',
       'DREB', 'AST', 'STL', 'BLK', 'TO', 'PF', 'PTS', 'PLUS_MINUS']] = grouped[['FGM','FGA','FG3M','FG3A','FTM','FTA','OREB',
       'DREB', 'AST', 'STL', 'BLK', 'TO', 'PF', 'PTS', 'PLUS_MINUS']].div(grouped.pace, axis=0)

In [None]:
# PER
grouped['PER'] = (grouped['FGM']*85.910 + grouped['STL']*53.897 + 
grouped['FG3M']*51.757 + grouped['FTM']*46.845 + grouped['BLK']*39.190
+ grouped['OREB']*39.190+ grouped['AST']*34.677 + grouped['DREB']*14.707
- grouped['PF']*17.174 - (grouped['FTA']-grouped['FTM'])*20.091 - (grouped['FGA']-grouped['FGM'])*39.190
- grouped['TO']*53.897)*(1/grouped['Minutes'])

# True Shooting Percentage
grouped['TSP'] = grouped['PTS']/(2*(grouped['FGA']+.44*grouped['FTA']))

In [None]:
# keep only January 20 data
ranking2 = ranking[(ranking['STANDINGSDATE'].apply(lambda x: x.split('-')[1])=='01') & (ranking['STANDINGSDATE'].apply(lambda x: x.split('-')[2])=='20')]

# get season from the year
ranking2['season'] = ranking2['SEASON_ID']-20000
grouped = pd.merge(grouped, ranking2[['TEAM_ID', 'season','CONFERENCE','W_PCT']], on = ['TEAM_ID','season'], how = 'left')

In [None]:
# merge jersey data
grouped = pd.merge(grouped, jersey, on = ['season','PLAYER_ID'], how = 'left')

In [None]:
# merge 2k data
grouped = pd.merge(grouped, twok[['PLAYER_ID','season','2KRank']],on=['PLAYER_ID','season'],how='left')

In [None]:
# function that defines if a player was an All star the previous year
def was_AS_last_year(row):
    if row['season'] == 1999:
        return 1 if 1998 in all_star_appearances[row['PLAYER_NAME']] else 0
    return 1 if row['season'] in all_star_appearances[row['PLAYER_NAME']] else 0

# adds if a player was an ALl star the previous year
grouped['LastASG?'] = grouped[['PLAYER_NAME', 'season']].apply(was_AS_last_year, axis=1)

# adds the total amount of prior selections a player had as of that year (not including that year)
grouped['PriorASG'] = grouped[['PLAYER_NAME', 'season']].apply(lambda row : sum(y<=row['season'] for y in all_star_appearances[row['PLAYER_NAME']]), axis=1)

# adds wether a player was selected as an all star that year
grouped['Selected?'] = grouped[['PLAYER_NAME', 'season']].apply(lambda row : 1 if row['season']+1 in all_star_appearances[row['PLAYER_NAME']] else 0, axis=1)


In [None]:
# get total games played by each team up to that point in each season
game_count1 = games_details3.groupby(['season','GAME_ID','TEAM_ID']).GP.mean().reset_index()
game_count2 = game_count1.groupby(['season','TEAM_ID']).GP.sum().reset_index()

# merges Games Played rate with main dataset
grouped = pd.merge(grouped,game_count2,on=['season','TEAM_ID'],how='left')

# calculates percentage games played
grouped['GP_Per'] = grouped['GP_x'] / grouped['GP_y']

In [None]:
#fill null values
grouped['PLUS_MINUS'] = grouped['PLUS_MINUS'].fillna(0)
grouped['TSP'] = grouped['TSP'].fillna(0)
grouped['PER'] = grouped['PER'].fillna(0)
grouped['2KRank'] = grouped['2KRank'].fillna(500)

In [None]:
grouped.isnull().sum(axis = 0)


PLAYER_ID            0
season               0
PLAYER_NAME          0
TEAM_ABBREVIATION    0
TEAM_ID              0
GP_x                 0
Minutes              0
FGM                  0
FGA                  0
FG3M                 0
FG3A                 0
FTM                  0
FTA                  0
OREB                 0
DREB                 0
AST                  0
STL                  0
BLK                  0
TO                   0
PF                   0
PTS                  0
PLUS_MINUS           0
Starting_Position    0
pace                 0
PER                  0
TSP                  0
CONFERENCE           0
W_PCT                0
Jersey               0
2KRank               0
LastASG?             0
PriorASG             0
Selected?            0
GP_y                 0
GP_Per               0
dtype: int64

In [None]:
grouped.loc[(grouped['PLAYER_NAME']=='Dwyane Wade') & (grouped['season']==2018),'Selected?'] = 0
grouped.loc[(grouped['PLAYER_NAME']=='Dirk Nowitzki') & (grouped['season']==2018),'Selected?'] = 0

In [None]:
grouped

Unnamed: 0,PLAYER_ID,season,PLAYER_NAME,TEAM_ABBREVIATION,TEAM_ID,GP_x,Minutes,FGM,FGA,FG3M,FG3A,FTM,FTA,OREB,DREB,AST,STL,BLK,TO,PF,PTS,PLUS_MINUS,Starting_Position,pace,PER,TSP,CONFERENCE,W_PCT,Jersey,2KRank,LastASG?,PriorASG,Selected?,GP_y,GP_Per
0,255,2012,Grant Hill,LAC,1610612746,12,15.500000,1.992754,4.800725,0.090580,0.452899,0.996377,1.630435,0.362319,1.992754,1.086957,0.452899,0.362319,1.086957,1.902174,5.072464,0.652174,,0.920,8.279628,0.459619,West,0.780,0,88.0,0,7,0,55,0.218182
1,467,2012,Jason Kidd,NYK,1610612752,43,27.674419,2.679474,6.597573,1.971689,5.030334,0.834176,0.935288,0.707786,3.690597,4.145602,1.971689,0.353893,1.390293,1.794742,8.164813,3.089245,G,0.920,16.473405,0.582444,East,0.658,0,54.0,0,10,0,49,0.877551
2,703,2012,Kurt Thomas,NYK,1610612752,33,10.121212,1.119895,2.272727,0.000000,0.000000,0.197628,0.428195,0.922266,1.877470,0.461133,0.296443,0.362319,0.131752,1.416337,2.437418,0.644122,F,0.920,13.254729,0.495182,East,0.658,0,299.0,0,0,0,49,0.673469
3,708,2012,Kevin Garnett,BOS,1610612738,49,28.959184,6.499556,12.843833,0.000000,0.221828,2.639752,3.349601,0.931677,6.765750,2.395741,1.153505,1.086957,1.841171,2.284827,15.638864,0.652174,C,0.920,20.875268,0.546139,East,0.500,0,28.0,0,14,1,51,0.960784
4,708,2013,Kevin Garnett,BKN,1610612751,43,20.372093,3.071055,7.306140,0.000000,0.074300,0.792530,0.916363,1.040196,5.943978,1.585061,0.866830,0.619164,1.287862,2.674790,6.934641,-1.174190,F,0.939,13.316450,0.449756,East,0.436,1,29.0,1,15,0,51,0.843137
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5011,1630270,2020,Xavier Sneed,CHA,1610612766,3,5.666667,0.672043,1.680108,0.672043,1.680108,0.000000,0.000000,0.336022,1.008065,0.336022,0.000000,0.000000,0.000000,1.344086,2.016129,3.696237,,0.992,12.277989,0.600000,East,0.429,0,500.0,0,0,0,24,0.125000
5012,1630271,2020,Brodric Thomas,HOU,1610612745,7,6.857143,0.720046,2.448157,0.288018,1.440092,0.864055,1.152074,0.144009,1.008065,0.720046,0.144009,0.288018,0.576037,0.864055,2.592166,-0.720046,,0.992,9.090243,0.438596,West,0.308,0,500.0,0,0,0,22,0.318182
5013,1630273,2020,Freddie Gillespie,DAL,1610612742,1,5.000000,1.008065,2.016129,0.000000,1.008065,0.000000,0.000000,0.000000,1.008065,0.000000,0.000000,0.000000,1.008065,1.008065,2.016129,-5.040323,,0.992,-1.944355,0.500000,West,0.500,0,500.0,0,0,0,23,0.043478
5014,1630278,2020,Ade Murkey,MIN,1610612750,1,3.000000,0.000000,1.008065,0.000000,0.000000,0.000000,0.000000,1.008065,0.000000,0.000000,0.000000,0.000000,0.000000,1.008065,0.000000,7.056452,,0.992,-5.770833,0.000000,West,0.231,0,500.0,0,0,0,22,0.045455


In [None]:
grouped.columns


Index(['PLAYER_ID', 'season', 'PLAYER_NAME', 'TEAM_ABBREVIATION', 'TEAM_ID',
       'GP_x', 'Minutes', 'FGM', 'FGA', 'FG3M', 'FG3A', 'FTM', 'FTA', 'OREB',
       'DREB', 'AST', 'STL', 'BLK', 'TO', 'PF', 'PTS', 'PLUS_MINUS',
       'Starting_Position', 'pace', 'PER', 'TSP', 'CONFERENCE', 'W_PCT',
       'Jersey', '2KRank', 'LastASG?', 'PriorASG', 'Selected?', 'GP_y',
       'GP_Per'],
      dtype='object')

In [None]:
df = grouped[['PLAYER_ID', 'season', 'PLAYER_NAME', 'TEAM_ABBREVIATION', 'TEAM_ID',
       'GP_x', 'Minutes', 'FGM', 'FGA', 'FG3M', 'FG3A', 'FTM', 'FTA', 'OREB',
       'DREB', 'AST', 'STL', 'BLK', 'TO', 'PF', 'PTS', 'PLUS_MINUS', 'PER', 'TSP',
       'W_PCT', 'Jersey', '2KRank', 'LastASG?', 'PriorASG', 'Selected?', 'Starting_Position',
       'pace', 'CONFERENCE', 'GP_y', 'GP_Per']]

In [None]:
df

Unnamed: 0,PLAYER_ID,season,PLAYER_NAME,TEAM_ABBREVIATION,TEAM_ID,GP_x,Minutes,FGM,FGA,FG3M,FG3A,FTM,FTA,OREB,DREB,AST,STL,BLK,TO,PF,PTS,PLUS_MINUS,PER,TSP,W_PCT,Jersey,2KRank,LastASG?,PriorASG,Selected?,Starting_Position,pace,CONFERENCE,GP_y,GP_Per
0,255,2012,Grant Hill,LAC,1610612746,12,15.500000,1.992754,4.800725,0.090580,0.452899,0.996377,1.630435,0.362319,1.992754,1.086957,0.452899,0.362319,1.086957,1.902174,5.072464,0.652174,8.279628,0.459619,0.780,0,88.0,0,7,0,,0.920,West,55,0.218182
1,467,2012,Jason Kidd,NYK,1610612752,43,27.674419,2.679474,6.597573,1.971689,5.030334,0.834176,0.935288,0.707786,3.690597,4.145602,1.971689,0.353893,1.390293,1.794742,8.164813,3.089245,16.473405,0.582444,0.658,0,54.0,0,10,0,G,0.920,East,49,0.877551
2,703,2012,Kurt Thomas,NYK,1610612752,33,10.121212,1.119895,2.272727,0.000000,0.000000,0.197628,0.428195,0.922266,1.877470,0.461133,0.296443,0.362319,0.131752,1.416337,2.437418,0.644122,13.254729,0.495182,0.658,0,299.0,0,0,0,F,0.920,East,49,0.673469
3,708,2012,Kevin Garnett,BOS,1610612738,49,28.959184,6.499556,12.843833,0.000000,0.221828,2.639752,3.349601,0.931677,6.765750,2.395741,1.153505,1.086957,1.841171,2.284827,15.638864,0.652174,20.875268,0.546139,0.500,0,28.0,0,14,1,C,0.920,East,51,0.960784
4,708,2013,Kevin Garnett,BKN,1610612751,43,20.372093,3.071055,7.306140,0.000000,0.074300,0.792530,0.916363,1.040196,5.943978,1.585061,0.866830,0.619164,1.287862,2.674790,6.934641,-1.174190,13.316450,0.449756,0.436,1,29.0,1,15,0,F,0.939,East,51,0.843137
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5011,1630270,2020,Xavier Sneed,CHA,1610612766,3,5.666667,0.672043,1.680108,0.672043,1.680108,0.000000,0.000000,0.336022,1.008065,0.336022,0.000000,0.000000,0.000000,1.344086,2.016129,3.696237,12.277989,0.600000,0.429,0,500.0,0,0,0,,0.992,East,24,0.125000
5012,1630271,2020,Brodric Thomas,HOU,1610612745,7,6.857143,0.720046,2.448157,0.288018,1.440092,0.864055,1.152074,0.144009,1.008065,0.720046,0.144009,0.288018,0.576037,0.864055,2.592166,-0.720046,9.090243,0.438596,0.308,0,500.0,0,0,0,,0.992,West,22,0.318182
5013,1630273,2020,Freddie Gillespie,DAL,1610612742,1,5.000000,1.008065,2.016129,0.000000,1.008065,0.000000,0.000000,0.000000,1.008065,0.000000,0.000000,0.000000,1.008065,1.008065,2.016129,-5.040323,-1.944355,0.500000,0.500,0,500.0,0,0,0,,0.992,West,23,0.043478
5014,1630278,2020,Ade Murkey,MIN,1610612750,1,3.000000,0.000000,1.008065,0.000000,0.000000,0.000000,0.000000,1.008065,0.000000,0.000000,0.000000,0.000000,0.000000,1.008065,0.000000,7.056452,-5.770833,0.000000,0.231,0,500.0,0,0,0,,0.992,West,22,0.045455


In [None]:
# replace nan with 0
df.fillna(0, inplace = True)

In [None]:
# Create a list with numeric importatn stats
xcols = df.columns[6 : 29].to_list()

print(xcols)

['Minutes', 'FGM', 'FGA', 'FG3M', 'FG3A', 'FTM', 'FTA', 'OREB', 'DREB', 'AST', 'STL', 'BLK', 'TO', 'PF', 'PTS', 'PLUS_MINUS', 'PER', 'TSP', 'W_PCT', 'Jersey', '2KRank', 'LastASG?', 'PriorASG']


In [None]:
# set train and test up
X_train, X_test, y_train, y_test = train_test_split(df[xcols], df['Selected?'], 
                                                    train_size = 0.8, random_state = 1)
print('training data:', X_train.shape)
print('test data:', X_test.shape)


training data: (4012, 23)
test data: (1004, 23)


In [None]:
X_test.replace([np.inf, -np.inf], np.nan, inplace=True)
X_test.fillna(0, inplace=True)
X_train.replace([np.inf, -np.inf], np.nan, inplace=True)
X_train.fillna(0, inplace=True)

In [None]:
# logistic regression
# even without solver specified, same results are shown
log_reg = LogisticRegression(solver = 'lbfgs', max_iter = 3000)

# Fit the model to the training data
clf = log_reg.fit(X_train, y_train)

# Get accuracy stats
print('training accuracy: {}'.format(clf.score(X_train, y_train).round(3)))
print('test accuracy: {}'.format(clf.score(X_test, y_test).round(3)))

training accuracy: 0.982
test accuracy: 0.985


In [None]:
# Put the coefficients into a new dataframe
coef = pd.concat([pd.DataFrame(xcols),
                  pd.DataFrame(np.transpose(clf.coef_))], 
                 axis = 1)

coef.columns = ['feature','coefficient']

coef.sort_values(by=['coefficient'], 
                 ascending = False, 
                 inplace = True)

# Examine the features/stats with the 10 largest positive and negative coefficients
print('Ten largest positive features:\n', coef.head(10), '\n')
print('Ten largest negative features:\n', coef.tail(10))


Ten largest positive features:
      feature  coefficient
18     W_PCT     1.849673
11       BLK     0.715427
14       PTS     0.681582
21  LastASG?     0.465860
19    Jersey     0.404852
9        AST     0.393917
10       STL     0.364938
8       DREB     0.327467
7       OREB     0.293951
3       FG3M     0.202758 

Ten largest negative features:
     feature  coefficient
0   Minutes     0.142660
5       FTM     0.137352
17      TSP     0.110844
20   2KRank    -0.010645
12       TO    -0.052489
6       FTA    -0.141825
4      FG3A    -0.142887
16      PER    -0.146543
2       FGA    -0.385090
13       PF    -0.507503


In [None]:
# Adding the features with the 10 largest positive and negative coefficients 
# to a new list for training purposes

xcols2 = coef.feature[0:10].to_list()
xcols2 += coef.feature[-10:].to_list()
print(xcols2)

['W_PCT', 'BLK', 'PTS', 'LastASG?', 'Jersey', 'AST', 'STL', 'DREB', 'OREB', 'FG3M', 'Minutes', 'FTM', 'TSP', '2KRank', 'TO', 'FTA', 'FG3A', 'PER', 'FGA', 'PF']


In [None]:
# test train fpr large coefficients
X_train, X_test, y_train, y_test = train_test_split(df[xcols2], 
                                                    df['Selected?'], 
                                                    train_size = 0.8, 
                                                    random_state = 1)
print('training data:', X_train.shape)
print('test data:', X_test.shape)

# fit the model to the training data
clf2 = log_reg.fit(X_train, y_train)

# get accuracy stats
print('training accuracy: {}'.format(clf2.score(X_train, y_train).round(3)))
print('test accuracy: {}'.format(clf2.score(X_test, y_test).round(3)))

training data: (4012, 20)
test data: (1004, 20)
training accuracy: 0.982
test accuracy: 0.988


In [None]:
# Regularization with logistic regression & C parameter with logistic regression

# high C value means trust the data a lot; low means ehhh
cset = [.001, .01, .1, 1, 10]

for i in cset:
    print('C =', i)
    log_reg = LogisticRegression(solver = 'lbfgs', 
                                 max_iter = 3000, 
                                 C = i)
    
    clf = log_reg.fit(X_train, y_train)
    
    print('training accuracy: {}'.format(clf.score(X_train, y_train).round(3)))
    print('test accuracy: {}'.format(clf.score(X_test, y_test).round(3)), '\n')

C = 0.001
training accuracy: 0.977
test accuracy: 0.977 

C = 0.01
training accuracy: 0.978
test accuracy: 0.979 

C = 0.1
training accuracy: 0.979
test accuracy: 0.979 

C = 1
training accuracy: 0.982
test accuracy: 0.988 

C = 10
training accuracy: 0.983
test accuracy: 0.99 



In [None]:
# Cross-validation with tuning regularization in logistic regression

for i in cset:
    print('C =', i)
    log_reg = LogisticRegression(solver = 'lbfgs', 
                                 max_iter = 3000, 
                                 C = i)
    
    scores = cross_val_score(log_reg, 
                             df[xcols2], 
                             df['Selected?'], 
                             cv = 5)
    print(scores)
    print('Accuracy: %0.3f (+/- %0.3f)' % (scores.mean(), scores.std() * 2), '\n')

C = 0.001
[0.97709163 0.96111665 0.98404786 0.98105683 0.97507478]
Accuracy: 0.976 (+/- 0.016) 

C = 0.01
[0.9750996  0.96510469 0.98404786 0.98005982 0.97906281]
Accuracy: 0.977 (+/- 0.013) 

C = 0.1
[0.9750996  0.96211366 0.98305085 0.98504487 0.97706879]
Accuracy: 0.976 (+/- 0.016) 

C = 1
[0.97011952 0.96410768 0.98404786 0.9890329  0.98005982]
Accuracy: 0.977 (+/- 0.018) 

C = 10
[0.96912351 0.96610169 0.98404786 0.98803589 0.98404786]
Accuracy: 0.978 (+/- 0.018) 



In [None]:

y_pred_test = clf.predict(X_test)
print('  Recall: {:.3f}'.format(recall_score(y_test, y_pred_test)))
print('  Precision: {:.3f}'.format(precision_score(y_test, y_pred_test)))
print('  F1 score: {:.3f}'.format(f1_score(y_test, y_pred_test)))
cm = confusion_matrix(y_test, y_pred_test)
tn, fp, fn, tp = cm.ravel()
specificity = tn / (tn+fp)
print('  Specificity: {:.3f}'.format(specificity))

  Recall: 0.946
  Precision: 0.814
  F1 score: 0.875
  Specificity: 0.992


In [None]:
# Confusion matrix

cm = confusion_matrix(y_test, y_pred_test)
sample = np.array([['TN', 'FP'], ['FN', 'TP']])
print('CM key:\n', sample, '\n')
print('CM for test:\n', cm)

CM key:
 [['TN' 'FP']
 ['FN' 'TP']] 

CM for test:
 [[959   8]
 [  2  35]]


In [None]:
scaler = MinMaxScaler()

X = df[xcols2]
scaled_X = scaler.fit_transform(X)
y = df['Selected?']

In [None]:

scaled_X = scaler.fit_transform(X_train)

tuned_parameters = {'C': [0.1, 0.5, 1, 5, 10, 50, 100]}

grid = GridSearchCV(LogisticRegression(solver='liblinear'), tuned_parameters, 
                    cv = 3, scoring = 'accuracy')

grid.fit(scaled_X, y_train)

print('mean of accuracies:', grid.cv_results_['mean_test_score'])
print('std dev of accuracies:', grid.cv_results_['std_test_score'])

# print best parameter after tuning 
print('best parameters:', grid.best_params_) 

# store the best estimator (In this case it's 'C': 100)
best_logreg = grid.best_estimator_

mean of accuracies: [0.9680967  0.97208499 0.9745772  0.97856437 0.97981038 0.98030882
 0.97881331]
std dev of accuracies: [0.00461639 0.00372044 0.00339221 0.00519363 0.00521846 0.00554213
 0.00675528]
best parameters: {'C': 50}


In [None]:
# Random forest
param_grid = {'n_estimators': [10, 100, 250], 'max_samples': [.25, .5, 1]} 


# instantiate grid search object
grid = GridSearchCV(RandomForestClassifier(), param_grid, cv = 3)

# fitting the model for grid search 
grid.fit(X, y)

# print parameters, mean, and standard deviation of scores by iteration
for z in range(0, len(grid.cv_results_['params'])):
    print('\nparams:', grid.cv_results_['params'][z])
    print('mean of accuracies:', grid.cv_results_['mean_test_score'][z])
    print('std dev of accuracies:', grid.cv_results_['std_test_score'][z])

# print best parameter after tuning 
print('\n***best parameters:', grid.best_params_)
print('best score:', grid.best_score_)

# store the best estimator (in this case it's 'max_samples': 0.5, 'n_estimators': 100)
best_rf = grid.best_estimator_


params: {'max_samples': 0.25, 'n_estimators': 10}
mean of accuracies: 0.9744816586921851
std dev of accuracies: 0.004376893620434658

params: {'max_samples': 0.25, 'n_estimators': 100}
mean of accuracies: 0.9772727272727272
std dev of accuracies: 0.005940855034191136

params: {'max_samples': 0.25, 'n_estimators': 250}
mean of accuracies: 0.9764752791068579
std dev of accuracies: 0.005826012438677226

params: {'max_samples': 0.5, 'n_estimators': 10}
mean of accuracies: 0.9730861244019139
std dev of accuracies: 0.0068366938594426885

params: {'max_samples': 0.5, 'n_estimators': 100}
mean of accuracies: 0.9774720893141945
std dev of accuracies: 0.006648059011376821

params: {'max_samples': 0.5, 'n_estimators': 250}
mean of accuracies: 0.9778708133971291
std dev of accuracies: 0.004809544689718825

params: {'max_samples': 1, 'n_estimators': 10}
mean of accuracies: 0.9533492822966507
std dev of accuracies: 0.0

params: {'max_samples': 1, 'n_estimators': 100}
mean of accuracies: 0.953349282

In [None]:
mydata = grouped[(grouped['season'] == 2020)]
mydata = mydata.drop('Selected?', axis=1, errors='coerce')
mydata

Unnamed: 0,PLAYER_ID,season,PLAYER_NAME,TEAM_ABBREVIATION,TEAM_ID,GP_x,Minutes,FGM,FGA,FG3M,FG3A,FTM,FTA,OREB,DREB,AST,STL,BLK,TO,PF,PTS,PLUS_MINUS,Starting_Position,pace,PER,TSP,CONFERENCE,W_PCT,Jersey,2KRank,LastASG?,PriorASG,GP_y,GP_Per
254,2544,2020,LeBron James,LAL,1610612747,23,31.695652,8.897265,18.101332,2.717391,6.530505,4.076087,5.916900,0.745091,6.837307,7.144109,0.876578,0.394460,3.813114,1.534011,24.588008,7.363254,F,0.992,28.603203,0.593776,West,0.733,1,1.0,1,16,25,0.920000
264,2546,2020,Carmelo Anthony,POR,1610612757,21,23.809524,4.032258,10.800691,1.536098,4.176267,2.256144,2.640169,0.336022,3.120200,1.104071,0.864055,0.672043,0.960061,2.016129,11.856759,-4.656298,F,0.992,14.385635,0.495586,West,0.571,0,119.0,0,10,22,0.954545
390,2730,2020,Dwight Howard,PHI,1610612755,23,17.173913,2.542076,4.207574,0.131487,0.306802,1.358696,2.892707,2.673562,5.215638,0.613604,0.438289,0.920407,2.016129,3.330996,6.574334,-1.227209,C,0.992,16.847479,0.599808,East,0.667,0,119.0,0,8,23,1.000000
429,2738,2020,Andre Iguodala,MIA,1610612748,18,20.000000,1.568100,4.312276,1.064068,3.192204,0.280018,0.504032,0.560036,3.024194,2.632168,1.120072,0.336022,1.176075,1.400090,4.480287,-2.072133,F,0.992,11.733322,0.494071,East,0.462,0,177.0,0,1,21,0.857143
534,101108,2020,Chris Paul,PHX,1610612756,20,31.000000,5.393145,12.298387,0.907258,3.175403,2.923387,3.024194,0.554435,4.082661,8.568548,1.108871,0.201613,2.620968,2.872984,14.616935,1.260081,G,0.992,20.340422,0.536243,West,0.615,0,17.0,1,10,22,0.909091
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5010,1630269,2020,Keandre Cook,CHA,1610612766,1,1.000000,0.000000,2.016129,0.000000,1.008065,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,-7.056452,,0.992,-79.012097,0.000000,East,0.429,0,500.0,0,0,24,0.041667
5011,1630270,2020,Xavier Sneed,CHA,1610612766,3,5.666667,0.672043,1.680108,0.672043,1.680108,0.000000,0.000000,0.336022,1.008065,0.336022,0.000000,0.000000,0.000000,1.344086,2.016129,3.696237,,0.992,12.277989,0.600000,East,0.429,0,500.0,0,0,24,0.125000
5012,1630271,2020,Brodric Thomas,HOU,1610612745,7,6.857143,0.720046,2.448157,0.288018,1.440092,0.864055,1.152074,0.144009,1.008065,0.720046,0.144009,0.288018,0.576037,0.864055,2.592166,-0.720046,,0.992,9.090243,0.438596,West,0.308,0,500.0,0,0,22,0.318182
5013,1630273,2020,Freddie Gillespie,DAL,1610612742,1,5.000000,1.008065,2.016129,0.000000,1.008065,0.000000,0.000000,0.000000,1.008065,0.000000,0.000000,0.000000,1.008065,1.008065,2.016129,-5.040323,,0.992,-1.944355,0.500000,West,0.500,0,500.0,0,0,23,0.043478


In [None]:
mydata.fillna(0, inplace = True)
mydata.replace([np.inf, -np.inf], np.nan, inplace=True)

X_test.replace([np.inf, -np.inf], np.nan, inplace=True)
X_test.fillna(0, inplace=True)
X_train.replace([np.inf, -np.inf], np.nan, inplace=True)
X_train.fillna(0, inplace=True)

In [None]:
print(xcols)

['Minutes', 'FGM', 'FGA', 'FG3M', 'FG3A', 'FTM', 'FTA', 'OREB', 'DREB', 'AST', 'STL', 'BLK', 'TO', 'PF', 'PTS', 'PLUS_MINUS', 'PER', 'TSP', 'W_PCT', 'Jersey', '2KRank', 'LastASG?', 'PriorASG']


In [None]:
# time to predict
pred = clf.predict(mydata[xcols2])
print(pred)

[1 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 1 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 1 1
 1 0 0 1 0 0 0 0 1 1 1 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 1 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 1 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 0 0 0 0 1 0 0 0 0 0 0 0 0 1 0 1
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 1 0 0 0 0 0 0 0
 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 1 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 

In [None]:
# add predictions to df
mydata['All-Star'] = pred.tolist()
print(mydata)

      PLAYER_ID  season        PLAYER_NAME  ... GP_y    GP_Per  All-Star
254        2544    2020       LeBron James  ...   25  0.920000         1
264        2546    2020    Carmelo Anthony  ...   22  0.954545         0
390        2730    2020      Dwight Howard  ...   23  1.000000         0
429        2738    2020     Andre Iguodala  ...   21  0.857143         0
534      101108    2020         Chris Paul  ...   22  0.909091         0
...         ...     ...                ...  ...  ...       ...       ...
5010    1630269    2020       Keandre Cook  ...   24  0.041667         0
5011    1630270    2020       Xavier Sneed  ...   24  0.125000         0
5012    1630271    2020     Brodric Thomas  ...   22  0.318182         0
5013    1630273    2020  Freddie Gillespie  ...   23  0.043478         0
5014    1630278    2020         Ade Murkey  ...   22  0.045455         0

[541 rows x 35 columns]


In [None]:
# see the predictions
mydata.loc[mydata['All-Star'] == 1]

Unnamed: 0,PLAYER_ID,season,PLAYER_NAME,TEAM_ABBREVIATION,TEAM_ID,GP_x,Minutes,FGM,FGA,FG3M,FG3A,FTM,FTA,OREB,DREB,AST,STL,BLK,TO,PF,PTS,PLUS_MINUS,Starting_Position,pace,PER,TSP,CONFERENCE,W_PCT,Jersey,2KRank,LastASG?,PriorASG,GP_y,GP_Per,All-Star
254,2544,2020,LeBron James,LAL,1610612747,23,31.695652,8.897265,18.101332,2.717391,6.530505,4.076087,5.9169,0.745091,6.837307,7.144109,0.876578,0.39446,3.813114,1.534011,24.588008,7.363254,F,0.992,28.603203,0.593776,West,0.733,1,1.0,1,16,25,0.92,1
847,201142,2020,Kevin Durant,BKN,1610612751,18,35.0,10.080645,19.433244,2.688172,6.160394,7.112455,8.176523,0.392025,6.776434,5.264337,0.78405,1.5681,3.304211,2.296147,29.961918,8.960573,F,0.992,32.405799,0.650472,East,0.563,1,5.0,0,10,24,0.75,1
1060,201566,2020,Russell Westbrook,WAS,1610612764,12,32.166667,7.476478,18.397177,1.596102,4.620296,3.44422,5.208333,1.512097,7.728495,8.988575,0.924059,0.336022,4.53629,2.856183,19.99328,-3.360215,G,0.992,21.042911,0.48319,East,0.273,1,17.0,1,9,19,0.631579,1
1316,201935,2020,James Harden,HOU,1610612745,18,36.0,7.33647,15.849014,3.024194,8.064516,6.048387,6.888441,0.616039,5.712366,10.640681,1.064068,0.616039,4.312276,2.072133,23.74552,1.176075,G,0.992,28.063338,0.628856,West,0.308,0,3.0,1,8,22,0.818182,1
1342,201939,2020,Stephen Curry,GSW,1610612744,23,32.0,9.072581,19.76683,4.470547,11.001052,4.645863,5.040323,0.525947,4.689691,5.741585,1.358696,0.131487,3.374825,2.016129,27.261571,0.745091,G,0.992,29.748717,0.620016,West,0.571,1,5.0,0,6,23,1.0,1
1662,202331,2020,Paul George,LAC,1610612746,21,31.238095,7.440476,14.976959,3.264209,7.248464,3.936252,4.36828,0.528034,5.232335,5.136329,1.152074,0.432028,3.60023,2.112135,22.081413,8.256528,G,0.992,26.025377,0.653335,West,0.733,0,11.0,0,6,24,0.875,1
1880,202681,2020,Kyrie Irving,BKN,1610612751,16,33.6875,10.143649,19.153226,2.835181,6.552419,3.402218,3.717238,0.756048,3.969254,5.859375,1.071069,0.756048,2.331149,2.835181,26.524698,9.387601,G,0.992,30.3483,0.637956,East,0.563,1,11.0,0,6,24,0.666667,1
1986,202695,2020,Kawhi Leonard,LAC,1610612746,20,31.3,8.518145,16.935484,1.915323,4.586694,4.637097,5.090726,0.856855,4.133065,4.939516,1.764113,0.554435,1.764113,1.5625,23.58871,9.475806,F,0.992,30.981064,0.615077,West,0.733,1,3.0,1,4,24,0.833333,1
1995,202696,2020,Nikola Vucevic,ORL,1610612753,25,32.0,9.032258,18.629032,2.620968,6.169355,1.532258,1.935484,2.137097,9.072581,3.346774,1.048387,0.483871,1.572581,1.814516,22.217742,-4.959677,C,0.992,27.874433,0.570252,East,0.467,0,29.0,0,1,25,1.0,1
2015,202699,2020,Tobias Harris,PHI,1610612755,20,32.2,7.711694,14.919355,2.016129,4.435484,2.066532,2.520161,1.159274,5.897177,2.721774,0.856855,0.957661,2.520161,2.016129,19.506048,7.96371,F,0.992,22.108329,0.608491,East,0.667,0,45.0,0,0,23,0.869565,1


Predicted 33 all stars. There were 24 in real life. Is there a way to limit to 24 in the model?

**Starters:**
LeBron James, Anthony Davis, Kawhi Leonard, Luka Doncic, James Harden,
Giannis Antetokounmpo, Joel Embiid, Pascal Siakam, Kemba Walker, Trae Young

**Reserves:**
Damian Lillard, Ben Simmons, Nikola Jokic, Jayson Tatum, Chris Paul, Russell Westbrook, Domantas Sabonis, Khris Middleton, Bam Adebayo, Rudy Gobert, Kyle Lowry, Brandon Ingram, Donovan Mitchell

**Starters that were left out:** Pascal Siakam, Kemba Walker

**Reserves that were left out:** Chris Paul, Kyle Lowry

**MODEL SHOWS 83% ACCURACY OVERALL**