In [1]:
# read in libraries
import pandas as pd
import numpy as np
import pickle
pd.options.mode.chained_assignment = None 
pd.errors.DtypeWarning = None

In [2]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from urllib.request import urlopen
from bs4 import BeautifulSoup
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from scipy import stats
from scipy.stats import ttest_ind
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score

In [3]:
# read in datasets

# kaggle datasets
games = pd.read_csv('data/games.csv') 
games_details = pd.read_csv('data/games_details.csv')
players = pd.read_csv('data/players.csv')
ranking = pd.read_csv('data/ranking.csv')
teams = pd.read_csv('data/teams.csv')

# datasets created from internet
jersey = pd.read_csv('data/jersey2.csv')
twok = pd.read_csv('data/2kRank.csv')
pacedata = {'season':[2012,2013,2014,2015,2016,2017,2018,2019,2020],
           'pace':[92.0,93.9,93.9,95.8,96.4,97.3,100.0,100.3,99.2]}
pace = pd.DataFrame(pacedata)

# load in scraped all star info
# get this from ASG Scrape
all_star_appearances = pickle.load(open('data/all_star_appearances.pickle', 'rb'))

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


In [4]:
# join date to individual games stats dataset
games_details2 = pd.merge(games_details, games[['GAME_DATE_EST', 'GAME_ID']], on = 'GAME_ID', how = 'left')

# extract year and month for every row
games_details2['year'] =  pd.to_numeric(games_details2['GAME_DATE_EST'].apply(lambda x: x.split('-')[0]))
games_details2['month'] =  pd.to_numeric(games_details2['GAME_DATE_EST'].apply(lambda x: x.split('-')[1]))

# label each row with season based on year and month
games_details2['season'] = games_details2[['year','month']].apply(lambda x: x['year']-1 if (x['month']==1) else x['year'], axis=1)

# keep only game info for September through January in the 2012-2020 seasons
games_details2 = games_details2[games_details2['month'].isin([9,10,11,12,1])]
games_details3 = games_details2[(games_details2['season']<2021) & (games_details2['season']>2011)]

# get rid of 2020 finals
games_details3 = games_details3[~((games_details3['season']==2020) & (games_details3['month'].isin([9,10])))]

# drop duplicates
games_details3 = games_details3.drop_duplicates(subset=['GAME_ID','PLAYER_ID'])

In [5]:
# get rid of rows with comments (means they did not play)
games_details3 = games_details3[games_details3['COMMENT'].isnull()]

# get rid of one erroneous row
games_details3 = games_details3[games_details3['PTS'].notnull()]

# extract number of minutes played
games_details3['minutes'] =  pd.to_numeric(games_details3['MIN'].apply(lambda x: x.split(':')[0]))

# add a column to indicate they played in that game (useful for aggregation in next step)
games_details3['GP'] = 1

In [6]:
# uses groupby to summarize by player and season
grouped = games_details3.groupby(['PLAYER_ID','season']).apply(lambda s: pd.Series({ 
    "PLAYER_NAME": s["PLAYER_NAME"].mode()[0],
    "TEAM_ABBREVIATION": s["TEAM_ABBREVIATION"].mode()[0],
    "TEAM_ID": s["TEAM_ID"].mode()[0],
    "GP": s["GP"].sum(),
    "Minutes": s["minutes"].mean(),
    "FGM": s["FGM"].mean(),
    "FGA": s["FGA"].mean(),
    "FG3M": s["FG3M"].mean(),
    "FG3A": s["FG3A"].mean(),
    "FTM": s["FTM"].mean(),
    "FTA": s["FTA"].mean(),
    "OREB": s["OREB"].mean(),
    "DREB": s["DREB"].mean(),
    "AST": s["AST"].mean(),
    "STL": s["STL"].mean(),
    "BLK": s["BLK"].mean(),
    "TO": s["TO"].mean(), 
    "PF": s["PF"].mean(),
    "PTS": s["PTS"].mean(),
    "PLUS_MINUS": s["PLUS_MINUS"].mean(),
    "Starting_Position": s["START_POSITION"].mode(),
})).reset_index()

In [7]:
# Fix Starting_Position column
grouped['Starting_Position'] = grouped['Starting_Position'].apply(lambda x: "None" if len(x)==0 else x[0])

In [8]:
# merge pace data
grouped = pd.merge(grouped, pace, on = ['season'], how = 'left')

# divide pace data by 100
grouped['pace'] = grouped['pace']/100

# divide relevant statistics by pace/100
grouped[['FGM','FGA','FG3M','FG3A','FTM','FTA','OREB',
       'DREB', 'AST', 'STL', 'BLK', 'TO', 'PF', 'PTS', 'PLUS_MINUS']] = grouped[['FGM','FGA','FG3M','FG3A','FTM','FTA','OREB',
       'DREB', 'AST', 'STL', 'BLK', 'TO', 'PF', 'PTS', 'PLUS_MINUS']].div(grouped.pace, axis=0)

In [9]:
# PER
grouped['PER'] = (grouped['FGM']*85.910 + grouped['STL']*53.897 + 
grouped['FG3M']*51.757 + grouped['FTM']*46.845 + grouped['BLK']*39.190
+ grouped['OREB']*39.190+ grouped['AST']*34.677 + grouped['DREB']*14.707
- grouped['PF']*17.174 - (grouped['FTA']-grouped['FTM'])*20.091 - (grouped['FGA']-grouped['FGM'])*39.190
- grouped['TO']*53.897)*(1/grouped['Minutes'])

# True Shooting Percentage
grouped['TSP'] = grouped['PTS']/(2*(grouped['FGA']+.44*grouped['FTA']))

In [10]:
# keep only January 20 data
ranking2 = ranking[(ranking['STANDINGSDATE'].apply(lambda x: x.split('-')[1])=='01') & (ranking['STANDINGSDATE'].apply(lambda x: x.split('-')[2])=='20')]

# get season from the year
ranking2['season'] = ranking2['SEASON_ID']-20000
grouped = pd.merge(grouped, ranking2[['TEAM_ID', 'season','CONFERENCE','W_PCT']], on = ['TEAM_ID','season'], how = 'left')

In [11]:
# merge jersey data
grouped = pd.merge(grouped, jersey, on = ['season','PLAYER_ID'], how = 'left')

In [12]:
# merge 2k data
grouped = pd.merge(grouped, twok[['PLAYER_ID','season','2KRank']],on=['PLAYER_ID','season'],how='left')

In [13]:
# function that defines if a player was an All star the previous year
def was_AS_last_year(row):
    if row['season'] == 1999:
        return 1 if 1998 in all_star_appearances[row['PLAYER_NAME']] else 0
    return 1 if row['season'] in all_star_appearances[row['PLAYER_NAME']] else 0

# adds if a player was an ALl star the previous year
grouped['LastASG?'] = grouped[['PLAYER_NAME', 'season']].apply(was_AS_last_year, axis=1)

# adds the total amount of prior selections a player had as of that year (not including that year)
grouped['PriorASG'] = grouped[['PLAYER_NAME', 'season']].apply(lambda row : sum(y<=row['season'] for y in all_star_appearances[row['PLAYER_NAME']]), axis=1)

# adds wether a player was selected as an all star that year
grouped['Selected?'] = grouped[['PLAYER_NAME', 'season']].apply(lambda row : 1 if row['season']+1 in all_star_appearances[row['PLAYER_NAME']] else 0, axis=1)


In [14]:
# get total games played by each team up to that point in each season
game_count1 = games_details3.groupby(['season','GAME_ID','TEAM_ID']).GP.mean().reset_index()
game_count2 = game_count1.groupby(['season','TEAM_ID']).GP.sum().reset_index()

# merges Games Played rate with main dataset
grouped = pd.merge(grouped,game_count2,on=['season','TEAM_ID'],how='left')

# calculates percentage games played
grouped['GP_Per'] = grouped['GP_x'] / grouped['GP_y']

In [15]:
#fill null values
grouped['PLUS_MINUS'] = grouped['PLUS_MINUS'].fillna(0)
grouped['TSP'] = grouped['TSP'].fillna(0)
grouped['PER'] = grouped['PER'].fillna(0)
grouped['2KRank'] = grouped['2KRank'].fillna(500)

In [None]:
grouped.isnull().sum(axis = 0)


PLAYER_ID            0
season               0
PLAYER_NAME          0
TEAM_ABBREVIATION    0
TEAM_ID              0
GP_x                 0
Minutes              0
FGM                  0
FGA                  0
FG3M                 0
FG3A                 0
FTM                  0
FTA                  0
OREB                 0
DREB                 0
AST                  0
STL                  0
BLK                  0
TO                   0
PF                   0
PTS                  0
PLUS_MINUS           0
Starting_Position    0
pace                 0
PER                  0
TSP                  0
CONFERENCE           0
W_PCT                0
Jersey               0
2KRank               0
LastASG?             0
PriorASG             0
Selected?            0
GP_y                 0
GP_Per               0
dtype: int64

In [None]:
grouped.loc[(grouped['PLAYER_NAME']=='Dwyane Wade') & (grouped['season']==2018),'Selected?'] = 0
grouped.loc[(grouped['PLAYER_NAME']=='Dirk Nowitzki') & (grouped['season']==2018),'Selected?'] = 0

In [None]:
grouped

Unnamed: 0,PLAYER_ID,season,PLAYER_NAME,TEAM_ABBREVIATION,TEAM_ID,GP_x,Minutes,FGM,FGA,FG3M,FG3A,FTM,FTA,OREB,DREB,AST,STL,BLK,TO,PF,PTS,PLUS_MINUS,Starting_Position,pace,PER,TSP,CONFERENCE,W_PCT,Jersey,2KRank,LastASG?,PriorASG,Selected?,GP_y,GP_Per
0,255,2012,Grant Hill,LAC,1610612746,12,15.500000,1.992754,4.800725,0.090580,0.452899,0.996377,1.630435,0.362319,1.992754,1.086957,0.452899,0.362319,1.086957,1.902174,5.072464,0.652174,,0.920,8.279628,0.459619,West,0.780,0,88.0,0,7,0,55,0.218182
1,467,2012,Jason Kidd,NYK,1610612752,43,27.674419,2.679474,6.597573,1.971689,5.030334,0.834176,0.935288,0.707786,3.690597,4.145602,1.971689,0.353893,1.390293,1.794742,8.164813,3.089245,G,0.920,16.473405,0.582444,East,0.658,0,54.0,0,10,0,49,0.877551
2,703,2012,Kurt Thomas,NYK,1610612752,33,10.121212,1.119895,2.272727,0.000000,0.000000,0.197628,0.428195,0.922266,1.877470,0.461133,0.296443,0.362319,0.131752,1.416337,2.437418,0.644122,F,0.920,13.254729,0.495182,East,0.658,0,299.0,0,0,0,49,0.673469
3,708,2012,Kevin Garnett,BOS,1610612738,49,28.959184,6.499556,12.843833,0.000000,0.221828,2.639752,3.349601,0.931677,6.765750,2.395741,1.153505,1.086957,1.841171,2.284827,15.638864,0.652174,C,0.920,20.875268,0.546139,East,0.500,0,28.0,0,14,1,51,0.960784
4,708,2013,Kevin Garnett,BKN,1610612751,43,20.372093,3.071055,7.306140,0.000000,0.074300,0.792530,0.916363,1.040196,5.943978,1.585061,0.866830,0.619164,1.287862,2.674790,6.934641,-1.174190,F,0.939,13.316450,0.449756,East,0.436,1,29.0,1,15,0,51,0.843137
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5011,1630270,2020,Xavier Sneed,CHA,1610612766,3,5.666667,0.672043,1.680108,0.672043,1.680108,0.000000,0.000000,0.336022,1.008065,0.336022,0.000000,0.000000,0.000000,1.344086,2.016129,3.696237,,0.992,12.277989,0.600000,East,0.429,0,500.0,0,0,0,24,0.125000
5012,1630271,2020,Brodric Thomas,HOU,1610612745,7,6.857143,0.720046,2.448157,0.288018,1.440092,0.864055,1.152074,0.144009,1.008065,0.720046,0.144009,0.288018,0.576037,0.864055,2.592166,-0.720046,,0.992,9.090243,0.438596,West,0.308,0,500.0,0,0,0,22,0.318182
5013,1630273,2020,Freddie Gillespie,DAL,1610612742,1,5.000000,1.008065,2.016129,0.000000,1.008065,0.000000,0.000000,0.000000,1.008065,0.000000,0.000000,0.000000,1.008065,1.008065,2.016129,-5.040323,,0.992,-1.944355,0.500000,West,0.500,0,500.0,0,0,0,23,0.043478
5014,1630278,2020,Ade Murkey,MIN,1610612750,1,3.000000,0.000000,1.008065,0.000000,0.000000,0.000000,0.000000,1.008065,0.000000,0.000000,0.000000,0.000000,0.000000,1.008065,0.000000,7.056452,,0.992,-5.770833,0.000000,West,0.231,0,500.0,0,0,0,22,0.045455


In [None]:
grouped.columns


Index(['PLAYER_ID', 'season', 'PLAYER_NAME', 'TEAM_ABBREVIATION', 'TEAM_ID',
       'GP_x', 'Minutes', 'FGM', 'FGA', 'FG3M', 'FG3A', 'FTM', 'FTA', 'OREB',
       'DREB', 'AST', 'STL', 'BLK', 'TO', 'PF', 'PTS', 'PLUS_MINUS',
       'Starting_Position', 'pace', 'PER', 'TSP', 'CONFERENCE', 'W_PCT',
       'Jersey', '2KRank', 'LastASG?', 'PriorASG', 'Selected?', 'GP_y',
       'GP_Per'],
      dtype='object')

In [16]:
df = grouped[['PLAYER_ID', 'season', 'PLAYER_NAME', 'TEAM_ABBREVIATION', 'TEAM_ID',
       'GP_x', 'Minutes', 'FGM', 'FGA', 'FG3M', 'FG3A', 'FTM', 'FTA', 'OREB',
       'DREB', 'AST', 'STL', 'BLK', 'TO', 'PF', 'PTS', 'PLUS_MINUS', 'PER', 'TSP',
       'W_PCT', 'Jersey', '2KRank', 'LastASG?', 'PriorASG', 'Selected?', 'Starting_Position',
       'pace', 'CONFERENCE', 'GP_y', 'GP_Per']]

In [None]:
df

Unnamed: 0,PLAYER_ID,season,PLAYER_NAME,TEAM_ABBREVIATION,TEAM_ID,GP_x,Minutes,FGM,FGA,FG3M,FG3A,FTM,FTA,OREB,DREB,AST,STL,BLK,TO,PF,PTS,PLUS_MINUS,PER,TSP,W_PCT,Jersey,2KRank,LastASG?,PriorASG,Selected?,Starting_Position,pace,CONFERENCE,GP_y,GP_Per
0,255,2012,Grant Hill,LAC,1610612746,12,15.500000,1.992754,4.800725,0.090580,0.452899,0.996377,1.630435,0.362319,1.992754,1.086957,0.452899,0.362319,1.086957,1.902174,5.072464,0.652174,8.279628,0.459619,0.780,0,88.0,0,7,0,,0.920,West,55,0.218182
1,467,2012,Jason Kidd,NYK,1610612752,43,27.674419,2.679474,6.597573,1.971689,5.030334,0.834176,0.935288,0.707786,3.690597,4.145602,1.971689,0.353893,1.390293,1.794742,8.164813,3.089245,16.473405,0.582444,0.658,0,54.0,0,10,0,G,0.920,East,49,0.877551
2,703,2012,Kurt Thomas,NYK,1610612752,33,10.121212,1.119895,2.272727,0.000000,0.000000,0.197628,0.428195,0.922266,1.877470,0.461133,0.296443,0.362319,0.131752,1.416337,2.437418,0.644122,13.254729,0.495182,0.658,0,299.0,0,0,0,F,0.920,East,49,0.673469
3,708,2012,Kevin Garnett,BOS,1610612738,49,28.959184,6.499556,12.843833,0.000000,0.221828,2.639752,3.349601,0.931677,6.765750,2.395741,1.153505,1.086957,1.841171,2.284827,15.638864,0.652174,20.875268,0.546139,0.500,0,28.0,0,14,1,C,0.920,East,51,0.960784
4,708,2013,Kevin Garnett,BKN,1610612751,43,20.372093,3.071055,7.306140,0.000000,0.074300,0.792530,0.916363,1.040196,5.943978,1.585061,0.866830,0.619164,1.287862,2.674790,6.934641,-1.174190,13.316450,0.449756,0.436,1,29.0,1,15,0,F,0.939,East,51,0.843137
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5011,1630270,2020,Xavier Sneed,CHA,1610612766,3,5.666667,0.672043,1.680108,0.672043,1.680108,0.000000,0.000000,0.336022,1.008065,0.336022,0.000000,0.000000,0.000000,1.344086,2.016129,3.696237,12.277989,0.600000,0.429,0,500.0,0,0,0,,0.992,East,24,0.125000
5012,1630271,2020,Brodric Thomas,HOU,1610612745,7,6.857143,0.720046,2.448157,0.288018,1.440092,0.864055,1.152074,0.144009,1.008065,0.720046,0.144009,0.288018,0.576037,0.864055,2.592166,-0.720046,9.090243,0.438596,0.308,0,500.0,0,0,0,,0.992,West,22,0.318182
5013,1630273,2020,Freddie Gillespie,DAL,1610612742,1,5.000000,1.008065,2.016129,0.000000,1.008065,0.000000,0.000000,0.000000,1.008065,0.000000,0.000000,0.000000,1.008065,1.008065,2.016129,-5.040323,-1.944355,0.500000,0.500,0,500.0,0,0,0,,0.992,West,23,0.043478
5014,1630278,2020,Ade Murkey,MIN,1610612750,1,3.000000,0.000000,1.008065,0.000000,0.000000,0.000000,0.000000,1.008065,0.000000,0.000000,0.000000,0.000000,0.000000,1.008065,0.000000,7.056452,-5.770833,0.000000,0.231,0,500.0,0,0,0,,0.992,West,22,0.045455


In [17]:
# replace nan with 0
df.fillna(0, inplace = True)

In [18]:
# Create a list with numeric importatn stats
xcols = df.columns[6 : 29].to_list()

print(xcols)

['Minutes', 'FGM', 'FGA', 'FG3M', 'FG3A', 'FTM', 'FTA', 'OREB', 'DREB', 'AST', 'STL', 'BLK', 'TO', 'PF', 'PTS', 'PLUS_MINUS', 'PER', 'TSP', 'W_PCT', 'Jersey', '2KRank', 'LastASG?', 'PriorASG']


In [32]:
df.replace([np.inf, -np.inf], np.nan, inplace=True)
df.fillna(0, inplace=True)

In [46]:
# separate 2020 data
dftest = df[df['season']==2020]
df = df[df['season']!=2020]

In [50]:
# set train and test up
X_train, X_test, y_train, y_test = train_test_split(df[xcols], df['Selected?'], 
                                                    train_size = 0.8, random_state = 1)
print('training data:', X_train.shape)
print('test data:', X_test.shape)


training data: (3580, 23)
test data: (895, 23)


In [52]:
# logistic regression
# even without solver specified, same results are shown
log_reg = LogisticRegression(solver = 'lbfgs', max_iter = 3000)

# Fit the model to the training data
clf = log_reg.fit(X_train, y_train)

# Get accuracy stats
print('training accuracy: {}'.format(clf.score(X_train, y_train).round(3)))
print('test accuracy: {}'.format(clf.score(X_test, y_test).round(3)))

training accuracy: 0.981
test accuracy: 0.985


In [53]:
# Put the coefficients into a new dataframe
coef = pd.concat([pd.DataFrame(xcols),
                  pd.DataFrame(np.transpose(clf.coef_))], 
                 axis = 1)

coef.columns = ['feature','coefficient']

coef.sort_values(by=['coefficient'], 
                 ascending = False, 
                 inplace = True)

# Examine the features/stats with the 10 largest positive and negative coefficients
print('Ten largest positive features:\n', coef.head(10), '\n')
print('Ten largest negative features:\n', coef.tail(10))


Ten largest positive features:
        feature  coefficient
18       W_PCT     1.241583
11         BLK     0.982472
14         PTS     0.528736
9          AST     0.389290
7         OREB     0.357779
10         STL     0.342824
5          FTM     0.300816
19      Jersey     0.297670
8         DREB     0.271484
15  PLUS_MINUS     0.232366 

Ten largest negative features:
     feature  coefficient
0   Minutes     0.067060
17      TSP     0.016387
20   2KRank    -0.005918
12       TO    -0.007139
3      FG3M    -0.027362
4      FG3A    -0.060541
16      PER    -0.125590
6       FTA    -0.159313
2       FGA    -0.220158
13       PF    -0.753016


In [54]:
# Adding the features with the 10 largest positive and negative coefficients 
# to a new list for training purposes

xcols2 = coef.feature[0:10].to_list()
xcols2 += coef.feature[-10:].to_list()
print(xcols2)

['W_PCT', 'BLK', 'PTS', 'AST', 'OREB', 'STL', 'FTM', 'Jersey', 'DREB', 'PLUS_MINUS', 'Minutes', 'TSP', '2KRank', 'TO', 'FG3M', 'FG3A', 'PER', 'FTA', 'FGA', 'PF']


In [55]:


# test train fpr large coefficients
X_train, X_test, y_train, y_test = train_test_split(df[xcols2], 
                                                    df['Selected?'], 
                                                    train_size = 0.8, 
                                                    random_state = 1)
print('training data:', X_train.shape)
print('test data:', X_test.shape)


# fit the model to the training data
clf2 = log_reg.fit(X_train, y_train)

# get accuracy stats
print('training accuracy: {}'.format(clf2.score(X_train, y_train).round(3)))
print('test accuracy: {}'.format(clf2.score(X_test, y_test).round(3)))

training data: (3580, 20)
test data: (895, 20)
training accuracy: 0.98
test accuracy: 0.985


In [56]:
# Regularization with logistic regression & C parameter with logistic regression

# high C value means trust the data a lot; low means ehhh
cset = [.001, .01, .1, 1, 10]

for i in cset:
    print('C =', i)
    log_reg = LogisticRegression(solver = 'lbfgs', 
                                 max_iter = 3000, 
                                 C = i)
    
    clf = log_reg.fit(X_train, y_train)
    
    print('training accuracy: {}'.format(clf.score(X_train, y_train).round(3)))
    print('test accuracy: {}'.format(clf.score(X_test, y_test).round(3)), '\n')

C = 0.001
training accuracy: 0.977
test accuracy: 0.984 

C = 0.01
training accuracy: 0.978
test accuracy: 0.988 

C = 0.1
training accuracy: 0.979
test accuracy: 0.987 

C = 1
training accuracy: 0.98
test accuracy: 0.985 

C = 10
training accuracy: 0.98
test accuracy: 0.983 



In [57]:
# Cross-validation with tuning regularization in logistic regression

for i in cset:
    print('C =', i)
    log_reg = LogisticRegression(solver = 'lbfgs', 
                                 max_iter = 3000, 
                                 C = i)
    
    scores = cross_val_score(log_reg, 
                             df[xcols2], 
                             df['Selected?'], 
                             cv = 5)
    print(scores)
    print('Accuracy: %0.3f (+/- %0.3f)' % (scores.mean(), scores.std() * 2), '\n')

C = 0.001
[0.96871508 0.96759777 0.98324022 0.98547486 0.97988827]
Accuracy: 0.977 (+/- 0.015) 

C = 0.01
[0.96424581 0.96871508 0.98547486 0.9877095  0.98547486]
Accuracy: 0.978 (+/- 0.020) 

C = 0.1
[0.96648045 0.96871508 0.98435754 0.98435754 0.98435754]
Accuracy: 0.978 (+/- 0.016) 

C = 1
[0.96312849 0.9698324  0.98435754 0.98212291 0.98435754]
Accuracy: 0.977 (+/- 0.017) 

C = 10
[0.96089385 0.96871508 0.98324022 0.98435754 0.98547486]
Accuracy: 0.977 (+/- 0.020) 



In [58]:

y_pred_test = clf.predict(X_test)
print('  Recall: {:.3f}'.format(recall_score(y_test, y_pred_test)))
print('  Precision: {:.3f}'.format(precision_score(y_test, y_pred_test)))
print('  F1 score: {:.3f}'.format(f1_score(y_test, y_pred_test)))
cm = confusion_matrix(y_test, y_pred_test)
tn, fp, fn, tp = cm.ravel()
specificity = tn / (tn+fp)
print('  Specificity: {:.3f}'.format(specificity))

  Recall: 0.667
  Precision: 0.846
  F1 score: 0.746
  Specificity: 0.995


In [59]:
# Confusion matrix

cm = confusion_matrix(y_test, y_pred_test)
sample = np.array([['TN', 'FP'], ['FN', 'TP']])
print('CM key:\n', sample, '\n')
print('CM for test:\n', cm)

CM key:
 [['TN' 'FP']
 ['FN' 'TP']] 

CM for test:
 [[858   4]
 [ 11  22]]


In [60]:
scaler = MinMaxScaler()

X = df[xcols2]
scaled_X = scaler.fit_transform(X)
y = df['Selected?']

In [61]:

scaled_X = scaler.fit_transform(X_train)

tuned_parameters = {'C': [0.1, 0.5, 1, 5, 10, 50, 100]}

grid = GridSearchCV(LogisticRegression(solver='liblinear'), tuned_parameters, 
                    cv = 3, scoring = 'accuracy')

grid.fit(scaled_X, y_train)

print('mean of accuracies:', grid.cv_results_['mean_test_score'])
print('std dev of accuracies:', grid.cv_results_['std_test_score'])

# print best parameter after tuning 
print('best parameters:', grid.best_params_) 

# store the best estimator (In this case it's 'C': 100)
best_logreg = grid.best_estimator_

mean of accuracies: [0.96480353 0.97206579 0.97402141 0.97709395 0.97793241 0.97960839
 0.98016697]
std dev of accuracies: [0.00314586 0.00345327 0.00247625 0.0030925  0.00172877 0.00172824
 0.00209615]
best parameters: {'C': 100}


In [62]:
# Random forest
param_grid = {'n_estimators': [10, 100, 250], 'max_samples': [.25, .5, 1]} 


# instantiate grid search object
grid = GridSearchCV(RandomForestClassifier(), param_grid, cv = 3)

# fitting the model for grid search 
grid.fit(X, y)

# print parameters, mean, and standard deviation of scores by iteration
for z in range(0, len(grid.cv_results_['params'])):
    print('\nparams:', grid.cv_results_['params'][z])
    print('mean of accuracies:', grid.cv_results_['mean_test_score'][z])
    print('std dev of accuracies:', grid.cv_results_['std_test_score'][z])

# print best parameter after tuning 
print('\n***best parameters:', grid.best_params_)
print('best score:', grid.best_score_)

# store the best estimator (in this case it's 'max_samples': 0.5, 'n_estimators': 100)
best_rf = grid.best_estimator_


params: {'max_samples': 0.25, 'n_estimators': 10}
mean of accuracies: 0.9743017233577215
std dev of accuracies: 0.00574807759977616

params: {'max_samples': 0.25, 'n_estimators': 100}
mean of accuracies: 0.9767604734753471
std dev of accuracies: 0.004902073830482492

params: {'max_samples': 0.25, 'n_estimators': 250}
mean of accuracies: 0.9772071511583652
std dev of accuracies: 0.004341514164882438

params: {'max_samples': 0.5, 'n_estimators': 10}
mean of accuracies: 0.9751956780899876
std dev of accuracies: 0.005219742565397671

params: {'max_samples': 0.5, 'n_estimators': 100}
mean of accuracies: 0.9776539786829407
std dev of accuracies: 0.004779748224000809

params: {'max_samples': 0.5, 'n_estimators': 250}
mean of accuracies: 0.977430564920653
std dev of accuracies: 0.004554946953148619

params: {'max_samples': 1, 'n_estimators': 10}
mean of accuracies: 0.9532961846143887
std dev of accuracies: 0.00030133312320972805

params: {'max_samples': 1, 'n_estimators': 100}
mean of accurac

In [64]:
dftest = dftest.drop('Selected?', axis=1, errors='coerce')
dftest

Unnamed: 0,PLAYER_ID,season,PLAYER_NAME,TEAM_ABBREVIATION,TEAM_ID,GP_x,Minutes,FGM,FGA,FG3M,...,W_PCT,Jersey,2KRank,LastASG?,PriorASG,Starting_Position,pace,CONFERENCE,GP_y,GP_Per
254,2544,2020,LeBron James,LAL,1610612747,23,31.695652,8.897265,18.101332,2.717391,...,0.733,1,1.0,1,16,F,0.992,West,25,0.920000
264,2546,2020,Carmelo Anthony,POR,1610612757,21,23.809524,4.032258,10.800691,1.536098,...,0.571,0,119.0,0,10,F,0.992,West,22,0.954545
390,2730,2020,Dwight Howard,PHI,1610612755,23,17.173913,2.542076,4.207574,0.131487,...,0.667,0,119.0,0,8,C,0.992,East,23,1.000000
429,2738,2020,Andre Iguodala,MIA,1610612748,18,20.000000,1.568100,4.312276,1.064068,...,0.462,0,177.0,0,1,F,0.992,East,21,0.857143
534,101108,2020,Chris Paul,PHX,1610612756,20,31.000000,5.393145,12.298387,0.907258,...,0.615,0,17.0,1,10,G,0.992,West,22,0.909091
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5010,1630269,2020,Keandre Cook,CHA,1610612766,1,1.000000,0.000000,2.016129,0.000000,...,0.429,0,500.0,0,0,,0.992,East,24,0.041667
5011,1630270,2020,Xavier Sneed,CHA,1610612766,3,5.666667,0.672043,1.680108,0.672043,...,0.429,0,500.0,0,0,,0.992,East,24,0.125000
5012,1630271,2020,Brodric Thomas,HOU,1610612745,7,6.857143,0.720046,2.448157,0.288018,...,0.308,0,500.0,0,0,,0.992,West,22,0.318182
5013,1630273,2020,Freddie Gillespie,DAL,1610612742,1,5.000000,1.008065,2.016129,0.000000,...,0.500,0,500.0,0,0,,0.992,West,23,0.043478


In [None]:
print(xcols)

['Minutes', 'FGM', 'FGA', 'FG3M', 'FG3A', 'FTM', 'FTA', 'OREB', 'DREB', 'AST', 'STL', 'BLK', 'TO', 'PF', 'PTS', 'PLUS_MINUS', 'PER', 'TSP', 'W_PCT', 'Jersey', '2KRank', 'LastASG?', 'PriorASG']


In [75]:
# time to predict
pred = clf.predict_proba(dftest[xcols2])[:,1]
print(pred)

[9.93031749e-01 7.74467107e-04 2.24705531e-04 4.64961141e-05
 1.30411778e-01 4.73682915e-05 2.05670961e-03 6.61848572e-03
 2.47040572e-03 6.66098289e-05 5.64945662e-05 6.84208294e-02
 2.29746242e-05 1.19057043e-03 9.99269031e-01 7.94666898e-03
 1.29028179e-01 2.52653286e-04 3.77437921e-04 2.48151412e-06
 4.06151485e-04 1.39757924e-03 4.73163493e-01 1.47453166e-03
 1.95151493e-03 5.27503124e-03 8.38014123e-05 7.22419669e-03
 2.24256767e-06 1.29309910e-04 5.84718614e-08 2.02488369e-02
 2.63079281e-04 8.34622979e-04 1.16960366e-03 5.87198182e-03
 1.54809244e-03 9.42191242e-01 8.83411695e-05 9.30040216e-01
 2.88717669e-01 5.49023077e-05 1.08056846e-01 2.78146251e-04
 9.92317080e-06 3.39302750e-06 6.89624375e-04 6.00126793e-04
 1.37966591e-03 3.47054337e-05 3.33303433e-05 5.05889749e-02
 1.25811914e-04 3.04733435e-04 2.60229494e-01 8.78164982e-01
 1.21337338e-07 1.35070151e-05 4.95957253e-03 1.36737893e-05
 2.54923387e-04 5.45823483e-05 6.37674436e-05 9.61764459e-01
 6.07898836e-04 5.455142

In [76]:
# add predictions to df
dftest['All-Star'] = pred.tolist()
print(mydata)

      PLAYER_ID  season        PLAYER_NAME TEAM_ABBREVIATION     TEAM_ID  \
254        2544    2020       LeBron James               LAL  1610612747   
264        2546    2020    Carmelo Anthony               POR  1610612757   
390        2730    2020      Dwight Howard               PHI  1610612755   
429        2738    2020     Andre Iguodala               MIA  1610612748   
534      101108    2020         Chris Paul               PHX  1610612756   
...         ...     ...                ...               ...         ...   
5010    1630269    2020       Keandre Cook               CHA  1610612766   
5011    1630270    2020       Xavier Sneed               CHA  1610612766   
5012    1630271    2020     Brodric Thomas               HOU  1610612745   
5013    1630273    2020  Freddie Gillespie               DAL  1610612742   
5014    1630278    2020         Ade Murkey               MIN  1610612750   

      GP_x    Minutes       FGM        FGA      FG3M  ...        PER  \
254     23  31.

In [83]:
dftest.loc[(dftest['PLAYER_NAME']=='James Harden'),'CONFERENCE'] = 'East'

In [84]:
# West Predictions

dftest[dftest['CONFERENCE']=='West'].sort_values(by='All-Star', ascending=False).head(12)

Unnamed: 0,PLAYER_ID,season,PLAYER_NAME,TEAM_ABBREVIATION,TEAM_ID,GP_x,Minutes,FGM,FGA,FG3M,...,Jersey,2KRank,LastASG?,PriorASG,Starting_Position,pace,CONFERENCE,GP_y,GP_Per,All-Star
254,2544,2020,LeBron James,LAL,1610612747,23,31.695652,8.897265,18.101332,2.717391,...,1,1.0,1,16,F,0.992,West,25,0.92,0.993032
3362,203999,2020,Nikola Jokic,DEN,1610612743,23,33.434783,9.642356,16.567321,1.358696,...,1,11.0,1,2,C,0.992,West,23,1.0,0.992695
4593,1629029,2020,Luka Doncic,DAL,1610612742,22,32.636364,9.484971,20.619501,2.107771,...,1,8.0,1,1,G,0.992,West,23,0.956522,0.991113
2200,203076,2020,Anthony Davis,LAL,1610612747,20,31.45,8.518145,16.179435,1.108871,...,1,5.0,1,7,F,0.992,West,25,0.8,0.988091
2240,203081,2020,Damian Lillard,POR,1610612757,21,33.761905,8.304531,18.337174,3.74424,...,1,8.0,1,5,G,0.992,West,22,0.954545,0.980339
1986,202695,2020,Kawhi Leonard,LAC,1610612746,20,31.3,8.518145,16.935484,1.915323,...,1,3.0,1,4,F,0.992,West,24,0.833333,0.979601
1342,201939,2020,Stephen Curry,GSW,1610612744,23,32.0,9.072581,19.76683,4.470547,...,1,5.0,0,6,G,0.992,West,23,1.0,0.93004
1662,202331,2020,Paul George,LAC,1610612746,21,31.238095,7.440476,14.976959,3.264209,...,0,11.0,0,6,G,0.992,West,24,0.875,0.878165
4775,1629630,2020,Ja Morant,MEM,1610612763,10,27.2,7.560484,13.91129,0.907258,...,1,36.0,0,0,G,0.992,West,18,0.555556,0.782388
2845,203497,2020,Rudy Gobert,UTA,1610612762,23,28.73913,4.952665,8.152174,0.0,...,0,23.0,1,1,C,0.992,West,23,1.0,0.763749


Selected Ja Morant and Christian Wood instead of Devin Booker and Chris Paul

In [85]:
# East Predictions

dftest[dftest['CONFERENCE']=='East'].sort_values(by='All-Star', ascending=False).head(12)

Unnamed: 0,PLAYER_ID,season,PLAYER_NAME,TEAM_ABBREVIATION,TEAM_ID,GP_x,Minutes,FGM,FGA,FG3M,...,Jersey,2KRank,LastASG?,PriorASG,Starting_Position,pace,CONFERENCE,GP_y,GP_Per,All-Star
847,201142,2020,Kevin Durant,BKN,1610612751,18,35.0,10.080645,19.433244,2.688172,...,1,5.0,0,10,F,0.992,East,24,0.75,0.999269
3291,203954,2020,Joel Embiid,PHI,1610612755,17,30.117647,8.716793,16.18833,1.12666,...,0,10.0,1,3,C,0.992,East,23,0.73913,0.996992
2907,203507,2020,Giannis Antetokounmpo,MIL,1610612749,20,32.15,10.282258,18.75,1.260081,...,1,1.0,1,4,F,0.992,East,22,0.909091,0.996473
1880,202681,2020,Kyrie Irving,BKN,1610612751,16,33.6875,10.143649,19.153226,2.835181,...,1,11.0,0,6,G,0.992,East,24,0.666667,0.961764
4159,1628369,2020,Jayson Tatum,BOS,1610612738,15,32.933333,9.610215,20.63172,2.822581,...,1,23.0,1,1,F,0.992,East,20,0.75,0.957656
1316,201935,2020,James Harden,HOU,1610612745,18,36.0,7.33647,15.849014,3.024194,...,0,3.0,1,8,G,0.992,East,22,0.818182,0.942191
4587,1629027,2020,Trae Young,ATL,1610612737,22,32.090909,7.01063,17.182918,2.199413,...,0,17.0,1,1,G,0.992,East,23,0.956522,0.886019
2217,203078,2020,Bradley Beal,WAS,1610612764,17,33.588235,11.681689,24.549336,2.194023,...,0,17.0,0,2,G,0.992,East,19,0.894737,0.868082
2071,202710,2020,Jimmy Butler,MIA,1610612748,7,29.142857,6.048387,13.824885,0.0,...,1,15.0,1,5,F,0.992,East,21,0.333333,0.716094
3810,1627734,2020,Domantas Sabonis,IND,1610612754,23,34.869565,7.889201,14.463534,0.78892,...,0,36.0,1,1,F,0.992,East,23,1.0,0.688088


Selected *Jayson Tatum*, Trae Young, *Jimmy Butler*, *Sabonis*, Bam Adebayo, and Tobias Harris. Did not predict *Zion*, Jalyen Brown, Zach LaVine, Randle, Vuvecic, Ben Simmons 

*replaced*

**Selected Trae Young, Jimmy Butler, Bam Adebayo, and Tobias Harris. Did not predict Jalyen Brown, Zach Lavine, Randle, Vevecic, or Ben Simmons**

Predicted 33 all stars. There were 24 in real life. Is there a way to limit to 24 in the model?

**Starters:**
LeBron James, Anthony Davis, Kawhi Leonard, Luka Doncic, James Harden,
Giannis Antetokounmpo, Joel Embiid, Pascal Siakam, Kemba Walker, Trae Young

**Reserves:**
Damian Lillard, Ben Simmons, Nikola Jokic, Jayson Tatum, Chris Paul, Russell Westbrook, Domantas Sabonis, Khris Middleton, Bam Adebayo, Rudy Gobert, Kyle Lowry, Brandon Ingram, Donovan Mitchell

**Starters that were left out:** Pascal Siakam, Kemba Walker

**Reserves that were left out:** Chris Paul, Kyle Lowry

**MODEL SHOWS 83% ACCURACY OVERALL**