In [1]:
from nba_api.stats.static import teams, players
from nba_api.stats.endpoints import playercareerstats, leaguegamefinder, playergamelog, boxscoreadvancedv3, teamestimatedmetrics, playerestimatedmetrics
from nba_api.stats.library.parameters import Season, SeasonType
import pandas as pd
import numpy as np
from datetime import date
import matplotlib.pyplot as plt

In [63]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import RidgeClassifier
import statsmodels.api as sm
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.decomposition import PCA
from sklearn.model_selection import cross_val_score
import sklearn
import keras
from keras.models import Sequential
from keras.layers import Dense
import tensorflow as tf
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
from sklearn.preprocessing import MinMaxScaler

In [3]:
#Predict above/below this number
point_threshold = "27.5"
player_full_name = "Tyrese Haliburton"
player_team_abbr = "IND"
opp_team_abbrev = "BOS"
game_date = "DEC 04, 2023"

In [4]:
#Fetch player data
player_details = [p for p in players.get_players() if p["full_name"] == player_full_name][0]

career_stats = playercareerstats.PlayerCareerStats(player_id=player_details['id']).get_data_frames()[0]

seasons = career_stats['SEASON_ID']

season_data = {}
for s in seasons:
    season_data[s] = playergamelog.PlayerGameLog(player_id=player_details['id'], season=s, season_type_all_star=SeasonType.regular).get_data_frames()[0]

In [5]:
"""
For each game, we want to extract the following information for training:
1. Opponent team defensive rating
2. Player team offensive rating
3. Player offensive rating
4. Player usage rate
5. Days since last game
6. Game number
"""

'\nFor each game, we want to extract the following information for training:\n1. Opponent team defensive rating\n2. Player team offensive rating\n3. Player offensive rating\n4. Player usage rate\n5. Days since last game\n6. Game number\n7. Average ppg since start of season\n'

In [12]:
#Form training arrays
X_currseason = []
X_allseasons = []
opp_def_rating_allseasons = []
opp_def_rating_currseason = []
team_off_rating_allseasons = []
team_off_rating_currseason = []
player_off_rating_allseasons = []
player_off_rating_currseason = []
player_usage_rate_allseasons = []
player_usage_rate_currseason = []
days_since_last_game_allseasons = []
days_since_last_game_currseason = []
game_numbers_allseasons = []
game_numbers_currseason = []
Y_regr_currseason = []
Y_class_currseason = []
Y_regr_allseasons = []
Y_class_allseasons = []



In [13]:
"""
Collect data. There's a different approach here. 
We are going to build two models, one predicting exact points (regression), one
predicting purely over/under point threshold. Also, we are going to train on two 
different timeframes of data: player whole career and current season only.
"""
for season in season_data.keys():
    currSeason = season_data[season]
    for ind in currSeason.index:
        #Label regressions
        Y_regr_allseasons.append(float(currSeason['PTS'][ind]))
        
        #Get defensive rating
        metrics = teamestimatedmetrics.TeamEstimatedMetrics(league_id="00", season=season, season_type = SeasonType.regular).get_data_frames()[0]
        opp_abbrev = currSeason['MATCHUP'][ind][-3:]


        for i in teams.get_teams():
            if i['abbreviation'] == opp_abbrev:
                opp_team_id = i['id']
                
        opp_def_rating_allseasons.append(metrics.loc[metrics["TEAM_ID"] == opp_team_id]["E_DEF_RATING"])

        #Get team offensive rating
        for i in teams.get_teams():
            if i['abbreviation'] == currSeason["MATCHUP"][ind][0:3]:
                plr_team_id = i['id']

        team_off_rating_allseasons.append(metrics.loc[metrics["TEAM_ID"] == plr_team_id]["E_OFF_RATING"])
        
        #Get player offensive rating
        player_metrics = playerestimatedmetrics.PlayerEstimatedMetrics(league_id="00", season=season, season_type = SeasonType.regular).get_data_frames()[0]
        player_id = player_details['id']

        player_off_rating_allseasons.append(player_metrics.loc[player_metrics["PLAYER_ID"] == player_id]["E_OFF_RATING"])

        #Get player usage rate
        player_usage_rate_allseasons.append(player_metrics.loc[player_metrics["PLAYER_ID"] == player_id]["E_USG_PCT"])
        
        #Label classifications
        if float(currSeason['PTS'][ind]) > float(point_threshold):
            Y_class_allseasons.append(1)
        else:
            Y_class_allseasons.append(0)
            
for ind in season_data["2023-24"].index:
    #Label regressions
    Y_regr_currseason.append(float(season_data["2023-24"]['PTS'][ind]))
    
    #Get team defensive ratings
    metrics = teamestimatedmetrics.TeamEstimatedMetrics(league_id="00", season="2023-24", season_type = SeasonType.regular).get_data_frames()[0]
    opp_abbrev = season_data["2023-24"]["MATCHUP"][ind][-3:]

    for i in teams.get_teams():
        if i['abbreviation'] == opp_abbrev:
            opp_team_id = i['id']

    opp_def_rating_currseason.append(metrics.loc[metrics["TEAM_ID"] == opp_team_id]["E_DEF_RATING"])

    #Get team offensive ratings
    for i in teams.get_teams():
        if i['abbreviation'] == currSeason["MATCHUP"][ind][0:3]:
            plr_team_id = i['id']

    team_off_rating_currseason.append(metrics.loc[metrics["TEAM_ID"] == plr_team_id]["E_OFF_RATING"])

    #Get player offensive rating
    player_metrics = playerestimatedmetrics.PlayerEstimatedMetrics(league_id="00", season="2023-24", season_type = SeasonType.regular).get_data_frames()[0]
    player_id = player_details['id']

    player_off_rating_currseason.append(player_metrics.loc[player_metrics["PLAYER_ID"] == player_id]["E_OFF_RATING"])

    #Get player usage rate
    player_usage_rate_currseason.append(player_metrics.loc[player_metrics["PLAYER_ID"] == player_id]["E_USG_PCT"])
    
    #Label classifications
    if float(season_data["2023-24"]['PTS'][ind]) > float(point_threshold):
        Y_class_currseason.append(1)
    else:
        Y_class_currseason.append(0)
    

In [19]:
months = {
    "JAN": 1,
    "FEB": 2,
    "MAR": 3,
    "APR": 4,
    "MAY": 5,
    "JUN": 6,
    "JUL": 7,
    "AUG": 8,
    "OCT": 10,
    "NOV": 11,
    "DEC": 12
}
def convert_human_date(d):
    month = months[d[0:3]]
    day = int(d[4:6])
    year = int(d[8:])

    return date(year, month, day)

In [20]:
def distance_between_dates(d1, d2):
    return int((d2 - d1).days)

In [21]:
#default days of rest set for the first game of the season:
first_game_rest = 7

In [23]:
#Compute days since last game
for season in season_data.keys():
    currSeason = season_data[season]
    for ind in currSeason.index:
        if (ind + 1) == len(currSeason.index):
            days_since_last_game_allseasons.append(first_game_rest)
        else:
            currGameDate = convert_human_date(str(currSeason["GAME_DATE"][ind]))
            lastGameDate = convert_human_date(str(currSeason["GAME_DATE"][ind+1]))
            days_since_last_game_allseasons.append(distance_between_dates(lastGameDate, currGameDate))
for ind in season_data["2023-24"].index:
    if (ind + 1) == len(season_data["2023-24"].index):
        days_since_last_game_currseason.append(first_game_rest)
    else:
        currGameDate = convert_human_date(str(season_data['2023-24']["GAME_DATE"][ind]))
        lastGameDate = convert_human_date(str(season_data['2023-24']["GAME_DATE"][ind+1]))
        days_since_last_game_currseason.append(distance_between_dates(lastGameDate, currGameDate))

In [24]:
#Get the game number, in terms of the number of games that player has played
for season in season_data.keys():
    for ind in season_data[season].index:
        game_numbers_allseasons.append(len(season_data[season].index) - ind)

for ind in season_data["2023-24"].index:
    game_numbers_currseason.append(len(season_data["2023-24"].index) - ind)
        

In [25]:
#default average ppg since start of season
default_first_game_ppg = 0.0

In [34]:
#Zip together all the training input data
X_allseasons = []
X_currseason = []
for i in range(len(game_numbers_allseasons)):
    tpl = (float(opp_def_rating_allseasons[i]), float(team_off_rating_allseasons[i]), float(player_off_rating_allseasons[i]), float(player_usage_rate_allseasons[i]), float(days_since_last_game_allseasons[i]), float(game_numbers_allseasons[i]))
    X_allseasons.append(tpl)

for i in range(len(game_numbers_currseason)):
    tpl = (float(opp_def_rating_currseason[i]), float(team_off_rating_currseason[i]), float(player_off_rating_currseason[i]), float(player_usage_rate_currseason[i]), float(days_since_last_game_currseason[i]), float(game_numbers_currseason[i]))
    X_currseason.append(tpl)
    

  tpl = (float(opp_def_rating_allseasons[i]), float(team_off_rating_allseasons[i]), float(player_off_rating_allseasons[i]), float(player_usage_rate_allseasons[i]), float(days_since_last_game_allseasons[i]), float(game_numbers_allseasons[i]))
  tpl = (float(opp_def_rating_currseason[i]), float(team_off_rating_currseason[i]), float(player_off_rating_currseason[i]), float(player_usage_rate_currseason[i]), float(days_since_last_game_currseason[i]), float(game_numbers_currseason[i]))


In [36]:
#Run PCA to determine which predictor variables to include, for all seasons
pca = PCA(n_components = 6)
pca.fit(X_allseasons)
print(pca.explained_variance_ratio_)

[9.19578377e-01 5.21248442e-02 1.43829547e-02 1.14980603e-02
 2.41571099e-03 5.26953523e-08]


In [37]:
#So then let's reform the data, without game number and days since last game from the data
X_allseasons = []
X_currseason = []
for i in range(len(game_numbers_allseasons)):
    tpl = (float(opp_def_rating_allseasons[i]), float(team_off_rating_allseasons[i]), float(player_off_rating_allseasons[i]), float(player_usage_rate_allseasons[i]))
    X_allseasons.append(tpl)

for i in range(len(game_numbers_currseason)):
    tpl = (float(opp_def_rating_currseason[i]), float(team_off_rating_currseason[i]), float(player_off_rating_currseason[i]), float(player_usage_rate_currseason[i]))
    X_currseason.append(tpl)
    

  tpl = (float(opp_def_rating_allseasons[i]), float(team_off_rating_allseasons[i]), float(player_off_rating_allseasons[i]), float(player_usage_rate_allseasons[i]))
  tpl = (float(opp_def_rating_currseason[i]), float(team_off_rating_currseason[i]), float(player_off_rating_currseason[i]), float(player_usage_rate_currseason[i]))


In [40]:
#Now, let's start preprocessing the data
Y_regr_allseasons = pd.DataFrame(Y_regr_allseasons)
Y_regr_currseason = pd.DataFrame(Y_regr_currseason)
Y_class_allseasons = pd.DataFrame(Y_class_allseasons)
Y_class_currseason = pd.DataFrame(Y_class_currseason)

X_allseasons = pd.DataFrame(X_allseasons)
X_currseason = pd.DataFrame(X_currseason)

In [47]:
X_regr_allseasons_train, X_regr_allseasons_test, Y_regr_allseasons_train, Y_regr_allseasons_test = train_test_split(X_allseasons, Y_regr_allseasons, test_size=.25)
X_class_allseasons_train, X_class_allseasons_test, Y_class_allseasons_train, Y_class_allseasons_test = train_test_split(X_allseasons, Y_class_allseasons, test_size=.25)
X_regr_currseason_train, X_regr_currseason_test, Y_regr_currseason_train, Y_regr_currseason_test = train_test_split(X_currseason, Y_regr_currseason, test_size=.25)
X_class_currseason_train, X_class_currseason_test, Y_class_currseason_train, Y_class_currseason_test = train_test_split(X_currseason, Y_class_currseason, test_size=.25)

In [None]:
#Let's start with the classification models. We'll check with 5-fold cross validation

In [60]:
#all seasons SVC. Has a great cross-validation score
svc = SVC()
scores = cross_val_score(svc, X_class_allseasons_train, Y_class_allseasons_train.values.ravel(), cv=5)
print(np.mean(scores))

0.9161290322580644


In [56]:
#current season SVC. Not great. 
svc = SVC()
scores = cross_val_score(svc, X_class_currseason_train, Y_class_currseason_train.values.ravel(), cv=5)
print(scores)

[0.66666667 0.66666667 0.5        0.5        0.5       ]


In [59]:
#All seasons logistic classifier. Pretty good!
log = LogisticRegression()
scores = cross_val_score(log, X_class_allseasons_train, Y_class_allseasons_train.values.ravel(), cv=5)
print(np.mean(scores))

0.9161290322580644


In [58]:
#Current season logistic classifier. Bad
log = LogisticRegression()
scores = cross_val_score(log, X_class_currseason_train, Y_class_currseason_train.values.ravel(), cv=5)
print(scores)

[0.33333333 0.33333333 0.         0.5        0.5       ]


In [None]:
#What's happening is a lack of data problem. There have been fewer than twenty games this season for any given team. So let's use all season data now. 

In [61]:
#All season random forest classifier. Not as good as SVC or logistic classifier. 
rcf = RandomForestClassifier()
scores = cross_val_score(rcf, X_class_allseasons_train, Y_class_allseasons_train.values.ravel(), cv=5)
print(np.mean(scores))

0.8709677419354838


In [62]:
#Gradient boosting. Not as great as SVC or logistic classifier.
gbc = GradientBoostingClassifier()
scores = cross_val_score(gbc, X_class_allseasons_train, Y_class_allseasons_train.values.ravel(), cv=5)
print(np.mean(scores))

0.8709677419354838


In [65]:
#Ridge classifier. This is pretty good too. 
rc = RidgeClassifier()
scores = cross_val_score(rc, X_class_allseasons_train, Y_class_allseasons_train.values.ravel(), cv=5)
print(np.mean(scores))

0.9161290322580644


In [None]:
#Now, let's try regression on allseasons.