In [1]:
from nba_api.stats.static import teams, players
from nba_api.stats.endpoints import playercareerstats, leaguegamefinder, playergamelog, boxscoreadvancedv3, teamestimatedmetrics, playerestimatedmetrics
from nba_api.stats.library.parameters import Season, SeasonType
import pandas as pd
import numpy as np

In [2]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
import sklearn

In [3]:
#Predict above/below this number
point_threshold = "27.5"
player_full_name = "Tyrese Haliburton"
player_team_abbr = "IND"
opp_team_abbrev = "BOS"

In [4]:
#Hyperparameters
#default days of rest set for the first game of the season:
first_game_rest = 7
#default average ppg since start of season
default_first_game_ppg = 0.0

In [5]:
#Fetch player data
player_details = [p for p in players.get_players() if p["full_name"] == player_full_name][0]

career_stats = playercareerstats.PlayerCareerStats(player_id=player_details['id']).get_data_frames()[0]

seasons = career_stats['SEASON_ID']

season_data = {}
for s in seasons:
    season_data[s] = playergamelog.PlayerGameLog(player_id=player_details['id'], season=s, season_type_all_star=SeasonType.regular).get_data_frames()[0]

#Form training arrays
X_allseasons = []
opp_def_rating_allseasons = []
team_off_rating_allseasons = []
player_off_rating_allseasons = []
player_usage_rate_allseasons = []
Y_class_allseasons = []

"""
Collect data. There's a different approach here. 
We are going to build two models, one predicting exact points (regression), one
predicting purely over/under point threshold. Also, we are going to train on two 
different timeframes of data: player whole career and current season only.
"""
for season in season_data.keys():
    currSeason = season_data[season]
    for ind in currSeason.index:
        
        #Get defensive rating
        metrics = teamestimatedmetrics.TeamEstimatedMetrics(league_id="00", season=season, season_type = SeasonType.regular).get_data_frames()[0]
        opp_abbrev = currSeason['MATCHUP'][ind][-3:]

        #Label classification
        for i in teams.get_teams():
            if i['abbreviation'] == opp_abbrev:
                opp_team_id = i['id']
                
        opp_def_rating_allseasons.append(metrics.loc[metrics["TEAM_ID"] == opp_team_id]["E_DEF_RATING"])

        #Get team offensive rating
        for i in teams.get_teams():
            if i['abbreviation'] == currSeason["MATCHUP"][ind][0:3]:
                plr_team_id = i['id']

        team_off_rating_allseasons.append(metrics.loc[metrics["TEAM_ID"] == plr_team_id]["E_OFF_RATING"])
        
        #Get player offensive rating
        player_metrics = playerestimatedmetrics.PlayerEstimatedMetrics(league_id="00", season=season, season_type = SeasonType.regular).get_data_frames()[0]
        player_id = player_details['id']

        player_off_rating_allseasons.append(player_metrics.loc[player_metrics["PLAYER_ID"] == player_id]["E_OFF_RATING"])

        #Get player usage rate
        player_usage_rate_allseasons.append(player_metrics.loc[player_metrics["PLAYER_ID"] == player_id]["E_USG_PCT"])
        
        #Label classifications
        if float(currSeason['PTS'][ind]) > float(point_threshold):
            Y_class_allseasons.append(1)
        else:
            Y_class_allseasons.append(0)

#Zip together the data
X_allseasons = []
for i in range(len(Y_class_allseasons)):
    tpl = (float(opp_def_rating_allseasons[i]), float(team_off_rating_allseasons[i]), float(player_off_rating_allseasons[i]), float(player_usage_rate_allseasons[i]))
    X_allseasons.append(tpl)
    

#Now, let's start preprocessing the data
Y_regr_allseasons = pd.DataFrame(Y_regr_allseasons)
Y_regr_currseason = pd.DataFrame(Y_regr_currseason)
Y_class_allseasons = pd.DataFrame(Y_class_allseasons)
Y_class_currseason = pd.DataFrame(Y_class_currseason)

X_allseasons = pd.DataFrame(X_allseasons)
X_currseason = pd.DataFrame(X_currseason)

X_regr_allseasons_train, X_regr_allseasons_test, Y_regr_allseasons_train, Y_regr_allseasons_test = train_test_split(X_allseasons, Y_regr_allseasons, test_size=.25)
X_class_allseasons_train, X_class_allseasons_test, Y_class_allseasons_train, Y_class_allseasons_test = train_test_split(X_allseasons, Y_class_allseasons, test_size=.25)
X_regr_currseason_train, X_regr_currseason_test, Y_regr_currseason_train, Y_regr_currseason_test = train_test_split(X_currseason, Y_regr_currseason, test_size=.25)
X_class_currseason_train, X_class_currseason_test, Y_class_currseason_train, Y_class_currseason_test = train_test_split(X_currseason, Y_class_currseason, test_size=.25)

ReadTimeout: HTTPSConnectionPool(host='stats.nba.com', port=443): Read timed out. (read timeout=30)

In [None]:
#Train logistic model
log = LogisticRegression()
log.fit(X_class_allseasons_train, Y_class_allseasons_train)

In [None]:
#Collect data for desired prediction
#First, we find the opposing team's defensive rating this season.
metrics = teamestimatedmetrics.TeamEstimatedMetrics(league_id="00", season="2023-24", season_type = SeasonType.regular).get_data_frames()[0]

for i in teams.get_teams():
        if i['abbreviation'] == opp_team_abbr:
            opp_team_id = i['id']
            
def_rating = metrics.loc[metrics["TEAM_ID"] == opp_team_id]["E_DEF_RATING"]

#Second, we find the player's team offensive rating
metrics = teamestimatedmetrics.TeamEstimatedMetrics(league_id="00", season="2023-24", season_type = SeasonType.regular).get_data_frames()[0]

for i in teams.get_teams():
        if i['abbreviation'] == player_team_abbr:
            player_team_id = i['id']

team_off_rating = metrics.loc[metrics["TEAM_ID"] == player_team_id]["E_OFF_RATING"]

#Third, we find the player's offensive rating
player_metrics = playerestimatedmetrics.PlayerEstimatedMetrics(league_id="00", season="2023-24", season_type = SeasonType.regular).get_data_frames()[0]
player_id = player_details['id']

player_off_rating = player_metrics.loc[player_metrics["PLAYER_ID"] == player_id]["E_OFF_RATING"]

#Fourth, we find the player's usage rate
player_usage_rate = player_metrics.loc[player_metrics["PLAYER_ID"] == player_id]["E_USG_PCT"]

#Accumulate the data into a tuple
input = (def_rating, team_off_rating, player_off_rating, player_usage_rate)



In [None]:
#Make the prediction
log.predict([input])