In [1]:
#API and numerical imports
from nba_api.stats.static import teams, players
from nba_api.stats.endpoints import playercareerstats, leaguegamefinder, playergamelog, boxscoreadvancedv3
from nba_api.stats.library.parameters import Season, SeasonType
import pandas as pd
import numpy as np
from datetime import date
import matplotlib.pyplot as plt

In [2]:
#Machine learning imports
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
import sklearn
import keras
from keras.models import Sequential
from keras.layers import Dense
import tensorflow as tf
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
from sklearn.preprocessing import MinMaxScaler

In [3]:
#Predict above/below this number
point_threshold = '30.0' 
player_full_name = "LeBron James"
player_team_abbrev2 = "CLE"
opp_abbrev2 = "GSW"
game_date = "NOV 29, 2023" 

In [4]:
#Retrieve player info (id, first name, last name, is_active, etc.
player_details = [p for p in players.get_players() if p["full_name"] == player_full_name][0]

In [5]:
#Retrieve all the seasons the player has been in the league
career_stats = playercareerstats.PlayerCareerStats(player_id=player_details['id']).get_data_frames()[0]
seasons = career_stats['SEASON_ID']

In [6]:
#Fix type of season
season_type = SeasonType.regular

In [7]:
#Retrieve game-by-game stats for player 
season_data = {}
for s in seasons:
    season_data[s] = playergamelog.PlayerGameLog(player_id=player_details['id'], season=s, season_type_all_star=season_type).get_data_frames()[0]

In [8]:
"""
For each game, we want to extract the following information for training:
1. Opponent
2. Player's team
3. Days since last game
4. Game number
5. Average PPG since start of season
"""


"\nFor each game, we want to extract the following information for training:\n1. Opponent\n2. Player's team\n3. Days since last game\n4. Game number\n5. Average PPG since start of season\n"

In [9]:
#Form training arrays
#Entries in each tuple agree with the above
X = []
#Y will be 1 if player scores above threshold, 0 if below
Y = []

In [10]:
#Label training data
for season in season_data.keys():
    currSeason = season_data[season]
    for ind in currSeason.index:
        if float(currSeason['PTS'][ind]) > float(point_threshold):
            Y.append(1)
        else:
            Y.append(0)

In [11]:
print(len(Y))

1439


In [12]:
def correct_id(abbrev):
    if abbrev == "NJN":
        return "BKN"
    elif abbrev == "NOH" or abbrev == "NOK":
        return "NOP"
    elif abbrev == "SEA":
        return "OKC"
    elif abbrev == "PHO":
        return "PHX"
    elif abbrev == "GOS":
        return "GSW"
    else:
        return abbrev

In [13]:
def get_team_id_from_abbrev(abbrev):
    abbrev = correct_id(abbrev)
    return [t for t in teams.get_teams() if t['abbreviation'] == abbrev][0]['id']

In [14]:
#Collect player team and opponent team IDs for training data
player_team_ids = []
opp_team_ids = []
for season in season_data.keys():
    currSeason = season_data[season]
    for ind in currSeason.index:
        matchup = str(currSeason['MATCHUP'][ind])
        player_team_abbrev = matchup[0:3]

        #@ index
        if "@" in matchup:
            opp_team_abbrev = matchup[6:]
        elif "vs." in matchup:
            opp_team_abbrev = matchup[8:]
        else:
            raise Exception("Unconforming matchup string: " + matchup)

        player_team_ids.append(get_team_id_from_abbrev(player_team_abbrev))
        opp_team_ids.append(get_team_id_from_abbrev(opp_team_abbrev))

In [15]:
months = {
    "JAN": 1,
    "FEB": 2,
    "MAR": 3,
    "APR": 4,
    "MAY": 5,
    "JUN": 6,
    "JUL": 7,
    "AUG": 8,
    "OCT": 10,
    "NOV": 11,
    "DEC": 12
}
def convert_human_date(d):
    month = months[d[0:3]]
    day = int(d[4:6])
    year = int(d[8:])

    return date(year, month, day)
    

In [16]:
def distance_between_dates(d1, d2):
    return int((d2 - d1).days)

In [17]:
#default days of rest set for the first game of the season:
first_game_rest = 7

In [18]:
#Compute days since last game
days_since_last_game = []
for season in season_data.keys():
    currSeason = season_data[season]
    for ind in currSeason.index:
        if (ind + 1) == len(currSeason.index):
            days_since_last_game.append(first_game_rest)
        else:
            currGameDate = convert_human_date(str(currSeason["GAME_DATE"][ind]))
            lastGameDate = convert_human_date(str(currSeason["GAME_DATE"][ind+1]))
            days_since_last_game.append(distance_between_dates(lastGameDate, currGameDate))
            

In [19]:
#Get the game number, in terms of the number of games that player has played in the season
game_numbers = []
for season in season_data.keys():
    for ind in season_data[season].index:
        game_numbers.append(len(season_data[season].index) - ind)

In [20]:
#default average ppg since start of season for player
default_first_game_ppg = 0.0

In [21]:
#Compute average ppg since start of season
average_ppg = []
for season in season_data.keys():
    currSeason = season_data[season]
    for ind in currSeason.index:
        if (ind + 1) == len(currSeason.index):
            average_ppg.append(default_first_game_ppg)
        else:
            sum = 0.0
            for pts in currSeason["PTS"][ind+1:len(currSeason.index)]:
                sum += float(pts)
            average_ppg.append(sum / (len(currSeason.index) - ind - 1))

In [22]:
#Zip together all the training input data
for i in range(len(game_numbers)):
    X.append([float(player_team_ids[i]), float(opp_team_ids[i]), float(days_since_last_game[i]), float(game_numbers[i]), average_ppg[i]])

In [23]:
"""
Below we start doing preprocessing
"""

'\nBelow we start doing preprocessing\n'

In [24]:
Y = pd.DataFrame(Y)

In [25]:
X = pd.DataFrame(X)

In [26]:
print(len(Y.index))

1439


In [27]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.25, random_state=0)

In [28]:
"""
Next, we begin constructing the classification model
"""

'\nNext, we begin constructing the classification model\n'

In [29]:
svc = SVC()
svc.fit(X_train, Y_train.values.ravel())
svc.score(X_test, Y_test.values.ravel())

0.675

In [30]:
log = LogisticRegression()
log.fit(X_train, Y_train.values.ravel())
log.score(X_test, Y_test)

0.675

In [31]:
rcf = RandomForestClassifier()
rcf.fit(X_train, Y_train.values.ravel())

In [32]:
gbc = GradientBoostingClassifier()

In [33]:
gbc.fit(X_train, Y_train.values.ravel())

In [34]:
dtc = DecisionTreeClassifier()
dtc.fit(X_train, Y_train.values.ravel())

In [35]:
"""
Next, we have tools to calculate the desired input data for a player, to be used for predictions
"""

'\nNext, we have tools to calculate the desired input data for a player, to be used for predictions\n'

In [36]:
opp_id, player_team_id = get_team_id_from_abbrev(opp_abbrev2), get_team_id_from_abbrev(player_team_abbrev2)
days_elapsed = (convert_human_date(game_date) - convert_human_date(season_data["2023-24"]["GAME_DATE"][0])).days
game_number = len(season_data["2023-24"].index) + 1
average_ppg = 0.0
currSeason = season_data['2023-24']
for ind in currSeason.index:
        if (ind + 1) == len(currSeason.index):
            average_ppg = (default_first_game_ppg)
        else:
            sum = 0.0
            for pts in currSeason["PTS"][ind+1:len(currSeason.index)]:
                sum += float(pts)
            average_ppg = (sum / (len(currSeason.index) - ind - 1))
input = pd.DataFrame([[float(player_team_id), float(opp_id), float(days_elapsed), float(game_number), float(average_ppg)]])

In [37]:
accuracies = []
predictions = []

In [46]:
score = svc.score(X_test, Y_test.values.ravel())
prediction = svc.predict(input)[0]
predictions.append(prediction)
accuracies.append(score)
print("SVC Accuracy: ")
print(score)
print("SVC Prediction: ")
print(prediction)

SVC Accuracy: 
0.675
SVC Prediction: 
0


In [47]:
score = log.score(X_test, Y_test.values.ravel())
prediction = log.predict(input)[0]
predictions.append(prediction)
accuracies.append(score)
print("Logistic Regression Accuracy: ")
print(score)
print("Logistic Regression Prediction: ")
print(prediction)

Logistic Regression Accuracy: 
0.675
Logistic Regression Prediction: 
0


In [48]:
score = rcf.score(X_test, Y_test.values.ravel())
prediction = rcf.predict(input)[0]
predictions.append(prediction)
accuracies.append(score)
print("Random Forest Accuracy: ")
print(score)
print("Random Forest Prediction: ")
print(prediction)

Random Forest Accuracy: 
0.6111111111111112
Random Forest Prediction: 
1


In [49]:
score = gbc.score(X_test, Y_test.values.ravel())
prediction = gbc.predict(input)[0]
predictions.append(prediction)
accuracies.append(score)
print("Gradient Boosting Accuracy: ")
print(score)
print("Gradient Boosting Prediction: ")
print(prediction)

Gradient Boosting Accuracy: 
0.6472222222222223
Gradient Boosting Prediction: 
1


In [50]:
score = dtc.score(X_test, Y_test.values.ravel())
prediction = dtc.predict(input)[0]
predictions.append(prediction)
accuracies.append(score)
print("Decision Tree Accuracy: ")
print(score)
print("Decision Tree Prediction: ")
print(prediction)

Decision Tree Accuracy: 
0.5583333333333333
Decision Tree Prediction: 
1


In [43]:
#Betting tools

In [60]:
cume = 0.0
over_probabilities = []
under_probabilities = []
for i in range(len(predictions)):
    if int(predictions[i]) == 1:
        cume += accuracies[i]
        over_probabilities.append(accuracies[i])
        under_probabilities.append(1-accuracies[i])
    else:
        cume -= accuracies[i]
        over_probabilities.append(1-accuracies[i])
        under_probabilities.append(accuracies[i])

if cume < 0.0:
    print("Under probs with confidence " + str(cume))
else:
    print("Over probs with confidence " + str(cume))

sum_of_overs = 0.0
for i in over_probabilities:
    sum_of_overs += i
    
sum_of_unders = 0.0
for i in under_probabilities:
    sum_of_unders += i

print("Average of all probabilities over: " + str(sum_of_overs / len(over_probabilities)))
print("Average of all probabilities under: " + str(sum_of_unders / len(under_probabilities)))
        
    

Over with confidence 0.4666666666666667
Average of all probabilities over: 0.49333333333333335
Average of all probabilities under: 0.5066666666666666
