In [7]:
# A fast-growing method of sports betting in the U.S. is 
#   "player props", where the bettor wagers on certain players 
#   going over or under a line on a certain statistic.

# This analysis intends to develop a model for betting player props.

# The below code will not run unless you have installed the nba_api package:
#    pip install nba_api

In [None]:
import pandas as pd
import numpy as np
import json
import matplotlib.pyplot as plt
import seaborn as sns
from nba_api.stats.static import players
from nba_api.stats.endpoints import playergamelogs

pd.options.display.max_columns = None

# get_players returns a list of dictionaries, each representing a player.
nba_players = players.get_players()

test_player_name = "LeBron James"

chosen_player = [
    player for player in nba_players if player['full_name'] == test_player_name
][0]

chosen_player_id = chosen_player['id']

print('Test player:', chosen_player['full_name'], '-', chosen_player_id)

In [None]:
# Only use the last two seasons.
# The reason for this is that, in my experience from the sports betting industry,
#   props are most often wrong in two cases:
#   1) young players whose numbers can make large leaps year-over-year as they improve; 
#   2) aging players whose numbers tend to drop rapidly toward the end of their careers.
# Using just the current season and the previous one minimizes the chances of
#   incorrectly estimating a player due to these effects.

seasons = ['2022-23', '2023-24']

# Get the test player's data.

test_player_log = playergamelogs.PlayerGameLogs(player_id_nullable = chosen_player_id,
                                                season_nullable = seasons).get_normalized_json()

testGames = pd.DataFrame(json.loads(test_player_log)['PlayerGameLogs'])

# We also need a data set of all players, to know how the counting stats are distributed.

raw_recent_players = playergamelogs.PlayerGameLogs(season_nullable = seasons).get_normalized_json()

all_recent_player_games = pd.DataFrame(json.loads(raw_recent_players)['PlayerGameLogs'])

all_recent_player_games.head(10)

In [None]:
# Only pick player-games with more than 24 minutes (i.e., players who play more than half the game).
# This should eliminate early injuries and "garbage time" players who only play at the end of blowouts.

player_games_df = all_recent_player_games\
    .query("MIN > 24.0")

player_games_df['PTS'].describe()

In [None]:
# Some research suggests that point scoring is Gaussian distributed, 
#   but as this graph suggests, it can be skewed.

pgpts_mean = player_games_df['PTS'].mean()

sns.histplot(player_games_df['PTS'], stat="probability");
plt.show();

In [None]:
# Simulate the probability that the test player goes over 34.5 points

from scipy.stats import norm

sim_size = 1000000

pts_sim = norm.rvs(loc=testGames['PTS'].mean(), scale=testGames['PTS'].std(), size=sim_size)
p_pts_over = len(pts_sim[pts_sim > 34.5]) / len(pts_sim)
p_pts_over

In [None]:
# Now, rebounds.
# My experience from similar metrics (i.e., both goals and corner kicks in soccer)
#  suggests that rebounds should be distributed as a Poisson variable.

from scipy.stats import poisson

pgreb_mean = player_games_df['REB'].mean()

pgreb_pos =\
    pd.DataFrame(
        {"x": range(0, 30),
         "expected": [poisson.pmf(x, pgreb_mean) for x in range(0, 30)]
        }
)

sns.histplot(player_games_df['REB'], stat="probability");
plt.plot(pgreb_pos.x, pgreb_pos.expected);
plt.show();

In [None]:
# Assists, made 3-point shots, steals, and blocks should be similar
# Assists:

pgast_mean = player_games_df['AST'].mean()

pgast_pos =\
    pd.DataFrame(
        {"x": range(0, 30),
         "expected": [poisson.pmf(x, pgast_mean) for x in range(0, 30)]
        }
)

sns.histplot(player_games_df['AST'], stat="probability");
plt.plot(pgast_pos.x, pgast_pos.expected);
plt.show();

In [None]:
# Made 3-pointers:

pg3pt_mean = player_games_df['FG3M'].mean()

pg3pt_pos =\
    pd.DataFrame(
        {"x": range(0, 30),
         "expected": [poisson.pmf(x, pg3pt_mean) for x in range(0, 30)]
        }
)

sns.histplot(player_games_df['FG3M'], stat="probability");
plt.plot(pg3pt_pos.x, pg3pt_pos.expected);
plt.show();

In [None]:
# Steals:

pgstl_mean = player_games_df['STL'].mean()

pgstl_pos =\
    pd.DataFrame(
        {"x": range(0, 30),
         "expected": [poisson.pmf(x, pgstl_mean) for x in range(0, 30)]
        }
)

sns.histplot(player_games_df['STL'], stat="probability");
plt.plot(pgstl_pos.x, pgstl_pos.expected);
plt.show();

In [None]:
# Blocked shots:

pgblk_mean = player_games_df['BLK'].mean()

pgblk_pos =\
    pd.DataFrame(
        {"x": range(0, 30),
         "expected": [poisson.pmf(x, pgblk_mean) for x in range(0, 30)]
        }
)

sns.histplot(player_games_df['BLK'], stat="probability");
plt.plot(pgblk_pos.x, pgblk_pos.expected);
plt.show();

In [None]:
# Steals, Threes made, and Blocks are almost perfectly Poisson. 
# Rebounds is close to Poisson, but weak in spots.

# So let's simulate the probability of the test player scoring above 34.5 points  
#   along with more than 9.5 rebounds and 9.5 assists:

reb_sim = poisson.rvs(testGames['REB'].mean(), size=sim_size)
p_reb_over = len(reb_sim[reb_sim > 9.5]) / len(reb_sim)

ast_sim = poisson.rvs(testGames['AST'].mean(), size=sim_size)
p_ast_over = len(ast_sim[ast_sim > 9.5]) / len(ast_sim)

print((p_pts_over * p_reb_over * p_ast_over) * 100, "%")

In [None]:
# Just barely above 1%. Now check this with the ACTUAL number of 
#   similar games that the test player has had in the last two years:

conditions = [
    (playerGames['PTS'] > 34.5) & (playerGames['REB'] > 9.5) & (playerGames['AST'] > 9.5),
    (playerGames['PTS'] < 34.5) | (playerGames['REB'] < 9.5) | (playerGames['AST'] < 9.5)
]
choices = [1, 0]

playerGames['PROP_FLAG'] = np.select(conditions, choices, default=0)

p_prop = sum(playerGames['PROP_FLAG']) / len(playerGames['PROP_FLAG'])
p_prop

In [None]:
# Literally hasn't happened in the current season to date, 
#   or the entirety of the previous one. 
# Of course the test player is LeBron James, who has done this many times 
#   in his career, so it's not impossible. 
# That fact is reflected in the very small probability returned 
#   by using the simulation method.

# Another common prop is sum(Points, Rebounds, Assists) > N. 
#   In our test player example, the "over" would be at 35+10+10 = 55

pra_sim = pts_sim + reb_sim + ast_sim

p_pra_over = len(pra_sim[pra_sim > 54.5]) / len(pra_sim)
p_pra_over

In [None]:
# And how many times has LeBron James actually exceeded 54.5 in the last two years?

playerGames['PROP_CHECK_2'] = playerGames['PTS'] + playerGames['REB'] + playerGames['AST']
playerGames['PROP_FLAG_2'] = playerGames['PROP_CHECK_2'] > 54.5

p_prop2 = sum(playerGames['PROP_FLAG']) / len(playerGames['PROP_FLAG'])
p_prop2