In [76]:
import pandas as pd
import numpy as np
from nba_api.stats.static import teams
import plotly.express as px

In [62]:
def get_teams_playoff_matchup(matchup_str):
    return [matchup_str[0:3], matchup_str[-3:]]

In [61]:
def invert_matchup(matchup_str):
    first_team, second_team = get_teams_playoff_matchup(matchup_str)
    return second_team + "," + first_team

In [None]:
def identify_series(team_playoff_df):
    matchup_strings = []
    for index, row in team_playoff_df:
        matchup_strings.append(get_teams_playoff_matchup(row["MATCHUP"]))

In [2]:
#Get z score
def compute_z_score(element, column):
    mean = np.mean(column)
    z_score = (element - mean) / np.std(column)
    return z_score

In [3]:
#generate string labels for seasons (ex: 2023-24)
def generate_nba_seasons(start_year, end_year):
    seasons = []
    for year in range(start_year, end_year):
        next_year = str(year + 1)[-2:]  # Get last two digits of the next year
        season = f"{year}-{next_year}"
        seasons.append(season)
    return seasons

In [8]:
season_ids = generate_nba_seasons(2010, 2022)

In [9]:
data = []
for season in season_ids:
    data.append(pd.read_csv("Data/Raw/BENCH_POINTS_REG_SEASON_" + season + ".csv"))


In [28]:
print(data[0])

      Unnamed: 0     TEAM_ID   GAME_ID   SEASON  BENCH_PTS  OFF_RTG  \
0              0  1610612748  21000001  2010-11       23.0    109.3   
1              1  1610612738  21000001  2010-11       26.0    104.0   
2              2  1610612756  21000002  2010-11       29.0    107.0   
3              3  1610612757  21000002  2010-11       32.0    105.6   
4              4  1610612745  21000003  2010-11       30.0    108.0   
...          ...         ...       ...      ...        ...      ...   
2453        2453  1610612746  21001228  2010-11       33.0    101.8   
2454        2454  1610612759  21001229  2010-11       59.0    109.4   
2455        2455  1610612756  21001229  2010-11       29.0    107.0   
2456        2456  1610612747  21001230  2010-11       21.0    107.9   
2457        2457  1610612758  21001230  2010-11       29.0    100.6   

      SEASON_W_PCT  
0            0.707  
1            0.683  
2            0.488  
3            0.585  
4            0.524  
...            ...  


In [11]:
#Accumulate game-by-game data into team season data (i.e. get average bench points for each team season)
seasonal_cume_data = []
for season_data in data:
    team_ids = []
    seasons = []
    avg_bench_pts = []
    off_rtg = []
    win_pcts = []
    
    #Current season
    curr_season = season_data["SEASON"].tolist()[0]
    
    for team in np.unique(season_data["TEAM_ID"]):
        
        team_ids.append(team)
        seasons.append(curr_season)
        
        #Compute average bench points for team
        team_avg_bench_pts = np.mean(season_data[season_data["TEAM_ID"] == team]["BENCH_PTS"])
        avg_bench_pts.append(team_avg_bench_pts)
        
        #get season offensive rating and win pct for team
        off_rtg.append(season_data[season_data["TEAM_ID"] == team]["OFF_RTG"].tolist()[0])
        win_pcts.append(season_data[season_data["TEAM_ID"] == team]["SEASON_W_PCT"].tolist()[0])
    
    out_dict = {
        "TEAM_ID": team_ids,
        "SEASON": seasons,
        "AVG_BENCH_PTS": avg_bench_pts,
        "OFF_RTG": off_rtg,
        "W_PCT": win_pcts
    }
    
    seasonal_cume_data.append(pd.DataFrame(out_dict))

In [15]:
print(seasonal_cume_data[0])

       TEAM_ID   SEASON  AVG_BENCH_PTS  OFF_RTG  W_PCT
0   1610612737  2010-11      30.939024    103.2  0.537
1   1610612738  2010-11      26.740741    104.0  0.683
2   1610612739  2010-11      35.646341     99.5  0.232
3   1610612740  2010-11      29.621951    103.8  0.561
4   1610612741  2010-11      27.524390    105.5  0.756
5   1610612742  2010-11      40.518519    107.6  0.695
6   1610612743  2010-11      37.219512    109.5  0.610
7   1610612744  2010-11      24.207317    105.5  0.439
8   1610612745  2010-11      32.426829    108.0  0.524
9   1610612746  2010-11      27.085366    101.8  0.390
10  1610612747  2010-11      28.243902    107.9  0.695
11  1610612748  2010-11      21.939024    109.3  0.707
12  1610612749  2010-11      30.963415     99.0  0.427
13  1610612750  2010-11      34.530864    101.1  0.207
14  1610612751  2010-11      31.271605    100.2  0.293
15  1610612752  2010-11      26.914634    108.3  0.512
16  1610612753  2010-11      27.268293    105.7  0.634
17  161061

In [21]:
#Calculate z scores and accumulate all data into a single dataframe
team_ids = []
seasons = []
bench_pts_z_scores = []
off_rtg_z_scores = []
w_pct_z_scores = []

for season_data in seasonal_cume_data:
    
    #Collect data from all teams to use for computing z-scores
    seasonal_bench_pts = season_data["AVG_BENCH_PTS"].tolist()
    seasonal_off_rtgs = season_data["OFF_RTG"].tolist()
    seasonal_w_pcts = season_data["W_PCT"].tolist()
    
    for index, row in season_data.iterrows():
        team_ids.append(row["TEAM_ID"])
        seasons.append(row["SEASON"])
    
        #Compute z scores
        bench_pts_z_scores.append(compute_z_score(row["AVG_BENCH_PTS"], seasonal_bench_pts))
        off_rtg_z_scores.append(compute_z_score(row["OFF_RTG"], seasonal_off_rtgs))
        w_pct_z_scores.append(compute_z_score(row["W_PCT"], seasonal_w_pcts))
        

out_dict = {
    "TEAM_ID": team_ids,
    "SEASON": seasons,
    "REL_BENCH_PTS": bench_pts_z_scores,
    "REL_OFF_RTG": off_rtg_z_scores,
    "REL_W_PCT": w_pct_z_scores
}

relative_data = pd.DataFrame(out_dict)

In [22]:
print(relative_data)

        TEAM_ID   SEASON  REL_BENCH_PTS  REL_OFF_RTG  REL_W_PCT
0    1610612737  2010-11      -0.015224    -0.404078   0.234234
1    1610612738  2010-11      -0.850603    -0.152838   1.158506
2    1610612739  2010-11       0.921444    -1.566064  -1.696610
3    1610612740  2010-11      -0.277296    -0.215648   0.386169
4    1610612741  2010-11      -0.694671     0.318238   1.620643
..          ...      ...            ...          ...        ...
355  1610612762  2021-22       0.727985     1.485015   0.707263
356  1610612763  2021-22       0.918643     0.715245   1.320288
357  1610612764  2021-22       1.024471    -0.150747  -0.526000
358  1610612765  2021-22       0.902854    -1.946877  -1.586172
359  1610612766  2021-22       0.221519     0.747318   0.173570

[360 rows x 5 columns]


In [23]:
relative_data.to_csv("Data/Polished/RELATIVE_REG_SEASON_DATA.csv")

In [None]:
"""
Next, get the z-scores of avg proportions of points that come from the bench
"""

In [26]:
#Read total points data 
total_points = pd.read_csv("Data/Raw/TOTAL_GAME_POINTS_DATA.csv")

In [27]:
print(total_points)

       Unnamed: 0   GAME_ID     TEAM_ID  PTS
0               0  21000001  1610612748   80
1               1  21000001  1610612738   88
2               2  21000002  1610612756   92
3               3  21000002  1610612757  106
4               4  21000003  1610612745  110
...           ...       ...         ...  ...
28393        2455  22101220  1610612743  141
28394        2456  22101226  1610612752  105
28395        2457  22101217  1610612766  124
28396        2458  22101225  1610612740  107
28397        2459  22101229  1610612756  109

[28398 rows x 4 columns]


In [29]:
#For each game, get the proportion of team points that came from the bench
for season_data in data:
    bench_pts_prop = []
    for index, row in season_data.iterrows():
        game_id = row["GAME_ID"]
        team_id = row["TEAM_ID"]
        
        game_team_tot_pts = total_points[total_points["GAME_ID"] == game_id]
        game_team_tot_pts = game_team_tot_pts[game_team_tot_pts["TEAM_ID"] == team_id]["PTS"].tolist()[0]
        
        bench_pts_prop.append(row["BENCH_PTS"] / game_team_tot_pts)
    
    season_data["BENCH_PTS_PROP"] = bench_pts_prop

In [35]:
#Compute team averages
bench_prop_cume_data = []
for season_data in data:
    team_ids = []
    seasons = []
    bench_prop = []
    off_rtg = []
    win_pcts = []
    
    #Current season
    curr_season = season_data["SEASON"].tolist()[0]
    
    for team in np.unique(season_data["TEAM_ID"]):
        
        team_ids.append(team)
        seasons.append(curr_season)
        
        #Compute average bench points for team
        team_avg_bench_prop = np.mean(season_data[season_data["TEAM_ID"] == team]["BENCH_PTS_PROP"])
        bench_prop.append(team_avg_bench_prop)
        
        #get season offensive rating and win pct for team
        off_rtg.append(season_data[season_data["TEAM_ID"] == team]["OFF_RTG"].tolist()[0])
        win_pcts.append(season_data[season_data["TEAM_ID"] == team]["SEASON_W_PCT"].tolist()[0])
    
    out_dict = {
        "TEAM_ID": team_ids,
        "SEASON": seasons,
        "BENCH_PROP": bench_prop,
        "OFF_RTG": off_rtg,
        "W_PCT": win_pcts
    }
    
    bench_prop_cume_data.append(pd.DataFrame(out_dict))

In [36]:
print(bench_prop_cume_data[0])

       TEAM_ID   SEASON  BENCH_PROP  OFF_RTG  W_PCT
0   1610612737  2010-11    0.329232    103.2  0.537
1   1610612738  2010-11    0.276372    104.0  0.683
2   1610612739  2010-11    0.377224     99.5  0.232
3   1610612740  2010-11    0.311895    103.8  0.561
4   1610612741  2010-11    0.277325    105.5  0.756
5   1610612742  2010-11    0.402258    107.6  0.695
6   1610612743  2010-11    0.342916    109.5  0.610
7   1610612744  2010-11    0.239653    105.5  0.439
8   1610612745  2010-11    0.305174    108.0  0.524
9   1610612746  2010-11    0.277505    101.8  0.390
10  1610612747  2010-11    0.277450    107.9  0.695
11  1610612748  2010-11    0.213378    109.3  0.707
12  1610612749  2010-11    0.338461     99.0  0.427
13  1610612750  2010-11    0.343905    101.1  0.207
14  1610612751  2010-11    0.334436    100.2  0.293
15  1610612752  2010-11    0.254109    108.3  0.512
16  1610612753  2010-11    0.272687    105.7  0.634
17  1610612754  2010-11    0.339077    101.9  0.451
18  16106127

In [42]:
#Calculate z scores and accumulate all data into a single dataframe
team_ids = []
seasons = []
bench_pts_prop_z_scores = []
off_rtg_z_scores = []
w_pct_z_scores = []

for season_data in bench_prop_cume_data:
    
    #Collect data from all teams to use for computing z-scores
    seasonal_bench_props = season_data["BENCH_PROP"].tolist()
    seasonal_off_rtgs = season_data["OFF_RTG"].tolist()
    seasonal_w_pcts = season_data["W_PCT"].tolist()
    
    for index, row in season_data.iterrows():
        team_ids.append(row["TEAM_ID"])
        seasons.append(row["SEASON"])
    
        #Compute z scores
        bench_pts_prop_z_scores.append(compute_z_score(row["BENCH_PROP"], seasonal_bench_props))
        off_rtg_z_scores.append(compute_z_score(row["OFF_RTG"], seasonal_off_rtgs))
        w_pct_z_scores.append(compute_z_score(row["W_PCT"], seasonal_w_pcts))
        

out_dict = {
    "TEAM_ID": team_ids,
    "SEASON": seasons,
    "REL_BENCH_PROP": bench_pts_prop_z_scores,
    "REL_OFF_RTG": off_rtg_z_scores,
    "REL_W_PCT": w_pct_z_scores
}

rel_prop_data = pd.DataFrame(out_dict)

In [43]:
print(rel_prop_data)

        TEAM_ID   SEASON  REL_BENCH_PROP  REL_OFF_RTG  REL_W_PCT
0    1610612737  2010-11        0.335107    -0.404078   0.234234
1    1610612738  2010-11       -0.726431    -0.152838   1.158506
2    1610612739  2010-11        1.298882    -1.566064  -1.696610
3    1610612740  2010-11       -0.013052    -0.215648   0.386169
4    1610612741  2010-11       -0.707288     0.318238   1.620643
..          ...      ...             ...          ...        ...
355  1610612762  2021-22        0.467584     1.485015   0.707263
356  1610612763  2021-22        0.401514     0.715245   1.320288
357  1610612764  2021-22        1.197849    -0.150747  -0.526000
358  1610612765  2021-22        1.408451    -1.946877  -1.586172
359  1610612766  2021-22       -0.209506     0.747318   0.173570

[360 rows x 5 columns]


In [44]:
rel_prop_data.to_csv("Data/Polished/REL_BENCH_PTS_PROP_REG_SEASON_DATA.csv")

In [None]:
#Process playoff data

In [45]:
#Fetch raw playoff data
raw_playoff_data = [pd.read_csv("Data/Raw/PLAYOFFS_" + season + ".csv") for season in season_ids]
raw_playoff_data = pd.concat(raw_playoff_data)

In [46]:
print(raw_playoff_data)

    Unnamed: 0  TOTAL_PTS  TOTAL_BENCH_PTS  WL  MATCHUP     TEAM_ID   SEASON
0            0        556              125   1  CHI,ATL  1610612741  2010-11
1            1        488              120   1  CHI,IND  1610612741  2010-11
2            2        436              108   0  CHI,MIA  1610612741  2010-11
3            0        566              181   0  SAS,MEM  1610612759  2010-11
4            0        477               95   1  MIA,BOS  1610612748  2010-11
..         ...        ...              ...  ..      ...         ...      ...
21           1        669              109   1  PHI,TOR  1610612755  2021-22
22           0        619              170   0  TOR,PHI  1610612761  2021-22
23           0        594              169   0  UTA,DAL  1610612762  2021-22
24           0        476              103   0  CHI,MIL  1610612741  2021-22
25           0        550              168   0  DEN,GSW  1610612743  2021-22

[356 rows x 7 columns]


In [70]:
reg_season_bench_pts_prop = []
series_bench_pts_prop = []
reg_season_playoff_change = []
team_opp_tot_bench_diff = []
team_opp_bench_prop_change = []
w_or_l = []
tot_pts_diff = []

for index, row in raw_playoff_data.iterrows():
    
    #Get opponent data
    opp_abbrev = row["MATCHUP"][-3:]
    
    opp_id = teams.find_team_by_abbreviation(opp_abbrev)
    
    if opp_id:
        opp_id = opp_id["id"]
        
        #Find opponent data from series
        opp_series_data = raw_playoff_data[raw_playoff_data["TEAM_ID"] == opp_id]
        opp_series_data = opp_series_data[opp_series_data["SEASON"] == row["SEASON"]]
        print(row["MATCHUP"])
        print(invert_matchup(row["MATCHUP"]))
        opp_series_data = opp_series_data[opp_series_data["MATCHUP"] == invert_matchup(row["MATCHUP"])]
        if len(opp_series_data) > 0:
            
            #Get opponent total bench points
            opp_tot_bench_pts = opp_series_data["TOTAL_BENCH_PTS"].tolist()[0]

            #Get opponent bench point proportion
            opp_bench_pt_prop = opp_tot_bench_pts / opp_series_data["TOTAL_PTS"].tolist()[0]

            #Series bench pt prop
            series_prop = row["TOTAL_BENCH_PTS"] / row["TOTAL_PTS"]
            series_bench_pts_prop.append(series_prop)

            #Get regular season bench point proportion
            reg_season_data = rel_prop_data[rel_prop_data["TEAM_ID"] == row["TEAM_ID"]]
            reg_season_data = reg_season_data[reg_season_data["SEASON"] == row["SEASON"]]
            reg_season_prop = reg_season_data["REL_BENCH_PROP"].tolist()[0]
            reg_season_bench_pts_prop.append(reg_season_prop)

            #Get percent change from regular season bench point prop 
            reg_season_playoff_change.append((series_prop - reg_season_prop) / series_prop)


            #Get team/opponent difference in raw bench points
            team_opp_tot_bench_diff.append(row["TOTAL_BENCH_PTS"] - opp_tot_bench_pts)

            #Get team/opponent change in proportion of points from bench
            team_opp_bench_prop_change.append((series_prop - opp_bench_pt_prop) / opp_bench_pt_prop)

            #Get win or loss
            w_or_l.append(row["WL"])
            
            #Get total points difference
            tot_pts_diff.append(row["TOTAL_PTS"] - opp_series_data["TOTAL_PTS"].tolist()[0])


polished_playoff_data = {}
polished_playoff_data["REG_SEASON_BENCH_PROP"] = reg_season_bench_pts_prop
polished_playoff_data["SERIES_BENCH_PROP"] = series_bench_pts_prop
polished_playoff_data["REG_SEASON_PLAYOFF_PROP_CHANGE"] = reg_season_playoff_change
polished_playoff_data["TEAM_OPP_BENCH_PTS_DIFF"] = team_opp_tot_bench_diff
polished_playoff_data["TEAM_OPP_BENCH_PROP_CHG"] = team_opp_bench_prop_change
polished_playoff_data["WL"] = w_or_l
polished_playoff_data["TOT_PTS_DIFF"] = tot_pts_diff

polished_playoff_data = pd.DataFrame(polished_playoff_data)
    

CHI,ATL
ATL,CHI
CHI,IND
IND,CHI
CHI,MIA
MIA,CHI
SAS,MEM
MEM,SAS
MIA,BOS
BOS,MIA
MIA,CHI
CHI,MIA
MIA,DAL
DAL,MIA
MIA,PHI
PHI,MIA
LAL,DAL
DAL,LAL
BOS,MIA
MIA,BOS
BOS,NYK
NYK,BOS
DAL,LAL
LAL,DAL
DAL,MIA
MIA,DAL
DAL,OKC
OKC,DAL
DAL,POR
POR,DAL
ORL,ATL
ATL,ORL
OKC,DAL
DAL,OKC
OKC,DEN
DEN,OKC
OKC,MEM
MEM,OKC
ATL,CHI
CHI,ATL
ATL,ORL
ORL,ATL
DEN,OKC
OKC,DEN
NYK,BOS
BOS,NYK
POR,DAL
DAL,POR
NOH,LAL
LAL,NOH
PHI,MIA
MIA,PHI
IND,CHI
CHI,IND
MEM,OKC
OKC,MEM
MEM,SAS
SAS,MEM
SAS,LAC
LAC,SAS
SAS,OKC
OKC,SAS
SAS,UTA
UTA,SAS
CHI,PHI
PHI,CHI
MIA,BOS
BOS,MIA
MIA,IND
IND,MIA
MIA,NYK
NYK,MIA
MIA,OKC
OKC,MIA
OKC,DAL
DAL,OKC
OKC,LAL
LAL,OKC
OKC,MIA
MIA,OKC
OKC,SAS
SAS,OKC
LAL,DEN
DEN,LAL
LAL,OKC
OKC,LAL
IND,MIA
MIA,IND
IND,ORL
ORL,IND
BOS,ATL
ATL,BOS
BOS,MIA
MIA,BOS
BOS,PHI
PHI,BOS
MEM,LAC
LAC,MEM
LAC,MEM
MEM,LAC
LAC,SAS
SAS,LAC
ATL,BOS
BOS,ATL
DEN,LAL
LAL,DEN
ORL,IND
IND,ORL
DAL,OKC
OKC,DAL
NYK,MIA
MIA,NYK
PHI,BOS
BOS,PHI
PHI,CHI
CHI,PHI
UTA,SAS
SAS,UTA
MIA,CHI
CHI,MIA
MIA,IND
IND,MIA
MIA,MIL
MIL,MIA
MIA,SAS


In [71]:
print(polished_playoff_data)

     REG_SEASON_BENCH_PROP  SERIES_BENCH_PROP  REG_SEASON_PLAYOFF_PROP_CHANGE  \
0                -0.707288           0.224820                        4.146017   
1                -0.707288           0.245902                        3.876305   
2                -0.707288           0.247706                        3.855348   
3                 0.802179           0.319788                       -1.508471   
4                -1.991467           0.199161                       10.999262   
..                     ...                ...                             ...   
346              -1.761815           0.162930                       11.813340   
347              -2.164769           0.274637                        8.882306   
348               0.467584           0.284512                       -0.643459   
349              -2.050428           0.216387                       10.475767   
350               0.061673           0.305455                        0.798096   

     TEAM_OPP_BENCH_PTS_DIF

In [68]:
polished_playoff_data.to_csv("Data/Polished/POLISHED_PLAYOFF_DATA.csv")

In [None]:
"""
Next let's compute probabilities of winning for intervals of independent variables
"""

In [72]:
#First, let's look at TEAM_OPP_BENCH_PROP_CHG
print(np.min(polished_playoff_data["TEAM_OPP_BENCH_PROP_CHG"]))
print(np.max(polished_playoff_data["TEAM_OPP_BENCH_PROP_CHG"]))

-0.6337456513335911
1.730343007915567


In [90]:
#Split it up into bins of .25 from -.5 to 1.75
intervals = []
probs = []

lower_bound = -.5
while lower_bound <= 1.5:
    upper_bound = lower_bound + .25
    
    matches = polished_playoff_data[polished_playoff_data["TEAM_OPP_BENCH_PROP_CHG"] >= lower_bound]
    matches = polished_playoff_data[polished_playoff_data["TEAM_OPP_BENCH_PROP_CHG"] < upper_bound]
    
    if len(matches) > 0:
        intervals.append("[" + str(lower_bound) + ", " + str(upper_bound) + ")")
        
        num_wins = len(matches[matches["WL"] == 1])
        probs.append(num_wins / len(matches))
    
    lower_bound += 0.25
    
TEAM_OPP_BENCH_PROP_CHG_PROBS = pd.DataFrame({"INTERVAL": intervals, "PROB_WIN_SERIES": probs})

In [91]:
TEAM_OPP_BENCH_PROP_CHG_PROBS.to_csv("Results/TEAM_OPP_BENCH_PROP_CHG_PROBS.csv")

In [92]:
print(TEAM_OPP_BENCH_PROP_CHG_PROBS)

        INTERVAL  PROB_WIN_SERIES
0  [-0.5, -0.25)         0.618421
1   [-0.25, 0.0)         0.568182
2    [0.0, 0.25)         0.540856
3    [0.25, 0.5)         0.522727
4    [0.5, 0.75)         0.510511
5    [0.75, 1.0)         0.504373
6    [1.0, 1.25)         0.501441
7    [1.25, 1.5)         0.498567
8    [1.5, 1.75)         0.498575


In [93]:
px.scatter(intervals, probs)

In [83]:
#Second, let's look at REG_SEASON_PLAYOFF_PROP_CHANGE
print(np.min(polished_playoff_data["REG_SEASON_PLAYOFF_PROP_CHANGE"]))
print(np.max(polished_playoff_data["REG_SEASON_PLAYOFF_PROP_CHANGE"]))

-5.680189936802384
16.385670314717768


In [84]:
#Let's do intervals of .5 from -6 to 16
intervals = []
probs = []

lower_bound = -6
while lower_bound <= 15.5:
    upper_bound = lower_bound + .5
    
    matches = polished_playoff_data[polished_playoff_data["REG_SEASON_PLAYOFF_PROP_CHANGE"] >= lower_bound]
    matches = polished_playoff_data[polished_playoff_data["REG_SEASON_PLAYOFF_PROP_CHANGE"] < upper_bound]
    
    if len(matches) > 0:
        intervals.append("[" + str(lower_bound) + ", " + str(upper_bound) + ")")
        
        num_wins = len(matches[matches["WL"] == 1])
        probs.append(num_wins / len(matches))
    
    lower_bound += 0.5
    
REG_SEASON_PLAYOFF_PROP_CHANGE_PROBS = pd.DataFrame({"INTERVAL": intervals, "PROB_WIN_SERIES": probs})

In [85]:
px.scatter(intervals, probs)

In [88]:
REG_SEASON_PLAYOFF_PROP_CHANGE_PROBS.to_csv("Results/REG_SEASON_PLAYOFF_PROP_CHANGE_PROBS.csv")