In [1]:
import pandas as pd
import numpy as np

In [2]:
#Get z score
def compute_z_score(element, column):
    mean = np.mean(column)
    z_score = (element - mean) / np.std(column)
    return z_score

In [3]:
#generate string labels for seasons (ex: 2023-24)
def generate_nba_seasons(start_year, end_year):
    seasons = []
    for year in range(start_year, end_year):
        next_year = str(year + 1)[-2:]  # Get last two digits of the next year
        season = f"{year}-{next_year}"
        seasons.append(season)
    return seasons

In [8]:
season_ids = generate_nba_seasons(2010, 2022)

In [9]:
data = []
for season in season_ids:
    data.append(pd.read_csv("Data/Raw/BENCH_POINTS_REG_SEASON_" + season + ".csv"))


In [11]:
#Accumulate game-by-game data into team season data (i.e. get average bench points for each team season)
seasonal_cume_data = []
for season_data in data:
    team_ids = []
    seasons = []
    avg_bench_pts = []
    off_rtg = []
    win_pcts = []
    
    #Current season
    curr_season = season_data["SEASON"].tolist()[0]
    
    for team in np.unique(season_data["TEAM_ID"]):
        
        team_ids.append(team)
        seasons.append(curr_season)
        
        #Compute average bench points for team
        team_avg_bench_pts = np.mean(season_data[season_data["TEAM_ID"] == team]["BENCH_PTS"])
        avg_bench_pts.append(team_avg_bench_pts)
        
        #get season offensive rating and win pct for team
        off_rtg.append(season_data[season_data["TEAM_ID"] == team]["OFF_RTG"].tolist()[0])
        win_pcts.append(season_data[season_data["TEAM_ID"] == team]["SEASON_W_PCT"].tolist()[0])
    
    out_dict = {
        "TEAM_ID": team_ids,
        "SEASON": seasons,
        "AVG_BENCH_PTS": avg_bench_pts,
        "OFF_RTG": off_rtg,
        "W_PCT": win_pcts
    }
    
    seasonal_cume_data.append(pd.DataFrame(out_dict))

In [15]:
print(seasonal_cume_data[0])

       TEAM_ID   SEASON  AVG_BENCH_PTS  OFF_RTG  W_PCT
0   1610612737  2010-11      30.939024    103.2  0.537
1   1610612738  2010-11      26.740741    104.0  0.683
2   1610612739  2010-11      35.646341     99.5  0.232
3   1610612740  2010-11      29.621951    103.8  0.561
4   1610612741  2010-11      27.524390    105.5  0.756
5   1610612742  2010-11      40.518519    107.6  0.695
6   1610612743  2010-11      37.219512    109.5  0.610
7   1610612744  2010-11      24.207317    105.5  0.439
8   1610612745  2010-11      32.426829    108.0  0.524
9   1610612746  2010-11      27.085366    101.8  0.390
10  1610612747  2010-11      28.243902    107.9  0.695
11  1610612748  2010-11      21.939024    109.3  0.707
12  1610612749  2010-11      30.963415     99.0  0.427
13  1610612750  2010-11      34.530864    101.1  0.207
14  1610612751  2010-11      31.271605    100.2  0.293
15  1610612752  2010-11      26.914634    108.3  0.512
16  1610612753  2010-11      27.268293    105.7  0.634
17  161061

In [21]:
#Calculate z scores and accumulate all data into a single dataframe
team_ids = []
seasons = []
bench_pts_z_scores = []
off_rtg_z_scores = []
w_pct_z_scores = []

for season_data in seasonal_cume_data:
    
    #Collect data from all teams to use for computing z-scores
    seasonal_bench_pts = season_data["AVG_BENCH_PTS"].tolist()
    seasonal_off_rtgs = season_data["OFF_RTG"].tolist()
    seasonal_w_pcts = season_data["W_PCT"].tolist()
    
    for index, row in season_data.iterrows():
        team_ids.append(row["TEAM_ID"])
        seasons.append(row["SEASON"])
    
        #Compute z scores
        bench_pts_z_scores.append(compute_z_score(row["AVG_BENCH_PTS"], seasonal_bench_pts))
        off_rtg_z_scores.append(compute_z_score(row["OFF_RTG"], seasonal_off_rtgs))
        w_pct_z_scores.append(compute_z_score(row["W_PCT"], seasonal_w_pcts))
        

out_dict = {
    "TEAM_ID": team_ids,
    "SEASON": seasons,
    "REL_BENCH_PTS": bench_pts_z_scores,
    "REL_OFF_RTG": off_rtg_z_scores,
    "REL_W_PCT": w_pct_z_scores
}

relative_data = pd.DataFrame(out_dict)

In [22]:
print(relative_data)

        TEAM_ID   SEASON  REL_BENCH_PTS  REL_OFF_RTG  REL_W_PCT
0    1610612737  2010-11      -0.015224    -0.404078   0.234234
1    1610612738  2010-11      -0.850603    -0.152838   1.158506
2    1610612739  2010-11       0.921444    -1.566064  -1.696610
3    1610612740  2010-11      -0.277296    -0.215648   0.386169
4    1610612741  2010-11      -0.694671     0.318238   1.620643
..          ...      ...            ...          ...        ...
355  1610612762  2021-22       0.727985     1.485015   0.707263
356  1610612763  2021-22       0.918643     0.715245   1.320288
357  1610612764  2021-22       1.024471    -0.150747  -0.526000
358  1610612765  2021-22       0.902854    -1.946877  -1.586172
359  1610612766  2021-22       0.221519     0.747318   0.173570

[360 rows x 5 columns]


In [23]:
relative_data.to_csv("Data/Polished/RELATIVE_REG_SEASON_DATA.csv")