In [2]:
from scipy.stats import truncnorm, multivariate_normal, norm
import numpy as np
import matplotlib.pyplot as plt
import time
import seaborn as sns
import pandas as pd


# Hyperparameteres

In [3]:
# Parameters
mu_1 = 25
mu_2 = 25
sigma_1 = 25/3
sigma_2 = 25/3
sigma_t = 25/6
iterations = 800

# Loading Dataset

In [4]:
df = pd.read_csv('nba.csv', nrows=3000)
unique_teams = pd.concat([df['team1'], df['team1']], ignore_index=True).unique().tolist() # list
unique_teams.sort()
# unique_teams = np.array(unique_teams)
unique_teams # 0, Atlanta

['Blackhawks',
 'Bombers',
 'Bullets',
 'Capitols',
 'Celtics',
 'East',
 'Hawks',
 'Knicks',
 'Lakers',
 'Nationals',
 'Nuggets',
 'Olympians',
 'Packers',
 'Pistons',
 'Redskins',
 'Royals',
 'Stags',
 'Warriors']

In [5]:
team_mapping = {}
for i, team in enumerate(unique_teams):
    team_mapping[team] = i
team_mapping # Atlanta, 0

{'Blackhawks': 0,
 'Bombers': 1,
 'Bullets': 2,
 'Capitols': 3,
 'Celtics': 4,
 'East': 5,
 'Hawks': 6,
 'Knicks': 7,
 'Lakers': 8,
 'Nationals': 9,
 'Nuggets': 10,
 'Olympians': 11,
 'Packers': 12,
 'Pistons': 13,
 'Redskins': 14,
 'Royals': 15,
 'Stags': 16,
 'Warriors': 17}

In [6]:
df['team1_idx'] = df['team1'].map(team_mapping)
df['team2_idx'] = df['team2'].map(team_mapping)
df

Unnamed: 0,team1,team2,team1_score,team2_score,team1_leader_name,team1_leader_points,team1_leader_rebounds,team1_leader_assists,team1_leader_blocks,team1_leader_steals,...,period_Q6_team2,period_Q7_team1,period_Q7_team2,period_Q8_team1,period_Q8_team2,period_Q9_team1,period_Q9_team2,verdict,team1_idx,team2_idx
0,Bombers,Blackhawks,72,51,,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0.0
1,Royals,Redskins,108,75,,0,0,0,0,0,...,0,0,0,0,0,0,0,0,15,14.0
2,Olympians,Nuggets,71,64,,0,0,0,0,0,...,0,0,0,0,0,0,0,0,11,10.0
3,Stags,Knicks,87,89,,0,0,0,0,0,...,0,0,0,0,0,0,0,1,16,7.0
4,Capitols,Bullets,66,61,,0,0,0,0,0,...,0,0,0,0,0,0,0,0,3,2.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2995,Hawks,Celtics,90,115,,0,0,0,0,0,...,0,0,0,0,0,0,0,1,6,4.0
2996,Knicks,Hawks,112,95,,0,0,0,0,0,...,0,0,0,0,0,0,0,0,7,6.0
2997,Pistons,Celtics,94,105,,0,0,0,0,0,...,0,0,0,0,0,0,0,1,13,4.0
2998,Nationals,Warriors,103,85,,0,0,0,0,0,...,0,0,0,0,0,0,0,0,9,17.0


In [11]:
score_difference = df['team1_score'] - df['team2_score']

# Create a new DataFrame with the desired columns
result_df = pd.DataFrame({
    'team1_idx': df['team1_idx'],
    'team2_idx': df['team2_idx'],
    'score_difference': np.where(df['team1_score'] - df['team2_score'] > 0, 1, 
                                   np.where(df['team1_score'] - df['team2_score'] < 0, -1, 0))
})

# Remove rows where score_difference is zero
result_df = result_df[result_df['score_difference'] != 0]

result_df

Unnamed: 0,team1_idx,team2_idx,score_difference
0,1,0.0,1
1,15,14.0,1
2,11,10.0,1
3,16,7.0,-1
4,3,2.0,1
...,...,...,...
2995,6,4.0,-1
2996,7,6.0,1
2997,13,4.0,-1
2998,9,17.0,1


In [12]:
# Convert the DataFrame to a NumPy array
result_array = result_df.to_numpy()
# result_array 

In [13]:
np.random.seed(42)
np.random.shuffle(result_array)
result_array

array([[ 9., 12.,  1.],
       [15., 17.,  1.],
       [15.,  8., -1.],
       ...,
       [11.,  8.,  1.],
       [ 9.,  4.,  1.],
       [17.,  4.,  1.]])

In [14]:
# MU and Sigma for each team
skills_dist = [[mu_1, sigma_1] for _ in unique_teams]
skills_dist

[[25, 8.333333333333334],
 [25, 8.333333333333334],
 [25, 8.333333333333334],
 [25, 8.333333333333334],
 [25, 8.333333333333334],
 [25, 8.333333333333334],
 [25, 8.333333333333334],
 [25, 8.333333333333334],
 [25, 8.333333333333334],
 [25, 8.333333333333334],
 [25, 8.333333333333334],
 [25, 8.333333333333334],
 [25, 8.333333333333334],
 [25, 8.333333333333334],
 [25, 8.333333333333334],
 [25, 8.333333333333334],
 [25, 8.333333333333334],
 [25, 8.333333333333334]]

# GIBBS SAMPLER

In [15]:
# Helper function to sample from a truncated normal distribution
def truncated_normal(mean, std, lower, upper):
    a, b = (lower - mean) / std, (upper - mean) / std
    return truncnorm.rvs(a, b, loc=mean, scale=std)

In [16]:
# Gibbs Sampler
def gibbs_sampler(mu_1, mu_2, sigma_1, sigma_2 ,y, iterations):
    # Initialize t
    # inital t ???????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????
    if y == 1:
        t = np.abs(np.random.randn())
    else:
        t = -np.abs(np.random.randn())

    
    samples = []
    
    for i in range(iterations):
        # p(s1,s2|t,y)
        # p(S|t,y)
        # QUESTION 3.1
        # Step 1: Draw s_1 and s_2 from the conditional distribution N(mean_s|t, cov_s|t) -
        
        
        
        sigma_s_inv = np.array([[1.0/(sigma_1 * sigma_1), 0.0],
                                [0.0, 1.0/(sigma_2 * sigma_2)]])
        
        sigma_t_s_inv = 1.0 / (sigma_t * sigma_t)
        mu_s = np.array([mu_1, mu_2]).T.reshape(2,1) # 2 x 1 (2, 1)
        A = np.array([1.0, -1.0]).reshape(1, 2) # 1 x 2
        
        # (2x2)
        cov_s_t = np.linalg.inv(sigma_s_inv + (A.T @ (sigma_t_s_inv * A) ) )
        
         # (2, )
        mean_s_t = cov_s_t @ ( (sigma_s_inv @ mu_s) + (A.T * sigma_t_s_inv * t) )
 
       
        # Draw from the multivariate normal distribution
        mean_s_t = mean_s_t.reshape(2,)
        s_1, s_2 = multivariate_normal.rvs(mean=mean_s_t, cov=cov_s_t)

        #DO WE NEED TO UPDATE: mu_1 +mu_2, sigma_1, sigma_2 ????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????

        #p(t | s_1, s_2, y)
        #p(t | S, y)

        # QUESTION 3.2
        # Step 2: Draw t from the conditional distribution 
        mean_t = s_1 - s_2 
        # print(s_1, s_2)
        if y == 1:
            t = truncated_normal(mean_t, sigma_t, 0, np.inf)  # For y = 1, t > 0
        else:
            t = truncated_normal(mean_t, sigma_t, -np.inf, 0)  # For y = -1, t < 0

        # Store samples
        samples.append((s_1, s_2, t))

    # burn_in = int(0.3 * iterations)
    burn_in = 25
    samples = np.array(samples)
    mean = np.mean(samples[burn_in:, :2], axis = 0)
    covariance = np.cov(samples, rowvar = False)
    return mean[0], mean[1], np.sqrt(covariance[0,0]), np.sqrt(covariance[1, 1])


# Run

In [18]:
for team_1, team_2, y in result_array:
    try:
        team_1 = int(team_1)
        team_2 = int(team_2)
    except ValueError as e:
        print(f"Skipping invalid data: {team_1}, {team_2} due to error: {e}")
        continue  # Skip this iteration if there's a conversion error
    
    mu_1 = skills_dist[team_1][0]
    sigma_1 = skills_dist[team_1][1]
    mu_2 = skills_dist[team_2][0]
    sigma_2 = skills_dist[team_2][1]
    # print(mu_1, mu_2, sigma_1, sigma_2)
    mu_1, mu_2, sigma_1, sigma_2 = gibbs_sampler(mu_1=mu_1,
                                                    mu_2=mu_2, 
                                                    sigma_1=sigma_1,
                                                    sigma_2=sigma_2,
                                                    y=y,
                                                    iterations=iterations)
    skills_dist[team_1][0] = mu_1
    skills_dist[team_2][0] = mu_2
    skills_dist[team_1][1] = sigma_1
    skills_dist[team_2][1] = sigma_2
    
    


Skipping invalid data: 5, nan due to error: cannot convert float NaN to integer
Skipping invalid data: 5, nan due to error: cannot convert float NaN to integer
Skipping invalid data: 5, nan due to error: cannot convert float NaN to integer
Skipping invalid data: 5, nan due to error: cannot convert float NaN to integer
Skipping invalid data: 5, nan due to error: cannot convert float NaN to integer
Skipping invalid data: 5, nan due to error: cannot convert float NaN to integer
Skipping invalid data: 5, nan due to error: cannot convert float NaN to integer


In [19]:
for idx, (m, s) in enumerate(skills_dist):
    print(f"{unique_teams[idx]}: mean = {m:.3f}, std dev = {s:.3f}")

Blackhawks: mean = 24.011, std dev = 0.482
Bombers: mean = 23.617, std dev = 0.655
Bullets: mean = 23.278, std dev = 0.262
Capitols: mean = 24.010, std dev = 0.464
Celtics: mean = 25.522, std dev = 0.180
East: mean = 25.000, std dev = 8.333
Hawks: mean = 24.270, std dev = 0.438
Knicks: mean = 26.167, std dev = 0.149
Lakers: mean = 26.431, std dev = 0.231
Nationals: mean = 26.066, std dev = 0.227
Nuggets: mean = 20.087, std dev = 0.792
Olympians: mean = 24.891, std dev = 0.290
Packers: mean = 25.373, std dev = 0.856
Pistons: mean = 25.329, std dev = 0.227
Redskins: mean = 22.560, std dev = 0.609
Royals: mean = 25.648, std dev = 0.138
Stags: mean = 25.486, std dev = 0.667
Warriors: mean = 24.910, std dev = 0.193


In [20]:
# ranked based on mu - 3 * sigma
ranked_teams = sorted(enumerate(skills_dist), key=lambda x: x[1][0] - 3.0 * x[1][1], reverse=True)
# Print ranked teams with their skills
for rank, (idx, (m, s)) in enumerate(ranked_teams, start=1):
    print(f"Rank {rank}: Team {unique_teams[idx]} - Skill: {m}, Std Dev: {s}")

Rank 1: Team Lakers - Skill: 26.43073722802069, Std Dev: 0.23128717701201146
Rank 2: Team Knicks - Skill: 26.167035058661995, Std Dev: 0.14887959074459114
Rank 3: Team Nationals - Skill: 26.06559371987283, Std Dev: 0.22669146600508808
Rank 4: Team Royals - Skill: 25.64807984323357, Std Dev: 0.13768975048067406
Rank 5: Team Celtics - Skill: 25.522327882894466, Std Dev: 0.1795782067215027
Rank 6: Team Pistons - Skill: 25.329050028898216, Std Dev: 0.22714239894527727
Rank 7: Team Warriors - Skill: 24.90960481470147, Std Dev: 0.19348241188039192
Rank 8: Team Olympians - Skill: 24.890666553896413, Std Dev: 0.2904634304940674
Rank 9: Team Stags - Skill: 25.485999108269546, Std Dev: 0.6665875709822141
Rank 10: Team Hawks - Skill: 24.26974281279161, Std Dev: 0.4378893641958013
Rank 11: Team Packers - Skill: 25.37252698147806, Std Dev: 0.8562847548879562
Rank 12: Team Capitols - Skill: 24.010272251565688, Std Dev: 0.46350649382004006
Rank 13: Team Blackhawks - Skill: 24.011317853247906, Std Dev