In [101]:
from scipy.stats import truncnorm, multivariate_normal, norm
import numpy as np
import matplotlib.pyplot as plt
import time
import seaborn as sns
import pandas as pd

# Hyperparameteres

In [142]:
# Parameters
mu_1 = 25
mu_2 = 25
sigma_1 = 25/3
sigma_2 = 25/3
sigma_t = 25/6
iterations = 800

# Loading Dataset

In [149]:
df = pd.read_csv('SerieA.csv')
unique_teams = pd.concat([df['team1'], df['team1']], ignore_index=True).unique().tolist() # list
unique_teams.sort()
# unique_teams = np.array(unique_teams)
unique_teams # 0, Atlanta

['Atalanta',
 'Bologna',
 'Cagliari',
 'Chievo',
 'Empoli',
 'Fiorentina',
 'Frosinone',
 'Genoa',
 'Inter',
 'Juventus',
 'Lazio',
 'Milan',
 'Napoli',
 'Parma',
 'Roma',
 'Sampdoria',
 'Sassuolo',
 'Spal',
 'Torino',
 'Udinese']

In [145]:
team_mapping = {}
for i, team in enumerate(unique_teams):
    team_mapping[team] = i
team_mapping # Atlanta, 0

{'Blackhawks': 0,
 'Bombers': 1,
 'Bullets': 2,
 'Capitols': 3,
 'Celtics': 4,
 'East': 5,
 'Hawks': 6,
 'Knicks': 7,
 'Lakers': 8,
 'Nationals': 9,
 'Nuggets': 10,
 'Olympians': 11,
 'Packers': 12,
 'Pistons': 13,
 'Redskins': 14,
 'Royals': 15,
 'Stags': 16,
 'Warriors': 17}

In [146]:
df['team1_idx'] = df['team1'].map(team_mapping)
df['team2_idx'] = df['team2'].map(team_mapping)
df

Unnamed: 0,team1,team2,team1_score,team2_score,team1_leader_name,team1_leader_points,team1_leader_rebounds,team1_leader_assists,team1_leader_blocks,team1_leader_steals,...,period_Q6_team2,period_Q7_team1,period_Q7_team2,period_Q8_team1,period_Q8_team2,period_Q9_team1,period_Q9_team2,verdict,team1_idx,team2_idx
0,Bombers,Blackhawks,72,51,,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0.0
1,Royals,Redskins,108,75,,0,0,0,0,0,...,0,0,0,0,0,0,0,0,15,14.0
2,Olympians,Nuggets,71,64,,0,0,0,0,0,...,0,0,0,0,0,0,0,0,11,10.0
3,Stags,Knicks,87,89,,0,0,0,0,0,...,0,0,0,0,0,0,0,1,16,7.0
4,Capitols,Bullets,66,61,,0,0,0,0,0,...,0,0,0,0,0,0,0,0,3,2.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2995,Hawks,Celtics,90,115,,0,0,0,0,0,...,0,0,0,0,0,0,0,1,6,4.0
2996,Knicks,Hawks,112,95,,0,0,0,0,0,...,0,0,0,0,0,0,0,0,7,6.0
2997,Pistons,Celtics,94,105,,0,0,0,0,0,...,0,0,0,0,0,0,0,1,13,4.0
2998,Nationals,Warriors,103,85,,0,0,0,0,0,...,0,0,0,0,0,0,0,0,9,17.0


In [148]:
score_difference = df['score1'] - df['score2']

# Create a new DataFrame with the desired columns
result_df = pd.DataFrame({
    'team1_idx': df['team1_idx'],
    'team2_idx': df['team2_idx'],
    'score_difference': np.where(df['score1'] - df['score2'] > 0, 1, 
                                   np.where(df['score1'] - df['score2'] < 0, -1, 0))
})

# Remove rows where score_difference is zero
result_df = result_df[result_df['score_difference'] != 0]

result_df

Unnamed: 0,team1_idx,team2_idx,score_difference
0,1,0.0,1
1,15,14.0,1
2,11,10.0,1
3,16,7.0,-1
4,3,2.0,1
...,...,...,...
2995,6,4.0,-1
2996,7,6.0,1
2997,13,4.0,-1
2998,9,17.0,1


In [128]:
# Convert the DataFrame to a NumPy array
result_array = result_df.to_numpy()
# result_array 

In [129]:
np.random.seed(42)
np.random.shuffle(result_array)
result_array

array([[16, 12,  0],
       [ 3, 11, -1],
       [ 6, 18, -1],
       ...,
       [16, 15, -1],
       [12,  2,  1],
       [ 5, 14,  0]])

In [130]:
# MU and Sigma for each team
skills_dist = [[mu_1, sigma_1] for _ in unique_teams]
skills_dist

[[25, 8.333333333333334],
 [25, 8.333333333333334],
 [25, 8.333333333333334],
 [25, 8.333333333333334],
 [25, 8.333333333333334],
 [25, 8.333333333333334],
 [25, 8.333333333333334],
 [25, 8.333333333333334],
 [25, 8.333333333333334],
 [25, 8.333333333333334],
 [25, 8.333333333333334],
 [25, 8.333333333333334],
 [25, 8.333333333333334],
 [25, 8.333333333333334],
 [25, 8.333333333333334],
 [25, 8.333333333333334],
 [25, 8.333333333333334],
 [25, 8.333333333333334],
 [25, 8.333333333333334],
 [25, 8.333333333333334]]

# GIBBS SAMPLER

In [131]:
# Helper function to sample from a truncated normal distribution
def truncated_normal(mean, std, lower, upper):
    a, b = (lower - mean) / std, (upper - mean) / std
    return truncnorm.rvs(a, b, loc=mean, scale=std)

In [111]:
# # Gibbs Sampler
# def gibbs_sampler(mu_1, mu_2, sigma_1, sigma_2 ,y, iterations):
#     # Initialize t
#     # inital t ???????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????
#     if y == 1:
#         t = np.abs(np.random.randn())
#     elif y == 0:
#         t = 0.001
#     else:
#         t = -np.abs(np.random.randn())

#     samples = []
    
#     for i in range(iterations):
#         # p(s1,s2|t,y)
#         # p(S|t,y)
#         # QUESTION 3.1
#         # Step 1: Draw s_1 and s_2 from the conditional distribution N(mean_s|t, cov_s|t) -
        
        
        
#         sigma_s_inv = np.array([[1.0/(sigma_1 * sigma_1), 0.0],
#                                 [0.0, 1.0/(sigma_2 * sigma_2)]])
        
#         sigma_t_s_inv = 1.0 / (sigma_t * sigma_t)
#         mu_s = np.array([mu_1, mu_2]).T.reshape(2,1) # 2 x 1 (2, 1)
#         A = np.array([1.0, -1.0]).reshape(1, 2) # 1 x 2
        
#         # (2x2)
#         cov_s_t = np.linalg.inv(sigma_s_inv + (A.T @ (sigma_t_s_inv * A) ) )
        
#          # (2, )
#         mean_s_t = cov_s_t @ ( (sigma_s_inv @ mu_s) + (A.T * sigma_t_s_inv * t) )
 
       
#         # Draw from the multivariate normal distribution
#         mean_s_t = mean_s_t.reshape(2,)
#         s_1, s_2 = multivariate_normal.rvs(mean=mean_s_t, cov=cov_s_t)

#         #DO WE NEED TO UPDATE: mu_1 +mu_2, sigma_1, sigma_2 ????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????

#         #p(t | s_1, s_2, y)
#         #p(t | S, y)

#         # QUESTION 3.2
#         # Step 2: Draw t from the conditional distribution 
#         mean_t = s_1 - s_2 
#         # print(s_1, s_2)
#         if y == 1:
#             t = truncated_normal(mean_t, sigma_t, 0, np.inf)  # For y = 1, t > 0
#         elif y == 0:
#             t = truncated_normal(mean_t, sigma_t, -0.5, 0.5)  # For y = 0
#         else:
#             t = truncated_normal(mean_t, sigma_t, -np.inf, 0)  # For y = -1, t < 0

#         # Store samples
#         samples.append((s_1, s_2, t))

#     # burn_in = int(0.3 * iterations)
#     burn_in = 25
#     samples = np.array(samples)
#     mean = np.mean(samples[burn_in:, :2], axis = 0)
#     covariance = np.cov(samples, rowvar = False)
#     return mean[0], mean[1], np.sqrt(covariance[0,0]), np.sqrt(covariance[1, 1])


# Run

In [112]:
# for team_1, team_2, y in result_array:
#     # print(team_1, team_2)
#     mu_1 = skills_dist[team_1][0]
#     sigma_1 = skills_dist[team_1][1]
#     mu_2 = skills_dist[team_2][0]
#     sigma_2 = skills_dist[team_2][1]
#     # print(mu_1, mu_2, sigma_1, sigma_2)
#     mu_1, mu_2, sigma_1, sigma_2 = gibbs_sampler(mu_1=mu_1,
#                                                     mu_2=mu_2, 
#                                                     sigma_1=sigma_1,
#                                                     sigma_2=sigma_2,
#                                                     y=y,
#                                                     iterations=iterations)
#     skills_dist[team_1][0] = mu_1
#     skills_dist[team_2][0] = mu_2
#     skills_dist[team_1][1] = sigma_1
#     skills_dist[team_2][1] = sigma_2
    
    


In [113]:
for idx, (m, s) in enumerate(skills_dist):
    print(f"{unique_teams[idx]}: mean = {m:.3f}, std dev = {s:.3f}")

Atalanta: mean = 27.214, std dev = 0.925
Bologna: mean = 24.675, std dev = 0.891
Cagliari: mean = 24.590, std dev = 0.878
Chievo: mean = 22.491, std dev = 1.020
Empoli: mean = 23.817, std dev = 0.974
Fiorentina: mean = 24.928, std dev = 0.871
Frosinone: mean = 22.864, std dev = 0.918
Genoa: mean = 24.858, std dev = 0.951
Inter: mean = 27.544, std dev = 0.911
Juventus: mean = 29.120, std dev = 1.036
Lazio: mean = 25.955, std dev = 0.908
Milan: mean = 26.651, std dev = 0.913
Napoli: mean = 28.607, std dev = 1.126
Parma: mean = 24.789, std dev = 0.870
Roma: mean = 27.172, std dev = 0.847
Sampdoria: mean = 25.660, std dev = 1.007
Sassuolo: mean = 24.536, std dev = 0.893
Spal: mean = 24.801, std dev = 0.966
Torino: mean = 26.834, std dev = 0.938
Udinese: mean = 24.713, std dev = 0.879


In [114]:
# ranked based on mu - 3 * sigma
ranked_teams = sorted(enumerate(skills_dist), key=lambda x: x[1][0] - 3.0 * x[1][1], reverse=True)
#ranked_teams = sorted(enumerate(skills_dist), key=lambda x: x[1][0], reverse=True)
# Print ranked teams with their skills
for rank, (idx, (m, s)) in enumerate(ranked_teams, start=1):
    print(f"Rank {rank}: Team {unique_teams[idx]} - Skill: {m}, Std Dev: {s}")

Rank 1: Team Juventus - Skill: 29.119687850962695, Std Dev: 1.036048889725767
Rank 2: Team Napoli - Skill: 28.606554393323893, Std Dev: 1.1259938786136665
Rank 3: Team Inter - Skill: 27.543875399093913, Std Dev: 0.9110328227896657
Rank 4: Team Roma - Skill: 27.172481724250545, Std Dev: 0.8473000817636326
Rank 5: Team Atalanta - Skill: 27.21372473292239, Std Dev: 0.9245500941609169
Rank 6: Team Torino - Skill: 26.834426033646846, Std Dev: 0.9379148099135334
Rank 7: Team Milan - Skill: 26.6505285508516, Std Dev: 0.9127983763130172
Rank 8: Team Lazio - Skill: 25.95482767420138, Std Dev: 0.9078340667530229
Rank 9: Team Sampdoria - Skill: 25.660054751080196, Std Dev: 1.0067234386918236
Rank 10: Team Fiorentina - Skill: 24.92804603285979, Std Dev: 0.8707827266321985
Rank 11: Team Parma - Skill: 24.789219150409146, Std Dev: 0.8696423676730425
Rank 12: Team Udinese - Skill: 24.713135681189005, Std Dev: 0.8787918919076151
Rank 13: Team Genoa - Skill: 24.85825320662382, Std Dev: 0.95069437573623

In [138]:
def truncated_normal(mean, std, lower, upper):
    a, b = (lower - mean) / std, (upper - mean) / std
    return truncnorm.rvs(a, b, loc=mean, scale=std)

def get_team_skills(team_A, team_B, skills_dist=skills_dist):
    # Extract mu and sigma for team_A and team_B
    mu_A, sigma_A = skills_dist[team_A]
    mu_B, sigma_B = skills_dist[team_B]
    
    return mu_A, sigma_A, mu_B, sigma_B

def set_team_skills(team_A, mu_A, sigma_A, team_B, mu_B, sigma_B, skills_dist=skills_dist):
    skills_dist[team_A] = [mu_A, sigma_A]
    skills_dist[team_B] = [mu_B, sigma_B]


def phi_value(value, mu_A, sigma_A, mu_B, sigma_B, sigma_t=sigma_t):
   
    denom = np.sqrt(sigma_A * sigma_A + sigma_B * sigma_B + sigma_t * sigma_t)
    # Calculate the argument for the CDF (Phi)
    argument = (value + mu_B - mu_A) / denom
    return norm.cdf(argument)

def predict_winner(mu_A, sigma_A, mu_B, sigma_B):
    # 1 if A wins, -1 if lose.
    p_win = 1 - phi_value(0.5, mu_A, sigma_A, mu_B, sigma_B) 
    p_loss = phi_value(-0.5, mu_A, sigma_A, mu_B, sigma_B) 
    p_draw = 1 - p_win - p_loss
    pred = np.array([p_win, p_loss, p_draw])
    max = np.argmax(pred)

    if max == 0:
        return 1
    elif max == 1:
        return -1
    else:
        return 0
    #return 1 if 1 - phi_value(mu_A, sigma_A, mu_B, sigma_B) >= 0.5 elif else -1

In [139]:
# Gibbs Sampler
def gibbs_sampler(mu_1, sigma_1, mu_2, sigma_2 ,y, iterations):
    # Initialize t
    # inital t ???????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????
    if y == 1:
        t = np.abs(np.random.randn())
    elif y == 0:
        t = 0.001
    else:
        t = -np.abs(np.random.randn())


    
    samples = []
    
    for i in range(iterations):
        # p(s1,s2|t,y)
        # p(S|t,y)
        # QUESTION 3.1
        # Step 1: Draw s_1 and s_2 from the conditional distribution N(mean_s|t, cov_s|t) -
        
        
        
        sigma_s_inv = np.array([[1.0/(sigma_1 * sigma_1), 0.0],
                                [0.0, 1.0/(sigma_2 * sigma_2)]])
        
        sigma_t_s_inv = 1.0 / (sigma_t * sigma_t)
        mu_s = np.array([mu_1, mu_2]).T.reshape(2,1) # 2 x 1 (2, 1)
        A = np.array([1.0, -1.0]).reshape(1, 2) # 1 x 2
        
        # (2x2)
        cov_s_t = np.linalg.inv(sigma_s_inv + (A.T @ (sigma_t_s_inv * A) ) )
        
         # (2, )
        mean_s_t = cov_s_t @ ( (sigma_s_inv @ mu_s) + (A.T * sigma_t_s_inv * t) )
 
       
        # Draw from the multivariate normal distribution
        mean_s_t = mean_s_t.reshape(2,)
        s_1, s_2 = multivariate_normal.rvs(mean=mean_s_t, cov=cov_s_t)

        #DO WE NEED TO UPDATE: mu_1 +mu_2, sigma_1, sigma_2 ????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????

        #p(t | s_1, s_2, y)
        #p(t | S, y)

        # QUESTION 3.2
        # Step 2: Draw t from the conditional distribution 
        mean_t = s_1 - s_2 
        # print(s_1, s_2)
        if y == 1:
            t = truncated_normal(mean_t, sigma_t, 0, np.inf)  # For y = 1, t > 0
        elif y == 0:
            t = truncated_normal(mean_t, sigma_t, -0.5, 0.5)  # For y = 0
        else:
            t = truncated_normal(mean_t, sigma_t, -np.inf, 0)  # For y = -1, t < 0

        # Store samples
        samples.append((s_1, s_2, t))

    # burn_in = int(0.3 * iterations)
    burn_in = 25
    samples = np.array(samples)
    mean = np.mean(samples[burn_in:, :2], axis = 0)
    covariance = np.cov(samples, rowvar = False)
    return mean[0], mean[1], np.sqrt(covariance[0,0]), np.sqrt(covariance[1, 1])

In [140]:
result_array

array([[16, 12,  0],
       [ 3, 11, -1],
       [ 6, 18, -1],
       ...,
       [16, 15, -1],
       [12,  2,  1],
       [ 5, 14,  0]])

In [141]:
predicted_y = []
for team_A, team_B, y in result_array:
    mu_A, sigma_A, mu_B, sigma_B = get_team_skills(team_A, team_B)
    predicted_y.append(predict_winner(mu_A, sigma_A, mu_B, sigma_B))
    
    
    mu_A_new, mu_B_new, sigma_A_new, sigma_B_new = gibbs_sampler(mu_A, sigma_A, mu_B, sigma_B,
                                                    y=y,
                                                    iterations=iterations)
    
    set_team_skills(team_A, mu_A_new, sigma_A_new, team_B, mu_B_new, sigma_B_new)
    
true_y = result_array[:, -1]
correct_guesses = np.sum(predicted_y == true_y)
r = correct_guesses / len(true_y)
print(f"r is {r:.2f}")

r is 0.51


In [119]:
true_y

array([ 0, -1, -1,  1,  1,  0,  0,  1,  1,  0,  1, -1,  1,  0,  1,  1, -1,
        0, -1, -1,  1, -1, -1, -1, -1, -1,  1,  1,  1, -1,  0,  1, -1, -1,
        0,  0,  1,  1, -1,  1,  0,  0,  1,  0,  0,  0,  0, -1, -1,  1, -1,
        0,  1,  1,  0,  1,  1, -1,  0,  0, -1,  1,  0,  1,  1, -1,  0,  1,
        1, -1,  1, -1, -1,  1,  0,  0,  1,  1,  1, -1, -1,  1,  1, -1,  0,
        1, -1,  0,  0,  1, -1,  1,  0,  1, -1,  0, -1, -1,  1,  1,  1,  1,
        0,  1, -1, -1,  0,  0,  1,  0,  0, -1,  1,  0,  0, -1,  1,  0, -1,
        1,  0,  1, -1,  0,  1, -1,  1, -1,  1,  1,  1,  0,  1, -1,  1, -1,
        1, -1,  0, -1, -1, -1,  1,  0,  1, -1,  1,  0,  0, -1, -1,  1,  1,
        1,  1,  1,  0,  1,  1,  1,  0,  1,  0,  1,  1,  1,  1,  0,  1, -1,
       -1, -1,  0,  1,  1,  1,  0, -1,  0,  0,  1,  0, -1,  0, -1,  1,  1,
        0, -1,  0,  0,  1, -1,  0, -1,  0,  1,  1, -1,  0,  1,  1, -1,  1,
       -1, -1,  0, -1,  1,  1,  1, -1, -1,  1,  0, -1,  0,  1, -1,  0,  1,
        1,  0,  1,  0,  1

In [120]:
predicted_y

[-1,
 -1,
 -1,
 1,
 1,
 -1,
 -1,
 0,
 1,
 -1,
 1,
 -1,
 1,
 -1,
 -1,
 0,
 -1,
 0,
 0,
 -1,
 1,
 -1,
 -1,
 -1,
 -1,
 -1,
 1,
 -1,
 1,
 1,
 1,
 0,
 -1,
 -1,
 0,
 0,
 1,
 1,
 -1,
 1,
 0,
 0,
 1,
 1,
 1,
 1,
 -1,
 -1,
 0,
 1,
 -1,
 1,
 1,
 0,
 1,
 -1,
 0,
 -1,
 0,
 -1,
 0,
 1,
 -1,
 1,
 1,
 -1,
 1,
 -1,
 0,
 -1,
 1,
 -1,
 -1,
 1,
 -1,
 0,
 1,
 1,
 1,
 -1,
 -1,
 0,
 1,
 1,
 -1,
 1,
 1,
 -1,
 -1,
 1,
 -1,
 1,
 -1,
 0,
 -1,
 1,
 -1,
 -1,
 -1,
 -1,
 1,
 -1,
 -1,
 1,
 -1,
 -1,
 -1,
 1,
 -1,
 -1,
 -1,
 -1,
 1,
 0,
 0,
 -1,
 1,
 1,
 -1,
 1,
 1,
 1,
 -1,
 1,
 -1,
 -1,
 1,
 1,
 0,
 1,
 -1,
 -1,
 -1,
 -1,
 0,
 -1,
 1,
 1,
 1,
 1,
 -1,
 -1,
 0,
 0,
 1,
 -1,
 1,
 -1,
 1,
 0,
 1,
 1,
 1,
 1,
 -1,
 1,
 0,
 1,
 -1,
 1,
 0,
 0,
 1,
 -1,
 1,
 0,
 -1,
 0,
 1,
 -1,
 0,
 -1,
 -1,
 1,
 1,
 -1,
 1,
 -1,
 0,
 1,
 1,
 -1,
 -1,
 -1,
 -1,
 -1,
 1,
 -1,
 0,
 -1,
 1,
 -1,
 -1,
 0,
 -1,
 0,
 -1,
 1,
 1,
 -1,
 1,
 -1,
 1,
 0,
 1,
 0,
 -1,
 -1,
 1,
 1,
 1,
 0,
 0,
 1,
 -1,
 -1,
 -1,
 -1,
 -1,
 0,
 1,
 -1,
 -1,
 1,
 1,
 