In [335]:
from scipy.stats import truncnorm, multivariate_normal, norm
import numpy as np
import matplotlib.pyplot as plt
import time
import seaborn as sns
import pandas as pd

# Hyperparameteres

In [367]:
# Parameters
mu_1 = 25
mu_2 = 25
sigma_1 = 25/3
sigma_2 = 25/3
sigma_t = 25/6
iterations = 800

# Loading Dataset

In [368]:
df = pd.read_csv('SerieA.csv')
unique_teams = pd.concat([df['team1'], df['team1']], ignore_index=True).unique().tolist() # list
unique_teams.sort()
# unique_teams = np.array(unique_teams)
unique_teams # 0, Atlanta

['Atalanta',
 'Bologna',
 'Cagliari',
 'Chievo',
 'Empoli',
 'Fiorentina',
 'Frosinone',
 'Genoa',
 'Inter',
 'Juventus',
 'Lazio',
 'Milan',
 'Napoli',
 'Parma',
 'Roma',
 'Sampdoria',
 'Sassuolo',
 'Spal',
 'Torino',
 'Udinese']

In [369]:
team_mapping = {}
for i, team in enumerate(unique_teams):
    team_mapping[team] = i
team_mapping # Atlanta, 0

{'Atalanta': 0,
 'Bologna': 1,
 'Cagliari': 2,
 'Chievo': 3,
 'Empoli': 4,
 'Fiorentina': 5,
 'Frosinone': 6,
 'Genoa': 7,
 'Inter': 8,
 'Juventus': 9,
 'Lazio': 10,
 'Milan': 11,
 'Napoli': 12,
 'Parma': 13,
 'Roma': 14,
 'Sampdoria': 15,
 'Sassuolo': 16,
 'Spal': 17,
 'Torino': 18,
 'Udinese': 19}

In [370]:
df['team1_idx'] = df['team1'].map(team_mapping)
df['team2_idx'] = df['team2'].map(team_mapping)
df

Unnamed: 0,yyyy-mm-dd,HH:MM,team1,team2,score1,score2,team1_idx,team2_idx
0,2018-08-18,18:00,Chievo,Juventus,2,3,3,9
1,2018-08-18,20:30,Lazio,Napoli,1,2,10,12
2,2018-08-19,18:00,Torino,Roma,0,1,18,14
3,2018-08-19,20:30,Sassuolo,Inter,1,0,16,8
4,2018-08-19,20:30,Parma,Udinese,2,2,13,19
...,...,...,...,...,...,...,...,...
375,2019-05-26,20:30,Roma,Parma,2,1,14,13
376,2019-05-26,20:30,Inter,Empoli,2,1,8,4
377,2019-05-26,20:30,Fiorentina,Genoa,0,0,5,7
378,2019-05-26,20:30,Cagliari,Udinese,1,2,2,19


In [371]:
score_difference = df['score1'] - df['score2']

# Create a new DataFrame with the desired columns
result_df = pd.DataFrame({
    'team1_idx': df['team1_idx'],
    'team2_idx': df['team2_idx'],
    'score_difference': np.where(df['score1'] - df['score2'] > 0, 1, 
                                   np.where(df['score1'] - df['score2'] < 0, -1, 0))
})

# Remove rows where score_difference is zero
result_df = result_df[result_df['score_difference'] != 0]

result_df

Unnamed: 0,team1_idx,team2_idx,score_difference
0,3,9,-1
1,10,12,-1
2,18,14,-1
3,16,8,1
5,4,2,1
...,...,...,...
374,17,11,-1
375,14,13,1
376,8,4,1
378,2,19,-1


In [372]:
# Convert the DataFrame to a NumPy array
result_array = result_df.to_numpy()
# result_array 

In [373]:
np.random.seed(42)
np.random.shuffle(result_array)
result_array

array([[13,  2,  1],
       [ 7,  0,  1],
       [ 9,  2,  1],
       [19,  2,  1],
       [10, 13,  1],
       [12, 10,  1],
       [ 2,  9, -1],
       [14, 10,  1],
       [16,  9, -1],
       [12,  2,  1],
       [ 9, 19,  1],
       [ 8,  4,  1],
       [ 3, 12, -1],
       [ 3, 16, -1],
       [ 7, 10,  1],
       [18,  1, -1],
       [15,  7,  1],
       [10,  9, -1],
       [ 4,  1,  1],
       [11,  3,  1],
       [ 2, 17,  1],
       [17, 11, -1],
       [ 9, 12,  1],
       [ 9,  1,  1],
       [ 3, 11, -1],
       [17, 13,  1],
       [ 8, 13, -1],
       [11,  8, -1],
       [ 5,  4,  1],
       [ 6, 15, -1],
       [10,  2,  1],
       [ 3,  0, -1],
       [ 1,  6, -1],
       [14, 19,  1],
       [18, 16,  1],
       [ 0,  6,  1],
       [ 5, 13, -1],
       [13, 10, -1],
       [15,  3,  1],
       [ 9, 14,  1],
       [ 9, 16,  1],
       [ 8, 12,  1],
       [ 3, 17, -1],
       [15, 12,  1],
       [12,  4,  1],
       [ 0,  1,  1],
       [11,  9, -1],
       [ 8, 1

In [374]:
# MU and Sigma for each team
skills_dist = [[mu_1, sigma_1] for _ in unique_teams]
skills_dist

[[25, 8.333333333333334],
 [25, 8.333333333333334],
 [25, 8.333333333333334],
 [25, 8.333333333333334],
 [25, 8.333333333333334],
 [25, 8.333333333333334],
 [25, 8.333333333333334],
 [25, 8.333333333333334],
 [25, 8.333333333333334],
 [25, 8.333333333333334],
 [25, 8.333333333333334],
 [25, 8.333333333333334],
 [25, 8.333333333333334],
 [25, 8.333333333333334],
 [25, 8.333333333333334],
 [25, 8.333333333333334],
 [25, 8.333333333333334],
 [25, 8.333333333333334],
 [25, 8.333333333333334],
 [25, 8.333333333333334]]

# GIBBS SAMPLER

In [375]:
# Helper function to sample from a truncated normal distribution
def truncated_normal(mean, std, lower, upper):
    a, b = (lower - mean) / std, (upper - mean) / std
    return truncnorm.rvs(a, b, loc=mean, scale=std)

In [376]:
# Gibbs Sampler
def gibbs_sampler(mu_1, mu_2, sigma_1, sigma_2 ,y, iterations):
    # Initialize t
    # inital t ???????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????
    if y == 1:
        t = np.abs(np.random.randn())
    else:
        t = -np.abs(np.random.randn())

    
    samples = []
    
    for i in range(iterations):
        # p(s1,s2|t,y)
        # p(S|t,y)
        # QUESTION 3.1
        # Step 1: Draw s_1 and s_2 from the conditional distribution N(mean_s|t, cov_s|t) -
        
        
        
        sigma_s_inv = np.array([[1.0/(sigma_1 * sigma_1), 0.0],
                                [0.0, 1.0/(sigma_2 * sigma_2)]])
        
        sigma_t_s_inv = 1.0 / (sigma_t * sigma_t)
        mu_s = np.array([mu_1, mu_2]).T.reshape(2,1) # 2 x 1 (2, 1)
        A = np.array([1.0, -1.0]).reshape(1, 2) # 1 x 2
        
        # (2x2)
        cov_s_t = np.linalg.inv(sigma_s_inv + (A.T @ (sigma_t_s_inv * A) ) )
        
         # (2, )
        mean_s_t = cov_s_t @ ( (sigma_s_inv @ mu_s) + (A.T * sigma_t_s_inv * t) )
 
       
        # Draw from the multivariate normal distribution
        mean_s_t = mean_s_t.reshape(2,)
        s_1, s_2 = multivariate_normal.rvs(mean=mean_s_t, cov=cov_s_t)

        #DO WE NEED TO UPDATE: mu_1 +mu_2, sigma_1, sigma_2 ????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????

        #p(t | s_1, s_2, y)
        #p(t | S, y)

        # QUESTION 3.2
        # Step 2: Draw t from the conditional distribution 
        mean_t = s_1 - s_2 
        # print(s_1, s_2)
        if y == 1:
            t = truncated_normal(mean_t, sigma_t, 0, np.inf)  # For y = 1, t > 0
        else:
            t = truncated_normal(mean_t, sigma_t, -np.inf, 0)  # For y = -1, t < 0

        # Store samples
        samples.append((s_1, s_2, t))

    # burn_in = int(0.3 * iterations)
    burn_in = 25
    samples = np.array(samples)
    mean = np.mean(samples[burn_in:, :2], axis = 0)
    covariance = np.cov(samples, rowvar = False)
    return mean[0], mean[1], np.sqrt(covariance[0,0]), np.sqrt(covariance[1, 1])


# Run

In [378]:
for team_1, team_2, y in result_array:
    # print(team_1, team_2)
    mu_1 = skills_dist[team_1][0]
    sigma_1 = skills_dist[team_1][1]
    mu_2 = skills_dist[team_2][0]
    sigma_2 = skills_dist[team_2][1]
    # print(mu_1, mu_2, sigma_1, sigma_2)
    mu_1, mu_2, sigma_1, sigma_2 = gibbs_sampler(mu_1=mu_1,
                                                    mu_2=mu_2, 
                                                    sigma_1=sigma_1,
                                                    sigma_2=sigma_2,
                                                    y=y,
                                                    iterations=iterations)
    skills_dist[team_1][0] = mu_1
    skills_dist[team_2][0] = mu_2
    skills_dist[team_1][1] = sigma_1
    skills_dist[team_2][1] = sigma_2
    
    


In [379]:
for idx, (m, s) in enumerate(skills_dist):
    print(f"{unique_teams[idx]}: mean = {m:.3f}, std dev = {s:.3f}")

Atalanta: mean = 27.634, std dev = 1.229
Bologna: mean = 25.189, std dev = 1.314
Cagliari: mean = 23.401, std dev = 1.304
Chievo: mean = 17.624, std dev = 2.362
Empoli: mean = 23.423, std dev = 1.359
Fiorentina: mean = 23.118, std dev = 1.486
Frosinone: mean = 20.928, std dev = 1.474
Genoa: mean = 23.670, std dev = 1.386
Inter: mean = 27.862, std dev = 1.194
Juventus: mean = 30.662, std dev = 1.694
Lazio: mean = 26.057, std dev = 1.329
Milan: mean = 28.786, std dev = 1.256
Napoli: mean = 31.297, std dev = 1.443
Parma: mean = 23.084, std dev = 1.340
Roma: mean = 28.320, std dev = 1.215
Sampdoria: mean = 25.643, std dev = 1.234
Sassuolo: mean = 23.725, std dev = 1.548
Spal: mean = 24.481, std dev = 1.214
Torino: mean = 28.241, std dev = 1.479
Udinese: mean = 24.166, std dev = 1.188


In [380]:
# ranked based on mu - 3 * sigma
ranked_teams = sorted(enumerate(skills_dist), key=lambda x: x[1][0] - 3.0 * x[1][1], reverse=True)
# Print ranked teams with their skills
for rank, (idx, (m, s)) in enumerate(ranked_teams, start=1):
    print(f"Rank {rank}: Team {unique_teams[idx]} - Skill: {m}, Std Dev: {s}")

Rank 1: Team Napoli - Skill: 31.297346530129918, Std Dev: 1.4425089928871946
Rank 2: Team Juventus - Skill: 30.66203454627061, Std Dev: 1.6942966928489058
Rank 3: Team Milan - Skill: 28.785776404929365, Std Dev: 1.2559560927657034
Rank 4: Team Roma - Skill: 28.320310184097277, Std Dev: 1.2147029654082766
Rank 5: Team Inter - Skill: 27.861631179097078, Std Dev: 1.194272891633878
Rank 6: Team Atalanta - Skill: 27.633721250244953, Std Dev: 1.2291048070015043
Rank 7: Team Torino - Skill: 28.24146048795454, Std Dev: 1.478821894047611
Rank 8: Team Lazio - Skill: 26.057061867965295, Std Dev: 1.3288288169873779
Rank 9: Team Sampdoria - Skill: 25.643499933426007, Std Dev: 1.2343168189992302
Rank 10: Team Bologna - Skill: 25.18904220384615, Std Dev: 1.314040165959835
Rank 11: Team Spal - Skill: 24.480909633982733, Std Dev: 1.2142177513966985
Rank 12: Team Udinese - Skill: 24.166063727615622, Std Dev: 1.1881028208083941
Rank 13: Team Genoa - Skill: 23.669841204463175, Std Dev: 1.3857479650144615
