In [304]:
from scipy.stats import truncnorm, multivariate_normal, norm
import numpy as np
import matplotlib.pyplot as plt
import time
import seaborn as sns
import pandas as pd

# Hyperparameteres

In [324]:
# Parameters
mu_1 = 25
mu_2 = 25
sigma_1 = 25/3
sigma_2 = 25/3
sigma_t = 25/6
iterations = 800

# Loading Dataset

In [306]:
df = pd.read_csv('SerieA.csv')
unique_teams = pd.concat([df['team1'], df['team1']], ignore_index=True).unique().tolist() # list
unique_teams.sort()
# unique_teams = np.array(unique_teams)
unique_teams # 0, Atlanta

['Atalanta',
 'Bologna',
 'Cagliari',
 'Chievo',
 'Empoli',
 'Fiorentina',
 'Frosinone',
 'Genoa',
 'Inter',
 'Juventus',
 'Lazio',
 'Milan',
 'Napoli',
 'Parma',
 'Roma',
 'Sampdoria',
 'Sassuolo',
 'Spal',
 'Torino',
 'Udinese']

In [307]:
team_mapping = {}
for i, team in enumerate(unique_teams):
    team_mapping[team] = i
team_mapping # Atlanta, 0

{'Atalanta': 0,
 'Bologna': 1,
 'Cagliari': 2,
 'Chievo': 3,
 'Empoli': 4,
 'Fiorentina': 5,
 'Frosinone': 6,
 'Genoa': 7,
 'Inter': 8,
 'Juventus': 9,
 'Lazio': 10,
 'Milan': 11,
 'Napoli': 12,
 'Parma': 13,
 'Roma': 14,
 'Sampdoria': 15,
 'Sassuolo': 16,
 'Spal': 17,
 'Torino': 18,
 'Udinese': 19}

In [308]:
df['team1_idx'] = df['team1'].map(team_mapping)
df['team2_idx'] = df['team2'].map(team_mapping)
df

Unnamed: 0,yyyy-mm-dd,HH:MM,team1,team2,score1,score2,team1_idx,team2_idx
0,2018-08-18,18:00,Chievo,Juventus,2,3,3,9
1,2018-08-18,20:30,Lazio,Napoli,1,2,10,12
2,2018-08-19,18:00,Torino,Roma,0,1,18,14
3,2018-08-19,20:30,Sassuolo,Inter,1,0,16,8
4,2018-08-19,20:30,Parma,Udinese,2,2,13,19
...,...,...,...,...,...,...,...,...
375,2019-05-26,20:30,Roma,Parma,2,1,14,13
376,2019-05-26,20:30,Inter,Empoli,2,1,8,4
377,2019-05-26,20:30,Fiorentina,Genoa,0,0,5,7
378,2019-05-26,20:30,Cagliari,Udinese,1,2,2,19


In [309]:
score_difference = df['score1'] - df['score2']

# Create a new DataFrame with the desired columns
result_df = pd.DataFrame({
    'team1_idx': df['team1_idx'],
    'team2_idx': df['team2_idx'],
    'score_difference': np.where(df['score1'] - df['score2'] > 0, 1, 
                                   np.where(df['score1'] - df['score2'] < 0, -1, 0))
})

# Remove rows where score_difference is zero
result_df = result_df[result_df['score_difference'] != 0]

result_df

Unnamed: 0,team1_idx,team2_idx,score_difference
0,3,9,-1
1,10,12,-1
2,18,14,-1
3,16,8,1
5,4,2,1
...,...,...,...
374,17,11,-1
375,14,13,1
376,8,4,1
378,2,19,-1


In [310]:
# Convert the DataFrame to a NumPy array
result_array = result_df.to_numpy()
result_array 

array([[ 3,  9, -1],
       [10, 12, -1],
       [18, 14, -1],
       [16,  8,  1],
       [ 4,  2,  1],
       [ 1, 17, -1],
       [ 0,  6,  1],
       [ 9, 10,  1],
       [12, 11,  1],
       [17, 13,  1],
       [19, 15,  1],
       [ 7,  4,  1],
       [ 5,  3,  1],
       [11, 14,  1],
       [ 1,  8, -1],
       [13,  9, -1],
       [ 5, 19,  1],
       [18, 17,  1],
       [16,  7,  1],
       [15, 12,  1],
       [10,  6,  1],
       [ 0,  2, -1],
       [ 8, 13, -1],
       [12,  5,  1],
       [ 6, 15, -1],
       [ 9, 16,  1],
       [ 7,  1,  1],
       [ 4, 10, -1],
       [17,  0,  1],
       [16,  4,  1],
       [13,  2,  1],
       [ 5, 17,  1],
       [15,  8, -1],
       [18, 12, -1],
       [10,  7,  1],
       [ 3, 19, -1],
       [ 1, 14,  1],
       [ 6,  9, -1],
       [ 8,  5,  1],
       [19, 10, -1],
       [14,  6,  1],
       [12, 13,  1],
       [ 9,  1,  1],
       [ 7,  3,  1],
       [17, 16, -1],
       [14, 10,  1],
       [ 9, 12,  1],
       [ 8,  

In [325]:
# MU and Sigma for each team
skills_dist = [[mu_1, sigma_1] for _ in unique_teams]
skills_dist

[[25, 8.333333333333334],
 [25, 8.333333333333334],
 [25, 8.333333333333334],
 [25, 8.333333333333334],
 [25, 8.333333333333334],
 [25, 8.333333333333334],
 [25, 8.333333333333334],
 [25, 8.333333333333334],
 [25, 8.333333333333334],
 [25, 8.333333333333334],
 [25, 8.333333333333334],
 [25, 8.333333333333334],
 [25, 8.333333333333334],
 [25, 8.333333333333334],
 [25, 8.333333333333334],
 [25, 8.333333333333334],
 [25, 8.333333333333334],
 [25, 8.333333333333334],
 [25, 8.333333333333334],
 [25, 8.333333333333334]]

# GIBBS SAMPLER

In [312]:
# Helper function to sample from a truncated normal distribution
def truncated_normal(mean, std, lower, upper):
    a, b = (lower - mean) / std, (upper - mean) / std
    return truncnorm.rvs(a, b, loc=mean, scale=std)

In [318]:
# Gibbs Sampler
def gibbs_sampler(mu_1, mu_2, sigma_1, sigma_2 ,y, iterations):
    # Initialize t
    # inital t ???????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????
    if y == 1:
        t = np.abs(np.random.randn())
    else:
        t = -np.abs(np.random.randn())

    
    samples = []
    
    for i in range(iterations):
        # p(s1,s2|t,y)
        # p(S|t,y)
        # QUESTION 3.1
        # Step 1: Draw s_1 and s_2 from the conditional distribution N(mean_s|t, cov_s|t) -
        
        
        
        sigma_s_inv = np.array([[1.0/(sigma_1 * sigma_1), 0.0],
                                [0.0, 1.0/(sigma_2 * sigma_2)]])
        
        sigma_t_s_inv = 1.0 / (sigma_t * sigma_t)
        mu_s = np.array([mu_1, mu_2]).T.reshape(2,1) # 2 x 1 (2, 1)
        A = np.array([1.0, -1.0]).reshape(1, 2) # 1 x 2
        
        # (2x2)
        cov_s_t = np.linalg.inv(sigma_s_inv + (A.T @ (sigma_t_s_inv * A) ) )
        
         # (2, )
        mean_s_t = cov_s_t @ ( (sigma_s_inv @ mu_s) + (A.T * sigma_t_s_inv * t) )
 
       
        # Draw from the multivariate normal distribution
        mean_s_t = mean_s_t.reshape(2,)
        s_1, s_2 = multivariate_normal.rvs(mean=mean_s_t, cov=cov_s_t)

        #DO WE NEED TO UPDATE: mu_1 +mu_2, sigma_1, sigma_2 ????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????

        #p(t | s_1, s_2, y)
        #p(t | S, y)

        # QUESTION 3.2
        # Step 2: Draw t from the conditional distribution 
        mean_t = s_1 - s_2 
        # print(s_1, s_2)
        if y == 1:
            t = truncated_normal(mean_t, sigma_t, 0, np.inf)  # For y = 1, t > 0
        else:
            t = truncated_normal(mean_t, sigma_t, -np.inf, 0)  # For y = -1, t < 0

        # Store samples
        samples.append((s_1, s_2, t))

    # burn_in = int(0.3 * iterations)
    burn_in = 25
    samples = np.array(samples)
    mean = np.mean(samples[burn_in:, :2], axis = 0)
    covariance = np.cov(samples, rowvar = False)
    return mean[0], mean[1], covariance[0,0], covariance[1, 1]


# Run

In [326]:
for team_1, team_2, y in result_array:
    print(team_1, team_2)
    mu_1 = skills_dist[team_1][0]
    sigma_1 = skills_dist[team_1][1]
    mu_2 = skills_dist[team_2][0]
    sigma_2 = skills_dist[team_2][1]
    print(mu_1, mu_2, sigma_1, sigma_2)
    mu_1, mu_2, sigma_1, sigma_2 = gibbs_sampler(mu_1=mu_1,
                                                    mu_2=mu_2, 
                                                    sigma_1=sigma_1,
                                                    sigma_2=sigma_2,
                                                    y=y,
                                                    iterations=iterations)
    skills_dist[team_1][0] = mu_1
    skills_dist[team_2][0] = mu_2
    skills_dist[team_1][1] = sigma_1
    skills_dist[team_2][1] = sigma_2
    
    


3 9
25 25 8.333333333333334 8.333333333333334
10 12
25 25 8.333333333333334 8.333333333333334
18 14
25 25 8.333333333333334 8.333333333333334
16 8
25 25 8.333333333333334 8.333333333333334
4 2
25 25 8.333333333333334 8.333333333333334
1 17
25 25 8.333333333333334 8.333333333333334
0 6
25 25 8.333333333333334 8.333333333333334
9 10
29.73269455994669 20.441458716540037 58.728635630123044 51.6290496550443
12 11
29.856347307429008 25 51.22518233742342 8.333333333333334
17 13
29.002345655713068 25 48.52415003949262 8.333333333333334
19 15
25 25 8.333333333333334 8.333333333333334
7 4
25 29.502809938713398 8.333333333333334 44.909528534615106
5 3
25 20.404850392245798 8.333333333333334 54.494058618161354
11 14
24.07309270796229 29.111129399107227 64.06112222530095 51.43115471333715
1 8
20.652603398239368 20.407330295766506 47.71400595496527 49.541958489747685
13 9
24.18481190405172 42.75513524529001 70.09089325477215 1591.4019555406894
5 19
26.502605370945272 29.75859647553627 69.07227362437

LinAlgError: Singular matrix