In [2]:
from scipy.stats import truncnorm, multivariate_normal, norm
import numpy as np
import matplotlib.pyplot as plt
import time
import seaborn as sns
import pandas as pd

# Hyperparameteres

In [3]:
mu_1 = 25
mu_2 = 25
sigma_1 = 25/3
sigma_2 = 25/3
sigma_t = 25/6
iterations = 1200
burn_in = 25 # Q4, 
num_seeds = 42 # for Q5.2, changing the order of the matches.

# Loading Dataset

In [4]:
df = pd.read_csv('SerieA.csv')
unique_teams = pd.concat([df['team1'], df['team1']], ignore_index=True).unique().tolist() # list
unique_teams.sort()
# unique_teams = np.array(unique_teams)
unique_teams # 0, Atlanta

['Atalanta',
 'Bologna',
 'Cagliari',
 'Chievo',
 'Empoli',
 'Fiorentina',
 'Frosinone',
 'Genoa',
 'Inter',
 'Juventus',
 'Lazio',
 'Milan',
 'Napoli',
 'Parma',
 'Roma',
 'Sampdoria',
 'Sassuolo',
 'Spal',
 'Torino',
 'Udinese']

In [5]:
team_mapping = {}
for i, team in enumerate(unique_teams):
    team_mapping[team] = i
team_mapping # Atlanta, 0

{'Atalanta': 0,
 'Bologna': 1,
 'Cagliari': 2,
 'Chievo': 3,
 'Empoli': 4,
 'Fiorentina': 5,
 'Frosinone': 6,
 'Genoa': 7,
 'Inter': 8,
 'Juventus': 9,
 'Lazio': 10,
 'Milan': 11,
 'Napoli': 12,
 'Parma': 13,
 'Roma': 14,
 'Sampdoria': 15,
 'Sassuolo': 16,
 'Spal': 17,
 'Torino': 18,
 'Udinese': 19}

In [16]:
df['team1_idx'] = df['team1'].map(team_mapping)
df['team2_idx'] = df['team2'].map(team_mapping)
df

Unnamed: 0,yyyy-mm-dd,HH:MM,team1,team2,score1,score2,team1_idx,team2_idx
0,2018-08-18,18:00,Chievo,Juventus,2,3,3,9
1,2018-08-18,20:30,Lazio,Napoli,1,2,10,12
2,2018-08-19,18:00,Torino,Roma,0,1,18,14
3,2018-08-19,20:30,Sassuolo,Inter,1,0,16,8
4,2018-08-19,20:30,Parma,Udinese,2,2,13,19
...,...,...,...,...,...,...,...,...
375,2019-05-26,20:30,Roma,Parma,2,1,14,13
376,2019-05-26,20:30,Inter,Empoli,2,1,8,4
377,2019-05-26,20:30,Fiorentina,Genoa,0,0,5,7
378,2019-05-26,20:30,Cagliari,Udinese,1,2,2,19


In [17]:
score_difference = df['score1'] - df['score2']

# Create a new DataFrame with the desired columns
result_df = pd.DataFrame({
    'team1_idx': df['team1_idx'],
    'team2_idx': df['team2_idx'],
    'score_difference': np.where(df['score1'] - df['score2'] > 0, 1, 
                                   np.where(df['score1'] - df['score2'] < 0, -1, 0))
})

# Remove rows where score_difference is zero
# result_df = result_df[result_df['score_difference'] != 0]

result_df

Unnamed: 0,team1_idx,team2_idx,score_difference
0,3,9,-1
1,10,12,-1
2,18,14,-1
3,16,8,1
4,13,19,0
...,...,...,...
375,14,13,1
376,8,4,1
377,5,7,0
378,2,19,-1


In [18]:
# Convert the DataFrame to a NumPy array
result_array = result_df.to_numpy()
# result_array 

In [19]:
# np.random.seed(42)
# np.random.shuffle(result_array)
# result_array

In [20]:
# MU and Sigma for each team
skills_dist = [[mu_1, sigma_1] for _ in unique_teams]
skills_dist[0]

[25, 8.333333333333334]

# GIBBS SAMPLER

In [67]:

# sample from a truncated normal distribution
def truncated_normal(mean, std, lower, upper):
    a, b = (lower - mean) / std, (upper - mean) / std
    return truncnorm.rvs(a, b, loc=mean, scale=std)

# Gibbs Sampler
def gibbs_sampler(mu_A, sigma_A, mu_B, sigma_B ,y, iterations, epsilon = sigma_t/15):
    
    if y == 1:
        t = np.abs(np.random.randn())
    elif y == -1:
        t = -np.abs(np.random.randn())
    else:
        if mu_A >= mu_B:
            t = np.random.uniform(0, epsilon)
        elif mu_A < mu_B:
            t = - np.random.uniform(0, epsilon)
        
        

    samples = []
    
    for i in range(iterations):
        # Calculationg mean and sigma of p(s1,s2|t,y) = p(S|t,y)
        
        sigma_s_inv = np.array([[1.0/(sigma_A * sigma_A), 0.0],
                                [0.0, 1.0/(sigma_B * sigma_B)]])
        
        sigma_t_given_s_inv = 1.0 / (sigma_t * sigma_t)
        mu_s = np.array([mu_A, mu_B]).T.reshape(2,1) # 2 x 1 (2, 1)
        A = np.array([1.0, -1.0]).reshape(1, 2) # 1 x 2
        
        # (2x2)
        cov_s_given_t = np.linalg.inv(sigma_s_inv + (A.T @ (sigma_t_given_s_inv * A) ) )
        
        # (2x1)
        mean_s_given_t = cov_s_given_t @ ( (sigma_s_inv @ mu_s) + (A.T * sigma_t_given_s_inv * t) )
 
       
        
        mean_s_given_t = mean_s_given_t.reshape(2,)
        # QUESTION 3.1
        # Step 1: Draw s_1 and s_2 from the conditional distribution N(mean_s|t, cov_s|t) -
        s_1, s_2 = multivariate_normal.rvs(mean=mean_s_given_t, cov=cov_s_given_t)

        

        # QUESTION 3.2
        # Step 2: Draw t from the conditional distribution 
        mean_t = s_1 - s_2 

        if y == 1:
            t = truncated_normal(mean_t, sigma_t, epsilon, np.inf)  # For y = 1, t > e
        elif y == -1:
            t = truncated_normal(mean_t, sigma_t, -np.inf, -epsilon)  # For y = -1, t < e
        else:
            if mu_A >= mu_B:
                t = truncated_normal(mean_t, sigma_t, 0, epsilon) # For y = 0, t < e
            elif mu_A < mu_B:
                t = truncated_normal(mean_t, sigma_t, -epsilon, 0)
            

        # Store samples
        samples.append((s_1, s_2, t))

    
  
    samples = np.array(samples)
    return samples

In [68]:
def get_team_skills(team_A, team_B, skills_dist):
    # Extract mu and sigma for team_A and team_B
    mu_A, sigma_A = skills_dist[team_A]
    mu_B, sigma_B = skills_dist[team_B]
    
    return mu_A, sigma_A, mu_B, sigma_B

def set_team_skills(team_A, mu_A, sigma_A, team_B, mu_B, sigma_B, skills_dist):
    skills_dist[team_A] = [mu_A, sigma_A]
    skills_dist[team_B] = [mu_B, sigma_B]


In [69]:
def phi_value(mu_A, sigma_A, mu_B, sigma_B, sigma_t=sigma_t):
   
    denom = np.sqrt(sigma_A * sigma_A + sigma_B * sigma_B + sigma_t * sigma_t)
    # Calculate the argument for the CDF (Phi)
    argument = (mu_B - mu_A) / denom
    return norm.cdf(argument)

def predict_winner(epsilon, mu_A, sigma_A, mu_B, sigma_B):
    p_win = 1 - phi_value(mu_A, sigma_A, mu_B, sigma_B) 
    if p_win > 0.5 + epsilon:
        return 1 # win
    elif p_win < 0.5 - epsilon:
        return -1 # lost
    return 0 # draw

In [70]:
predicted_y = []
limit = len(result_array)
# MU and Sigma for each team
skills_dist = [[mu_1, sigma_1] for _ in unique_teams]

for team_A, team_B, y in result_array[:limit]:
    mu_A, sigma_A, mu_B, sigma_B = get_team_skills(team_A, team_B, skills_dist)
    # compute the prediction
    predicted_y.append(predict_winner(0.01, mu_A, sigma_A, mu_B, sigma_B))
    samples = gibbs_sampler(mu_A, sigma_A, mu_B, sigma_B,
                                                    y=y,
                                                    iterations=iterations)
    # Getting the mean and variance of new skills
    mean = np.mean(samples[burn_in:, :2], axis = 0)
    covariance = np.cov(samples, rowvar = False)
    mu_A_new, sigma_A_new, mu_B_new, sigma_B_new =  mean[0], np.sqrt(covariance[0,0]), mean[1], np.sqrt(covariance[1, 1])
    set_team_skills(team_A, mu_A_new, sigma_A_new, team_B, mu_B_new, sigma_B_new, skills_dist)

# compare to the result in the dataset
true_y = result_array[:, -1]
correct_guesses = np.sum(predicted_y == true_y[:limit])
r = correct_guesses / limit
print(f"r is {r:.2f}")

In [None]:
print(predicted_y[:20])
print(true_y[:20])

[0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, -1, -1, 1, 1, -1, -1, -1, 0]
[-1 -1 -1  1  0  1 -1  1  1  1  1  1  0  1  0  1  0  0  1 -1]


In [None]:
# ranked based on mu - 3 * sigma
ranked_teams = sorted(enumerate(skills_dist), key=lambda x: x[1][0] - 3.0 * x[1][1], reverse=True)
# Print ranked teams with their skills
for rank, (idx, (m, s)) in enumerate(ranked_teams, start=1):
    print(f"Rank {rank}: Team {unique_teams[idx]} - Skill: {m}, Var: {s*s}")

Rank 1: Team Juventus - Skill: 32.41004778839751, Var: 4.523147884407655
Rank 2: Team Inter - Skill: 29.285924558377207, Var: 2.042985457922453
Rank 3: Team Napoli - Skill: 29.80482655832546, Var: 2.6730549013412706
Rank 4: Team Milan - Skill: 27.715465280695994, Var: 2.021787735036866
Rank 5: Team Lazio - Skill: 27.893259047076867, Var: 2.3474901486469433
Rank 6: Team Sampdoria - Skill: 26.442628335496384, Var: 1.496909904166898
Rank 7: Team Sassuolo - Skill: 26.6363726081421, Var: 1.8591474669824972
Rank 8: Team Fiorentina - Skill: 26.119133999992403, Var: 1.6829755871055017
Rank 9: Team Cagliari - Skill: 25.303945669125834, Var: 1.6051048482770718
Rank 10: Team Genoa - Skill: 25.39901700782373, Var: 2.011479663828978
Rank 11: Team Roma - Skill: 25.04177910389712, Var: 1.7130688720969014
Rank 12: Team Torino - Skill: 25.11172286514545, Var: 1.9577056189475017
Rank 13: Team Atalanta - Skill: 24.471022826430666, Var: 1.838144176814654
Rank 14: Team Parma - Skill: 24.94855426179111, Var