In [22]:
import pandas as pd
import numpy as np

# Read the Liga F all fixtures CSV file
liga_f_df = pd.read_csv('data/spain/liga_f_all_fixtures_2025-10-31.csv')

print("Original dataset shape:", liga_f_df.shape)
print("\nAvailable seasons:")
print(liga_f_df['season'].value_counts().sort_index())

# Filter for 2022-2023 season only
liga_f_2022_2023 = liga_f_df[liga_f_df['season'] == '2022-2023'].copy()
liga_f_2022_2023['home_score'] = liga_f_2022_2023['home_score'].fillna(0.0)
liga_f_2022_2023['away_score'] = liga_f_2022_2023['away_score'].fillna(0.0)

print(f"\nFiltered dataset for 2022-2023 season shape: {liga_f_2022_2023.shape}")
print(f"Number of matches: {len(liga_f_2022_2023)}")

liga_f_2022_2023.head()

Original dataset shape: (960, 20)

Available seasons:
season
2022-2023    240
2023-2024    240
2024-2025    240
2025-2026    240
Name: count, dtype: int64

Filtered dataset for 2022-2023 season shape: (240, 20)
Number of matches: 240


Unnamed: 0,season,gameweek,date,day_of_week,start_time,home_team,away_team,home_score,away_score,score,home_xg,away_xg,venue,attendance,referee,status,result,home_team_url,away_team_url,match_report_url
720,2022-2023,2,2022-09-17,Sat,12:00 (03:00),Barcelona,Tenerife,2.0,0.0,2–0,1.9,0.4,Estadi Johan Cruyff,,María Dolores Martínez Madrona,completed,home_win,https://fbref.com/en/squads/15f49df1/2022-2023...,https://fbref.com/en/squads/4c088abe/2022-2023...,https://fbref.com/en/matches/4df3a732/Barcelon...
721,2022-2023,2,2022-09-17,Sat,12:00 (03:00),Alavés,Madrid CFF,1.0,2.0,1–2,1.1,1.2,Ciudad Deportiva José Luis Compañón,,Manuel Pascali,completed,away_win,https://fbref.com/en/squads/aa11fb42/Alaves-Wo...,https://fbref.com/en/squads/89818574/2022-2023...,https://fbref.com/en/matches/87c755cd/Alaves-M...
722,2022-2023,2,2022-09-17,Sat,16:00 (07:00),Real Madrid,Valencia,2.0,0.0,2–0,1.6,0.8,Estadio Alfredo Di Stéfano,,Marta Huerta de Aza,completed,home_win,https://fbref.com/en/squads/54582b93/2022-2023...,https://fbref.com/en/squads/f96ff499/2022-2023...,https://fbref.com/en/matches/d0329f46/Real-Mad...
723,2022-2023,2,2022-09-17,Sat,16:00 (07:00),Real Sociedad,Villarreal,2.0,0.0,2–0,0.7,0.3,Instalaciones de Zubieta,,Alicia Espinosa Ríos,completed,home_win,https://fbref.com/en/squads/c21f25d3/2022-2023...,https://fbref.com/en/squads/7a7bef84/2022-2023...,https://fbref.com/en/matches/abfde9d9/Real-Soc...
724,2022-2023,2,2022-09-17,Sat,18:00 (09:00),Sevilla,Atlético Madrid,1.0,3.0,1–3,1.1,1.4,Estadio Jesús Navas,,Bruno Gallo,completed,away_win,https://fbref.com/en/squads/215d9026/2022-2023...,https://fbref.com/en/squads/b56c2667/2022-2023...,https://fbref.com/en/matches/f4452586/Sevilla-...


In [23]:
# Split the filtered dataset into training and testing sets
train_liga_f_2022_2023 = liga_f_2022_2023.iloc[:len(liga_f_2022_2023)//2]

# =========================
# === VALIDATION BLOCK ===
# =========================

# Hardcoded ground truth rankings (replace with real ones)
# --------------------------------------- WOMEN ------------------------------------------------------------------------
trueLigaF2223 = [
    'Barcelona', 'Real Madrid', 'Levante', 'Atlético Madrid', 'Madrid CFF', 'Tenerife', 'Sevilla', 'Real Sociedad',
    'Valencia', 'Athletic Club', 'Levante Planas', 'Real Betis', 'Sporting Huelva', 'Villarreal', 'Alhama', 'Alavés'
  ]
trueLigaF2324 = [
    'Barcelona', 'Real Madrid', 'Atlético Madrid', 'Levante', 'Madrid CFF', 'Athletic Club', 'Sevilla', 'Real Sociedad',
    'Tenerife', 'Eibar', 'Real Betis', 'Valencia', 'Levante Planas', 'Granada', 'Villarreal', 'Sporting Huelva'
  ]
trueLigaF2425 = [
    'Barcelona', 'Real Madrid', 'Atlético Madrid', 'Athletic Club', 'Granada', 'Tenerife', 'Real Sociedad', 'Eibar',
    'Sevilla', 'Madrid CFF', 'Espanyol', 'Levante', 'Lavante Badalona', 'Dep La Coruña', 'Valencia', 'Real Betis'
  ]
# Run model
# Make sure you load your `league` data before this point
# team_ratings, sorted_teams = Colley(league, weighting=0)

# Example only: to use for validation once the Colley function returns sorted_teams
def validate_ranking(sorted_teams, rankingTrueNames):
    # Get top predicted 16 teams
    rankingTrainNames = sorted_teams

    # Convert rankingTrueNames to predicted ranks
    rankingTrainIndices = []
    for name in rankingTrueNames:
        if name in sorted_teams:
            rankingTrainIndices.append(sorted_teams.index(name) + 1)  # 1-based index
        else:
            rankingTrainIndices.append(len(sorted_teams))  # assume worst if not found

    # Use the full list of teams for comparison
    rankingTrueIndices = list(range(1, len(rankingTrueNames) + 1))  # True positions: 1 to len(rankingTrueNames)

    # Error vector
    errorVector = [abs(rt - rp) for rt, rp in zip(rankingTrueIndices, rankingTrainIndices)]
    errorTrain = np.linalg.norm(errorVector)
    percentDiff = [ev / rt for ev, rt in zip(errorVector, rankingTrueIndices)]
    meanPercentDiff = np.mean(percentDiff)

    # print("True rankings :", rankingTrueNames)
    # print("Predicted top :", rankingTrainNames)

    return errorTrain, meanPercentDiff

# Example usage (assuming you already have `league` dataframe):
# team_ratings, sorted_teams = Colley(league, weighting=0)
# validate_ranking(sorted_teams, rankingTrueNames)

In [24]:
# Define the range for home win coefficients
home_win_coeffs = np.arange(0.7, 1.01, 0.05)

# Define the range for away win coefficients
away_win_coeffs = np.arange(1.0, 1.31, 0.05)

# Define the range for away win coefficients
margin_coeffs = np.arange(0.0, 1.26, 0.25)

print("Home Win Coefficients:", home_win_coeffs)
print("Away Win Coefficients:", away_win_coeffs)
print("Margin Coefficients:", margin_coeffs)

Home Win Coefficients: [0.7  0.75 0.8  0.85 0.9  0.95 1.  ]
Away Win Coefficients: [1.   1.05 1.1  1.15 1.2  1.25 1.3 ]
Margin Coefficients: [0.   0.25 0.5  0.75 1.   1.25]


In [25]:
best_error = float('inf')
best_mpd = float('inf')
best_home_coeff = None
best_away_coeff = None

for home_coeff in home_win_coeffs:
    for away_coeff in away_win_coeffs:
      for margin_c in margin_coeffs:
        # The actual running of Colley and validation will happen in the next subtask
        pass # Placeholder for the next step

In [26]:
import numpy as np
import pandas as pd
# Assuming trueLigaF2223 are already defined

# Define the Colley function, specifically for weighting mode 4 optimization
def Colley_weighted_optimized(league, home_win_coeff, away_win_coeff, margin_coeff):
    hTeam = league['home_team']
    aTeam = league['away_team']

    teams = sorted(set(hTeam).union(set(aTeam)))
    A = 2 * np.eye(len(teams))
    b = np.ones(len(teams))
    w = np.ones(len(league))
    teamIndex = {team: idx for idx, team in enumerate(teams)}

    # Weighting mode 4 logic (optimized)
    def margin(s):
        s = str(s).replace('–', '-')
        try:
            x, y = [int(t) for t in s.split('-')]
            return max(margin_coeff, abs(x - y))
        except:
            return margin_coeff
    w_margin = league['score'].apply(margin).to_numpy()

    w_home_away = np.ones(len(league))
    for i in range(len(league)):
        row = league.iloc[i]
        home, away = row['home_team'], row['away_team']
        score_txt = str(row['score']).replace('–', '-')
        try:
            hScore, aScore = [int(t) for t in score_txt.split('-')]
            if hScore > aScore:
                w_home_away[i] = home_win_coeff  # home win: less credit
            elif aScore > hScore:
                w_home_away[i] = away_win_coeff  # away win: more credit
        except:
            continue

    # Combine weights (e.g., multiply them)
    w = w_margin * w_home_away

    for i in range(len(league)):
        row = league.iloc[i]
        home, away = row['home_team'], row['away_team']
        score_txt = str(row['score']).replace('–', '-')

        try:
            hScore, aScore = [int(t) for t in score_txt.split('-')]
        except:
            continue

        hi = teamIndex[home]; ai = teamIndex[away]

        # Colley matrix
        A[hi, ai] -= w[i]; A[ai, hi] -= w[i]
        A[hi, hi] += w[i]; A[ai, ai] += w[i]

        # Vector b updates
        if hScore > aScore:
            b[hi] += 0.5 * w[i]; b[ai] -= 0.5 * w[i]
        elif aScore > hScore:
            b[ai] += 0.5 * w[i]; b[hi] -= 0.5 * w[i]
        # ties: no change


    r = np.linalg.solve(A, b)
    team_ratings = {team: r[idx] for idx, team in enumerate(teams)}
    sorted_teams = sorted(team_ratings, key=team_ratings.get, reverse=True)
    return team_ratings, sorted_teams

# Define Constant
best_error = float('inf')
best_mpd = float('inf')
best_home_coeff = None
best_away_coeff = None
best_margin_coeff = None

for home_coeff in home_win_coeffs:
    for away_coeff in away_win_coeffs:
        for margin_coeff in margin_coeffs:
            # Run Colley with the current coefficients
            team_ratings, sorted_teams = Colley_weighted_optimized(train_liga_f_2022_2023, home_coeff, away_coeff, margin_coeff)

            # Validate the ranking
            err, mpd = validate_ranking(sorted_teams, trueLigaF2223)

            # Check if the current combination is better than the best found so far
            if (err < best_error) or (np.isclose(err, best_error) and mpd < best_mpd):
                best_error = err
                best_mpd = mpd
                best_home_coeff = home_coeff
                best_away_coeff = away_coeff
                best_margin_coeff = margin_coeff
                print(f"New best found: Home Coeff={best_home_coeff:.2f}, Away Coeff={best_away_coeff:.2f}, Margin Coeff={best_margin_coeff:.2f}, Error={best_error:.4f}, Mean%Diff={best_mpd:.4f}")
                print("predicted ranked teams: ", sorted_teams)
                print("actual ranked teams:    ", trueLigaF2223)
                print("--------------------------------------------------------------------------------")


print("\nOptimization Complete:")
print(f"Best Home Win Coefficient: {best_home_coeff:.2f}")
print(f"Best Away Win Coefficient: {best_away_coeff:.2f}")
print(f"Best Margin Coefficient: {best_margin_coeff:.2f}")
print(f"Best Error: {best_error:.4f}")
print(f"Best Mean Percentage Difference: {best_mpd:.4f}")

New best found: Home Coeff=0.70, Away Coeff=1.00, Margin Coeff=0.00, Error=8.6023, Mean%Diff=0.1752
predicted ranked teams:  ['Barcelona', 'Real Madrid', 'Levante', 'Atlético Madrid', 'Real Sociedad', 'Madrid CFF', 'Valencia', 'Sevilla', 'Athletic Club', 'Sporting Huelva', 'Real Betis', 'Tenerife', 'Levante Planas', 'Alavés', 'Alhama', 'Villarreal']
actual ranked teams:     ['Barcelona', 'Real Madrid', 'Levante', 'Atlético Madrid', 'Madrid CFF', 'Tenerife', 'Sevilla', 'Real Sociedad', 'Valencia', 'Athletic Club', 'Levante Planas', 'Real Betis', 'Sporting Huelva', 'Villarreal', 'Alhama', 'Alavés']
--------------------------------------------------------------------------------
New best found: Home Coeff=0.70, Away Coeff=1.00, Margin Coeff=0.50, Error=7.8740, Mean%Diff=0.1596
predicted ranked teams:  ['Barcelona', 'Real Madrid', 'Levante', 'Atlético Madrid', 'Real Sociedad', 'Madrid CFF', 'Valencia', 'Sevilla', 'Athletic Club', 'Sporting Huelva', 'Tenerife', 'Real Betis', 'Levante Planas

In [27]:
import re

def Massey_weighted_optimized(league, home_win_coeff, away_win_coeff, margin_coeff):
    # Extract team names
    hTeam = league['home_team']
    aTeam = league['away_team']
    teams = sorted(set(hTeam).union(set(aTeam)))
    totalTeams = len(teams)

    # Map team names to indices
    teamIndex = {team: idx for idx, team in enumerate(teams)}

    # Weight vector (combined weighting mode 4)
    w = np.ones(len(league))

    # Margin weighting
    def margin(s):
        s = str(s).replace('–', '-')
        try:
            x, y = [int(t) for t in s.split('-')]
            return max(margin_coeff, abs(x - y))
        except:
            return margin_coeff
    w_margin = league['score'].apply(margin).to_numpy()

    # Home/Away weighting
    w_home_away = np.ones(len(league))
    for i in range(len(league)):
        row = league.iloc[i]
        home, away = row['home_team'], row['away_team']
        score_txt = str(row['score']).replace('–', '-')
        try:
            hScore, aScore = [int(t) for t in score_txt.split('-')]
            if hScore > aScore:
                w_home_away[i] = home_win_coeff  # home win: less credit
            elif aScore > hScore:
                w_home_away[i] = away_win_coeff  # away win: more credit
        except:
            continue

    # Combine weights (multiply them)
    w = w_margin * w_home_away


    # Massey matrix and score vector
    M = np.zeros((totalTeams, totalTeams))
    b = np.zeros(totalTeams)

    # Process each game
    for i in range(len(league)):
        row = league.iloc[i]
        home = row['home_team']
        away = row['away_team']

        try:
            score = re.split(r'[-–]', row['score'])
            hScore = int(score[0])
            aScore = int(score[1])
        except:
            continue  # Skip if invalid score

        hIndex = teamIndex[home]
        aIndex = teamIndex[away]
        diff = hScore - aScore

        # Update matrix
        M[hIndex, hIndex] += w[i]
        M[aIndex, aIndex] += w[i]
        M[hIndex, aIndex] -= w[i]
        M[aIndex, hIndex] -= w[i]

        # Update score difference vector
        b[hIndex] += w[i] * diff
        b[aIndex] -= w[i] * diff

    # Massey system adjustment (last row to all 1's, last b to 0)
    M[-1, :] = 1
    b[-1] = 0

    # Solve for ratings
    r = np.linalg.solve(M, b)

    # Output results
    team_ratings = {team: r[idx] for team, idx in teamIndex.items()}
    sorted_teams = sorted(team_ratings, key=team_ratings.get, reverse=True)

    return team_ratings, sorted_teams

# Define Constant
best_error = float('inf')
best_mpd = float('inf')
best_home_coeff = None
best_away_coeff = None
best_margin_coeff = None

for home_coeff in home_win_coeffs:
    for away_coeff in away_win_coeffs:
        for margin_coeff in margin_coeffs:
            # Run Massey with the current coefficients
            team_ratings, sorted_teams = Massey_weighted_optimized(train_liga_f_2022_2023, home_coeff, away_coeff, margin_coeff)

            # Validate the ranking
            err, mpd = validate_ranking(sorted_teams, trueLigaF2223)

            # Check if the current combination is better than the best found so far
            if (err < best_error) or (np.isclose(err, best_error) and mpd < best_mpd):
                best_error = err
                best_mpd = mpd
                best_home_coeff = home_coeff
                best_away_coeff = away_coeff
                best_margin_coeff = margin_coeff
                print(f"New best found: Home Coeff={best_home_coeff:.2f}, Away Coeff={best_away_coeff:.2f}, Margin Coeff={best_margin_coeff:.2f}, Error={best_error:.4f}, Mean%Diff={best_mpd:.4f}")
                print("predicted ranked teams: ", sorted_teams)
                print("actual ranked teams:    ", trueLigaF2223)
                print("--------------------------------------------------------------------------------")


print("\nOptimization Complete:")
print(f"Best Home Win Coefficient: {best_home_coeff:.2f}")
print(f"Best Away Win Coefficient: {best_away_coeff:.2f}")
print(f"Best Margin Coefficient: {best_margin_coeff:.2f}")
print(f"Best Error: {best_error:.4f}")
print(f"Best Mean Percentage Difference: {best_mpd:.4f}")

New best found: Home Coeff=0.70, Away Coeff=1.00, Margin Coeff=0.00, Error=8.1240, Mean%Diff=0.2182
predicted ranked teams:  ['Barcelona', 'Levante', 'Real Madrid', 'Atlético Madrid', 'Real Sociedad', 'Athletic Club', 'Sevilla', 'Madrid CFF', 'Valencia', 'Tenerife', 'Sporting Huelva', 'Levante Planas', 'Alhama', 'Real Betis', 'Villarreal', 'Alavés']
actual ranked teams:     ['Barcelona', 'Real Madrid', 'Levante', 'Atlético Madrid', 'Madrid CFF', 'Tenerife', 'Sevilla', 'Real Sociedad', 'Valencia', 'Athletic Club', 'Levante Planas', 'Real Betis', 'Sporting Huelva', 'Villarreal', 'Alhama', 'Alavés']
--------------------------------------------------------------------------------
New best found: Home Coeff=0.70, Away Coeff=1.00, Margin Coeff=0.75, Error=7.7460, Mean%Diff=0.2209
predicted ranked teams:  ['Barcelona', 'Levante', 'Real Madrid', 'Atlético Madrid', 'Real Sociedad', 'Sevilla', 'Athletic Club', 'Madrid CFF', 'Valencia', 'Tenerife', 'Sporting Huelva', 'Levante Planas', 'Alhama', '