In [5]:
import json
from pprint import pp

import numpy as np
import pandas as pd
import tqdm
from IPython.display import HTML, display

In [17]:
teams = json.load(open('teams.json'))
teams = {name: {"name": name, **data} for name, data in teams.items()}
# pp(teams)

In [18]:
matches = json.load(open('matches.json'))

print(f"Number of matches: {len(matches)}")

for match in matches:
    if len(match) != 4:
        print("EERRROR", match)

for team in teams.keys():
    home = []
    away = []
    for match in matches:
        if team == match[1]:
            home.append(match[0])
        elif team == match[2]:
            away.append(match[0])
    if len(set(home)) != 4:
        print("ERROR home", team, home)

    if len(set(away)) != 4:
        print("ERROR away", team, away)

Number of matches: 144


In [48]:
def coeff_based_prob(team1, team2):
    team1_coef = team1["coefficient"]
    team2_coef = team2["coefficient"]
    p = [
        team1_coef / (team1_coef + team2_coef),
        1 / (team1_coef + team2_coef),
        team2_coef / (team1_coef + team2_coef),
    ]
    p = [x / sum(p) for x in p]

    return p

def coeff_win_prob(team1, team2):
    team1_coef = team1["coefficient"]
    team2_coef = team2["coefficient"]
    p = [
        team1_coef / (team1_coef + team2_coef),
        team2_coef / (team1_coef + team2_coef),
    ]
    p = [x / sum(p) for x in p]

    return p

def simulate_goals(p):
    goals = [0, 0]
    for i in range(2):
        goals[i] = np.random.poisson(3 * p[i])
    return goals

def coeff_pper_match_factory(matches):
    def coeff_pper_match(team1, team2):

        on_win = 3
        on_draw = 1
        away_mod = 1.1
        t1_matches = [m for m in matches if (m[1] == team1['name'] or m[2] == team1['name']) and m[-1] is not None]
        t1_coef = 0
        for m in t1_matches:
            goals_h, goals_a = [int(x) for x in m[-1].split("-")]
            if m[1] == team1['name']:
                if goals_h > goals_a:
                    t1_coef += on_win
                elif goals_h == goals_a:
                    t1_coef += on_draw
            else:
                if goals_a > goals_h:
                    t1_coef += on_win * away_mod
                elif goals_a == goals_h:
                    t1_coef += on_draw * away_mod

        t1_coef = t1_coef / len(t1_matches)

        t2_matches = [m for m in matches if (m[1] == team2['name'] or m[2] == team2['name']) and m[-1] is not None]
        t2_coef = 0

        for m in t2_matches:
            goals_h, goals_a = [int(x) for x in m[-1].split("-")]
            if m[1] == team2['name']:
                if goals_h > goals_a:
                    t2_coef += on_win
                elif goals_h == goals_a:
                    t2_coef += on_draw
            else:
                if goals_a > goals_h:
                    t2_coef += on_win * away_mod
                elif goals_a == goals_h:
                    t2_coef += on_draw * away_mod
        t2_coef = t2_coef / len(t2_matches)

        if t1_coef == 0 and t2_coef == 0:
            return [0.5, 0.5]
        
        p = [
            t1_coef / (t1_coef + t2_coef),
            t2_coef / (t1_coef + t2_coef),
        ]
        p = [x / sum(p) for x in p]
        return p
    return coeff_pper_match


In [49]:
def simulate_match(p=0.5):
    result = np.random.choice(["win", "draw", "lose"], p=p)

    if result == "win":
        return 3, 0
    elif result == "draw":
        return 1, 1
    else:
        return 0, 3


def sort_standing(df):
    df.sort_values(["Points", "Goals Done", "Goals Conceded"], ascending=[False, False, True], inplace=True)
    return df

def simulate_matches(teams, matches, p_fn=coeff_win_prob, up_to=None):
    classification = {
        t: {"pts": 0, "sim": 0, "real": 0, "g_done": 0, "g_conc": 0} for t in teams
    }
    matches.sort(key=lambda x: x[0])

    for match in matches:

        turn, home, away, result = match
        if up_to is not None and turn > up_to:
            break

        if result is None:
            p = p_fn(teams[home], teams[away])
            goals_h, goals_a = simulate_goals(p=p)

            classification[home]["sim"] += 1
            classification[away]["sim"] += 1
        else:
            goals_h, goals_a = [int(x) for x in result.split("-")]

            classification[home]["real"] += 1
            classification[away]["real"] += 1

        if goals_h > goals_a:
            score1, score2 = 3, 0
        elif goals_h == goals_a:
            score1, score2 = 1, 1
        else:
            score1, score2 = 0, 3

        classification[home]["g_done"] += goals_h
        classification[home]["g_conc"] += goals_a
        classification[away]["g_done"] += goals_a
        classification[away]["g_conc"] += goals_h

        classification[home]["pts"] += score1
        classification[away]["pts"] += score2
    return classification


def simulate(runs, teams, matches, p_fn=coeff_win_prob, up_to=None):

    rows = []
    for _ in tqdm.tqdm(range(runs)):
        results_dict = simulate_matches(teams, matches, p_fn, up_to)
        for r in results_dict:
            rows.append(
                [
                    r,
                    results_dict[r]["pts"],
                    results_dict[r]["sim"],
                    results_dict[r]["real"],
                    results_dict[r]["g_done"],
                    results_dict[r]["g_conc"],
                ]
            )
    df = pd.DataFrame(
        rows,
        columns=["Team", "Points", "Simulated", "Real", "Goals Done", "Goals Conceded"],
    )
    return df


def current_standings(teams, matches):
    standing = {
        t: {"pts": 0, "played": 0, "g_done": 0, "g_conc": 0} for t in teams
    }
    matches.sort(key=lambda x: x[0])

    for match in matches:

        turn, home, away, result = match

        if result is None:
            continue
        goals_h, goals_a = [int(x) for x in result.split("-")]

        standing[home]["played"] += 1
        standing[away]["played"] += 1

        if goals_h > goals_a:
            score1, score2 = 3, 0
        elif goals_h == goals_a:
            score1, score2 = 1, 1
        else:
            score1, score2 = 0, 3

        standing[home]["g_done"] += goals_h
        standing[home]["g_conc"] += goals_a
        standing[away]["g_done"] += goals_a
        standing[away]["g_conc"] += goals_h

        standing[home]["pts"] += score1
        standing[away]["pts"] += score2

    df = pd.DataFrame(
        [[t, *list(v.values())] for t, v in standing.items()],
        columns=["Team", "Points", "Matches", "Goals Done", "Goals Conceded"],
    )
    df = sort_standing(df)
    df.set_index("Team", inplace=True)
    return df

In [50]:
def aggregate_results(results):
    agg_results = results.groupby("Team").agg(
        {
            "Points": "mean",
            "Goals Done": "mean",
            "Goals Conceded": "mean",
            "Simulated": "max",
            "Real": "max",
        }
    )
    agg_results = sort_standing(agg_results)
    agg_results["Matches"] = agg_results["Real"] + agg_results["Simulated"]
    agg_results["Goals Diff"] = (
        agg_results["Goals Done"] - agg_results["Goals Conceded"]
    )
    agg_results = agg_results[
        [
            "Points",
            "Matches",
            "Goals Diff",
            "Goals Done",
            "Goals Conceded",
            "Simulated",
            "Real",
        ]
    ]
    return agg_results


def print_standings(results, hide_cols=None):
    rows = []
    if hide_cols is None:
        hide_cols = []
    for i, (t, pts, matches, g_done, g_conc) in enumerate(results):
        pos = i + 1
    
        row = [pos, t, round(pts)]

        if "Matches" not in hide_cols:
            row.append(matches)
        if "Goals Done" not in hide_cols:
            row.append(g_done)
        if "Goals Conceded" not in hide_cols:
            row.append(g_conc)

        row.append("#000")

        if pos >= 25:
            row[-1] = "#E3735E"
        elif i >= 8:
            row[-1] = "#FFBF00"
        rows.append(row)

    content = "".join(
        [
            f"<tr><td style='color:{row[-1]}'>{row[0]}</td>{''.join(['<td>'+str(r)+'</td>' for r in row[1:-1]])}</tr>"
            for row in rows
        ]
    )

    cols = [c for c in ["Pos", "Team", "Points", "Matches", "Goals Done", "Goals Conceded"] if c not in hide_cols]
    cols_content = "".join([f"<th>{c}</th>" for c in cols])
    display(HTML(f"<table>{cols_content}</tr>{content}</table>"))

In [51]:
current = current_standings(teams, matches)
standing = current[['Points', 'Matches', 'Goals Done', 'Goals Conceded']].reset_index()
print_standings(standing.values)
# current

0,1,2,3,4,5
1,Inter,13,5,7,0
2,Barcelona,12,5,18,5
3,Liverpool,12,4,10,1
4,Atalanta,11,5,11,1
5,Bayer Leverkusen,10,5,11,5
6,Monaco,10,4,10,4
7,Sporting CP,10,5,10,7
8,Brest,10,5,9,6
9,Arsenal,10,5,8,2
10,Borussia Dortmund,9,4,13,6


In [52]:
focus = "Juventus"

matches_to_play = [m for m in matches if m[-1] is None and (m[1] == focus or m[2] == focus)]
matches_to_play.sort(key=lambda x: x[0])
matches_to_play

[[5, 'Aston Villa', 'Juventus', None],
 [6, 'Juventus', 'Manchester City', None],
 [7, 'Club Brugge', 'Juventus', None],
 [8, 'Juventus', 'Benfica', None]]

In [53]:
p_fn = coeff_pper_match_factory(matches)

for match in matches_to_play:
    probs = p_fn(teams[match[1]], teams[match[2]])
    print(match[1], round(probs[0], ndigits=2), match[2], round(probs[1], ndigits=2))

Aston Villa 0.56 Juventus 0.44
Juventus 0.53 Manchester City 0.47
Club Brugge 0.46 Juventus 0.54
Juventus 0.54 Benfica 0.46


In [57]:
runs = 100_000


results = simulate(runs, teams, matches, p_fn=coeff_pper_match_factory(matches))

100%|██████████| 100000/100000 [02:30<00:00, 663.48it/s]


In [58]:
agg_results = aggregate_results(results)
# agg_results

In [59]:
standing = agg_results[['Points', 'Matches', 'Goals Done', 'Goals Conceded']].reset_index()
# print(standing)
print_standings(standing.values, hide_cols=['Matches', 'Goals Done', 'Goals Conceded'])

0,1,2
1,Liverpool,21
2,Inter,18
3,Sporting CP,17
4,Atalanta,17
5,Borussia Dortmund,17
6,Barcelona,17
7,Aston Villa,16
8,Monaco,16
9,Brest,16
10,Atlético Madrid,15
