In [49]:
import json
from pprint import pp

import numpy as np
import pandas as pd
import tqdm
from IPython.display import HTML, display

In [50]:
teams = json.load(open('teams.json'))
# pp(teams)

In [51]:
matches = json.load(open('matches.json'))

print(f"Number of matches: {len(matches)}")

for match in matches:
    if len(match) != 4:
        print("EERRROR", match)

for team in teams.keys():
    home = []
    away = []
    for match in matches:
        if team == match[1]:
            home.append(match[0])
        elif team == match[2]:
            away.append(match[0])
    if len(set(home)) != 4:
        print("ERROR home", team, home)

    if len(set(away)) != 4:
        print("ERROR away", team, away)

Number of matches: 144


In [52]:
def coeff_based_prob(team1, team2):
    team1_coef = team1["coefficient"]
    team2_coef = team2["coefficient"]
    p = [
        team1_coef / (team1_coef + team2_coef),
        1 / (team1_coef + team2_coef),
        team2_coef / (team1_coef + team2_coef),
    ]
    p = [x / sum(p) for x in p]

    return p

def coeff_win_prob(team1, team2):
    team1_coef = team1["coefficient"]
    team2_coef = team2["coefficient"]
    p = [
        team1_coef / (team1_coef + team2_coef),
        team2_coef / (team1_coef + team2_coef),
    ]
    p = [x / sum(p) for x in p]

    return p

def simulate_goals(p):
    goals = [0, 0]
    for i in range(2):
        goals[i] = np.random.poisson(3 * p[i])
    return goals

In [75]:
def simulate_match(p=0.5):
    result = np.random.choice(["win", "draw", "lose"], p=p)

    if result == "win":
        return 3, 0
    elif result == "draw":
        return 1, 1
    else:
        return 0, 3


def sort_standing(df):
    df.sort_values(["Points", "Goals Done", "Goals Conceded"], ascending=[False, False, True], inplace=True)
    return df

def simulate_matches(teams, matches, p_fn=coeff_win_prob, up_to=None):
    classification = {
        t: {"pts": 0, "sim": 0, "real": 0, "g_done": 0, "g_conc": 0} for t in teams
    }
    matches.sort(key=lambda x: x[0])

    for match in matches:

        turn, home, away, result = match
        if up_to is not None and turn > up_to:
            break

        if result is None:
            p = p_fn(teams[home], teams[away])
            goals_h, goals_a = simulate_goals(p=p)

            classification[home]["sim"] += 1
            classification[away]["sim"] += 1
        else:
            goals_h, goals_a = [int(x) for x in result.split("-")]

            classification[home]["real"] += 1
            classification[away]["real"] += 1

        if goals_h > goals_a:
            score1, score2 = 3, 0
        elif goals_h == goals_a:
            score1, score2 = 1, 1
        else:
            score1, score2 = 0, 3

        classification[home]["g_done"] += goals_h
        classification[home]["g_conc"] += goals_a
        classification[away]["g_done"] += goals_a
        classification[away]["g_conc"] += goals_h

        classification[home]["pts"] += score1
        classification[away]["pts"] += score2
    return classification


def simulate(runs, teams, matches, p_fn=coeff_win_prob, up_to=None):

    rows = []
    for _ in tqdm.tqdm(range(runs)):
        results_dict = simulate_matches(teams, matches, p_fn, up_to)
        for r in results_dict:
            rows.append(
                [
                    r,
                    results_dict[r]["pts"],
                    results_dict[r]["sim"],
                    results_dict[r]["real"],
                    results_dict[r]["g_done"],
                    results_dict[r]["g_conc"],
                ]
            )
    df = pd.DataFrame(
        rows,
        columns=["Team", "Points", "Simulated", "Real", "Goals Done", "Goals Conceded"],
    )
    return df


def current_standings(teams, matches):
    standing = {
        t: {"pts": 0, "played": 0, "g_done": 0, "g_conc": 0} for t in teams
    }
    matches.sort(key=lambda x: x[0])

    for match in matches:

        turn, home, away, result = match

        if result is not None:
            goals_h, goals_a = [int(x) for x in result.split("-")]

            standing[home]["played"] += 1
            standing[away]["played"] += 1

        if goals_h > goals_a:
            score1, score2 = 3, 0
        elif goals_h == goals_a:
            score1, score2 = 1, 1
        else:
            score1, score2 = 0, 3

        standing[home]["g_done"] += goals_h
        standing[home]["g_conc"] += goals_a
        standing[away]["g_done"] += goals_a
        standing[away]["g_conc"] += goals_h

        standing[home]["pts"] += score1
        standing[away]["pts"] += score2

    df = pd.DataFrame(
        [[t, *list(v.values())] for t, v in standing.items()],
        columns=["Team", "Points", "Matches", "Goals Done", "Goals Conceded"],
    )
    df = sort_standing(df)
    df.set_index("Team", inplace=True)
    return df

In [92]:
def aggregate_results(results):
    agg_results = results.groupby("Team").agg(
        {
            "Points": "mean",
            "Goals Done": "mean",
            "Goals Conceded": "mean",
            "Simulated": "max",
            "Real": "max",
        }
    )
    agg_results = sort_standing(agg_results)
    agg_results["Matches"] = agg_results["Real"] + agg_results["Simulated"]
    agg_results["Goals Diff"] = (
        agg_results["Goals Done"] - agg_results["Goals Conceded"]
    )
    agg_results = agg_results[
        [
            "Points",
            "Matches",
            "Goals Diff",
            "Goals Done",
            "Goals Conceded",
            "Simulated",
            "Real",
        ]
    ]
    return agg_results


def print_standings(results, hide_cols=None):
    rows = []
    if hide_cols is None:
        hide_cols = []
    for i, (t, pts, matches, g_done, g_conc) in enumerate(results):
        pos = i + 1
    
        row = [pos, t, round(pts)]

        if "Matches" not in hide_cols:
            row.append(matches)
        if "Goals Done" not in hide_cols:
            row.append(g_done)
        if "Goals Conceded" not in hide_cols:
            row.append(g_conc)

        row.append("#000")

        if pos >= 25:
            row[-1] = "#E3735E"
        elif i >= 8:
            row[-1] = "#FFBF00"
        rows.append(row)

    content = "".join(
        [
            f"<tr><td style='color:{row[-1]}'>{row[0]}</td>{''.join(['<td>'+str(r)+'</td>' for r in row[1:-1]])}</tr>"
            for row in rows
        ]
    )

    cols = [c for c in ["Pos", "Team", "Points", "Matches", "Goals Done", "Goals Conceded"] if c not in hide_cols]
    cols_content = "".join([f"<th>{c}</th>" for c in cols])
    display(HTML(f"<table>{cols_content}</tr>{content}</table>"))

In [94]:
current = current_standings(teams, matches)
standing = current[['Points', 'Matches', 'Goals Done', 'Goals Conceded']].reset_index()
print_standings(standing.values)
# current

0,1,2,3,4,5
1,Aston Villa,14,3,11,5
2,Liverpool,14,3,11,6
3,Manchester City,12,3,14,5
4,Monaco,12,3,14,9
5,Brest,12,3,12,7
6,Bayer Leverkusen,12,3,11,6
7,Inter,12,3,10,5
8,Sporting CP,12,3,10,6
9,Arsenal,12,3,8,5
10,Borussia Dortmund,11,3,17,11


In [96]:
runs = 10_000

results = simulate(runs, teams, matches, p_fn=coeff_win_prob)

100%|██████████| 10000/10000 [00:03<00:00, 2723.87it/s]


In [101]:
agg_results = aggregate_results(results)
# agg_results

In [100]:
standing = agg_results[['Points', 'Matches', 'Goals Done', 'Goals Conceded']].reset_index()
# print(standing)
print_standings(standing.values, hide_cols=['Matches', 'Goals Done', 'Goals Conceded'])

0,1,2
1,Liverpool,19
2,Manchester City,17
3,Real Madrid,17
4,Inter,17
5,Borussia Dortmund,17
6,Arsenal,16
7,Bayer Leverkusen,15
8,Barcelona,15
9,Atalanta,14
10,Benfica,14
