In [1]:
import numpy as np
import pandas as pd
import csv
import math
import random
import matplotlib.pyplot as plt
from tqdm import tqdm
from joblib import Parallel, delayed

import sys
sys.path.append("src")
from world_cup_simulator import *

In [2]:
def run_group_stage_simulation(n, j):
    teams_pd = pd.read_csv("data/roster.csv")
    
    for i in range(n):
        games = read_games("data/matches.csv")
        teams = {}
    
        for row in [item for item in csv.DictReader(open("data/roster.csv"))]:
            teams[row['team']] = {'name': row['team'], 'rating': float(row['rating']), 'points': 0}
    
        simulate_group_stage(games, teams, ternary=True)
    
        collector = []
        for key in teams.keys():
            collector.append({"team": key, f"simulation{i+1}": teams[key]['points']})

        temp = pd.DataFrame(collector)
        teams_pd = pd.merge(teams_pd, temp)
    
    sim_cols = [a for a in teams_pd.columns if "simulation" in a]
    teams_pd[f"avg_pts_{j+1}"] = teams_pd[sim_cols].mean(axis=1)
    not_sim = [b for b in teams_pd.columns if "simulation" not in b]
    simulation_result = teams_pd[not_sim]
    
    return simulation_result

### Simulate group stage

#### The gist is to read from two files: One defining the match schedule, the other with teams and their relative strengths (given by Elo ratings prior to the start of the event)

In [3]:
# Reads in the matches and teams as dictionaries and proceeds with that data type
n = 100 # How many simulations to run
m = 10000 # How many simulation results to collect

roster_pd = Parallel(n_jobs=5)(delayed(run_group_stage_simulation)(n, j) for j in tqdm(range(m)))

for t in tqdm(range(m)):
    if t == 0:
        roster = pd.merge(roster_pd[t], roster_pd[t+1])
    elif t >= 2:
        roster = pd.merge(roster, roster_pd[t])
    else:
        pass

100%|██████████| 10000/10000 [15:38<00:00, 10.66it/s]
100%|██████████| 10000/10000 [00:39<00:00, 251.83it/s]


In [4]:
sim_cols = [i for i in roster.columns if "avg_pts" in i]

In [5]:
roster['avg_sim_pts'] = roster[sim_cols].mean(axis=1)
roster['99%CI_low'] = roster[sim_cols].quantile(q=0.005, axis=1)
roster['99%CI_high'] = roster[sim_cols].quantile(q=0.995, axis=1)

In [6]:
not_sim = [j for j in roster.columns if "avg_pts" not in j]

In [7]:
roster[not_sim].sort_values(by=['group', 'avg_sim_pts'], ascending=False)

Unnamed: 0,group,team,rating,avg_sim_pts,99%CI_low,99%CI_high
5,H,Portugal,2006,5.834141,5.39995,6.26
9,H,Uruguay,1936,4.302759,3.8,4.79
21,H,South Korea,1786,4.102976,3.63,4.58
31,H,Ghana,1567,1.5974,1.29,1.93
0,G,Brazil,2169,6.098759,5.71,6.48
12,G,Switzerland,1902,4.24178,3.79,4.7
13,G,Serbia,1898,2.936524,2.5,3.4
30,G,Cameroon,1610,2.335378,2.04,2.66
4,F,Belgium,2007,6.124347,5.61,6.59
10,F,Croatia,1927,4.579835,4.03995,5.11005


### Simulating playoffs

In [None]:
playoff_games_pd = pd.read_csv("data/playoff_matches.csv")
playoff_teams_pd = pd.read_csv("data/playoff_roster.csv")

In [None]:
# Now, doing the Monte Carlo simulations
n = 10000
playoff_results_teams = []
playoff_results_stage = []

for i in tqdm(range(n)):
    overall_result_teams = dict()
    overall_result_stage = dict()
    games = read_games("data/playoff_matches.csv")
    teams = {}
    
    for row in [item for item in csv.DictReader(open("data/playoff_roster.csv"))]:
        teams[row['team']] = {'name': row['team'], 'rating': float(row['rating'])}
    
    simulate_playoffs(games, teams, ternary=True)
    
    playoff_pd = pd.DataFrame(games)
    
    # This is for collecting results of simulations per team
    for key in teams.keys():
        overall_result_teams[key] = collect_playoff_results(key, playoff_pd)
    playoff_results_teams.append(overall_result_teams)
    
    # Now, collecting results from stage-perspective
    overall_result_stage['whole_bracket'] = playoff_pd['advances'].to_list()
    overall_result_stage['Quarterfinals'] = playoff_pd.loc[playoff_pd['stage'] == 'eigths_finals', 'advances'].to_list()
    overall_result_stage['Semifinals'] = playoff_pd.loc[playoff_pd['stage'] == 'quarterfinals', 'advances'].to_list()
    overall_result_stage['Final'] = playoff_pd.loc[playoff_pd['stage'] == 'semifinals', 'advances'].to_list()
    overall_result_stage['third_place_match'] = playoff_pd.loc[playoff_pd['stage'] == 'semifinals', 'loses'].to_list()
    overall_result_stage['fourth_place'] = playoff_pd.loc[playoff_pd['stage'] == 'third_place', 'loses'].to_list()
    overall_result_stage['third_place'] = playoff_pd.loc[playoff_pd['stage'] == 'third_place', 'advances'].to_list()
    overall_result_stage['second_place'] = playoff_pd.loc[playoff_pd['stage'] == 'final', 'loses'].to_list()
    overall_result_stage['Champion'] = playoff_pd.loc[playoff_pd['stage'] == 'final', 'advances'].to_list()
    overall_result_stage['match8'] = list(playoff_pd.loc[8, ['home_team', 'away_team']])
    overall_result_stage['match9'] = list(playoff_pd.loc[9, ['home_team', 'away_team']])
    overall_result_stage['match10'] = list(playoff_pd.loc[10, ['home_team', 'away_team']])
    overall_result_stage['match11'] = list(playoff_pd.loc[11, ['home_team', 'away_team']])
    overall_result_stage['match12'] = list(playoff_pd.loc[12, ['home_team', 'away_team']])
    overall_result_stage['match13'] = list(playoff_pd.loc[13, ['home_team', 'away_team']])
    overall_result_stage['match14'] = list(playoff_pd.loc[14, ['home_team', 'away_team']])
    overall_result_stage['match15'] = list(playoff_pd.loc[15, ['home_team', 'away_team']])
    
    playoff_results_stage.append(overall_result_stage)

In [None]:
results_teams = pd.DataFrame(playoff_results_teams)

In [None]:
results_teams['France'].value_counts()

In [None]:
results_stage = pd.DataFrame(playoff_results_stage)

In [None]:
results_stage['match9'].value_counts()