In [1]:
import numpy as np
import pandas as pd
import csv
import math
import random
import matplotlib.pyplot as plt
from tqdm import tqdm

import sys
sys.path.append("src")
from world_cup_simulator import *

### Simulate group stage

#### The gist is to read from two files: One defining the match schedule, the other with teams and their relative strengths (given by Elo ratings prior to the start of the event)

In [2]:
# However, this cell is for illustrative purposes
games_pd = pd.read_csv("data/matches.csv")
teams_pd = pd.read_csv("data/roster.csv")

In [3]:
# Reads in the matches and teams as dictionaries and proceeds with that data type
n = 100   # How many simulations to run

for i in tqdm(range(n)):
    games = read_games("data/matches.csv")
    teams = {}
    
    for row in [item for item in csv.DictReader(open("data/roster.csv"))]:
        teams[row['team']] = {'name': row['team'], 'rating': float(row['rating']), 'points': 0}
    
    simulate_group_stage(games, teams, ternary=False)
    
    for key in teams.keys():
        f = teams_pd['team'] == key
        teams_pd.loc[f, f"simulation{i+1}"] = teams[key]['points']
    
    teams_pd[f"simulation{i+1}"] = teams_pd.groupby('group')[f"simulation{i+1}"].rank(ascending=False)

  teams_pd.loc[f, f"simulation{i+1}"] = teams[key]['points']
  teams_pd.loc[f, f"simulation{i+1}"] = teams[key]['points']
100%|████████████████████████████████████████████████████████████████████████████████| 100/100 [00:02<00:00, 47.04it/s]


In [4]:
sim_cols = [i for i in teams_pd.columns if "simulation" in i]

In [5]:
teams_pd['avg_pos'] = teams_pd[sim_cols].mean(axis=1)
teams_pd['median_pos'] = teams_pd[sim_cols].median(axis=1)
teams_pd['std_pos'] = teams_pd[sim_cols].std(axis=1)

In [6]:
not_sim = [j for j in teams_pd.columns if "simulation" not in j]

In [7]:
teams_pd[not_sim].sort_values(by=['group', 'avg_pos'])

Unnamed: 0,group,team,rating,avg_pos,median_pos,std_pos
3,A,Netherlands,2040,1.41,1.0,0.645028
14,A,Ecuador,1833,2.38,2.0,0.879394
28,A,Qatar,1680,2.93,3.0,0.788234
27,A,Senegal,1687,3.28,3.5,0.847814
11,B,England,1920,1.88,2.0,0.907655
19,B,Wales,1790,2.58,3.0,0.96064
18,B,Iran,1797,2.75,3.0,0.973124
17,B,United States,1798,2.79,3.0,0.992853
1,C,Argentina,2143,1.3,1.0,0.564076
15,C,Poland,1814,2.57,3.0,0.84692


### Simulating playoffs

In [8]:
playoff_games_pd = pd.read_csv("data/playoff_matches.csv")
playoff_teams_pd = pd.read_csv("data/playoff_roster.csv")

In [9]:
# Now, doing the Monte Carlo simulations
n = 10000
playoff_results_teams = []
playoff_results_stage = []

for i in tqdm(range(n)):
    overall_result_teams = dict()
    overall_result_stage = dict()
    games = read_games("data/playoff_matches.csv")
    teams = {}
    
    for row in [item for item in csv.DictReader(open("data/playoff_roster.csv"))]:
        teams[row['team']] = {'name': row['team'], 'rating': float(row['rating'])}
    
    simulate_playoffs(games, teams, ternary=True)
    
    playoff_pd = pd.DataFrame(games)
    
    # This is for collecting results of simulations per team
    for key in teams.keys():
        overall_result_teams[key] = collect_playoff_results(key, playoff_pd)
    playoff_results_teams.append(overall_result_teams)
    
    # Now, collecting results from stage-perspective
    overall_result_stage['Quarterfinals'] = playoff_pd.loc[playoff_pd['stage'] == 'eigths_finals', 'advances'].to_list()
    overall_result_stage['Semifinals'] = playoff_pd.loc[playoff_pd['stage'] == 'quarterfinals', 'advances'].to_list() + [None]*4
    overall_result_stage['Final'] = playoff_pd.loc[playoff_pd['stage'] == 'semifinals', 'advances'].to_list() + [None]*6
    overall_result_stage['third_place_match'] = playoff_pd.loc[playoff_pd['stage'] == 'semifinals', 'loses'].to_list() + [None]*6
    overall_result_stage['fourth_place'] = playoff_pd.loc[playoff_pd['stage'] == 'third_place', 'loses'].to_list() + [None]*7
    overall_result_stage['third_place'] = playoff_pd.loc[playoff_pd['stage'] == 'third_place', 'advances'].to_list() + [None]*7
    overall_result_stage['second_place'] = playoff_pd.loc[playoff_pd['stage'] == 'final', 'loses'].to_list() + [None]*7
    overall_result_stage['Champion'] = playoff_pd.loc[playoff_pd['stage'] == 'final', 'advances'].to_list() + [None]*7
    
    playoff_results_stage.append(overall_result_stage)

100%|████████████████████████████████████████████████████████████████████████████| 10000/10000 [02:37<00:00, 63.49it/s]


In [10]:
results_teams = pd.DataFrame(playoff_results_teams)

In [11]:
results_teams['Brazil'].value_counts(normalize=True)

Quarterfinals    0.3778
Champion         0.1770
Round_of_16      0.1758
Third_place      0.1640
Final            0.0566
Fourth_place     0.0488
Name: Brazil, dtype: float64

In [12]:
results_stage = pd.DataFrame(playoff_results_stage)

In [13]:
results_stage['Quarterfinals'].value_counts(normalize=True)

[Netherlands, Argentina, Spain, Brazil, England, Denmark, Belgium, Portugal]         0.0555
[Netherlands, Argentina, Spain, Brazil, England, Denmark, Germany, Portugal]         0.0435
[Netherlands, Argentina, Spain, Brazil, Ecuador, Denmark, Belgium, Portugal]         0.0335
[Netherlands, Argentina, Spain, Brazil, England, Denmark, Belgium, Switzerland]      0.0294
[Netherlands, Argentina, Spain, Brazil, Ecuador, Denmark, Germany, Portugal]         0.0269
                                                                                      ...  
[United States, France, Croatia, Uruguay, England, Denmark, Belgium, Switzerland]    0.0001
[United States, France, Croatia, Uruguay, England, Poland, Germany, Switzerland]     0.0001
[United States, Argentina, Croatia, Uruguay, England, Poland, Germany, Portugal]     0.0001
[United States, France, Croatia, Brazil, England, Poland, Belgium, Switzerland]      0.0001
[United States, France, Spain, Uruguay, Ecuador, Poland, Belgium, Switzerland]  