In [1]:
import numpy as np
import pandas as pd
import csv
import math
import random
import matplotlib.pyplot as plt
from tqdm import tqdm

import sys
sys.path.append("src")
from world_cup_simulator import *

### Simulate group stage

#### The gist is to read from two files: One defining the match schedule, the other with teams and their relative strengths (given by Elo ratings prior to the start of the event)

In [2]:
# However, this cell is for illustrative purposes
games_pd = pd.read_csv("data/matches.csv")
teams_pd = pd.read_csv("data/roster1.csv")

In [3]:
# Reads in the matches and teams as dictionaries and proceeds with that data type
n = 10000 # How many simulations to run

for i in tqdm(range(n)):
    games = read_games("data/matches.csv")
    teams = {}
    
    for row in [item for item in csv.DictReader(open("data/roster1.csv"))]:
        teams[row['team']] = {'name': row['team'], 'rating': float(row['rating']), 'points': float(row['points'])}
    
    simulate_group_stage(games, teams, ternary=False)
    
    collector = []
    for key in teams.keys():
        collector.append({"team": key, f"simulation{i+1}": teams[key]['points']})

    temp = pd.DataFrame(collector)
    teams_pd = pd.merge(teams_pd, temp)
    
#     teams_pd[f"simulation{i+1}"] = teams_pd.groupby('group')[f"simulation{i+1}"].rank(ascending=False)

100%|██████████| 10000/10000 [00:42<00:00, 235.25it/s]


In [4]:
sim_cols = [i for i in teams_pd.columns if "simulation" in i]

In [5]:
teams_pd['average_points'] = teams_pd[sim_cols].mean(axis=1)
teams_pd['median_points'] = teams_pd[sim_cols].median(axis=1)
teams_pd['standard_deviation'] = teams_pd[sim_cols].std(axis=1)

In [6]:
not_sim = [j for j in teams_pd.columns if "simulation" not in j]

In [7]:
teams_pd[not_sim].sort_values(by=['group', 'average_points'], ascending=False)

Unnamed: 0,group,team,rating,points,average_points,median_points,standard_deviation
5,H,Portugal,2010,3,7.1991,9.0,1.999415
9,H,Uruguay,1924,1,4.7785,4.0,1.777851
21,H,South Korea,1798,1,4.0477,4.0,1.79219
31,H,Ghana,1563,0,0.9747,0.0,1.575979
0,G,Brazil,2185,3,8.3811,9.0,1.266263
12,G,Switzerland,1911,3,5.1465,6.0,1.932618
13,G,Serbia,1882,0,3.8754,3.0,1.907522
30,G,Cameroon,1601,0,0.597,0.0,1.241348
4,F,Belgium,2020,3,7.3278,9.0,1.937143
10,F,Croatia,1914,1,4.1959,4.0,2.038857


### Simulating playoffs

In [None]:
playoff_games_pd = pd.read_csv("data/playoff_matches.csv")
playoff_teams_pd = pd.read_csv("data/playoff_roster.csv")

In [None]:
# Now, doing the Monte Carlo simulations
n = 10000
playoff_results_teams = []
playoff_results_stage = []

for i in tqdm(range(n)):
    overall_result_teams = dict()
    overall_result_stage = dict()
    games = read_games("data/playoff_matches.csv")
    teams = {}
    
    for row in [item for item in csv.DictReader(open("data/playoff_roster.csv"))]:
        teams[row['team']] = {'name': row['team'], 'rating': float(row['rating'])}
    
    simulate_playoffs(games, teams, ternary=True)
    
    playoff_pd = pd.DataFrame(games)
    
    # This is for collecting results of simulations per team
    for key in teams.keys():
        overall_result_teams[key] = collect_playoff_results(key, playoff_pd)
    playoff_results_teams.append(overall_result_teams)
    
    # Now, collecting results from stage-perspective
    overall_result_stage['Quarterfinals'] = playoff_pd.loc[playoff_pd['stage'] == 'eigths_finals', 'advances'].to_list()
    overall_result_stage['Semifinals'] = playoff_pd.loc[playoff_pd['stage'] == 'quarterfinals', 'advances'].to_list() + [None]*4
    overall_result_stage['Final'] = playoff_pd.loc[playoff_pd['stage'] == 'semifinals', 'advances'].to_list() + [None]*6
    overall_result_stage['third_place_match'] = playoff_pd.loc[playoff_pd['stage'] == 'semifinals', 'loses'].to_list() + [None]*6
    overall_result_stage['fourth_place'] = playoff_pd.loc[playoff_pd['stage'] == 'third_place', 'loses'].to_list() + [None]*7
    overall_result_stage['third_place'] = playoff_pd.loc[playoff_pd['stage'] == 'third_place', 'advances'].to_list() + [None]*7
    overall_result_stage['second_place'] = playoff_pd.loc[playoff_pd['stage'] == 'final', 'loses'].to_list() + [None]*7
    overall_result_stage['Champion'] = playoff_pd.loc[playoff_pd['stage'] == 'final', 'advances'].to_list() + [None]*7
    
    playoff_results_stage.append(overall_result_stage)

In [None]:
results_teams = pd.DataFrame(playoff_results_teams)

In [None]:
results_teams['United States'].value_counts(normalize=True)

In [None]:
results_stage = pd.DataFrame(playoff_results_stage)

In [None]:
results_stage['Quarterfinals'].value_counts(normalize=True)