In [2]:
#Imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

group_stage_table = pd.read_csv(r'CSVs\Group_Stage_Table.csv' , encoding ='latin-1')
world_cup_matches = pd.read_csv(r'CSVs\World_Cup_Matches.csv', encoding = 'latin-1')
world_cup_teams = pd.read_csv(r'CSVs\World_Cup_Teams.csv', encoding='latin-1')
squad_predictions = pd.read_csv(r'CSVs\Predictions\Squad_Strength_Final.csv', encoding ='latin-1')

import warnings
warnings.filterwarnings("ignore")

In [3]:
# Final Simulation Code Chunk

#######################

# Simulation Setup

np.random.seed(22)
num_sims = 100000

## Creating round dataframes with match information

group_stage_matches = world_cup_matches[(world_cup_matches['Stage']=='Group')]
r_16_matches = world_cup_matches[(world_cup_matches['Group']=='R16')]
r_8_matches = world_cup_matches[(world_cup_matches['Group']=='R8')]
r_4_matches = world_cup_matches[(world_cup_matches['Group']=='Semi-Finals')]
final_match = world_cup_matches[(world_cup_matches['Group']=='Finals')]

group_results = []
r_16_sims = []
r_8_sims = []
r_4_sims = []
final_sims = []

# Group Stage Simulation 

## Need to lookup Squad percentages from Squad_Predictions DF

group_stage_matches['S1%']=pd.merge(group_stage_matches, squad_predictions, how='left', left_on=['Squad 1'], right_on=['Squad'])['% of Points Taken']
group_stage_matches['S2%']=pd.merge(group_stage_matches, squad_predictions, how='left', left_on=['Squad 2'], right_on=['Squad'])['% of Points Taken']

## Calculate Probability of S1 winning based on Log 5 Formula

group_stage_matches['S1_Prob'] = ((group_stage_matches['S1%'] - (group_stage_matches['S1%']*group_stage_matches['S2%'])) / (group_stage_matches['S1%'] + group_stage_matches['S2%'] - (2 * group_stage_matches['S1%'] * group_stage_matches['S2%'])))

## Create Group Stage Match Sims

group_match_sims = []

for index,row in group_stage_matches.iterrows():
    result = np.random.binomial(n=1, p = row.S1_Prob, size=num_sims)
    group_match_sims.append(result)


## Add match sim information into group_results 


for index, row in squad_predictions.iterrows():
    team = row.Squad
    group = row.Group
    team_percent = row["% of Points Taken"]
    
    team_rows = group_stage_matches.loc[(group_stage_matches['Squad 1'] == team)| (group_stage_matches['Squad 2']==team),['Squad 1','Squad 2', 'Group']]
    team_rows['is_squad_1'] = team_rows['Squad 1'] == team
    team_sims = [group_match_sims[i] for i in team_rows.index.to_list()]
    
    team_wins = []
    for x in range(len(team_rows)):
        is_squad_1 = team_rows.is_squad_1.values[x]
        game_wins = [y if is_squad_1 else int(not y) for y in team_sims[x]]
        team_wins.append(game_wins)
        
    team_results = np.sum(team_wins, axis=0)
    
    
    group_results.append({'team':team, 'group':group, 'team_percent':team_percent, 'wins':team_results, 'points':team_results*3})

     
    
#R16 Simulation

###############################
    
for z in range(num_sims):
    if (z > 0) and (z % (num_sims / 100) == 0):
        print(f"Simulation {z} Complete.")
        
    elif z == (num_sims - 1):
        print(f"Simulation Complete - The world cup was simulated {num_sims} times")
        
        
    r16_df = [{'team':x['team'], 'group':x['group'], 'team_percent':x['team_percent'], 'wins':x['wins'][z], 'points':x['points'][z]} for x in group_results] 
    r16_df = pd.DataFrame(r16_df)
    r16_df = r16_df.sort_values(by=['group','points', 'team_percent'], ascending=[True,False,False]).reset_index(drop=True)
    r16_df['rank'] = (r16_df.index % 4) + 1
    r16_df['rank'] = r16_df['rank'].astype(str)
    r16_df['seed'] = r16_df['rank'] + r16_df['group']
    
    r_16_results = pd.merge(r_16_matches, r16_df.loc[:,['team','team_percent','seed']], left_on='Squad 1', right_on='seed')
    r_16_results.drop(['Squad 1'], axis=1, inplace=True)
    r_16_results.rename(columns={'team':'Squad 1','team_percent':'S1%', 'seed':'Squad 1 Seed'}, inplace=True)

    r_16_results = pd.merge(r_16_results, r16_df.loc[:,['team','team_percent','seed']], left_on='Squad 2', right_on='seed')
    r_16_results.drop(['Squad 2'], axis=1, inplace=True)
    r_16_results.rename(columns={'team':'Squad 2','team_percent':'S2%', 'seed':'Squad 2 Seed'}, inplace=True)

    r_16_results['S1_Prob'] =((r_16_results['S1%'] - (r_16_results['S1%']*r_16_results['S2%'])) / (r_16_results['S1%'] + r_16_results['S2%'] - (2 * r_16_results['S1%'] * r_16_results['S2%'])))

    r_16_results['S1_wins'] = [np.random.binomial(n=1, p=x, size=1)[0] for x in r_16_results.S1_Prob]
    r_16_results['Simulation'] = z + 1
    r_16_sims.append(r_16_results)

#R8 Simulation

###############################

    r8_df = []
    for index, row in r_16_results.iterrows():
        match = row.Match
        seed = 'M'+str(match)
        if row.S1_wins == 1:
            team = row['Squad 1']
            team_percent = row['S1%']
        else:
            team = row['Squad 2']
            team_percent = row['S2%']

        data = {'seed':seed,'team':team,'team_percent':team_percent}
        r8_df.append(data)

    r8_df = pd.DataFrame(r8_df)
    
    r_8_results = pd.merge(r_8_matches, r8_df.loc[:,['team','team_percent','seed']], left_on='Squad 1', right_on='seed')
    r_8_results.drop(['Squad 1'], axis=1, inplace=True)
    r_8_results.rename(columns={'team':'Squad 1','team_percent':'S1%', 'seed':'Squad 1 Seed'}, inplace=True)

    r_8_results = pd.merge(r_8_results, r8_df.loc[:,['team','team_percent','seed']], left_on='Squad 2', right_on='seed')
    r_8_results.drop(['Squad 2'], axis=1, inplace=True)
    r_8_results.rename(columns={'team':'Squad 2','team_percent':'S2%', 'seed':'Squad 2 Seed'}, inplace=True)

    r_8_results['S1_Prob'] =((r_8_results['S1%'] - (r_8_results['S1%']*r_8_results['S2%'])) / (r_8_results['S1%'] + r_8_results['S2%'] - (2 * r_8_results['S1%'] * r_8_results['S2%'])))

    r_8_results['S1_wins'] = [np.random.binomial(n=1, p=x, size=1)[0] for x in r_8_results.S1_Prob]
    r_8_results['Simulation'] = z + 1
    r_8_sims.append(r_8_results)

    
#Semi-finals Simulation
###############################

    r_4_df = []
    for index, row in r_8_results.iterrows():
        match = row.Match
        seed = 'M'+str(match)
        if row.S1_wins == 1:
            team = row['Squad 1']
            team_percent = row['S1%']
        else:
            team = row['Squad 2']
            team_percent = row['S2%']

        data = {'seed':seed,'team':team,'team_percent':team_percent}
        r_4_df.append(data)

    r_4_df = pd.DataFrame(r_4_df)
    
    r_4_results = pd.merge(r_4_matches, r_4_df.loc[:,['team','team_percent','seed']], left_on='Squad 1', right_on='seed')
    r_4_results.drop(['Squad 1'], axis=1, inplace=True)
    r_4_results.rename(columns={'team':'Squad 1','team_percent':'S1%', 'seed':'Squad 1 Seed'}, inplace=True)

    r_4_results = pd.merge(r_4_results, r_4_df.loc[:,['team','team_percent','seed']], left_on='Squad 2', right_on='seed')
    r_4_results.drop(['Squad 2'], axis=1, inplace=True)
    r_4_results.rename(columns={'team':'Squad 2','team_percent':'S2%', 'seed':'Squad 2 Seed'}, inplace=True)

    r_4_results['S1_Prob'] =((r_4_results['S1%'] - (r_4_results['S1%']*r_4_results['S2%'])) / (r_4_results['S1%'] + r_4_results['S2%'] - (2 * r_4_results['S1%'] * r_4_results['S2%'])))

    r_4_results['S1_wins'] = [np.random.binomial(n=1, p=x, size=1)[0] for x in r_4_results.S1_Prob]
    r_4_results['Simulation'] = z + 1
    r_4_sims.append(r_4_results)

# Finals Simulation
###############################
    
    finals_df = []
    for index, row in r_4_results.iterrows():
        match = row.Match
        seed = 'M'+str(match)
        if row.S1_wins == 1:
            team = row['Squad 1']
            team_percent = row['S1%']
        else:
            team = row['Squad 2']
            team_percent = row['S2%']

        data = {'seed':seed,'team':team,'team_percent':team_percent}
        finals_df.append(data)

    finals_df = pd.DataFrame(finals_df)
    
    final_results = pd.merge(final_match, finals_df.loc[:,['team','team_percent','seed']], left_on='Squad 1', right_on='seed')
    final_results.drop(['Squad 1'], axis=1, inplace=True)
    final_results.rename(columns={'team':'Squad 1','team_percent':'S1%', 'seed':'Squad 1 Seed'}, inplace=True)

    final_results = pd.merge(final_results, finals_df.loc[:,['team','team_percent','seed']], left_on='Squad 2', right_on='seed')
    final_results.drop(['Squad 2'], axis=1, inplace=True)
    final_results.rename(columns={'team':'Squad 2','team_percent':'S2%', 'seed':'Squad 2 Seed'}, inplace=True)

    final_results['S1_Prob'] =((final_results['S1%'] - (final_results['S1%']*final_results['S2%'])) / (final_results['S1%'] + final_results['S2%'] - (2 * final_results['S1%'] * final_results['S2%'])))

    final_results['S1_wins'] = [np.random.binomial(n=1, p=x, size=1)[0] for x in final_results.S1_Prob]
    final_results['Simulation'] = z + 1
    final_sims.append(final_results)

Simulation 1000 Complete.
Simulation 2000 Complete.
Simulation 3000 Complete.
Simulation 4000 Complete.
Simulation 5000 Complete.
Simulation 6000 Complete.
Simulation 7000 Complete.
Simulation 8000 Complete.
Simulation 9000 Complete.
Simulation 10000 Complete.
Simulation 11000 Complete.
Simulation 12000 Complete.
Simulation 13000 Complete.
Simulation 14000 Complete.
Simulation 15000 Complete.
Simulation 16000 Complete.
Simulation 17000 Complete.
Simulation 18000 Complete.
Simulation 19000 Complete.
Simulation 20000 Complete.
Simulation 21000 Complete.
Simulation 22000 Complete.
Simulation 23000 Complete.
Simulation 24000 Complete.
Simulation 25000 Complete.
Simulation 26000 Complete.
Simulation 27000 Complete.
Simulation 28000 Complete.
Simulation 29000 Complete.
Simulation 30000 Complete.
Simulation 31000 Complete.
Simulation 32000 Complete.
Simulation 33000 Complete.
Simulation 34000 Complete.
Simulation 35000 Complete.
Simulation 36000 Complete.
Simulation 37000 Complete.
Simulation

In [4]:
final_sims[0]

Unnamed: 0,Match,Squad 1 Key,Squad 2 Key,Group,Stage,Squad 1,S1%,Squad 1 Seed,Squad 2,S2%,Squad 2 Seed,S1_Prob,S1_wins,Simulation
0,63,Knockout Round,Knockout Round,Finals,Knockout,Germany,0.819723,M61,Denmark,0.778587,M62,0.563903,1,1


In [None]:
group_match_sims[0]

In [None]:
type(final_sims)

In [None]:
len(world_cup_teams)

In [None]:
type(group_results)

In [None]:
group_results[0]['wins']

In [None]:
group_team_stats

In [None]:
empty_dict = {}

In [None]:
keys = 

In [None]:
listfor i in range(len(world_cup_teams)):
    print(group_results[i]['team'])


In [None]:
sum(group_results[0]['points'])/ num_sims

In [None]:
test = sum(group_results[0].get('wins')) / num_sims

In [None]:
test

In [None]:
for i in range(len(world_cup_teams)):
    

In [None]:
group_stage_team_wins = []
group_stage_team_wins = 

In [None]:
type(knockout_sims)

In [None]:
num_sims = 10000

for z in range(num_sims):
    if (z > 0) and (z % (num_sims / 10) == 0):
        print(f"Simulation {z} Complete.")
        
    elif z == (num_sims - 1):
        print(f"Simulation Complete - The world cup was simulated {num_sims} times")

In [5]:
winners = []

for x in final_sims:
    if x.S1_wins.values[0]==1:
        winners.append(x['Squad 1'].values[0])
    else:
        winners.append(x['Squad 2'].values[0])   

In [6]:
from collections import Counter
Counter(winners).most_common()

[('Netherlands', 13920),
 ('England', 10104),
 ('Spain', 9598),
 ('Belgium', 8772),
 ('Germany', 8391),
 ('Brazil', 8105),
 ('Portugal', 5891),
 ('France', 5031),
 ('Morocco', 4856),
 ('Denmark', 4496),
 ('Senegal', 4409),
 ('Mexico', 3599),
 ('South Korea', 2468),
 ('Argentina', 1758),
 ('Cameroon', 1664),
 ('Iran', 1377),
 ('United States', 840),
 ('Switzerland', 776),
 ('Croatia', 722),
 ('Serbia', 618),
 ('Australia', 583),
 ('Poland', 454),
 ('Tunisia', 409),
 ('Saudi Arabia', 314),
 ('Qatar', 210),
 ('Canada', 210),
 ('Japan', 152),
 ('Uruguay', 109),
 ('Wales', 74),
 ('Ecuador', 31),
 ('Ghana', 31),
 ('Costa Rica', 28)]

In [None]:
group_team_stats = []

#Counter(group_results[0]['wins'])

In [None]:
Iran_wins = Counter(group_results[1]['wins'])

In [None]:
group_team_stats.append(Iran_wins)

In [None]:
group_team_stats

In [13]:
import pickle

In [None]:
r_16_sims = knockout_sims

In [14]:
with open('group_results.pickle', 'wb') as file:
    pickle.dump(group_results, file)

In [None]:
group_results[0]

In [None]:
type(group_results)

In [None]:
winners

In [None]:
r_4_sims[99999]

In [7]:
r_16_df = pd.concat(r_16_sims, ignore_index=True)
r_8_df = pd.concat(r_8_sims, ignore_index=True)
r_4_df = pd.concat(r_4_sims, ignore_index=True)
finals_df = pd.concat(final_sims, ignore_index=True)

match_df = [r_16_df, r_8_df, r_4_df, finals_df]




In [8]:
all_knockout_matches = pd.concat(match_df, ignore_index=True)

In [None]:
r_4_df.reset_index(inplace = True)

In [None]:
r_4_df.drop(['index'], axis=1, inplace=True)

In [None]:
r_4_df

In [None]:
final_sims[0]

In [None]:
finals_df = pd.concat(final_sims, ignore_index=True)

In [None]:
finals_df

In [None]:
knockout_sims[0]

In [None]:
r_16_df = pd.concat(r_16_sims, ignore_index=True)

In [None]:
r_16_df

In [None]:
r_8_df = pd.concat(r_8_sims, ignore_index=True)

In [None]:
match_df = [r_16_df, r_8_df, r_4_df, finals_df]

In [9]:
all_knockout_matches.shape

(1500000, 14)

In [11]:
all_knockout_matches.to_csv('CSVs/Predictions/all_knockout_matches.csv', index=False)

In [10]:
all_knockout_matches['Match Winner'] = np.where(all_knockout_matches['S1_wins'] == 1, all_knockout_matches['Squad 1'], all_knockout_matches['Squad 2'])

In [None]:
test_df = all_knockout_matches.head(5)

In [None]:
US_Matches = all_knockout_matches.loc[(all_knockout_matches['Squad 1'] == 'United States') | (all_knockout_matches['Squad 2'] == 'United States')]

In [None]:
US_Matches

In [None]:
US_Matches.loc[(US_Matches['Group'] == 'Finals')].value_counts()

In [None]:
test_df

In [None]:
US_Matches.loc[(US_Matches['Group'] == 'Finals')]

In [None]:
test_df['Match Winner'] = np.where(test_df['S1_wins'] == 1, test_df['Squad 1'], test_df['Squad 2'])

In [None]:
test_df