### Setup and Preprocessing

In [146]:
import pandas as pd
import numpy as np
import random as rand
from itertools import combinations
from collections import Counter

In [147]:
# Read in merged CSV file
merge = pd.read_csv('merged_files.csv')
print(merge.columns)
print(merge['batting_team'].unique())
merge.head()

Index(['Unnamed: 0', 'match_id', 'season', 'start_date', 'venue', 'innings',
       'ball', 'batting_team', 'bowling_team', 'striker', 'non_striker',
       'bowler', 'runs_off_bat', 'extras', 'wides', 'noballs', 'byes',
       'legbyes', 'penalty', 'wicket_type', 'player_dismissed'],
      dtype='object')
['England' 'Australia' 'New Zealand' 'South Africa' 'Pakistan' 'Sri Lanka'
 'West Indies' 'India' 'Kenya' 'Scotland' 'Zimbabwe' 'Bangladesh'
 'Bermuda' 'Netherlands' 'Ireland' 'Afghanistan' 'Canada' 'Nepal'
 'Hong Kong' 'United Arab Emirates' 'Papua New Guinea' 'Oman'
 'ICC World XI' 'Philippines' 'Vanuatu' 'United States of America'
 'Germany' 'Italy' 'Ghana' 'Namibia' 'Uganda' 'Botswana' 'Nigeria'
 'Guernsey' 'Denmark' 'Norway' 'Jersey' 'Thailand' 'Malaysia' 'Maldives'
 'Singapore' 'Qatar' 'Kuwait' 'Cayman Islands' 'Portugal' 'Spain'
 'Gibraltar' 'Bhutan' 'Saudi Arabia' 'Bahrain' 'Iran' 'Belgium'
 'Luxembourg' 'Czech Republic' 'Isle of Man' 'Bulgaria' 'Romania'
 'Austria' 'Greece' 

Unnamed: 0.1,Unnamed: 0,match_id,season,start_date,venue,innings,ball,batting_team,bowling_team,striker,non_striker,bowler,runs_off_bat,extras,wides,noballs,byes,legbyes,penalty,wicket_type,player_dismissed
0,0,211028,2005,2005-06-13,The Rose Bowl,1,0.1,England,Australia,ME Trescothick,GO Jones,B Lee,0,0,,,,,,,
1,1,211028,2005,2005-06-13,The Rose Bowl,1,0.2,England,Australia,ME Trescothick,GO Jones,B Lee,1,0,,,,,,,
2,2,211028,2005,2005-06-13,The Rose Bowl,1,0.3,England,Australia,GO Jones,ME Trescothick,B Lee,0,0,,,,,,,
3,3,211028,2005,2005-06-13,The Rose Bowl,1,0.4,England,Australia,GO Jones,ME Trescothick,B Lee,0,0,,,,,,,
4,4,211028,2005,2005-06-13,The Rose Bowl,1,0.5,England,Australia,GO Jones,ME Trescothick,B Lee,0,0,,,,,,,


In [148]:
# We will use only data from the last 4 years (2020-2024)
merge[['start-year', 'start-day', 'start-month']] = merge['start_date'].str.split('-', expand=True)
recent = merge.loc[merge['start-year'].astype(int) >= 2020]
merge = merge.drop(['start-year', 'start-day', 'start-month'], axis=1)
print(recent.shape)
recent.head()

(295732, 24)


Unnamed: 0.1,Unnamed: 0,match_id,season,start_date,venue,innings,ball,batting_team,bowling_team,striker,non_striker,bowler,runs_off_bat,extras,wides,noballs,byes,legbyes,penalty,wicket_type,player_dismissed,start-year,start-day,start-month
169922,0,1185313,2019/20,2020-02-12,Buffalo Park,1,0.1,South Africa,England,T Bavuma,Q de Kock,MM Ali,1,0,,,,,,,,2020,2,12
169923,1,1185313,2019/20,2020-02-12,Buffalo Park,1,0.2,South Africa,England,Q de Kock,T Bavuma,MM Ali,1,0,,,,,,,,2020,2,12
169924,2,1185313,2019/20,2020-02-12,Buffalo Park,1,0.3,South Africa,England,T Bavuma,Q de Kock,MM Ali,1,0,,,,,,,,2020,2,12
169925,3,1185313,2019/20,2020-02-12,Buffalo Park,1,0.4,South Africa,England,Q de Kock,T Bavuma,MM Ali,0,0,,,,,,,,2020,2,12
169926,4,1185313,2019/20,2020-02-12,Buffalo Park,1,0.5,South Africa,England,Q de Kock,T Bavuma,MM Ali,1,0,,,,,,,,2020,2,12


In [149]:
# Create dataframe of matches only from teams participating in the 2024 World Cup
# teams are organized in their groups for the first round, ordered by pre-tournament seed within groups
# Ex: [A1, A2, ..., D4, D5]
with open('teams.txt') as file:
    teams_list = file.read().splitlines()
wc20=recent.loc[(recent['bowling_team'].isin(teams_list)) | (recent['batting_team'].isin(teams_list))]
print(wc20.shape)
wc20.head()

(154678, 24)


Unnamed: 0.1,Unnamed: 0,match_id,season,start_date,venue,innings,ball,batting_team,bowling_team,striker,non_striker,bowler,runs_off_bat,extras,wides,noballs,byes,legbyes,penalty,wicket_type,player_dismissed,start-year,start-day,start-month
169922,0,1185313,2019/20,2020-02-12,Buffalo Park,1,0.1,South Africa,England,T Bavuma,Q de Kock,MM Ali,1,0,,,,,,,,2020,2,12
169923,1,1185313,2019/20,2020-02-12,Buffalo Park,1,0.2,South Africa,England,Q de Kock,T Bavuma,MM Ali,1,0,,,,,,,,2020,2,12
169924,2,1185313,2019/20,2020-02-12,Buffalo Park,1,0.3,South Africa,England,T Bavuma,Q de Kock,MM Ali,1,0,,,,,,,,2020,2,12
169925,3,1185313,2019/20,2020-02-12,Buffalo Park,1,0.4,South Africa,England,Q de Kock,T Bavuma,MM Ali,0,0,,,,,,,,2020,2,12
169926,4,1185313,2019/20,2020-02-12,Buffalo Park,1,0.5,South Africa,England,Q de Kock,T Bavuma,MM Ali,1,0,,,,,,,,2020,2,12


In [150]:
%%capture
# How many unique teams are there in wc20 competition with World Cup teams?
unique_teams = pd.unique(wc20[['batting_team', 'bowling_team']].values.ravel('K'))
for team in unique_teams:
    print(f"\"{team}\"",":",",")

In [151]:
#ICC T20 rating for all teams who have played against or are a World Cup team
team_rating = {
    "India" : 266,
    "England" : 256,
    "Australia" : 255,
    "New Zealand" : 254,
    "South Africa" : 249,
    "Pakistan" : 249,
    "West Indies" : 245,
    "Sri Lanka" : 234,
    "Bangladesh" : 227,
    "Afghanistan" : 218,
    "Namibia" : 196,
    "Ireland" : 194,
    "Zimbabwe" : 193,
    "Scotland" : 190,
    "Netherlands" : 183,
    "Nepal" : 175,
    "United Arab Emirates" : 174,
    "Oman" : 154,
    "Papua New Guinea" : 143,
    "Canada" : 140,
    "Hong Kong" : 139,
    "Uganda" : 132,
    "United States of America" : 131,
    "Jersey" : 128,
    "Malaysia" : 125,
    "Kuwait" : 118,
    "Bahrain" : 115,
    "Kenya" : 107,
    "Bermuda" : 107,
    "Saudi Arabia" : 104,
    "Italy" : 101,
    "Tanzania" : 93,
    "Germany" : 92,
    "Singapore" : 76,
    "Nigeria" : 75,
    "Cayman Islands" : 72,
    "Denmark" : 71,
    "Vanuatu" : 60,
    "Austria" : 58,
    "Botswana" : 53,
    "Japan" : 49,
    "Philippines" : 41,
    "Argentina" : 39,
    "Malawi" : 36,
    "Mozambique" : 36,
    "Rwanda" : 32,
    "Ghana" : 31,
    "Bahamas" : 24,
    "Belize" : 22,
    "Panama" : 19,
    "Lesotho" : 9,
    "Maldives" : 3,
    "Thailand" : 0,
    "Swaziland" : 0,
    "Seychelles" : 0,
    "Mongolia" : 0,
}

In [152]:
# TODO: MAKE THIS

tiered_team_rating = {
    "India" : 266,
    "England" : 256,
    "Australia" : 255,
    "New Zealand" : 254,
    "South Africa" : 249,
    "Pakistan" : 249,
    "West Indies" : 245,
    "Sri Lanka" : 234,
    "Bangladesh" : 227,
    "Afghanistan" : 218,
    "Namibia" : 196,
    "Ireland" : 194,
    "Zimbabwe" : 193,
    "Scotland" : 190,
    "Netherlands" : 183,
    "Nepal" : 175,
    "United Arab Emirates" : 174,
    "Oman" : 154,
    "Papua New Guinea" : 143,
    "Canada" : 140,
    "Hong Kong" : 139,
    "Uganda" : 132,
    "United States of America" : 131,
    "Jersey" : 128,
    "Malaysia" : 125,
    "Kuwait" : 118,
    "Bahrain" : 115,
    "Kenya" : 107,
    "Bermuda" : 107,
    "Saudi Arabia" : 104,
    "Italy" : 101,
    "Tanzania" : 93,
    "Germany" : 92,
    "Singapore" : 76,
    "Nigeria" : 75,
    "Cayman Islands" : 72,
    "Denmark" : 71,
    "Vanuatu" : 60,
    "Austria" : 58,
    "Botswana" : 53,
    "Japan" : 49,
    "Philippines" : 41,
    "Argentina" : 39,
    "Malawi" : 36,
    "Mozambique" : 36,
    "Rwanda" : 32,
    "Ghana" : 31,
    "Bahamas" : 24,
    "Belize" : 22,
    "Panama" : 19,
    "Lesotho" : 9,
    "Maldives" : 3,
    "Thailand" : 0,
    "Swaziland" : 0,
    "Seychelles" : 0,
    "Mongolia" : 0,
}

### Calculate Stats

In [153]:
%%capture
# Calculate adjusted runs scored
team_ratings_mapped = wc20['bowling_team'].map(team_rating)
wc20['adjusted_team_ratings'] = team_ratings_mapped
wc20['adj_runs_scored'] = wc20['runs_off_bat'] * (wc20['adjusted_team_ratings'] / 266)

# Calculate adjusted conceded runs
team_ratings_mapped2 = wc20['batting_team'].map(team_rating)
wc20['adjusted_team_ratings2'] = team_ratings_mapped2 
wc20.loc[wc20['adjusted_team_ratings2'] < 50, 'adjusted_team_ratings2'] = 50
wc20['adj_conceded_runs'] = wc20['runs_off_bat'] * ( 266 / wc20['adjusted_team_ratings2'])

In [154]:
# Calculate adj runs per bowl for strikers (extras not included)
# Extras included in conceded runs for bowlers

df = (wc20
       .loc[:, ['striker', 'adj_runs_scored', 'batting_team']]
       .groupby(['striker', 'batting_team'], as_index = False)
       .sum())
df_sorted = df.sort_values(by = 'adj_runs_scored', ascending = False)
df_sorted.head(10)
 

Unnamed: 0,striker,batting_team,adj_runs_scored
757,Mohammad Rizwan,Pakistan,2522.421053
194,Babar Azam,Pakistan,2069.703008
1019,SA Yadav,India,1935.890977
511,JC Buttler,England,1561.943609
376,GD Phillips,New Zealand,1505.657895
795,N Pooran,West Indies,1449.68797
301,DJ Malan,England,1345.139098
1197,V Kohli,India,1261.804511
314,DP Conway,New Zealand,1234.789474
936,RG Sharma,India,1226.492481


In [155]:
num_bowls = (wc20
       .loc[:, ['striker', 'bowling_team', 'batting_team']]
       .groupby(['striker', 'batting_team'], as_index = False)
       .count()
       .rename(columns = {'bowling_team' : 'n_bowls'})
        )
dfA = df_sorted.merge(num_bowls, on = ['striker', 'batting_team'])
dfA = dfA.sort_values(by = 'adj_runs_scored', ascending = False)
# print(dfA)

In [156]:
dfA['adj_runs_per_bowl'] = (dfA['adj_runs_scored'] / dfA['n_bowls'])
dfA = dfA.sort_values(by = 'adj_runs_per_bowl', ascending = False)
dfA.head(10)

Unnamed: 0,striker,batting_team,adj_runs_scored,n_bowls,adj_runs_per_bowl
670,Shahnawaz Dhani,Pakistan,16.0,6,2.666667
903,Umran Malik,India,4.699248,2,2.349624
909,Mukesh Kumar,India,4.605263,2,2.302632
652,Rakibul Hasan,Bangladesh,17.744361,8,2.218045
739,O Muzondo,Zimbabwe,11.052632,5,2.210526
792,AM Fernando,Sri Lanka,8.533835,4,2.133459
1027,Aziz Sualley,Ghana,1.984962,1,1.984962
1034,CB Sole,Scotland,1.909774,1,1.909774
712,SC Kuggeleijn,New Zealand,13.105263,7,1.87218
814,Aminul Islam Biplob,Bangladesh,7.488722,4,1.87218


In [157]:
# Calculate adj runs conceded per bowl for bowlers (extras included)
df = (wc20
       .loc[:, ['bowler', 'bowling_team', 'adj_conceded_runs', 'extras']]
       .groupby(['bowler', 'bowling_team'], as_index = False)
       .sum())
df_sorted = df.sort_values(by='adj_conceded_runs', ascending=False)
df_sorted.head(10)

Unnamed: 0,bowler,bowling_team,adj_conceded_runs,extras
282,H Ssenyondo,Uganda,3826.84158,42
204,DM Nakrani,Uganda,2247.115561,51
503,MR Adair,Ireland,2228.86199,140
128,Bilal Hassun,Uganda,2152.425148,119
298,Haris Rauf,Pakistan,2105.100956,114
317,IS Sodhi,New Zealand,2067.358027,86
231,F Nsubuga,Uganda,2011.768068,24
339,J Little,Ireland,1953.855053,115
41,AR Ramjani,Uganda,1884.615645,27
403,K Waiswa,Uganda,1851.712674,53


In [158]:
num_bowls = (wc20
       .loc[:, ['bowler', 'bowling_team', 'batting_team']]
       .groupby(['bowler', 'bowling_team'], as_index = False)
       .count()
       .rename(columns = {'batting_team' : 'n_bowls'})
        )
dfB = df_sorted.merge(num_bowls, on = ['bowler', 'bowling_team'])
dfB = dfB.sort_values(by = 'adj_conceded_runs', ascending = False)
# print(dfB)

In [159]:
dfB['adj_runs_conceded_per_bowl'] = ((dfB['adj_conceded_runs'] + dfB['extras']) / dfB['n_bowls'])
dfB = dfB.sort_values(by = 'adj_runs_conceded_per_bowl', ascending = False)
dfB.head(10)

Unnamed: 0,bowler,bowling_team,adj_conceded_runs,extras,n_bowls,adj_runs_conceded_per_bowl
799,KK Tillett,Belize,24.366412,5,3,9.788804
505,E Frosler,South Africa,95.76,0,12,7.98
535,Gurdeep Singh,Kenya,82.621212,0,12,6.885101
716,Yusuf Ebrahim,Panama,39.9,0,6,6.65
548,NS Dhaliwal,Canada,79.8,2,13,6.292308
312,TO Carmichael,United States of America,223.44,1,37,6.065946
717,B Terbish,Mongolia,39.52,1,7,5.788571
490,L Botha,South Africa,101.08,0,18,5.615556
745,Umair Tariq,Austria,33.6,0,6,5.6
699,L Ntanzi,South Africa,42.56,2,8,5.57


In [160]:
players = pd.read_csv('players.csv')

# Add runs per bowl values to players.csv 
# Only RPB values for hitters and all_rounders will be used

players = players.merge(dfA[['striker', 'adj_runs_per_bowl']], how='left', left_on='name',right_on='striker')
players = players.drop(columns = ['striker'])

# Add runs conceded per bowl values to players.csv 
# Only RCPB values for bowlers and all_rounders will be used

players = players.merge(dfB[['bowler', 'adj_runs_conceded_per_bowl']], how='left', left_on='name',right_on='bowler')
players = players.drop(columns = ['bowler'])

players.head(7)

Unnamed: 0,name,position,country,adj_runs_per_bowl,adj_runs_conceded_per_bowl
0,Pargat Singh,hitter,Canada,0.379431,3.295069
1,NR Kirton,hitter,Canada,0.471596,4.203396
2,NS Dhaliwal,hitter,Canada,0.376332,6.292308
3,Saad Bin Zafar,all_rounder,Canada,0.613306,2.749941
4,N Dutta,bowler,Canada,0.6944,2.299036
5,Kaleem Sana,bowler,Canada,0.132946,2.123171
6,JOA Gordon,bowler,Canada,0.201128,2.912078


### Simulate a Match

In [161]:
bowlers = players[((players['position'] == 'bowler') | (players['position'] == 'all_rounder'))]
hitters = players[((players['position'] == 'hitter') | (players['position'] == 'all_rounder'))]

# Remove players for which there is no data
bowlers = bowlers.dropna(subset=['adj_runs_conceded_per_bowl'])
hitters = hitters.dropna(subset=['adj_runs_per_bowl'])

# Remove invalid rows from bowlers df
bowlers = bowlers.drop([74,76,77])

# We have 80 bowlers and 80 hitters,
# with all-rounders listed in both dataframes.
print(bowlers.shape)
print(hitters.shape)

bowlers.head(8)

(80, 5)
(80, 5)


Unnamed: 0,name,position,country,adj_runs_per_bowl,adj_runs_conceded_per_bowl
3,Saad Bin Zafar,all_rounder,Canada,0.613306,2.749941
4,N Dutta,bowler,Canada,0.6944,2.299036
5,Kaleem Sana,bowler,Canada,0.132946,2.123171
6,JOA Gordon,bowler,Canada,0.201128,2.912078
10,NK Patel,all_rounder,United States of America,0.725881,2.212665
11,SN Netravalkar,bowler,United States of America,0.470462,2.001088
12,Ali Khan,bowler,United States of America,0.0,2.100031
13,NP Kenjige,bowler,United States of America,,1.6625


In [162]:
# How we determine which score to use determines our results! 
# The distribution of RPB and RPCB could be decided on a per-match basis 
# if there is enough data for a player

def get_runs(rpb, rcpb):
    return rand.uniform(rpb,rcpb)

In [163]:
def run_match(country1, country2, print_score=False, update_nrr = True):
    global cup

    # make a vector for each team's batters/bowlers 
    bowlers1 = bowlers.loc[bowlers['country'] == country1]
    bowlers1 = bowlers1['name'].to_numpy()

    bowlers2 = bowlers.loc[bowlers['country'] == country2]
    bowlers2 = bowlers2['name'].to_numpy()

    hitters1 = hitters.loc[hitters['country'] == country1]
    hitters1 = hitters1['name'].to_numpy()

    hitters2 = hitters.loc[hitters['country'] == country2]
    hitters2 = hitters2['name'].to_numpy()

    arr_list = [bowlers1, bowlers2, hitters1, hitters2]

    # shuffle order of bowlers and hitters for added variability
    for arr in arr_list:
        rand.shuffle(arr)

    # base scores
    country1_score = 0.0
    country2_score = 0.0 
    
    # 2 x 4 hitters x 4 bowlers x 6 bowls
    # 192 total bowls (except for US)

    #first team score:
    for hitter in hitters1:
        for bowler in bowlers2:
            for x in range(6):
                rpb = (players.loc[players['name'] == hitter, 'adj_runs_per_bowl'].values)[0]
                rcpb = (players.loc[players['name'] == bowler, 'adj_runs_conceded_per_bowl'].values)[0]
                country1_score += get_runs(rpb, rcpb)
              
    
    #second team score:
    for hitter in hitters2:
        for bowler in bowlers1:
            for x in range(6):
                rpb = (players.loc[players['name'] == hitter, 'adj_runs_per_bowl'].values)[0]
                rcpb = (players.loc[players['name'] == bowler, 'adj_runs_conceded_per_bowl'].values)[0]
                country2_score += get_runs(rpb, rcpb)

    if(print_score):
        print(country1, "score:", country1_score)
        print(country2, "score:", country2_score)

    # In reality, all scores will be whole numbers, and the number of overs will vary.
    # For the purposes of calculating NRR in this simulation, we will not round scores
    # and treat all teams as bowling 16 overs in each match. In that way, for every team:
    # Total Overs Faced = Total Overs Bowled = 16 * Number of Matches Played 
    # (Since the number of overs will always be 16, this speeds up the calculation!)
        
    if(update_nrr):
        # NRR = (Total Runs Scored ÷ Total Overs Faced) – (Total Runs Conceded ÷ Total Overs Bowled)
        # For country1
        cup.loc[cup['Country'] == country1, 'total_runs_scored'] += country1_score
        cup.loc[cup['Country'] == country1, 'total_runs_conceded'] += country2_score
        cup.loc[cup['Country'] == country1, 'num_matches_played'] += 1
        cup.loc[cup['Country'] == country1, 'NRR'] = ((cup.loc[cup['Country'] == country1, 'total_runs_scored'].values[0] -
                                                                         cup.loc[cup['Country'] == country1, 'total_runs_conceded'].values[0]) / 
                                                                         (16*cup.loc[cup['Country'] == country1, 'num_matches_played'].values[0]))
        # For country2
        cup.loc[cup['Country'] == country2, 'total_runs_scored'] += country2_score
        cup.loc[cup['Country'] == country2, 'total_runs_conceded'] += country1_score
        cup.loc[cup['Country'] == country2, 'num_matches_played'] += 1
        cup.loc[cup['Country'] == country2, 'NRR'] = ((cup.loc[cup['Country'] == country2, 'total_runs_scored'].values[0] -
                                                                         cup.loc[cup['Country'] == country2, 'total_runs_conceded'].values[0]) / 
                                                                         (16*cup.loc[cup['Country'] == country2, 'num_matches_played'].values[0]))

    if country1_score > country2_score:
        return country1
    elif country2_score > country1_score:
        return country2
    else:
        return 'tie'

In [164]:
# Change these to try a different matchup!
teamA = 'Australia'
teamB = 'New Zealand'

# One match simulated with each function call
# Add 'True' to run_match to print scores
winner = run_match(teamA, teamB, True, False)
print("Winner:", winner)

Australia score: 127.6511651330198
New Zealand score: 127.81409202132103
Winner: New Zealand


In [165]:
# Change num_iterations and run this cell to simulate 
# many matches for the pair given in the previous cell

num_iterations = 100

A_victories = 0
for i in range(num_iterations):
    if (run_match(teamA, teamB, False, False) == teamA):
        A_victories += 1
print("Proportion that", teamA, "won:", A_victories/num_iterations)

Proportion that Australia won: 0.92


### Simulate the World Cup

In [166]:
# placements = ["Group stage", "Super 8 stage", "Semifinalist", "Finalist", "Champion"]

cup = pd.DataFrame({'Country': teams_list, 'Group': 5*['A']+5*['B']+5*['C']+5*['D']})
cup['id'] = cup.index

A_matches= tuple(combinations(cup['Country'].loc[cup['Group'] == 'A'], 2))
B_matches = tuple(combinations(cup['Country'].loc[cup['Group'] == 'B'], 2))
C_matches = tuple(combinations(cup['Country'].loc[cup['Group'] == 'C'], 2))
D_matches = tuple(combinations(cup['Country'].loc[cup['Group'] == 'D'], 2))

In [167]:
def round_robin(group_list, match_record):
    global cup
    for group in group_list:
        for pairing in group:
            team1 = pairing[0]
            team2 = pairing[1]
            winner = run_match(team1,team2)

            # 2 points for win, 1 for tie
            if (winner == team1):
                cup.loc[cup['Country'] == team1, 'Points'] += 2
                match_record.append([team1, team2, 'w'])
            elif (winner == team2):
                cup.loc[cup['Country'] == team2, 'Points'] += 2
                match_record.append([team2, team1, 'w'])
            else: # if tie, but this is very rare in practice
                cup.loc[cup['Country'] == team1, 'Points'] += 1
                cup.loc[cup['Country'] == team2, 'Points'] += 1
                match_record.append([team1, team2, 't'])
    return match_record

In [168]:
"""
In the Super Eight round, Teams seeded first and second in their groups in the 
first round will retain that seeding in the Super Eight, provided they qualify.
(https://www.icc-cricket.com/news/fixtures-revealed-for-historic-icc-men-s-t20-world-cup-2024-in-west-indies-and-the-usa)

For simplicity, all teams' Super 8 placements will be determined 
by their pre-tournament seeds in the cup simulation.
"""

def world_cup(match_record=[]):
    global cup

    # Reset dataframe values
    cup = cup.assign(Points=0)
    cup = cup.assign(Result="")

    cup = cup.assign(total_runs_scored=0.0)
    cup = cup.assign(total_runs_conceded=0.0)
    cup = cup.assign(num_matches_played=0)
    cup = cup.assign(NRR=0.0)

    for i in range(0,5):
        cup.loc[[i], 'Group'] = 'A'
    for i in range(5,10):
        cup.loc[[i], 'Group'] = 'B'
    for i in range(10,15):
        cup.loc[[i], 'Group'] = 'C'
    for i in range(15,20):
        cup.loc[[i], 'Group'] = 'D'


    """ GROUP STAGE """
    # Group stage is round robin
    round_robin([A_matches, B_matches, C_matches, D_matches], match_record)
               
    A_top = (cup['Points'].loc[cup['Group'] == 'A'].nlargest(2, keep='all').sort_index())
    B_top = (cup['Points'].loc[cup['Group'] == 'B'].nlargest(2, keep='all').sort_index())
    C_top = (cup['Points'].loc[cup['Group'] == 'C'].nlargest(2, keep='all').sort_index())
    D_top = (cup['Points'].loc[cup['Group'] == 'D'].nlargest(2, keep='all').sort_index())

    A1 = cup['Country'].loc[[A_top.index[0]]].values[0]
    B1 = cup['Country'].loc[[B_top.index[0]]].values[0]
    C1 = cup['Country'].loc[[C_top.index[0]]].values[0]
    D1 = cup['Country'].loc[[D_top.index[0]]].values[0]
    A2 = cup['Country'].loc[[A_top.index[1]]].values[0]
    B2 = cup['Country'].loc[[B_top.index[1]]].values[0]
    C2 = cup['Country'].loc[[C_top.index[1]]].values[0]
    D2 = cup['Country'].loc[[D_top.index[1]]].values[0]

    S8_G1 = [A1,B2,C1,D2]
    S8_G2 = [A2,B1,C2,D1]

    for team in S8_G1:
        cup.loc[cup['Country'] == team, 'Group'] = 'S8_G1'
    for team in S8_G2:
        cup.loc[cup['Country'] == team, 'Group'] = 'S8_G2'

    s8_in = set(np.concatenate((A_top.index,B_top.index,C_top.index,D_top.index)))
    all = set(range(0, 20))
    out = all.symmetric_difference(s8_in)

    for i in out:
        cup.loc[[i], 'Result'] = "Group stage"

    """ SUPER 8 STAGE """
    G1_matches = tuple(combinations(S8_G1, 2))
    G2_matches = tuple(combinations(S8_G2, 2))
    round_robin([G1_matches, G2_matches], match_record)

    # Sorted by points here (not index)
    G1_largest = (cup['Points'].loc[cup['Group'] == 'S8_G1'].nlargest(2, keep='all'))
    G2_largest = (cup['Points'].loc[cup['Group'] == 'S8_G2'].nlargest(2, keep='all'))

    # NRR comes into effect here to break ties
    # Could also use (and keep track) of head-to-head result
    # The rules are unclear whether or not head-to-head would be used over NRR 
    # when there is a tie between two teams.
    # For now, we will use NRR, as it could apply to ties of any size.

    G1_top = cup.loc[cup['id'].isin(G1_largest.index.values), ['Country', 'Points', 'NRR']]
    G2_top = cup.loc[cup['id'].isin(G2_largest.index.values), ['Country', 'Points', 'NRR']]
    G1_top = G1_top.sort_values(by=['Points', 'NRR'], ascending=[False, False], kind='stable')
    G2_top = G2_top.sort_values(by=['Points', 'NRR'], ascending=[False, False], kind='stable')

    g1winner = cup['Country'].loc[[G1_top.index[0]]].values[0]
    g1runner = cup['Country'].loc[[G1_top.index[1]]].values[0]
    g2winner = cup['Country'].loc[[G2_top.index[0]]].values[0]
    g2runner = cup['Country'].loc[[G2_top.index[1]]].values[0]

    SF1 = [g1winner, g2runner]
    SF2 = [g2winner, g1runner]

    sf_in = set(np.concatenate((G1_top.index.values[0:2],G2_top.index.values[0:2])))
    out = s8_in.symmetric_difference(sf_in)

    for i in out:
        cup.loc[[i], 'Result'] = "Super 8 stage"

    """ KNOCKOUT STAGE """
    # No ties - this should not be an issue
    # For reference, points are not being awarded at this stage
    SF1_winner = run_match(SF1[0], SF1[1])
    if (SF1_winner == SF1[0]):
        cup.loc[cup['Country'] == SF1[1], 'Result'] = "Semifinalist"
        match_record.append([cup.loc[cup['Country'] == SF1[0], 'Country'].values[0], 
                                      cup.loc[cup['Country'] == SF1[1], 'Country'].values[0], 'w'])
    else:
        cup.loc[cup['Country'] == SF1[0], 'Result'] = "Semifinalist"
        match_record.append([cup.loc[cup['Country'] == SF1[1], 'Country'].values[0], 
                                      cup.loc[cup['Country'] == SF1[0], 'Country'].values[0], 'w'])

    SF2_winner = run_match(SF2[0], SF2[1])
    if (SF2_winner == SF2[0]):
        cup.loc[cup['Country'] == SF2[1], 'Result'] = "Semifinalist"
        match_record.append([cup.loc[cup['Country'] == SF2[0], 'Country'].values[0], 
                                      cup.loc[cup['Country'] == SF2[1], 'Country'].values[0], 'w'])
    else:
        cup.loc[cup['Country'] == SF2[0], 'Result'] = "Semifinalist"
        match_record.append([cup.loc[cup['Country'] == SF2[1], 'Country'].values[0], 
                                      cup.loc[cup['Country'] == SF2[0], 'Country'].values[0], 'w'])

    cup_winner = run_match(SF1_winner, SF2_winner)
    if (cup_winner == SF1_winner):
        cup.loc[cup['Country'] == SF1_winner, 'Result'] = "Champion"
        cup.loc[cup['Country'] == SF2_winner, 'Result'] = "Finalist"
        match_record.append([cup.loc[cup['Country'] == SF1_winner, 'Country'].values[0], 
                                      cup.loc[cup['Country'] == SF2_winner, 'Country'].values[0], 'w'])
    else:
        cup.loc[cup['Country'] == SF2_winner, 'Result'] = "Champion"
        cup.loc[cup['Country'] == SF1_winner, 'Result'] = "Finalist"
        match_record.append([cup.loc[cup['Country'] == SF2_winner, 'Country'].values[0], 
                                      cup.loc[cup['Country'] == SF1_winner, 'Country'].values[0], 'w'])

    return cup, match_record

In [169]:
# Run to print the results of a single simulation
# One simulation should take about 3 seconds
sim_result = world_cup()[0].drop(columns = ['id'])
sim_result

Unnamed: 0,Country,Group,Points,Result,total_runs_scored,total_runs_conceded,num_matches_played,NRR
0,India,S8_G1,14,Champion,1255.324071,1014.23053,9,1.674261
1,Pakistan,S8_G2,8,Super 8 stage,963.580482,811.22205,7,1.360343
2,Ireland,A,4,Group stage,518.645218,474.84069,4,0.684446
3,Canada,A,0,Group stage,393.494905,685.135063,4,-4.556877
4,United States of America,A,2,Group stage,442.389516,564.810412,4,-1.912826
5,England,S8_G2,12,Semifinalist,1144.032798,965.736804,8,1.392937
6,Australia,S8_G1,12,Finalist,1242.942685,1073.103199,9,1.179441
7,Namibia,B,2,Group stage,459.386395,544.189759,4,-1.325053
8,Scotland,B,4,Group stage,473.767106,547.6747,4,-1.154806
9,Oman,B,0,Group stage,439.583724,590.215131,4,-2.353616


In [170]:
# Two-dimensional numpy array of simulation results 
# with each row as a simulation, each column as country, entry as placement 
# team index is same as in teams_list, starting with India as 0

# TODO: test with tiers

n_simulations = 10  # Change this!
match_records = pd.DataFrame()  # Will be reset every time this cell is run
simulation_results = np.empty((n_simulations, 20), dtype=object)
for i in range(0, n_simulations):
    match_records = pd.concat([match_records, pd.DataFrame(world_cup([])[1])], axis=0)
    placements = pd.Series(cup['Result'])
    simulation_results[i] = placements.to_numpy()

### Analyze Results

In [174]:
# Ex: Print results for all countries
for i in range(0,20):
    print(teams_list[i], "results:", dict(Counter(simulation_results[:,i])))

India results: {'Champion': 10}
Pakistan results: {'Super 8 stage': 10}
Ireland results: {'Group stage': 10}
Canada results: {'Group stage': 10}
United States of America results: {'Group stage': 10}
England results: {'Finalist': 7, 'Semifinalist': 3}
Australia results: {'Super 8 stage': 6, 'Finalist': 3, 'Semifinalist': 1}
Namibia results: {'Group stage': 10}
Scotland results: {'Group stage': 10}
Oman results: {'Group stage': 10}
New Zealand results: {'Semifinalist': 6, 'Super 8 stage': 4}
West Indies results: {'Semifinalist': 10}
Afghanistan results: {'Group stage': 10}
Uganda results: {'Group stage': 10}
Papua New Guinea results: {'Group stage': 10}
South Africa results: {'Super 8 stage': 10}
Sri Lanka results: {'Group stage': 8, 'Super 8 stage': 2}
Bangladesh results: {'Super 8 stage': 8, 'Group stage': 2}
Netherlands results: {'Group stage': 10}
Nepal results: {'Group stage': 10}


In [172]:
# Display all match results (win/tie) from simulation set.
# This way, we can easily calculate the proportion of matches 
# won by one team versus another across the whole simulation set.
# The outcome column is included only in the rare case a tie occurs

match_records = match_records.rename({0:'winner', 1:'loser', 2:'outcome'}, axis=1)
match_records.head(55)

Unnamed: 0,winner,loser,outcome
0,India,Pakistan,w
1,India,Ireland,w
2,India,Canada,w
3,India,United States of America,w
4,Pakistan,Ireland,w
5,Pakistan,Canada,w
6,Pakistan,United States of America,w
7,Ireland,Canada,w
8,Ireland,United States of America,w
9,United States of America,Canada,w


In [173]:
# TODO: Visualize results based on simulation_results array and match_records dataframe