### Setup and Preprocessing

In [141]:
import pandas as pd
import numpy as np
import random as rand
from itertools import combinations
from collections import Counter

In [142]:
# Read in merged CSV file
merge = pd.read_csv('merged_files.csv')
print(merge.columns)
print(merge['batting_team'].unique())
merge.head()

Index(['Unnamed: 0', 'match_id', 'season', 'start_date', 'venue', 'innings',
       'ball', 'batting_team', 'bowling_team', 'striker', 'non_striker',
       'bowler', 'runs_off_bat', 'extras', 'wides', 'noballs', 'byes',
       'legbyes', 'penalty', 'wicket_type', 'player_dismissed'],
      dtype='object')
['England' 'Australia' 'New Zealand' 'South Africa' 'Pakistan' 'Sri Lanka'
 'West Indies' 'India' 'Kenya' 'Scotland' 'Zimbabwe' 'Bangladesh'
 'Bermuda' 'Netherlands' 'Ireland' 'Afghanistan' 'Canada' 'Nepal'
 'Hong Kong' 'United Arab Emirates' 'Papua New Guinea' 'Oman'
 'ICC World XI' 'Philippines' 'Vanuatu' 'United States of America'
 'Germany' 'Italy' 'Ghana' 'Namibia' 'Uganda' 'Botswana' 'Nigeria'
 'Guernsey' 'Denmark' 'Norway' 'Jersey' 'Thailand' 'Malaysia' 'Maldives'
 'Singapore' 'Qatar' 'Kuwait' 'Cayman Islands' 'Portugal' 'Spain'
 'Gibraltar' 'Bhutan' 'Saudi Arabia' 'Bahrain' 'Iran' 'Belgium'
 'Luxembourg' 'Czech Republic' 'Isle of Man' 'Bulgaria' 'Romania'
 'Austria' 'Greece' 

Unnamed: 0.1,Unnamed: 0,match_id,season,start_date,venue,innings,ball,batting_team,bowling_team,striker,...,bowler,runs_off_bat,extras,wides,noballs,byes,legbyes,penalty,wicket_type,player_dismissed
0,0,211028,2005,2005-06-13,The Rose Bowl,1,0.1,England,Australia,ME Trescothick,...,B Lee,0,0,,,,,,,
1,1,211028,2005,2005-06-13,The Rose Bowl,1,0.2,England,Australia,ME Trescothick,...,B Lee,1,0,,,,,,,
2,2,211028,2005,2005-06-13,The Rose Bowl,1,0.3,England,Australia,GO Jones,...,B Lee,0,0,,,,,,,
3,3,211028,2005,2005-06-13,The Rose Bowl,1,0.4,England,Australia,GO Jones,...,B Lee,0,0,,,,,,,
4,4,211028,2005,2005-06-13,The Rose Bowl,1,0.5,England,Australia,GO Jones,...,B Lee,0,0,,,,,,,


In [143]:
# We will use only data from the last 4 years (2020-2024)
merge[['start-year', 'start-day', 'start-month']] = merge['start_date'].str.split('-', expand=True)
recent = merge.loc[merge['start-year'].astype(int) >= 2020]
merge = merge.drop(['start-year', 'start-day', 'start-month'], axis=1)
print(recent.shape)
recent.head()

(285767, 24)


Unnamed: 0.1,Unnamed: 0,match_id,season,start_date,venue,innings,ball,batting_team,bowling_team,striker,...,wides,noballs,byes,legbyes,penalty,wicket_type,player_dismissed,start-year,start-day,start-month
169922,0,1185313,2019/20,2020-02-12,Buffalo Park,1,0.1,South Africa,England,T Bavuma,...,,,,,,,,2020,2,12
169923,1,1185313,2019/20,2020-02-12,Buffalo Park,1,0.2,South Africa,England,Q de Kock,...,,,,,,,,2020,2,12
169924,2,1185313,2019/20,2020-02-12,Buffalo Park,1,0.3,South Africa,England,T Bavuma,...,,,,,,,,2020,2,12
169925,3,1185313,2019/20,2020-02-12,Buffalo Park,1,0.4,South Africa,England,Q de Kock,...,,,,,,,,2020,2,12
169926,4,1185313,2019/20,2020-02-12,Buffalo Park,1,0.5,South Africa,England,Q de Kock,...,,,,,,,,2020,2,12


In [144]:
# Create dataframe of matches only from teams participating in the 2024 World Cup
# teams are organized in their groups for the first round, ordered by pre-tournament seed within groups
# Ex: [A1, A2, ..., D4, D5]
with open('teams.txt') as file:
    teams_list = file.read().splitlines()
wc20=recent.loc[(recent['bowling_team'].isin(teams_list)) | (recent['batting_team'].isin(teams_list))]
print(wc20.shape)
wc20.head()

(147566, 24)


Unnamed: 0.1,Unnamed: 0,match_id,season,start_date,venue,innings,ball,batting_team,bowling_team,striker,...,wides,noballs,byes,legbyes,penalty,wicket_type,player_dismissed,start-year,start-day,start-month
169922,0,1185313,2019/20,2020-02-12,Buffalo Park,1,0.1,South Africa,England,T Bavuma,...,,,,,,,,2020,2,12
169923,1,1185313,2019/20,2020-02-12,Buffalo Park,1,0.2,South Africa,England,Q de Kock,...,,,,,,,,2020,2,12
169924,2,1185313,2019/20,2020-02-12,Buffalo Park,1,0.3,South Africa,England,T Bavuma,...,,,,,,,,2020,2,12
169925,3,1185313,2019/20,2020-02-12,Buffalo Park,1,0.4,South Africa,England,Q de Kock,...,,,,,,,,2020,2,12
169926,4,1185313,2019/20,2020-02-12,Buffalo Park,1,0.5,South Africa,England,Q de Kock,...,,,,,,,,2020,2,12


In [145]:
%%capture
# How many unique teams are there in wc20 competition with World Cup teams?
unique_teams = pd.unique(wc20[['batting_team', 'bowling_team']].values.ravel('K'))
for team in unique_teams:
    print(f"\"{team}\"",":",",")

In [146]:
#ICC T20 rating for all teams who have played against or are a World Cup team
team_rating = {
    "South Africa" : 249,
    "England" : 256,
    "Australia" : 255,
    "New Zealand" : 254,
    "India" : 266,
    "Pakistan" : 249,
    "Sri Lanka" : 234,
    "Ireland" : 194,
    "West Indies" : 245,
    "Bangladesh" : 227,
    "Zimbabwe" : 193,
    "Maldives" : 3,
    "Oman" : 154,
    "Afghanistan" : 218,
    "Hong Kong" : 139,
    "Nepal" : 175,
    "Thailand" : 0,
    "Uganda" : 132,
    "Namibia" : 196,
    "Netherlands" : 183,
    "Malaysia" : 125,
    "Scotland" : 190,
    "Papua New Guinea" : 143,
    "Nigeria" : 75,
    "Kenya" : 107,
    "United Arab Emirates" : 174,
    "Malawi" : 36,
    "Rwanda" : 32,
    "Lesotho" : 9,
    "Swaziland" : 0,
    "Ghana" : 31,
    "Seychelles" : 0,
    "Belize" : 22,
    "United States of America" : 131,
    "Canada" : 140,
    "Bermuda" : 107,
    "Argentina" : 39,
    "Bahamas" : 24,
    "Panama" : 19,
    "Tanzania" : 93,
    "Philippines" : 41,
    "Bahrain" : 115,
    "Germany" : 92,
    "Jersey" : 128,
    "Singapore" : 76,
    "Botswana" : 53,
    "Mozambique" : 36,
    "Saudi Arabia" : 104,
    "Italy" : 101,
    "Denmark" : 71,
    "Austria" : 58,
    "Vanuatu" : 60,
    "Japan" : 49,
    "Kuwait" : 118,
    "Mongolia" : 0, 
    "Cayman Islands" : 72,
}

In [147]:
%%capture
# Store level of each team in dictionary
# Based on ICC T20 rankings
# Not used for now
"""
tier_list = {
    "India" : 5,
    "England" : 5,
    "Australia" : 5,
    "New Zealand" : 5,
    "Pakistan" : 4,
    "South Africa" : 4,
    "West Indies" : 3,
    "Sri Lanka" : 3,
    "Bangladesh" : 3,
    "Afghanistan" : 3,
    "Namibia" : 2,
    "Ireland" : 2,
    "Scotland" : 2,
    "Netherlands" : 2,
    "Nepal" : 1,
    "Oman" : 1,
    "Papua New Guinea" : 1,
    "Canada" : 0,
    "Uganda" : 0,
    "United States of America" : 0
}
"""

### Calculate Stats

In [148]:
# Calculate adjusted runs scored
# wc20['adj_runs_scored'] = wc20['runs_off_bat'] * (wc20['bowling_team'].map(team_rating) / 266)

# # Calculate adjusted conceded runs
# cond = wc20['batting_team'].map(team_rating) < 50
# wc20['adj_conceded_runs'] = wc20['runs_off_bat'] * (cond * 50 + ~cond * 266 / wc20['batting_team'].map(team_rating))

In [149]:
# TODO: make sure this cell is correct
team_ratings_mapped = wc20['bowling_team'].map(team_rating)
wc20['adjusted_team_ratings'] = team_ratings_mapped # trm2?
wc20['adj_runs_scored'] = wc20['runs_off_bat'] * (wc20['adjusted_team_ratings'] / 266)

team_ratings_mapped2 = wc20['batting_team'].map(team_rating)
wc20['adjusted_team_ratings2'] = team_ratings_mapped2 
wc20.loc[wc20['adjusted_team_ratings2'] < 50, 'adjusted_team_ratings2'] = 50
wc20['adj_conceded_runs'] = wc20['runs_off_bat'] * ( 266 / wc20['adjusted_team_ratings2'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  wc20['adjusted_team_ratings'] = team_ratings_mapped # trm2?
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  wc20['adj_runs_scored'] = wc20['runs_off_bat'] * (wc20['adjusted_team_ratings'] / 266)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  wc20['adjusted_team_ratings2'] = team_ratings_mapped2
A va

In [150]:
#checks out
"""
df = (wc20
       .loc[:, ['striker', 'runs_off_bat', 'batting_team', 'extras', 'wides', 'noballs','byes','legbyes']]
       .groupby(['striker', 'batting_team'], as_index = False)
       .sum())
df_sorted = df.sort_values(by='runs_off_bat', ascending=False)
"""
df = (wc20
       .loc[:, ['striker', 'adj_runs_scored', 'batting_team']]
       .groupby(['striker', 'batting_team'], as_index = False)
       .sum())
df_sorted = df.sort_values(by='adj_runs_scored', ascending=False)
print(df_sorted.head())
 

             striker batting_team  adj_runs_scored
730  Mohammad Rizwan     Pakistan      2522.421053
186       Babar Azam     Pakistan      2069.703008
979         SA Yadav        India      1935.890977
491       JC Buttler      England      1561.943609
365      GD Phillips  New Zealand      1505.657895


In [151]:
num_bowls = (wc20
       .loc[:, ['striker', 'bowling_team', 'batting_team']]
       .groupby(['striker', 'batting_team'], as_index = False)
       .count()
       .rename(columns = {'bowling_team' : 'n_bowls'})
        )
dfA = df_sorted.merge(num_bowls, on = ['striker', 'batting_team'])
# dfA = dfA.sort_values(by = 'runs_off_bat', ascending = False)
dfA = dfA.sort_values(by = 'adj_runs_scored', ascending = False)
print(dfA)

              striker batting_team  adj_runs_scored  n_bowls
0     Mohammad Rizwan     Pakistan      2522.421053     2220
1          Babar Azam     Pakistan      2069.703008     1808
2            SA Yadav        India      1935.890977     1296
3          JC Buttler      England      1561.943609     1143
4         GD Phillips  New Zealand      1505.657895     1155
...               ...          ...              ...      ...
1151    Rizwan Haider     Malaysia         0.000000        1
1150    Garret Banner       Belize         0.000000        3
1149    Muhammad Khan    Hong Kong         0.000000       12
1148        H Fennell    Argentina         0.000000        1
1218         R Mondol   Bangladesh         0.000000        7

[1219 rows x 4 columns]


In [152]:
# Calculate runs per bowl for strikers (extras not included)
# probably better to include extras in conceded runs for bowlers
"""
dfA['runs_per_bowl'] = (dfA['runs_off_bat'] / dfA['n_bowls'])
dfA = dfA.sort_values(by = 'runs_per_bowl', ascending = False)
"""
dfA['adj_runs_per_bowl'] = (dfA['adj_runs_scored'] / dfA['n_bowls'])
dfA = dfA.sort_values(by = 'adj_runs_per_bowl', ascending = False)
dfA.head(10)

Unnamed: 0,striker,batting_team,adj_runs_scored,n_bowls,adj_runs_per_bowl
642,Shahnawaz Dhani,Pakistan,16.0,6,2.666667
872,Umran Malik,India,4.699248,2,2.349624
878,Mukesh Kumar,India,4.605263,2,2.302632
623,Rakibul Hasan,Bangladesh,17.744361,8,2.218045
770,AM Fernando,Sri Lanka,8.533835,4,2.133459
984,Aziz Sualley,Ghana,1.984962,1,1.984962
992,CB Sole,Scotland,1.909774,1,1.909774
791,Aminul Islam Biplob,Bangladesh,7.488722,4,1.87218
691,SC Kuggeleijn,New Zealand,13.105263,7,1.87218
909,W Barresi,Netherlands,3.684211,2,1.842105


In [153]:
# Calculate runs conceded per bowl for bowlers (extras included)
"""
df = (wc20
       .loc[:, ['bowler', 'bowling_team', 'runs_off_bat', 'extras']]
       .groupby(['bowler', 'bowling_team'], as_index = False)
       .sum())
df_sorted = df.sort_values(by='runs_off_bat', ascending=False)
"""
df = (wc20
       .loc[:, ['bowler', 'bowling_team', 'adj_conceded_runs', 'extras']]
       .groupby(['bowler', 'bowling_team'], as_index = False)
       .sum())
df_sorted = df.sort_values(by='adj_conceded_runs', ascending=False)
df_sorted.head()

Unnamed: 0,bowler,bowling_team,adj_conceded_runs,extras
273,H Ssenyondo,Uganda,3615.800538,42
481,MR Adair,Ireland,2127.586761,134
289,Haris Rauf,Pakistan,2105.100956,114
123,Bilal Hassun,Uganda,2085.062664,115
306,IS Sodhi,New Zealand,2067.358027,86


In [154]:
num_bowls = (wc20
       .loc[:, ['bowler', 'bowling_team', 'batting_team']]
       .groupby(['bowler', 'bowling_team'], as_index = False)
       .count()
       .rename(columns = {'batting_team' : 'n_bowls'})
        )
dfB = df_sorted.merge(num_bowls, on = ['bowler', 'bowling_team'])
# dfB = dfB.sort_values(by = 'runs_off_bat', ascending = False)
dfB = dfB.sort_values(by = 'adj_conceded_runs', ascending = False)
print(dfB)

                 bowler bowling_team  adj_conceded_runs  extras  n_bowls
0           H Ssenyondo       Uganda        3615.800538      42     1124
1              MR Adair      Ireland        2127.586761     134     1277
2            Haris Rauf     Pakistan        2105.100956     114     1483
3          Bilal Hassun       Uganda        2085.062664     115      691
4              IS Sodhi  New Zealand        2067.358027      86     1502
..                  ...          ...                ...     ...      ...
845  Mahmudul Hasan Joy   Bangladesh           5.000000       0        2
846         KNA Bandara    Sri Lanka           2.171429       0        6
847             SS Iyer        India           2.136546       0        2
848        Sandeep Goud      Bahamas           2.030534       0        1
849         JEA Doctora  Philippines           1.860140       1        3

[850 rows x 5 columns]


In [155]:
"""
dfB['runs_conceded_per_bowl'] = ((dfB['runs_off_bat'] + dfB['extras']) / dfB['n_bowls'])
dfB = dfB.sort_values(by = 'runs_conceded_per_bowl', ascending = False)
"""
dfB['adj_runs_conceded_per_bowl'] = ((dfB['adj_conceded_runs'] + dfB['extras']) / dfB['n_bowls'])
dfB = dfB.sort_values(by = 'adj_runs_conceded_per_bowl', ascending = False)
dfB.head(10)

Unnamed: 0,bowler,bowling_team,adj_conceded_runs,extras,n_bowls,adj_runs_conceded_per_bowl
762,KK Tillett,Belize,24.366412,5,3,9.788804
652,NR Kirton,Canada,47.233645,0,6,7.872274
512,Gurdeep Singh,Kenya,82.621212,0,12,6.885101
686,Yusuf Ebrahim,Panama,39.9,0,6,6.65
524,NS Dhaliwal,Canada,79.8,2,13,6.292308
305,TO Carmichael,United States of America,223.44,1,37,6.065946
687,B Terbish,Mongolia,39.52,1,7,5.788571
711,Umair Tariq,Austria,33.6,0,6,5.6
716,Sami Sohail,Malawi,32.242424,1,6,5.540404
557,H Patel,Canada,69.16,1,13,5.396923


In [156]:
players = pd.read_csv('players.csv')

# Add runs per bowl values to players.csv 
# Only RPB values for hitters and all_rounders will be used

# players = players.merge(dfA[['striker', 'runs_per_bowl']], how='left', left_on='name',right_on='striker')
players = players.merge(dfA[['striker', 'adj_runs_per_bowl']], how='left', left_on='name',right_on='striker')
players = players.drop(columns = ['striker'])

# Add runs conceded per bowl values to players.csv 
# Only RCPB values for bowlers and all_rounders will be used

# players = players.merge(dfB[['bowler', 'runs_conceded_per_bowl']], how='left', left_on='name',right_on='bowler')
players = players.merge(dfB[['bowler', 'adj_runs_conceded_per_bowl']], how='left', left_on='name',right_on='bowler')
players = players.drop(columns = ['bowler'])

players.head(7)

Unnamed: 0,name,position,country,adj_runs_per_bowl,adj_runs_conceded_per_bowl
0,Pargat Singh,hitter,Canada,0.381827,3.295069
1,NR Kirton,hitter,Canada,0.435024,7.872274
2,NS Dhaliwal,hitter,Canada,0.378526,6.292308
3,Saad Bin Zafar,all_rounder,Canada,0.571145,2.771515
4,N Dutta,bowler,Canada,0.6944,2.299036
5,Kaleem Sana,bowler,Canada,0.132946,2.123171
6,JOA Gordon,bowler,Canada,0.201128,2.912078


In [157]:
%%capture
# Potential issue for players listed under the same name:
# runs_off_bat (+ extras) grouped together for all players,
# but separated by country, resulting in faulty numbers

"""
khan = df2.loc[df2['striker']=='Shoaib Khan']
print(khan)
goud = df2.loc[df2['striker']=='Sandeep Goud']
print(goud)
goud = df2B.loc[df2B['bowler']=='Sandeep Goud']
print(goud)
"""

### Simulate a Match

In [158]:
# The total number of players in each dataframe should be 80
bowlers = players[((players['position'] == 'bowler') | (players['position'] == 'all_rounder'))]
hitters = players[((players['position'] == 'hitter') | (players['position'] == 'all_rounder'))]

"""
bowlers = bowlers.dropna(subset=['runs_conceded_per_bowl'])
hitters = hitters.dropna(subset=['runs_per_bowl'])
"""

# Remove players for which there is no data
bowlers = bowlers.dropna(subset=['adj_runs_conceded_per_bowl'])
hitters = hitters.dropna(subset=['adj_runs_per_bowl'])

# TODO: fix issue with bowlers. Rashid Khan from Afghanistan is in dataframe four times
# Likely due to issue described in previous cell
# Also there is one US player with no data currently
print(bowlers.shape)
print(hitters.shape)

# pd.set_option('display.max_rows', None)
# pd.set_option('display.max_columns', None)
# bowlers

(79, 5)
(80, 5)


In [159]:
# How we determine which score to use determines our results! 
# The distribution of RPB and RPCB could be decided on a per-match basis 
# if there is enough data for a player

def get_runs(rpb, rcpb):
    return rand.uniform(rpb,rcpb)

In [160]:
def run_match(country1, country2, print_score=False, update_nrr = True):
    global cup

    # make a vector for each team's batters/bowlers 
    bowlers1 = bowlers.loc[bowlers['country'] == country1]
    bowlers1 = bowlers1['name'].to_numpy()

    bowlers2 = bowlers.loc[bowlers['country'] == country2]
    bowlers2 = bowlers2['name'].to_numpy()

    hitters1 = hitters.loc[hitters['country'] == country1]
    hitters1 = hitters1['name'].to_numpy()

    hitters2 = hitters.loc[hitters['country'] == country2]
    hitters2 = hitters2['name'].to_numpy()

    arr_list = [bowlers1, bowlers2, hitters1, hitters2]

    # shuffle order of bowlers and hitters for added variability
    for arr in arr_list:
        rand.shuffle(arr)

    # base scores
    country1_score = 0.0
    country2_score = 0.0 
    
    # 2 x 4 hitters x 4 bowlers x 6 bowls
    # 192 total bowls (except for US)

    #first team score:
    for hitter in hitters1:
        for bowler in bowlers2:
            for x in range(6):
                #rpb = (players.loc[players['name'] == hitter, 'runs_per_bowl'].values)[0]
                #rcpb = (players.loc[players['name'] == bowler, 'runs_conceded_per_bowl'].values)[0]
                rpb = (players.loc[players['name'] == hitter, 'adj_runs_per_bowl'].values)[0]
                rcpb = (players.loc[players['name'] == bowler, 'adj_runs_conceded_per_bowl'].values)[0]
                country1_score += get_runs(rpb, rcpb)
              
    
    #second team score:
    for hitter in hitters2:
        for bowler in bowlers1:
            for x in range(6):
                #rpb = (players.loc[players['name'] == hitter, 'runs_per_bowl'].values)[0]
                #rcpb = (players.loc[players['name'] == bowler, 'runs_conceded_per_bowl'].values)[0]
                rpb = (players.loc[players['name'] == hitter, 'adj_runs_per_bowl'].values)[0]
                rcpb = (players.loc[players['name'] == bowler, 'adj_runs_conceded_per_bowl'].values)[0]
                country2_score += get_runs(rpb, rcpb)

    # Adjust scores in case of uneven rosters
    # Ideally, this will not be needed, and will be removed when we have values for all players
     
    country2_score = country2_score*(hitters2.shape[0]/bowlers1.shape[0])
    country1_score = country1_score*(hitters1.shape[0]/bowlers2.shape[0])
    
    if(print_score):
        print(country1, "score:", country1_score)
        print(country2, "score:", country2_score)

    # In reality, all scores will be whole numbers, and the number of overs will vary.
    # For the purposes of calculating NRR in this simulation, we will not round scores
    # and treat all teams as bowling 16 overs in each match. In that way, for every team:
    # Total Overs Faced = Total Overs Bowled = 16 * Number of Matches Played 
    # (Since the number of overs will always be 16, this speeds up the calculation!)
        
    if(update_nrr):
        # NRR = (Total Runs Scored ÷ Total Overs Faced) – (Total Runs Conceded ÷ Total Overs Bowled)
        # For country1
        cup.loc[cup['Country'] == country1, 'total_runs_scored'] += country1_score
        cup.loc[cup['Country'] == country1, 'total_runs_conceded'] += country2_score
        cup.loc[cup['Country'] == country1, 'num_matches_played'] += 1
        cup.loc[cup['Country'] == country1, 'NRR'] = ((cup.loc[cup['Country'] == country1, 'total_runs_scored'].values[0] -
                                                                         cup.loc[cup['Country'] == country1, 'total_runs_conceded'].values[0]) / 
                                                                         (16*cup.loc[cup['Country'] == country1, 'num_matches_played'].values[0]))
        # For country2
        cup.loc[cup['Country'] == country2, 'total_runs_scored'] += country2_score
        cup.loc[cup['Country'] == country2, 'total_runs_conceded'] += country1_score
        cup.loc[cup['Country'] == country2, 'num_matches_played'] += 1
        cup.loc[cup['Country'] == country2, 'NRR'] = ((cup.loc[cup['Country'] == country2, 'total_runs_scored'].values[0] -
                                                                         cup.loc[cup['Country'] == country2, 'total_runs_conceded'].values[0]) / 
                                                                         (16*cup.loc[cup['Country'] == country2, 'num_matches_played'].values[0]))

    
    if country1_score > country2_score:
        return country1
    elif country2_score > country1_score:
        return country2
    else:
        return 'tie'

In [161]:
# Change these to try a different matchup!
teamA = 'India'
teamB = 'England'

# One match simulated with each function call
# Add 'True' to run_match to print scores
winner = run_match(teamA, teamB, True, False)
print("Winner:", winner)

India score: 131.38495887116883
England score: 127.91692555724211
Winner: India


In [162]:
# Change num_iterations and run this cell to simulate 
# many matches for the pair given in the previous cell

num_iterations = 100

A_victories = 0
for i in range(num_iterations):
    if(run_match(teamA, teamB, False, False) == teamA):
        A_victories+=1
print("Proportion that", teamA, "won:",A_victories/num_iterations)

Proportion that India won: 1.0


### Simulate the World Cup

In [163]:
# placements = ["Group stage", "Super 8 stage", "Semifinalist", "Finalist", "Champion"]

cup = pd.DataFrame({'Country': teams_list, 'Group': 5*['A']+5*['B']+5*['C']+5*['D']})
cup['id'] = cup.index

A_matches= tuple(combinations(cup['Country'].loc[cup['Group'] == 'A'], 2))
B_matches = tuple(combinations(cup['Country'].loc[cup['Group'] == 'B'], 2))
C_matches = tuple(combinations(cup['Country'].loc[cup['Group'] == 'C'], 2))
D_matches = tuple(combinations(cup['Country'].loc[cup['Group'] == 'D'], 2))

In [164]:
def round_robin(group_list, match_record):
    global cup
    for group in group_list:
        for pairing in group:
            team1 = pairing[0]
            team2 = pairing[1]
            winner = run_match(team1,team2)

            # 2 points for win, 1 for tie
            if (winner == team1):
                cup.loc[cup['Country'] == team1, 'Points'] += 2
                match_record.append([team1, team2, 'w'])
            elif (winner == team2):
                cup.loc[cup['Country'] == team2, 'Points'] += 2
                match_record.append([team2, team1, 'w'])
            else: # if tie, but this is very rare in practice
                cup.loc[cup['Country'] == team1, 'Points'] += 1
                cup.loc[cup['Country'] == team2, 'Points'] += 1
                match_record.append([team1, team2, 't'])
    return match_record

In [165]:
"""
In the Super Eight round, Teams seeded first and second in their groups in the 
first round will retain that seeding in the Super Eight, provided they qualify.
(https://www.icc-cricket.com/news/fixtures-revealed-for-historic-icc-men-s-t20-world-cup-2024-in-west-indies-and-the-usa)

For simplicity, all teams' Super 8 placements will be determined 
by their pre-tournament seeds in the cup simulation.
"""

def world_cup(match_record=[]):
    global cup

    # Reset dataframe values
    cup = cup.assign(Points=0)
    cup = cup.assign(Result="")

    cup = cup.assign(total_runs_scored=0.0)
    cup = cup.assign(total_runs_conceded=0.0)
    cup = cup.assign(num_matches_played=0)
    cup = cup.assign(NRR=0.0)

    for i in range(0,5):
        cup.loc[[i], 'Group'] = 'A'
    for i in range(5,10):
        cup.loc[[i], 'Group'] = 'B'
    for i in range(10,15):
        cup.loc[[i], 'Group'] = 'C'
    for i in range(15,20):
        cup.loc[[i], 'Group'] = 'D'


    """ GROUP STAGE """
    # Group stage is round robin
    round_robin([A_matches, B_matches, C_matches, D_matches], match_record)
               
    A_top = (cup['Points'].loc[cup['Group'] == 'A'].nlargest(2, keep='all').sort_index())
    B_top = (cup['Points'].loc[cup['Group'] == 'B'].nlargest(2, keep='all').sort_index())
    C_top = (cup['Points'].loc[cup['Group'] == 'C'].nlargest(2, keep='all').sort_index())
    D_top = (cup['Points'].loc[cup['Group'] == 'D'].nlargest(2, keep='all').sort_index())

    A1 = cup['Country'].loc[[A_top.index[0]]].values[0]
    B1 = cup['Country'].loc[[B_top.index[0]]].values[0]
    C1 = cup['Country'].loc[[C_top.index[0]]].values[0]
    D1 = cup['Country'].loc[[D_top.index[0]]].values[0]
    A2 = cup['Country'].loc[[A_top.index[1]]].values[0]
    B2 = cup['Country'].loc[[B_top.index[1]]].values[0]
    C2 = cup['Country'].loc[[C_top.index[1]]].values[0]
    D2 = cup['Country'].loc[[D_top.index[1]]].values[0]

    S8_G1 = [A1,B2,C1,D2]
    S8_G2 = [A2,B1,C2,D1]

    for team in S8_G1:
        cup.loc[cup['Country'] == team, 'Group'] = 'S8_G1'
    for team in S8_G2:
        cup.loc[cup['Country'] == team, 'Group'] = 'S8_G2'

    s8_in = set(np.concatenate((A_top.index,B_top.index,C_top.index,D_top.index)))
    all = set(range(0, 20))
    out = all.symmetric_difference(s8_in)

    for i in out:
        cup.loc[[i], 'Result'] = "Group stage"

    """ SUPER 8 STAGE """
    G1_matches = tuple(combinations(S8_G1, 2))
    G2_matches = tuple(combinations(S8_G2, 2))
    round_robin([G1_matches, G2_matches], match_record)

    # Sorted by points here (not index)
    G1_largest = (cup['Points'].loc[cup['Group'] == 'S8_G1'].nlargest(2, keep='all'))
    G2_largest = (cup['Points'].loc[cup['Group'] == 'S8_G2'].nlargest(2, keep='all'))

    # NRR comes into effect here to break ties
    # Could also use (and keep track) of head-to-head result
    # The rules are unclear whether or not head-to-head would be used over NRR 
    # when there is a tie between two teams.
    # For now, we will use NRR, as it could apply to ties of any size.

    G1_top = cup.loc[cup['id'].isin(G1_largest.index.values), ['Country', 'Points', 'NRR']]
    G2_top = cup.loc[cup['id'].isin(G2_largest.index.values), ['Country', 'Points', 'NRR']]
    G1_top = G1_top.sort_values(by=['Points', 'NRR'], ascending=[False, False], kind='stable')
    G2_top = G2_top.sort_values(by=['Points', 'NRR'], ascending=[False, False], kind='stable')

    g1winner = cup['Country'].loc[[G1_top.index[0]]].values[0]
    g1runner = cup['Country'].loc[[G1_top.index[1]]].values[0]
    g2winner = cup['Country'].loc[[G2_top.index[0]]].values[0]
    g2runner = cup['Country'].loc[[G2_top.index[1]]].values[0]

    SF1 = [g1winner, g2runner]
    SF2 = [g2winner, g1runner]

    sf_in = set(np.concatenate((G1_top.index.values[0:2],G2_top.index.values[0:2])))
    out = s8_in.symmetric_difference(sf_in)

    for i in out:
        cup.loc[[i], 'Result'] = "Super 8 stage"

    """ KNOCKOUT STAGE """
    # No ties - this should not be an issue
    # For reference, points are not being awarded at this stage
    SF1_winner = run_match(SF1[0], SF1[1])
    if (SF1_winner == SF1[0]):
        cup.loc[cup['Country'] == SF1[1], 'Result'] = "Semifinalist"
        match_record.append([cup.loc[cup['Country'] == SF1[0], 'Country'].values[0], 
                                      cup.loc[cup['Country'] == SF1[1], 'Country'].values[0], 'w'])
    else:
        cup.loc[cup['Country'] == SF1[0], 'Result'] = "Semifinalist"
        match_record.append([cup.loc[cup['Country'] == SF1[1], 'Country'].values[0], 
                                      cup.loc[cup['Country'] == SF1[0], 'Country'].values[0], 'w'])

    SF2_winner = run_match(SF2[0], SF2[1])
    if (SF2_winner == SF2[0]):
        cup.loc[cup['Country'] == SF2[1], 'Result'] = "Semifinalist"
        match_record.append([cup.loc[cup['Country'] == SF2[0], 'Country'].values[0], 
                                      cup.loc[cup['Country'] == SF2[1], 'Country'].values[0], 'w'])
    else:
        cup.loc[cup['Country'] == SF2[0], 'Result'] = "Semifinalist"
        match_record.append([cup.loc[cup['Country'] == SF2[1], 'Country'].values[0], 
                                      cup.loc[cup['Country'] == SF2[0], 'Country'].values[0], 'w'])

    cup_winner = run_match(SF1_winner, SF2_winner)
    if (cup_winner == SF1_winner):
        cup.loc[cup['Country'] == SF1_winner, 'Result'] = "Champion"
        cup.loc[cup['Country'] == SF2_winner, 'Result'] = "Finalist"
        match_record.append([cup.loc[cup['Country'] == SF1_winner, 'Country'].values[0], 
                                      cup.loc[cup['Country'] == SF2_winner, 'Country'].values[0], 'w'])
    else:
        cup.loc[cup['Country'] == SF2_winner, 'Result'] = "Champion"
        cup.loc[cup['Country'] == SF1_winner, 'Result'] = "Finalist"
        match_record.append([cup.loc[cup['Country'] == SF2_winner, 'Country'].values[0], 
                                      cup.loc[cup['Country'] == SF1_winner, 'Country'].values[0], 'w'])

    return cup, match_record

In [166]:
# Run to print the results of a single simulation
# One simulation should take about 3 seconds
world_cup()[0]

Unnamed: 0,Country,Group,id,Points,Result,total_runs_scored,total_runs_conceded,num_matches_played,NRR
0,Canada,A,0,0,Group stage,409.140759,674.044393,4,-4.139119
1,United States of America,A,1,2,Group stage,439.03471,607.828303,4,-2.6374
2,Ireland,A,2,4,Group stage,533.413249,479.369471,4,0.844434
3,India,S8_G1,3,14,Champion,1274.387463,1025.636888,9,1.727435
4,Pakistan,S8_G2,4,8,Super 8 stage,954.612286,799.331247,7,1.386438
5,Australia,S8_G2,5,12,Finalist,1243.24983,1064.997157,9,1.237866
6,England,S8_G1,6,10,Super 8 stage,990.272967,834.424515,7,1.391504
7,Namibia,B,7,4,Group stage,471.077922,535.617934,4,-1.008438
8,Oman,B,8,0,Group stage,428.36567,593.552586,4,-2.581046
9,Scotland,B,9,2,Group stage,473.816165,547.644416,4,-1.153566


In [None]:
# Two-dimensional numpy array of simulation results 
# with each row as a simulation, each column as country, entry as placement 
# team index is same as in teams_list, starting with India as 0

# TODO: With the adjusted numbers, it is impossible for India to lose. 
# Having tiers (10+) may help address this.

n_simulations = 100  # Change this!
match_records = pd.DataFrame()  # Will be reset every time this cell is run
simulation_results = np.empty((n_simulations, 20), dtype=object)
for i in range(0, n_simulations):
    match_records = pd.concat([match_records, pd.DataFrame(world_cup([])[1])], axis=0)
    placements = pd.Series(cup['Result'])
    simulation_results[i] = placements.to_numpy()

### Analyze Results

In [None]:
# Ex: Print results for all countries
for i in range(0,20):
    print(cup['Country'].loc[[i]].values[0], "results:", Counter(simulation_results[:,i]))

In [None]:
# Display all match results (win/tie) from simulation set.
# This way, we can easily calculate the proportion of matches 
# won by one team versus another across the whole simulation set.
# the outcome column is included only in the rare case a tie occurs

match_records = match_records.rename({0:'winner',1:'loser',2:'outcome'},axis=1)
match_records

In [None]:
# TODO: Visualize results based on simulation_results array and match_records dataframe