### Setup and Preprocessing

In [None]:
import pandas as pd
import numpy as np
import random as rand
from itertools import combinations
from collections import Counter

In [None]:
# Read in merged CSV file
merge = pd.read_csv('merged_files.csv')
print(merge.columns)
print(merge['batting_team'].unique())
merge.head()

In [None]:
# We will use only data from the last 4 years (2020-2024)
merge[['start-year', 'start-day', 'start-month']] = merge['start_date'].str.split('-', expand=True)
recent = merge.loc[merge['start-year'].astype(int) >= 2020]
merge = merge.drop(['start-year', 'start-day', 'start-month'], axis=1)
print(recent.shape)
recent.head()

In [None]:
# Create dataframe of matches only from teams participating in the 2024 World Cup
# teams are organized in their groups for the first round, ordered by pre-tournament seed within groups
# Ex: [A1, A2, ..., D4, D5]
with open('teams.txt') as file:
    teams_list = file.read().splitlines()
wc20=recent.loc[(recent['bowling_team'].isin(teams_list)) | (recent['batting_team'].isin(teams_list))]
print(wc20.shape)
wc20.head()

In [None]:
# How many unique teams are there in wc20 competition with World Cup teams?
unique_teams = pd.unique(wc20[['batting_team', 'bowling_team']].values.ravel('K'))
for team in unique_teams:
    print(f"\"{team}\"",":",",")

In [None]:
#ICC T20 rating for all teams who have played against or are a World Cup team
team_rating = {
    "South Africa" : 249,
    "England" : 256,
    "Australia" : 255,
    "New Zealand" : 254,
    "India" : 266,
    "Pakistan" : 249,
    "Sri Lanka" : 234,
    "Ireland" : 194,
    "West Indies" : 245,
    "Bangladesh" : 227,
    "Zimbabwe" : 193,
    "Maldives" : 3,
    "Oman" : 154,
    "Afghanistan" : 218,
    "Hong Kong" : 139,
    "Nepal" : 175,
    "Thailand" : 0,
    "Uganda" : 132,
    "Namibia" : 196,
    "Netherlands" : 183,
    "Malaysia" : 125,
    "Scotland" : 190,
    "Papua New Guinea" : 143,
    "Nigeria" : 75,
    "Kenya" : 107,
    "United Arab Emirates" : 174,
    "Malawi" : 36,
    "Rwanda" : 32,
    "Lesotho" : 9,
    "Swaziland" : 0,
    "Ghana" : 31,
    "Seychelles" : 0,
    "Belize" : 22,
    "United States of America" : 131,
    "Canada" : 140,
    "Bermuda" : 107,
    "Argentina" : 39,
    "Bahamas" : 24,
    "Panama" : 19,
    "Tanzania" : 93,
    "Philippines" : 41,
    "Bahrain" : 115,
    "Germany" : 92,
    "Jersey" : 128,
    "Singapore" : 76,
    "Botswana" : 53,
    "Mozambique" : 36,
    "Saudi Arabia" : 104,
    "Italy" : 101,
    "Denmark" : 71,
    "Austria" : 58,
    "Vanuatu" : 60,
    "Japan" : 49,
    "Kuwait" : 118,
    "Mongolia" : 0, 
    "Cayman Islands" : 72,
}

In [None]:
%%capture
# Store level of each team in dictionary
# Based on ICC T20 rankings
# Not used for now
"""
tier_list = {
    "India" : 5,
    "England" : 5,
    "Australia" : 5,
    "New Zealand" : 5,
    "Pakistan" : 4,
    "South Africa" : 4,
    "West Indies" : 3,
    "Sri Lanka" : 3,
    "Bangladesh" : 3,
    "Afghanistan" : 3,
    "Namibia" : 2,
    "Ireland" : 2,
    "Scotland" : 2,
    "Netherlands" : 2,
    "Nepal" : 1,
    "Oman" : 1,
    "Papua New Guinea" : 1,
    "Canada" : 0,
    "Uganda" : 0,
    "United States of America" : 0
}
"""

### Calculate Stats

In [None]:
# TODO: make sure this cell is correct
team_ratings_mapped = wc20['bowling_team'].map(team_rating)
wc20['adj_runs_off_bat'] = wc20['runs_off_bat'] * (team_ratings_mapped / 266)

wc20['adjusted_team_ratings2'] = team_ratings_mapped # trm2?
wc20.loc[wc20['adjusted_team_ratings2'] < 50, 'adjusted_team_ratings'] = 50
wc20['adj_conceded_runs_off_bat'] = wc20['runs_off_bat'] * ( 266 / team_ratings_mapped) # trm2?

In [None]:
#checks out
"""
df = (wc20
       .loc[:, ['striker', 'runs_off_bat', 'batting_team', 'extras', 'wides', 'noballs','byes','legbyes']]
       .groupby(['striker', 'batting_team'], as_index = False)
       .sum())
df_sorted = df.sort_values(by='runs_off_bat', ascending=False)
"""
df = (wc20
       .loc[:, ['striker', 'adj_runs_off_bat', 'batting_team']]
       .groupby(['striker', 'batting_team'], as_index = False)
       .sum())
df_sorted = df.sort_values(by='adj_runs_off_bat', ascending=False)
print(df_sorted.head())
 

In [None]:
num_bowls = (wc20
       .loc[:, ['striker', 'bowling_team', 'batting_team']]
       .groupby(['striker', 'batting_team'], as_index = False)
       .count()
       .rename(columns = {'bowling_team' : 'n_bowls'})
        )
dfA = df_sorted.merge(num_bowls, on = ['striker', 'batting_team'])
# dfA = dfA.sort_values(by = 'runs_off_bat', ascending = False)
dfA = dfA.sort_values(by = 'adj_runs_off_bat', ascending = False)
print(dfA)

In [None]:
# check numbers for V Kohli
print(df_sorted[df_sorted['striker'] == 'V Kohli'])
print(dfA[dfA['striker'] == 'V Kohli'])

In [None]:
# Calculate runs per bowl for strikers (extras not included)
# probably better to include extras in conceded runs for bowlers
"""
dfA['runs_per_bowl'] = (dfA['runs_off_bat'] / dfA['n_bowls'])
dfA = dfA.sort_values(by = 'runs_per_bowl', ascending = False)
"""
dfA['adj_runs_per_bowl'] = (dfA['adj_runs_off_bat'] / dfA['n_bowls'])
dfA = dfA.sort_values(by = 'adj_runs_per_bowl', ascending = False)
dfA.head(10)

In [None]:
# Calculate runs conceded per bowl for bowlers (extras included)
"""
df = (wc20
       .loc[:, ['bowler', 'bowling_team', 'runs_off_bat', 'extras']]
       .groupby(['bowler', 'bowling_team'], as_index = False)
       .sum())
df_sorted = df.sort_values(by='runs_off_bat', ascending=False)
"""
df = (wc20
       .loc[:, ['bowler', 'bowling_team', 'adj_conceded_runs_off_bat', 'extras']]
       .groupby(['bowler', 'bowling_team'], as_index = False)
       .sum())
df_sorted = df.sort_values(by='adj_conceded_runs_off_bat', ascending=False)
df_sorted.head()

In [None]:
num_bowls = (wc20
       .loc[:, ['bowler', 'bowling_team', 'batting_team']]
       .groupby(['bowler', 'bowling_team'], as_index = False)
       .count()
       .rename(columns = {'batting_team' : 'n_bowls'})
        )
dfB = df_sorted.merge(num_bowls, on = ['bowler', 'bowling_team'])
# dfB = dfB.sort_values(by = 'runs_off_bat', ascending = False)
dfB = dfB.sort_values(by = 'adj_conceded_runs_off_bat', ascending = False)
print(dfB)

In [None]:
"""
dfB['runs_conceded_per_bowl'] = ((dfB['runs_off_bat'] + dfB['extras']) / dfB['n_bowls'])
dfB = dfB.sort_values(by = 'runs_conceded_per_bowl', ascending = False)
"""
dfB['adj_runs_conceded_per_bowl'] = ((dfB['adj_conceded_runs_off_bat'] + dfB['extras']) / dfB['n_bowls'])
dfB = dfB.sort_values(by = 'adj_conceded_runs_off_bat', ascending = False)
dfB.head(10)

In [None]:
players = pd.read_csv('players.csv')

# Add runs per bowl values to players.csv 
# Only RPB values for hitters and all_rounders will be used

# players = players.merge(dfA[['striker', 'runs_per_bowl']], how='left', left_on='name',right_on='striker')
players = players.merge(dfA[['striker', 'adj_runs_per_bowl']], how='left', left_on='name',right_on='striker')
players = players.drop(columns = ['striker'])

# Add runs conceded per bowl values to players.csv 
# Only RCPB values for bowlers and all_rounders will be used

# players = players.merge(dfB[['bowler', 'runs_conceded_per_bowl']], how='left', left_on='name',right_on='bowler')
players = players.merge(dfB[['bowler', 'adj_runs_conceded_per_bowl']], how='left', left_on='name',right_on='bowler')
players = players.drop(columns = ['bowler'])

players.head(7)

In [None]:
%%capture
# Potential issue for players listed under the same name:
# runs_off_bat (+ extras) grouped together for all players,
# but separated by country, resulting in faulty numbers
# should not affect any of our selected players

"""
khan = df2.loc[df2['striker']=='Shoaib Khan']
print(khan)
goud = df2.loc[df2['striker']=='Sandeep Goud']
print(goud)
goud = df2B.loc[df2B['bowler']=='Sandeep Goud']
print(goud)
"""

### Simulating a Match

In [None]:
bowlers = players[((players['position'] == 'bowler') | (players['position'] == 'all_rounder'))]
hitters = players[((players['position'] == 'hitter') | (players['position'] == 'all_rounder'))]

# Remove players for which there is no data
"""
bowlers = bowlers.dropna(subset=['runs_conceded_per_bowl'])
hitters = hitters.dropna(subset=['runs_per_bowl'])
"""

# TODO: check that we are not unnecessarily dropping players
bowlers = bowlers.dropna(subset=['adj_runs_conceded_per_bowl'])
hitters = hitters.dropna(subset=['adj_runs_per_bowl'])

print(bowlers.shape)
print(hitters.shape)

In [None]:
# How we determine which score to use determines our results! 
# The distribution of RPB and RPCB could be decided on a per-match basis 
# if there is enough data for a player

def get_runs(rpb, rcpb):
    return rand.uniform(rpb,rcpb)

In [None]:
def run_match(country1, country2, print_score=False):
    
    # make a vector for each team's batters/bowlers 
    bowlers1 = bowlers.loc[bowlers['country'] == country1]
    bowlers1 = bowlers1['name'].to_numpy()

    bowlers2 = bowlers.loc[bowlers['country'] == country2]
    bowlers2 = bowlers2['name'].to_numpy()

    hitters1 = hitters.loc[hitters['country'] == country1]
    hitters1 = hitters1['name'].to_numpy()

    hitters2 = hitters.loc[hitters['country'] == country2]
    hitters2 = hitters2['name'].to_numpy()

    arr_list = [bowlers1, bowlers2, hitters1, hitters2]

    # shuffle order of bowlers and hitters for added variability
    for arr in arr_list:
        rand.shuffle(arr)

    # base scores
    country1_score = 0.0
    country2_score = 0.0 
    
    # 2 x 4 hitters x 4 bowlers x 6 bowls
    # 192 total bowls (except for US)

    #first team score:
    for hitter in hitters1:
        for bowler in bowlers2:
            for x in range(6):
                #rpb = (players.loc[players['name'] == hitter, 'runs_per_bowl'].values)[0]
                #rcpb = (players.loc[players['name'] == bowler, 'runs_conceded_per_bowl'].values)[0]
                rpb = (players.loc[players['name'] == hitter, 'adj_runs_per_bowl'].values)[0]
                rcpb = (players.loc[players['name'] == bowler, 'adj_runs_conceded_per_bowl'].values)[0]
                country1_score += get_runs(rpb, rcpb)
              
    
    #second team score:
    for hitter in hitters2:
        for bowler in bowlers1:
            for x in range(6):
                #rpb = (players.loc[players['name'] == hitter, 'runs_per_bowl'].values)[0]
                #rcpb = (players.loc[players['name'] == bowler, 'runs_conceded_per_bowl'].values)[0]
                rpb = (players.loc[players['name'] == hitter, 'adj_runs_per_bowl'].values)[0]
                rcpb = (players.loc[players['name'] == bowler, 'adj_runs_conceded_per_bowl'].values)[0]
                country2_score += get_runs(rpb, rcpb)

    # Adjust scores in case of uneven rosters
    # TODO: this fix may be incorrect
    country2_score = country2_score*(hitters2.shape[0]/bowlers1.shape[0])
    country1_score = country1_score*(hitters1.shape[0]/bowlers2.shape[0])
    
    if(print_score):
        print(country1, "score:", country1_score)
        print(country2, "score:", country2_score)

    if country1_score > country2_score:
        return country1
    elif country1_score < country2_score:
        return country2
    else:
        return 'tie'

In [None]:
# Change these to try a different matchup!
teamA = 'India'
teamB = 'Australia'

# One match simulated with each function call
# Add 'True' to run_match to print scores
winner = run_match(teamA, teamB, True)
print("Winner:", winner)

In [None]:
# Change num_iterations and run this to simulate 
# many matches for the pair given in the previous cell

num_iterations = 100

A_victories = 0
for i in range(num_iterations):
    if(run_match(teamA,teamB)==teamA):
        A_victories+=1
print("Proportion that", teamA, "won:",A_victories/num_iterations)

### Simulating the World Cup

In [None]:
# placements = ["Group stage", "Super 8 stage", "Semifinalist", "Finalist", "Champion"]

cup = pd.DataFrame({'Country': teams_list, 'Group': 5*['A']+5*['B']+5*['C']+5*['D']})

A_matches= tuple(combinations(cup['Country'].loc[cup['Group'] == 'A'], 2))
B_matches = tuple(combinations(cup['Country'].loc[cup['Group'] == 'B'], 2))
C_matches = tuple(combinations(cup['Country'].loc[cup['Group'] == 'C'], 2))
D_matches = tuple(combinations(cup['Country'].loc[cup['Group'] == 'D'], 2))

In [None]:
def round_robin(group_list):
    global cup
    for group in group_list:
        for pairing in group:
            team1 = pairing[0]
            team2 = pairing[1]
            winner = run_match(team1,team2)

            # 2 points for win, 1 for tie
            if (winner == team1):
                cup.loc[cup['Country'] == team1, 'Points'] += 2
            elif (winner == team2):
                cup.loc[cup['Country'] == team2, 'Points'] += 2
            else: # if tie, but this is very rare in practice
                cup.loc[cup['Country'] == team1, 'Points'] += 1
                cup.loc[cup['Country'] == team2, 'Points'] += 1

In [None]:
"""
In the Super Eight round, Teams seeded first and second in their groups in the 
first round will retain that seeding in the Super Eight, provided they qualify.
(https://www.icc-cricket.com/news/fixtures-revealed-for-historic-icc-men-s-t20-world-cup-2024-in-west-indies-and-the-usa)

For simplicity, all teams' Super 8 placements will be determined 
by their pre-tournament seeds in the cup simulation.
"""

# Net Run Rate = (Total Runs Scored ÷ Total Overs Faced) – (Total Runs Conceded ÷ Total Overs Bowled)
# This breaks ties in case two teams have same number of points

# TODO: Add, calculate and use NRR to break ties

def world_cup():
    global cup

    # Reset dataframe values
    cup = cup.assign(Points=0)
    cup = cup.assign(Result="")

    for i in range(0,5):
        cup.loc[[i], 'Group'] = 'A'
    for i in range(5,10):
        cup.loc[[i], 'Group'] = 'B'
    for i in range(10,15):
        cup.loc[[i], 'Group'] = 'C'
    for i in range(15,20):
        cup.loc[[i], 'Group'] = 'D'

    """ GROUP STAGE """
    # Group stage is round robin
    round_robin([A_matches, B_matches, C_matches, D_matches])
               
    A_top = (cup['Points'].loc[cup['Group'] == 'A'].nlargest(2, keep='all').sort_index())
    B_top = (cup['Points'].loc[cup['Group'] == 'B'].nlargest(2, keep='all').sort_index())
    C_top = (cup['Points'].loc[cup['Group'] == 'C'].nlargest(2, keep='all').sort_index())
    D_top = (cup['Points'].loc[cup['Group'] == 'D'].nlargest(2, keep='all').sort_index())

    A1 = cup['Country'].loc[[A_top.index[0]]].values[0]
    B1 = cup['Country'].loc[[B_top.index[0]]].values[0]
    C1 = cup['Country'].loc[[C_top.index[0]]].values[0]
    D1 = cup['Country'].loc[[D_top.index[0]]].values[0]
    A2 = cup['Country'].loc[[A_top.index[1]]].values[0]
    B2 = cup['Country'].loc[[B_top.index[1]]].values[0]
    C2 = cup['Country'].loc[[C_top.index[1]]].values[0]
    D2 = cup['Country'].loc[[D_top.index[1]]].values[0]

    S8_G1 = [A1,B2,C1,D2]
    S8_G2 = [A2,B1,C2,D1]

    for team in S8_G1:
        cup.loc[cup['Country'] == team, 'Group'] = 'S8_G1'
    for team in S8_G2:
        cup.loc[cup['Country'] == team, 'Group'] = 'S8_G2'

    s8_in = set(np.concatenate((A_top.index,B_top.index,C_top.index,D_top.index)))
    all = set(range(0, 20))
    out = all.symmetric_difference(s8_in)

    for i in out:
        cup.loc[[i], 'Result'] = "Group stage"

    """ SUPER 8 STAGE """
    G1_matches = tuple(combinations(S8_G1, 2))
    G2_matches = tuple(combinations(S8_G2, 2))
    round_robin([G1_matches, G2_matches])

    # sorted by points here (not index)
    # TODO: use NRR to break ties (currently, pre-tournament seeding is used)
    G1_top = (cup['Points'].loc[cup['Group'] == 'S8_G1'].nlargest(2, keep='first'))
    G2_top = (cup['Points'].loc[cup['Group'] == 'S8_G2'].nlargest(2, keep='first'))

    g1winner = cup['Country'].loc[[G1_top.index[0]]].values[0]
    g1runner = cup['Country'].loc[[G1_top.index[1]]].values[0]
    g2winner = cup['Country'].loc[[G2_top.index[0]]].values[0]
    g2runner = cup['Country'].loc[[G2_top.index[1]]].values[0]

    SF1 = [g1winner, g2runner]
    SF2 = [g2winner, g1runner]

    sf_in = set(np.concatenate((G1_top.index,G2_top.index)))
    out = s8_in.symmetric_difference(sf_in)

    for i in out:
        cup.loc[[i], 'Result'] = "Super 8 stage"

    """ KNOCKOUT STAGE """

    # No ties - this should not be an issue
    SF1_winner = run_match(SF1[0], SF1[1])
    if (SF1_winner == SF1[0]):
        cup.loc[cup['Country'] == SF1[1], 'Result'] = "Semifinalist"
    else:
        cup.loc[cup['Country'] == SF1[0], 'Result'] = "Semifinalist"

    SF2_winner = run_match(SF2[0], SF2[1])
    if (SF2_winner == SF2[0]):
        cup.loc[cup['Country'] == SF2[1], 'Result'] = "Semifinalist"
    else:
        cup.loc[cup['Country'] == SF2[0], 'Result'] = "Semifinalist"

    cup_winner = run_match(SF1_winner, SF2_winner)
    if (cup_winner == SF1_winner):
        cup.loc[cup['Country'] == SF1_winner, 'Result'] = "Champion"
        cup.loc[cup['Country'] == SF2_winner, 'Result'] = "Finalist"
    else:
        cup.loc[cup['Country'] == SF2_winner, 'Result'] = "Champion"
        cup.loc[cup['Country'] == SF1_winner, 'Result'] = "Finalist"

    return cup

In [None]:
# Run to print the results of a single simulation
# One simulation should take about 3 seconds
world_cup()

In [None]:
# Ttwo-dimensional numpy array of simulation results 
# with each row as a simulation, each column as country, entry as placement 
# team index is same as in teams_list, starting with India as 0

# Change n_simulations at will
n_simulations = 5
simulation_results = np.empty((n_simulations, 20), dtype=object)
for i in range(0,n_simulations):
    world_cup()
    placements = pd.Series(cup['Result'])
    simulation_results[i] = placements.to_numpy()

In [None]:
# Ex: Print results for all countries
for i in range(0,20):
    print(cup['Country'].loc[[i]].values[0], "results:", Counter(simulation_results[:,i]))

In [None]:
# TODO: Can visualize results for each country for presentation if desired