### Setup and Preprocessing

In [None]:
import pandas as pd
import numpy as np
import random as rand
from itertools import combinations
from collections import Counter

In [None]:
# Read in merged CSV file
merge = pd.read_csv('merged_files.csv', index_col=0)
merge.head(6)

In [None]:
# We will use only data from the last 4 years (2020-2024)
merge[['start-year', 'start-day', 'start-month']] = merge['start_date'].str.split('-', expand=True)
recent = merge.loc[merge['start-year'].astype(int) >= 2020]
merge = merge.drop(['start-year', 'start-day', 'start-month'], axis=1)
print(recent.shape)
recent.head(6)

In [None]:
# Create dataframe of matches only from teams participating in the 2024 World Cup
# teams are organized in their groups for the first round, ordered by pre-tournament seed within groups
# Ex: [A1, A2, ..., D4, D5]
with open('teams.txt') as file:
    teams_list = file.read().splitlines()
wc20=recent.loc[(recent['bowling_team'].isin(teams_list)) | (recent['batting_team'].isin(teams_list))]
print(wc20.shape)
wc20.head()

In [None]:
%%capture
# How many unique teams are there in wc20 competition with World Cup teams?
unique_teams = pd.unique(wc20[['batting_team', 'bowling_team']].values.ravel('K'))
for team in unique_teams:
    print(f"\"{team}\"",":",",")

In [None]:
#ICC T20 rating for all teams who have played against or are a World Cup team
team_rating = {
    "India" : 266,
    "England" : 256,
    "Australia" : 255,
    "New Zealand" : 254,
    "South Africa" : 249,
    "Pakistan" : 249,
    "West Indies" : 245,
    "Sri Lanka" : 234,
    "Bangladesh" : 227,
    "Afghanistan" : 218,
    "Namibia" : 196,
    "Ireland" : 194,
    "Zimbabwe" : 193,
    "Scotland" : 190,
    "Netherlands" : 183,
    "Nepal" : 175,
    "United Arab Emirates" : 174,
    "Oman" : 154,
    "Papua New Guinea" : 143,
    "Canada" : 140,
    "Hong Kong" : 139,
    "Uganda" : 132,
    "United States of America" : 131,
    "Jersey" : 128,
    "Malaysia" : 125,
    "Kuwait" : 118,
    "Bahrain" : 115,
    "Kenya" : 107,
    "Bermuda" : 107,
    "Saudi Arabia" : 104,
    "Italy" : 101,
    "Tanzania" : 93,
    "Germany" : 92,
    "Singapore" : 76,
    "Nigeria" : 75,
    "Cayman Islands" : 72,
    "Denmark" : 71,
    "Vanuatu" : 60,
    "Austria" : 58,
    "Botswana" : 53,
    "Japan" : 49,
    "Philippines" : 41,
    "Argentina" : 39,
    "Malawi" : 36,
    "Mozambique" : 36,
    "Rwanda" : 32,
    "Ghana" : 31,
    "Bahamas" : 24,
    "Belize" : 22,
    "Panama" : 19,
    "Lesotho" : 9,
    "Maldives" : 3,
    "Thailand" : 0,
    "Swaziland" : 0,
    "Seychelles" : 0,
    "Mongolia" : 0,
}

In [None]:
# TODO: MAKE THIS

tiered_team_rating = {
    "India" : 266,
    "England" : 256,
    "Australia" : 255,
    "New Zealand" : 254,
    "South Africa" : 249,
    "Pakistan" : 249,
    "West Indies" : 245,
    "Sri Lanka" : 234,
    "Bangladesh" : 227,
    "Afghanistan" : 218,
    "Namibia" : 196,
    "Ireland" : 194,
    "Zimbabwe" : 193,
    "Scotland" : 190,
    "Netherlands" : 183,
    "Nepal" : 175,
    "United Arab Emirates" : 174,
    "Oman" : 154,
    "Papua New Guinea" : 143,
    "Canada" : 140,
    "Hong Kong" : 139,
    "Uganda" : 132,
    "United States of America" : 131,
    "Jersey" : 128,
    "Malaysia" : 125,
    "Kuwait" : 118,
    "Bahrain" : 115,
    "Kenya" : 107,
    "Bermuda" : 107,
    "Saudi Arabia" : 104,
    "Italy" : 101,
    "Tanzania" : 93,
    "Germany" : 92,
    "Singapore" : 76,
    "Nigeria" : 75,
    "Cayman Islands" : 72,
    "Denmark" : 71,
    "Vanuatu" : 60,
    "Austria" : 58,
    "Botswana" : 53,
    "Japan" : 49,
    "Philippines" : 41,
    "Argentina" : 39,
    "Malawi" : 36,
    "Mozambique" : 36,
    "Rwanda" : 32,
    "Ghana" : 31,
    "Bahamas" : 24,
    "Belize" : 22,
    "Panama" : 19,
    "Lesotho" : 9,
    "Maldives" : 3,
    "Thailand" : 0,
    "Swaziland" : 0,
    "Seychelles" : 0,
    "Mongolia" : 0,
}

### Calculate Stats

In [None]:
%%capture
# Calculate adjusted runs scored
team_ratings_mapped = wc20['bowling_team'].map(team_rating)
wc20['adjusted_team_ratings'] = team_ratings_mapped
wc20['adj_runs_scored'] = wc20['runs_off_bat'] * (wc20['adjusted_team_ratings'] / 266)

# Calculate adjusted conceded runs
team_ratings_mapped2 = wc20['batting_team'].map(team_rating)
wc20['adjusted_team_ratings2'] = team_ratings_mapped2 
wc20.loc[wc20['adjusted_team_ratings2'] < 50, 'adjusted_team_ratings2'] = 50
wc20['adj_conceded_runs'] = wc20['runs_off_bat'] * ( 266 / wc20['adjusted_team_ratings2'])

In [None]:
# Calculate adj runs per bowl for strikers (extras not included)
# Extras included in conceded runs for bowlers

df = (wc20
       .loc[:, ['striker', 'adj_runs_scored', 'batting_team']]
       .groupby(['striker', 'batting_team'], as_index = False)
       .sum())
df_sorted = df.sort_values(by = 'adj_runs_scored', ascending = False)
df_sorted.head(10)
 

In [None]:
num_bowls = (wc20
       .loc[:, ['striker', 'bowling_team', 'batting_team']]
       .groupby(['striker', 'batting_team'], as_index = False)
       .count()
       .rename(columns = {'bowling_team' : 'n_bowls'})
        )
dfA = df_sorted.merge(num_bowls, on = ['striker', 'batting_team'])
dfA = dfA.sort_values(by = 'adj_runs_scored', ascending = False)
# print(dfA)

In [None]:
dfA['adj_runs_per_bowl'] = (dfA['adj_runs_scored'] / dfA['n_bowls'])
dfA = dfA.sort_values(by = 'adj_runs_per_bowl', ascending = False)
dfA.head(10)

In [None]:
# Calculate adj runs conceded per bowl for bowlers (extras included)
df = (wc20
       .loc[:, ['bowler', 'bowling_team', 'adj_conceded_runs', 'extras']]
       .groupby(['bowler', 'bowling_team'], as_index = False)
       .sum())
df_sorted = df.sort_values(by='adj_conceded_runs', ascending=False)
df_sorted.head(10)

In [None]:
num_bowls = (wc20
       .loc[:, ['bowler', 'bowling_team', 'batting_team']]
       .groupby(['bowler', 'bowling_team'], as_index = False)
       .count()
       .rename(columns = {'batting_team' : 'n_bowls'})
        )
dfB = df_sorted.merge(num_bowls, on = ['bowler', 'bowling_team'])
dfB = dfB.sort_values(by = 'adj_conceded_runs', ascending = False)
# print(dfB)

In [None]:
dfB['adj_runs_conceded_per_bowl'] = ((dfB['adj_conceded_runs'] + dfB['extras']) / dfB['n_bowls'])
dfB = dfB.sort_values(by = 'adj_runs_conceded_per_bowl', ascending = False)
dfB.head(10)

In [None]:
players = pd.read_csv('players.csv')

# Add runs per bowl values to players.csv 
# Only RPB values for hitters and all_rounders will be used

players = players.merge(dfA[['striker', 'adj_runs_per_bowl']], how='left', left_on='name',right_on='striker')
players = players.drop(columns = ['striker'])

# Add runs conceded per bowl values to players.csv 
# Only RCPB values for bowlers and all_rounders will be used

players = players.merge(dfB[['bowler', 'adj_runs_conceded_per_bowl']], how='left', left_on='name',right_on='bowler')
players = players.drop(columns = ['bowler'])

players.head(7)

### Simulate a Match

In [None]:
bowlers = players[((players['position'] == 'bowler') | (players['position'] == 'all_rounder'))]
hitters = players[((players['position'] == 'hitter') | (players['position'] == 'all_rounder'))]

# Remove players for which there is no data
bowlers = bowlers.dropna(subset=['adj_runs_conceded_per_bowl'])
hitters = hitters.dropna(subset=['adj_runs_per_bowl'])

# Remove invalid rows from bowlers df
bowlers = bowlers.drop([74,76,77])

# We have 80 bowlers and 80 hitters,
# with all-rounders listed in both dataframes.
print(bowlers.shape)
print(hitters.shape)

bowlers.head(8)

In [None]:
# How we determine which score to use determines our results! 
# The distribution of RPB and RPCB could be decided on a per-match basis 
# if there is enough data for a player

def get_runs(rpb, rcpb):
    return rand.uniform(rpb,rcpb)

In [None]:
def run_match(country1, country2, print_score=False, update_nrr = True):
    global cup

    # make a vector for each team's batters/bowlers 
    bowlers1 = bowlers.loc[bowlers['country'] == country1]
    bowlers1 = bowlers1['name'].to_numpy()

    bowlers2 = bowlers.loc[bowlers['country'] == country2]
    bowlers2 = bowlers2['name'].to_numpy()

    hitters1 = hitters.loc[hitters['country'] == country1]
    hitters1 = hitters1['name'].to_numpy()

    hitters2 = hitters.loc[hitters['country'] == country2]
    hitters2 = hitters2['name'].to_numpy()

    arr_list = [bowlers1, bowlers2, hitters1, hitters2]

    # shuffle order of bowlers and hitters for added variability
    for arr in arr_list:
        rand.shuffle(arr)

    # base scores
    country1_score = 0.0
    country2_score = 0.0 
    
    # 2 x 4 hitters x 4 bowlers x 6 bowls
    # 192 total bowls (except for US)

    #first team score:
    for hitter in hitters1:
        for bowler in bowlers2:
            for x in range(6):
                rpb = (players.loc[players['name'] == hitter, 'adj_runs_per_bowl'].values)[0]
                rcpb = (players.loc[players['name'] == bowler, 'adj_runs_conceded_per_bowl'].values)[0]
                country1_score += get_runs(rpb, rcpb)
              
    
    #second team score:
    for hitter in hitters2:
        for bowler in bowlers1:
            for x in range(6):
                rpb = (players.loc[players['name'] == hitter, 'adj_runs_per_bowl'].values)[0]
                rcpb = (players.loc[players['name'] == bowler, 'adj_runs_conceded_per_bowl'].values)[0]
                country2_score += get_runs(rpb, rcpb)

    if(print_score):
        print(country1, "score:", country1_score)
        print(country2, "score:", country2_score)

    # In reality, all scores will be whole numbers, and the number of overs will vary.
    # For the purposes of calculating NRR in this simulation, we will not round scores
    # and treat all teams as bowling 16 overs in each match. In that way, for every team:
    # Total Overs Faced = Total Overs Bowled = 16 * Number of Matches Played 
    # (Since the number of overs will always be 16, this speeds up the calculation!)
        
    if(update_nrr):
        # NRR = (Total Runs Scored ÷ Total Overs Faced) – (Total Runs Conceded ÷ Total Overs Bowled)
        # For country1
        cup.loc[cup['Country'] == country1, 'total_runs_scored'] += country1_score
        cup.loc[cup['Country'] == country1, 'total_runs_conceded'] += country2_score
        cup.loc[cup['Country'] == country1, 'num_matches_played'] += 1
        cup.loc[cup['Country'] == country1, 'NRR'] = ((cup.loc[cup['Country'] == country1, 'total_runs_scored'].values[0] -
                                                                         cup.loc[cup['Country'] == country1, 'total_runs_conceded'].values[0]) / 
                                                                         (16*cup.loc[cup['Country'] == country1, 'num_matches_played'].values[0]))
        # For country2
        cup.loc[cup['Country'] == country2, 'total_runs_scored'] += country2_score
        cup.loc[cup['Country'] == country2, 'total_runs_conceded'] += country1_score
        cup.loc[cup['Country'] == country2, 'num_matches_played'] += 1
        cup.loc[cup['Country'] == country2, 'NRR'] = ((cup.loc[cup['Country'] == country2, 'total_runs_scored'].values[0] -
                                                                         cup.loc[cup['Country'] == country2, 'total_runs_conceded'].values[0]) / 
                                                                         (16*cup.loc[cup['Country'] == country2, 'num_matches_played'].values[0]))

    if country1_score > country2_score:
        return country1
    elif country2_score > country1_score:
        return country2
    else:
        return 'tie'

In [None]:
# Change these to try a different matchup!
teamA = 'Australia'
teamB = 'New Zealand'

# One match simulated with each function call
# Add 'True' to run_match to print scores
winner = run_match(teamA, teamB, True, False)
print("Winner:", winner)

In [None]:
# Change num_iterations and run this cell to simulate 
# many matches for the pair given in the previous cell

num_iterations = 100

A_victories = 0
for i in range(num_iterations):
    if (run_match(teamA, teamB, False, False) == teamA):
        A_victories += 1
print("Proportion that", teamA, "won:", A_victories/num_iterations)

### Simulate the World Cup

In [None]:
cup = pd.DataFrame({'Country': teams_list, 'Group': 5*['A']+5*['B']+5*['C']+5*['D']})
cup['id'] = cup.index

A_matches= tuple(combinations(cup['Country'].loc[cup['Group'] == 'A'], 2))
B_matches = tuple(combinations(cup['Country'].loc[cup['Group'] == 'B'], 2))
C_matches = tuple(combinations(cup['Country'].loc[cup['Group'] == 'C'], 2))
D_matches = tuple(combinations(cup['Country'].loc[cup['Group'] == 'D'], 2))

In [None]:
def round_robin(group_list, match_record):
    global cup
    for group in group_list:
        for pairing in group:
            team1 = pairing[0]
            team2 = pairing[1]
            winner = run_match(team1,team2)

            # 2 points for win, 1 for tie
            if (winner == team1):
                cup.loc[cup['Country'] == team1, 'Points'] += 2
                match_record.append([team1, team2, 'w'])
            elif (winner == team2):
                cup.loc[cup['Country'] == team2, 'Points'] += 2
                match_record.append([team2, team1, 'w'])
            else: # if tie, but this is very rare in practice
                cup.loc[cup['Country'] == team1, 'Points'] += 1
                cup.loc[cup['Country'] == team2, 'Points'] += 1
                match_record.append([team1, team2, 't'])
    return match_record

In [None]:
"""
In the Super Eight round, Teams seeded first and second in their groups in the 
first round will retain that seeding in the Super Eight, provided they qualify.
(https://www.icc-cricket.com/news/fixtures-revealed-for-historic-icc-men-s-t20-world-cup-2024-in-west-indies-and-the-usa)

For simplicity, all teams' Super 8 placements will be determined 
by their pre-tournament seeds in the cup simulation.
"""

def world_cup(match_record=[]):
    global cup

    # Reset dataframe values
    cup = cup.assign(Points=0)
    cup = cup.assign(Result="")

    cup = cup.assign(total_runs_scored=0.0)
    cup = cup.assign(total_runs_conceded=0.0)
    cup = cup.assign(num_matches_played=0)
    cup = cup.assign(NRR=0.0)

    for i in range(0,5):
        cup.loc[[i], 'Group'] = 'A'
    for i in range(5,10):
        cup.loc[[i], 'Group'] = 'B'
    for i in range(10,15):
        cup.loc[[i], 'Group'] = 'C'
    for i in range(15,20):
        cup.loc[[i], 'Group'] = 'D'


    """ GROUP STAGE """
    # Group stage is round robin
    round_robin([A_matches, B_matches, C_matches, D_matches], match_record)
               
    A_top = (cup['Points'].loc[cup['Group'] == 'A'].nlargest(2, keep='all').sort_index())
    B_top = (cup['Points'].loc[cup['Group'] == 'B'].nlargest(2, keep='all').sort_index())
    C_top = (cup['Points'].loc[cup['Group'] == 'C'].nlargest(2, keep='all').sort_index())
    D_top = (cup['Points'].loc[cup['Group'] == 'D'].nlargest(2, keep='all').sort_index())

    A1 = cup['Country'].loc[[A_top.index[0]]].values[0]
    B1 = cup['Country'].loc[[B_top.index[0]]].values[0]
    C1 = cup['Country'].loc[[C_top.index[0]]].values[0]
    D1 = cup['Country'].loc[[D_top.index[0]]].values[0]
    A2 = cup['Country'].loc[[A_top.index[1]]].values[0]
    B2 = cup['Country'].loc[[B_top.index[1]]].values[0]
    C2 = cup['Country'].loc[[C_top.index[1]]].values[0]
    D2 = cup['Country'].loc[[D_top.index[1]]].values[0]

    S8_G1 = [A1,B2,C1,D2]
    S8_G2 = [A2,B1,C2,D1]

    for team in S8_G1:
        cup.loc[cup['Country'] == team, 'Group'] = 'S8_G1'
    for team in S8_G2:
        cup.loc[cup['Country'] == team, 'Group'] = 'S8_G2'

    s8_in = set(np.concatenate((A_top.index,B_top.index,C_top.index,D_top.index)))
    all = set(range(0, 20))
    out = all.symmetric_difference(s8_in)

    for i in out:
        cup.loc[[i], 'Result'] = "Group stage"

    """ SUPER 8 STAGE """
    G1_matches = tuple(combinations(S8_G1, 2))
    G2_matches = tuple(combinations(S8_G2, 2))
    round_robin([G1_matches, G2_matches], match_record)

    # Sorted by points here (not index)
    G1_largest = (cup['Points'].loc[cup['Group'] == 'S8_G1'].nlargest(2, keep='all'))
    G2_largest = (cup['Points'].loc[cup['Group'] == 'S8_G2'].nlargest(2, keep='all'))

    # NRR comes into effect here to break ties
    # Could also use (and keep track) of head-to-head result
    # The rules are unclear whether or not head-to-head would be used over NRR 
    # when there is a tie between two teams.
    # For now, we will use NRR, as it could apply to ties of any size.

    G1_top = cup.loc[cup['id'].isin(G1_largest.index.values), ['Country', 'Points', 'NRR']]
    G2_top = cup.loc[cup['id'].isin(G2_largest.index.values), ['Country', 'Points', 'NRR']]
    G1_top = G1_top.sort_values(by=['Points', 'NRR'], ascending=[False, False], kind='stable')
    G2_top = G2_top.sort_values(by=['Points', 'NRR'], ascending=[False, False], kind='stable')

    g1winner = cup['Country'].loc[[G1_top.index[0]]].values[0]
    g1runner = cup['Country'].loc[[G1_top.index[1]]].values[0]
    g2winner = cup['Country'].loc[[G2_top.index[0]]].values[0]
    g2runner = cup['Country'].loc[[G2_top.index[1]]].values[0]

    SF1 = [g1winner, g2runner]
    SF2 = [g2winner, g1runner]

    sf_in = set(np.concatenate((G1_top.index.values[0:2],G2_top.index.values[0:2])))
    out = s8_in.symmetric_difference(sf_in)

    for i in out:
        cup.loc[[i], 'Result'] = "Super 8 stage"

    """ KNOCKOUT STAGE """
    # No ties - this should not be an issue
    # For reference, points are not being awarded at this stage
    SF1_winner = run_match(SF1[0], SF1[1])
    if (SF1_winner == SF1[0]):
        cup.loc[cup['Country'] == SF1[1], 'Result'] = "Semifinalist"
        match_record.append([cup.loc[cup['Country'] == SF1[0], 'Country'].values[0], 
                                      cup.loc[cup['Country'] == SF1[1], 'Country'].values[0], 'w'])
    else:
        cup.loc[cup['Country'] == SF1[0], 'Result'] = "Semifinalist"
        match_record.append([cup.loc[cup['Country'] == SF1[1], 'Country'].values[0], 
                                      cup.loc[cup['Country'] == SF1[0], 'Country'].values[0], 'w'])

    SF2_winner = run_match(SF2[0], SF2[1])
    if (SF2_winner == SF2[0]):
        cup.loc[cup['Country'] == SF2[1], 'Result'] = "Semifinalist"
        match_record.append([cup.loc[cup['Country'] == SF2[0], 'Country'].values[0], 
                                      cup.loc[cup['Country'] == SF2[1], 'Country'].values[0], 'w'])
    else:
        cup.loc[cup['Country'] == SF2[0], 'Result'] = "Semifinalist"
        match_record.append([cup.loc[cup['Country'] == SF2[1], 'Country'].values[0], 
                                      cup.loc[cup['Country'] == SF2[0], 'Country'].values[0], 'w'])

    cup_winner = run_match(SF1_winner, SF2_winner)
    if (cup_winner == SF1_winner):
        cup.loc[cup['Country'] == SF1_winner, 'Result'] = "Champion"
        cup.loc[cup['Country'] == SF2_winner, 'Result'] = "Finalist"
        match_record.append([cup.loc[cup['Country'] == SF1_winner, 'Country'].values[0], 
                                      cup.loc[cup['Country'] == SF2_winner, 'Country'].values[0], 'w'])
    else:
        cup.loc[cup['Country'] == SF2_winner, 'Result'] = "Champion"
        cup.loc[cup['Country'] == SF1_winner, 'Result'] = "Finalist"
        match_record.append([cup.loc[cup['Country'] == SF2_winner, 'Country'].values[0], 
                                      cup.loc[cup['Country'] == SF1_winner, 'Country'].values[0], 'w'])

    return cup, match_record

In [None]:
# Run to print the results of a single simulation
# One simulation should take about 3 seconds
sim_result = world_cup()[0].drop(columns = ['id'])
sim_result['Result'] = pd.Categorical(sim_result['Result'], 
                                      ["Group stage", "Super 8 stage", "Semifinalist", "Finalist", "Champion"])
sim_result.sort_values(by=['Result', 'NRR'], ascending=False)

In [None]:
# Two-dimensional numpy array of simulation results 
# with each row as a simulation, each column as country, entry as placement 
# team index is same as in teams_list, starting with India as 0

# TODO: test with tiers

n_simulations = 10  # Change this!
match_records = pd.DataFrame()  # Will be reset every time this cell is run
simulation_results = np.empty((n_simulations, 20), dtype=object)
for i in range(0, n_simulations):
    match_records = pd.concat([match_records, pd.DataFrame(world_cup([])[1])], axis=0)
    placements = pd.Series(cup['Result'])
    simulation_results[i] = placements.to_numpy()

### Analyze Results

In [None]:
# Ex: Print results for all countries
for i in range(0,20):
    print(teams_list[i], "results:", dict(Counter(simulation_results[:,i])))

In [None]:
# match_records contains all match results from the simulation set.
# This way, we can easily calculate the proportion of matches won
# by one team versus another across the whole simulation set.
# The outcome column is included only in the rare case a tie occurs.
# Below, the match records from the first simulation are printed.

match_records = match_records.rename({0:'winner', 1:'loser', 2:'outcome'}, axis=1)
match_records.head(55)

In [None]:
# TODO: Visualize results based on simulation_results array and match_records dataframe