### Setup and Preprocessing

In [None]:
import pandas as pd
import numpy as np
import random as rand
import math

In [None]:
# Read in merged CSV file
merge = pd.read_csv('merged_files.csv')
print(merge.columns)
print(merge['batting_team'].unique())
merge.head()

In [None]:
# We will use only data from the last 4 years (2020-2024)
merge[['start-year', 'start-day', 'start-month']] = merge['start_date'].str.split('-', expand=True)
recent = merge.loc[merge['start-year'].astype(int) >= 2020]
merge = merge.drop(['start-year', 'start-day', 'start-month'], axis=1)
print(recent.shape)
recent.head()

In [None]:
# Create dataframe of matches only from teams participating in the 2024 World Cup
with open('teams.txt') as file:
    teams_list = file.read().splitlines()
wc20=recent.loc[(recent['bowling_team'].isin(teams_list)) | (recent['batting_team'].isin(teams_list))]
print(wc20.shape)
wc20.head()

In [None]:
# How many unique teams are there in wc20 competition with World Cup teams?
# team_list = pd.unique(wc20[['batting_team', 'bowling_team']].values.ravel('K'))
# for team in team_list:
#    print(f"\"{team}\"",":",",")

In [None]:
#ICC T20 rating for all teams who have played against or are a World Cup team
team_rating = {
    "South Africa" : 249,
    "England" : 256,
    "Australia" : 255,
    "New Zealand" : 254,
    "India" : 266,
    "Pakistan" : 249,
    "Sri Lanka" : 234,
    "Ireland" : 194,
    "West Indies" : 245,
    "Bangladesh" : 227,
    "Zimbabwe" : 193,
    "Maldives" : 3,
    "Oman" : 154,
    "Afghanistan" : 218,
    "Hong Kong" : 139,
    "Nepal" : 175,
    "Thailand" : 0,
    "Uganda" : 132,
    "Namibia" : 196,
    "Netherlands" : 183,
    "Malaysia" : 125,
    "Scotland" : 190,
    "Papua New Guinea" : 143,
    "Nigeria" : 75,
    "Kenya" : 107,
    "United Arab Emirates" : 174,
    "Malawi" : 36,
    "Rwanda" : 32,
    "Lesotho" : 9,
    "Swaziland" : 0,
    "Ghana" : 31,
    "Seychelles" : 0,
    "Belize" : 22,
    "United States of America" : 131,
    "Canada" : 140,
    "Bermuda" : 107,
    "Argentina" : 39,
    "Bahamas" : 24,
    "Panama" : 19,
    "Tanzania" : 93,
    "Philippines" : 41,
    "Bahrain" : 115,
    "Germany" : 92,
    "Jersey" : 128,
    "Singapore" : 76,
    "Botswana" : 53,
    "Mozambique" : 36,
    "Saudi Arabia" : 104,
    "Italy" : 101,
    "Denmark" : 71,
    "Austria" : 58,
    "Vanuatu" : 60,
    "Japan" : 49,
    "Kuwait" : 118,
    "Mongolia" : 0, 
    "Cayman Islands" : 72,
}

In [None]:
group_A = teams_list[0:5]
group_B = teams_list[5:10]
group_C = teams_list[10:15]
group_D = teams_list[15:20]

In [None]:
%%capture
# Store level of each team in dictionary
# Based on ICC T20 rankings
# Not used for now
"""
tier_list = {
    "India" : 5,
    "England" : 5,
    "Australia" : 5,
    "New Zealand" : 5,
    "Pakistan" : 4,
    "South Africa" : 4,
    "West Indies" : 3,
    "Sri Lanka" : 3,
    "Bangladesh" : 3,
    "Afghanistan" : 3,
    "Namibia" : 2,
    "Ireland" : 2,
    "Scotland" : 2,
    "Netherlands" : 2,
    "Nepal" : 1,
    "Oman" : 1,
    "Papua New Guinea" : 1,
    "Canada" : 0,
    "Uganda" : 0,
    "United States of America" : 0
}
"""

### Calculate Stats

In [None]:
#checks out
df = (wc20
       .loc[:, ['striker', 'runs_off_bat', 'batting_team', 'extras', 'wides', 'noballs','byes','legbyes']]
       .groupby(['striker', 'batting_team'], as_index = False)
       .sum())
df_sorted = df.sort_values(by='runs_off_bat', ascending=False)
print(df_sorted.head())
 

In [None]:
num_bowls = (wc20
       .loc[:, ['striker', 'bowling_team', 'batting_team']]
       .groupby(['striker', 'batting_team'], as_index = False)
       .count()
       .rename(columns = {'bowling_team' : 'n_bowls'})
        )
dfA = df_sorted.merge(num_bowls, on = ['striker', 'batting_team'])
dfA = dfA.sort_values(by = 'runs_off_bat', ascending = False)
print(dfA)

In [None]:
# check numbers for V Kohli
print(df_sorted[df_sorted['striker'] == 'V Kohli'])
print(dfA[dfA['striker'] == 'V Kohli'])

In [None]:
# Calculate runs per bowl for strikers (extras not included)
# probably better to include extras in conceded runs for bowlers
dfA['runs_per_bowl'] = (dfA['runs_off_bat'] / dfA['n_bowls'])
dfA = dfA.sort_values(by = 'runs_per_bowl', ascending = False)
dfA.head(10)

In [None]:
# Calculate runs conceded per bowl for bowlers (extras included)
df = (wc20
       .loc[:, ['bowler', 'bowling_team', 'runs_off_bat', 'extras']]
       .groupby(['bowler', 'bowling_team'], as_index = False)
       .sum())
df_sorted = df.sort_values(by='runs_off_bat', ascending=False)
df_sorted.head()

In [None]:
num_bowls = (wc20
       .loc[:, ['bowler', 'bowling_team', 'batting_team']]
       .groupby(['bowler', 'bowling_team'], as_index = False)
       .count()
       .rename(columns = {'batting_team' : 'n_bowls'})
        )
dfB = df_sorted.merge(num_bowls, on = ['bowler', 'bowling_team'])
dfB = dfB.sort_values(by = 'runs_off_bat', ascending = False)
print(dfB)

In [None]:
dfB['runs_conceded_per_bowl'] = ((dfB['runs_off_bat'] + dfB['extras']) / dfB['n_bowls'])
dfB = dfB.sort_values(by = 'runs_conceded_per_bowl', ascending = False)
dfB.head(10)

In [None]:
players = pd.read_csv('players.csv')

# Add runs per bowl values to players.csv 
# Only RPB values for hitters and all_rounders will be used
players = players.merge(dfA[['striker', 'runs_per_bowl']], how='left', left_on='name',right_on='striker')
players = players.drop(columns = ['striker'])

# Add runs conceded per bowl values to players.csv 
# Only RCPB values for bowlers and all_rounders will be used
players = players.merge(dfB[['bowler', 'runs_conceded_per_bowl']], how='left', left_on='name',right_on='bowler')
players = players.drop(columns = ['bowler'])

players.head(7)

In [None]:
%%capture
# Potential issue for players listed under the same name:
# runs_off_bat (+ extras) grouped together for all players,
# but separated by country, resulting in faulty numbers
# should not affect any of our selected players

"""
khan = df2.loc[df2['striker']=='Shoaib Khan']
print(khan)
goud = df2.loc[df2['striker']=='Sandeep Goud']
print(goud)
goud = df2B.loc[df2B['bowler']=='Sandeep Goud']
print(goud)
"""

### Run Simulations

In [None]:
bowlers = players[((players['position'] == 'bowler') | (players['position'] == 'all_rounder'))]
hitters = players[((players['position'] == 'hitter') | (players['position'] == 'all_rounder'))]

# Remove players for which there is no data (one US bowler)
bowlers = bowlers.dropna(subset=['runs_conceded_per_bowl'])
hitters = hitters.dropna(subset=['runs_per_bowl'])

print(bowlers.shape)
print(hitters.shape)

In [None]:
# How we determine which score to use determines our results! 
# The distribution of RPB and RPCB could be decided on a per-match basis 
# if there is enough data for a player

def get_runs(rpb, rcpb):
    return rand.uniform(rpb,rcpb)

In [None]:
def run_match(country1, country2, print_score=False):
    
    #make a vector for each team's batters/bowlers 
    bowlers1 = bowlers.loc[bowlers['country'] == country1]
    bowlers1 = bowlers1['name'].to_numpy()

    bowlers2 = bowlers.loc[bowlers['country'] == country2]
    bowlers2 = bowlers2['name'].to_numpy()

    hitters1 = hitters.loc[hitters['country'] == country1]
    hitters1 = hitters1['name'].to_numpy()

    hitters2 = hitters.loc[hitters['country'] == country2]
    hitters2 = hitters2['name'].to_numpy()

    #base scores
    country1_score = 0.0
    country2_score = 0.0 
    
    # 2 x 4 hitters x 4 bowlers x 6 bowls
    # 192 total bowls (except for US)

    #first team score:
    for hitter in hitters1:
        for bowler in bowlers2:
            for x in range(6):
                rpb = (players.loc[players['name'] == hitter, 'runs_per_bowl'].values)[0]
                rcpb = (players.loc[players['name'] == bowler, 'runs_conceded_per_bowl'].values)[0]
                country1_score += get_runs(rpb, rcpb)
              
    
    #second team score:
    for hitter in hitters2:
        for bowler in bowlers1:
            for x in range(6):
                rpb = (players.loc[players['name'] == hitter, 'runs_per_bowl'].values)[0]
                rcpb = (players.loc[players['name'] == bowler, 'runs_conceded_per_bowl'].values)[0]
                country2_score += get_runs(rpb, rcpb)

# Adjust scores in case of US
    if (country1 == "United States of America"):
        country1_score = country1_score*(3/4)
    elif (country2 == "United States of America"):
        country2_score = country2_score*(3/4)
    
    if(print_score):
        print(country1, "score:", country1_score)
        print(country2, "score:", country2_score)

    if country1_score > country2_score:
        return country1
    elif country1_score < country2_score:
        return country2
    else:
        return 'tie'

In [None]:
# Change these to try a different matchup!
teamA = 'Canada'
teamB = 'Australia'

# One match simulated with each function call
# Add 'True' to run_match to print scores
winner = run_match(teamA, teamB)
print("Winner:", winner)

In [None]:
# Run this to simulate many matches for a given pair
num_iterations = 200
A = 0
for i in range(num_iterations):
    if(run_match(teamA,teamB,True)==teamA):
        A+=1
print("Proportion that", teamA, "won:",A/num_iterations)

In [None]:
# TODO: Loop through teams list to simulate matches