### Setup and Preprocessing

In [1]:
import pandas as pd
import numpy as np
import random as rand
import math

In [2]:
# Read in merged CSV file
merge = pd.read_csv('merged_files.csv')
print(merge.columns)
print(merge['batting_team'].unique())
merge.head()

Index(['Unnamed: 0', 'match_id', 'season', 'start_date', 'venue', 'innings',
       'ball', 'batting_team', 'bowling_team', 'striker', 'non_striker',
       'bowler', 'runs_off_bat', 'extras', 'wides', 'noballs', 'byes',
       'legbyes', 'penalty', 'wicket_type', 'player_dismissed'],
      dtype='object')
['England' 'Australia' 'New Zealand' 'South Africa' 'Pakistan' 'Sri Lanka'
 'West Indies' 'India' 'Kenya' 'Scotland' 'Zimbabwe' 'Bangladesh'
 'Bermuda' 'Netherlands' 'Ireland' 'Afghanistan' 'Canada' 'Nepal'
 'Hong Kong' 'United Arab Emirates' 'Papua New Guinea' 'Oman'
 'ICC World XI' 'Philippines' 'Vanuatu' 'United States of America'
 'Germany' 'Italy' 'Ghana' 'Namibia' 'Uganda' 'Botswana' 'Nigeria'
 'Guernsey' 'Denmark' 'Norway' 'Jersey' 'Thailand' 'Malaysia' 'Maldives'
 'Singapore' 'Qatar' 'Kuwait' 'Cayman Islands' 'Portugal' 'Spain'
 'Gibraltar' 'Bhutan' 'Saudi Arabia' 'Bahrain' 'Iran' 'Belgium'
 'Luxembourg' 'Czech Republic' 'Isle of Man' 'Bulgaria' 'Romania'
 'Austria' 'Greece' 

Unnamed: 0.1,Unnamed: 0,match_id,season,start_date,venue,innings,ball,batting_team,bowling_team,striker,...,bowler,runs_off_bat,extras,wides,noballs,byes,legbyes,penalty,wicket_type,player_dismissed
0,0,211028,2005,2005-06-13,The Rose Bowl,1,0.1,England,Australia,ME Trescothick,...,B Lee,0,0,,,,,,,
1,1,211028,2005,2005-06-13,The Rose Bowl,1,0.2,England,Australia,ME Trescothick,...,B Lee,1,0,,,,,,,
2,2,211028,2005,2005-06-13,The Rose Bowl,1,0.3,England,Australia,GO Jones,...,B Lee,0,0,,,,,,,
3,3,211028,2005,2005-06-13,The Rose Bowl,1,0.4,England,Australia,GO Jones,...,B Lee,0,0,,,,,,,
4,4,211028,2005,2005-06-13,The Rose Bowl,1,0.5,England,Australia,GO Jones,...,B Lee,0,0,,,,,,,


In [3]:
# We will use only data from the last 4 years (2020-2024)
merge[['start-year', 'start-day', 'start-month']] = merge['start_date'].str.split('-', expand=True)
recent = merge.loc[merge['start-year'].astype(int) >= 2020]
merge = merge.drop(['start-year', 'start-day', 'start-month'], axis=1)
print(recent.shape)
recent.head()

(285767, 24)


Unnamed: 0.1,Unnamed: 0,match_id,season,start_date,venue,innings,ball,batting_team,bowling_team,striker,...,wides,noballs,byes,legbyes,penalty,wicket_type,player_dismissed,start-year,start-day,start-month
169922,0,1185313,2019/20,2020-02-12,Buffalo Park,1,0.1,South Africa,England,T Bavuma,...,,,,,,,,2020,2,12
169923,1,1185313,2019/20,2020-02-12,Buffalo Park,1,0.2,South Africa,England,Q de Kock,...,,,,,,,,2020,2,12
169924,2,1185313,2019/20,2020-02-12,Buffalo Park,1,0.3,South Africa,England,T Bavuma,...,,,,,,,,2020,2,12
169925,3,1185313,2019/20,2020-02-12,Buffalo Park,1,0.4,South Africa,England,Q de Kock,...,,,,,,,,2020,2,12
169926,4,1185313,2019/20,2020-02-12,Buffalo Park,1,0.5,South Africa,England,Q de Kock,...,,,,,,,,2020,2,12


In [4]:
# Create dataframe of matches only from teams participating in the 2024 World Cup
with open('teams.txt') as file:
    teams_list = file.read().splitlines()
wc20=recent.loc[(recent['bowling_team'].isin(teams_list)) | (recent['batting_team'].isin(teams_list))]
print(wc20.shape)
wc20.head()

(147566, 24)


Unnamed: 0.1,Unnamed: 0,match_id,season,start_date,venue,innings,ball,batting_team,bowling_team,striker,...,wides,noballs,byes,legbyes,penalty,wicket_type,player_dismissed,start-year,start-day,start-month
169922,0,1185313,2019/20,2020-02-12,Buffalo Park,1,0.1,South Africa,England,T Bavuma,...,,,,,,,,2020,2,12
169923,1,1185313,2019/20,2020-02-12,Buffalo Park,1,0.2,South Africa,England,Q de Kock,...,,,,,,,,2020,2,12
169924,2,1185313,2019/20,2020-02-12,Buffalo Park,1,0.3,South Africa,England,T Bavuma,...,,,,,,,,2020,2,12
169925,3,1185313,2019/20,2020-02-12,Buffalo Park,1,0.4,South Africa,England,Q de Kock,...,,,,,,,,2020,2,12
169926,4,1185313,2019/20,2020-02-12,Buffalo Park,1,0.5,South Africa,England,Q de Kock,...,,,,,,,,2020,2,12


In [5]:
# How many unique teams are there in wc20 competition with World Cup teams?
# team_list = pd.unique(wc20[['batting_team', 'bowling_team']].values.ravel('K'))
# for team in team_list:
#    print(f"\"{team}\"",":",",")

In [6]:
#ICC T20 rating for all teams who have played against or are a World Cup team
team_rating = {
    "South Africa" : 249,
    "England" : 256,
    "Australia" : 255,
    "New Zealand" : 254,
    "India" : 266,
    "Pakistan" : 249,
    "Sri Lanka" : 234,
    "Ireland" : 194,
    "West Indies" : 245,
    "Bangladesh" : 227,
    "Zimbabwe" : 193,
    "Maldives" : 3,
    "Oman" : 154,
    "Afghanistan" : 218,
    "Hong Kong" : 139,
    "Nepal" : 175,
    "Thailand" : 0,
    "Uganda" : 132,
    "Namibia" : 196,
    "Netherlands" : 183,
    "Malaysia" : 125,
    "Scotland" : 190,
    "Papua New Guinea" : 143,
    "Nigeria" : 75,
    "Kenya" : 107,
    "United Arab Emirates" : 174,
    "Malawi" : 36,
    "Rwanda" : 32,
    "Lesotho" : 9,
    "Swaziland" : 0,
    "Ghana" : 31,
    "Seychelles" : 0,
    "Belize" : 22,
    "United States of America" : 131,
    "Canada" : 140,
    "Bermuda" : 107,
    "Argentina" : 39,
    "Bahamas" : 24,
    "Panama" : 19,
    "Tanzania" : 93,
    "Philippines" : 41,
    "Bahrain" : 115,
    "Germany" : 92,
    "Jersey" : 128,
    "Singapore" : 76,
    "Botswana" : 53,
    "Mozambique" : 36,
    "Saudi Arabia" : 104,
    "Italy" : 101,
    "Denmark" : 71,
    "Austria" : 58,
    "Vanuatu" : 60,
    "Japan" : 49,
    "Kuwait" : 118,
    "Mongolia" : 0, 
    "Cayman Islands" : 72,
}

In [7]:
group_A = teams_list[0:5]
group_B = teams_list[5:10]
group_C = teams_list[10:15]
group_D = teams_list[15:20]

In [8]:
%%capture
# Store level of each team in dictionary
# Based on ICC T20 rankings
# Not used for now
"""
tier_list = {
    "India" : 5,
    "England" : 5,
    "Australia" : 5,
    "New Zealand" : 5,
    "Pakistan" : 4,
    "South Africa" : 4,
    "West Indies" : 3,
    "Sri Lanka" : 3,
    "Bangladesh" : 3,
    "Afghanistan" : 3,
    "Namibia" : 2,
    "Ireland" : 2,
    "Scotland" : 2,
    "Netherlands" : 2,
    "Nepal" : 1,
    "Oman" : 1,
    "Papua New Guinea" : 1,
    "Canada" : 0,
    "Uganda" : 0,
    "United States of America" : 0
}
"""

### Calculate Stats

In [56]:
team_ratings_mapped = wc20['bowling_team'].map(team_rating)
wc20['adj_runs_off_bat'] = wc20['runs_off_bat'] * (team_ratings_mapped / 266)

wc20['adjusted_team_ratings2'] = team_ratings_mapped2
wc20.loc[wc20['adjusted_team_ratings2'] < 50, 'adjusted_team_ratings'] = 50
wc20['adj_conceded_runs_off_bat'] = wc20['runs_off_bat'] * ( 266 / team_ratings_mapped2)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  wc20['adj_runs_off_bat'] = wc20['runs_off_bat'] * (team_ratings_mapped / 266)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  wc20['adjusted_team_ratings2'] = team_ratings_mapped2
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  wc20.loc[wc20['adjusted_team_ratings2'] < 50, 'adjusted_team_ratings'] = 

In [46]:
#checks out
df = (wc20
       .loc[:, ['striker', 'adj_runs_off_bat', 'batting_team']]
       .groupby(['striker', 'batting_team'], as_index = False)
       .sum())
df_sorted = df.sort_values(by='adj_runs_off_bat', ascending=False)
print(df_sorted.head())
 

             striker batting_team  adj_runs_off_bat
730  Mohammad Rizwan     Pakistan       2522.421053
186       Babar Azam     Pakistan       2069.703008
979         SA Yadav        India       1935.890977
491       JC Buttler      England       1561.943609
365      GD Phillips  New Zealand       1505.657895


In [47]:
num_bowls = (wc20
       .loc[:, ['striker', 'bowling_team', 'batting_team']]
       .groupby(['striker', 'batting_team'], as_index = False)
       .count()
       .rename(columns = {'bowling_team' : 'n_bowls'})
        )
dfA = df_sorted.merge(num_bowls, on = ['striker', 'batting_team'])
dfA = dfA.sort_values(by = 'adj_runs_off_bat', ascending = False)
print(dfA)

              striker batting_team  adj_runs_off_bat  n_bowls
0     Mohammad Rizwan     Pakistan       2522.421053     2220
1          Babar Azam     Pakistan       2069.703008     1808
2            SA Yadav        India       1935.890977     1296
3          JC Buttler      England       1561.943609     1143
4         GD Phillips  New Zealand       1505.657895     1155
...               ...          ...               ...      ...
1151    Rizwan Haider     Malaysia          0.000000        1
1150    Garret Banner       Belize          0.000000        3
1149    Muhammad Khan    Hong Kong          0.000000       12
1148        H Fennell    Argentina          0.000000        1
1218         R Mondol   Bangladesh          0.000000        7

[1219 rows x 4 columns]


In [48]:
# check numbers for V Kohli
print(df_sorted[df_sorted['striker'] == 'V Kohli'])
print(dfA[dfA['striker'] == 'V Kohli'])

      striker batting_team  adj_runs_off_bat
1154  V Kohli        India       1261.804511
   striker batting_team  adj_runs_off_bat  n_bowls
7  V Kohli        India       1261.804511     1050


In [50]:
# Calculate runs per bowl for strikers (extras not included)
# probably better to include extras in conceded runs for bowlers
dfA['adj_runs_per_bowl'] = (dfA['adj_runs_off_bat'] / dfA['n_bowls'])
dfA = dfA.sort_values(by = 'adj_runs_per_bowl', ascending = False)
dfA.head(10)


Unnamed: 0,striker,batting_team,adj_runs_off_bat,n_bowls,adj_runs_per_bowl
642,Shahnawaz Dhani,Pakistan,16.0,6,2.666667
872,Umran Malik,India,4.699248,2,2.349624
878,Mukesh Kumar,India,4.605263,2,2.302632
623,Rakibul Hasan,Bangladesh,17.744361,8,2.218045
770,AM Fernando,Sri Lanka,8.533835,4,2.133459
984,Aziz Sualley,Ghana,1.984962,1,1.984962
992,CB Sole,Scotland,1.909774,1,1.909774
791,Aminul Islam Biplob,Bangladesh,7.488722,4,1.87218
691,SC Kuggeleijn,New Zealand,13.105263,7,1.87218
909,W Barresi,Netherlands,3.684211,2,1.842105


In [59]:
# Calculate runs conceded per bowl for bowlers (extras included)
df = (wc20
       .loc[:, ['bowler', 'bowling_team', 'adj_conceded_runs_off_bat', 'extras']]
       .groupby(['bowler', 'bowling_team'], as_index = False)
       .sum())
df_sorted = df.sort_values(by='adj_conceded_runs_off_bat', ascending=False)
df_sorted.head()

Unnamed: 0,bowler,bowling_team,adj_conceded_runs_off_bat,extras
849,Zeeshan Maqsood,Oman,2706.274753,12
124,Bilal Khan,Oman,2503.244489,38
40,AR Ramjani,Uganda,2244.036409,27
385,K Waiswa,Uganda,2141.823521,49
481,MR Adair,Ireland,2127.586761,134


In [60]:
num_bowls = (wc20
       .loc[:, ['bowler', 'bowling_team', 'batting_team']]
       .groupby(['bowler', 'bowling_team'], as_index = False)
       .count()
       .rename(columns = {'batting_team' : 'n_bowls'})
        )
dfB = df_sorted.merge(num_bowls, on = ['bowler', 'bowling_team'])
dfB = dfB.sort_values(by = 'adj_conceded_runs_off_bat', ascending = False)
print(dfB)

              bowler bowling_team  adj_conceded_runs_off_bat  extras  n_bowls
0    Zeeshan Maqsood         Oman                2706.274753      12      424
1         Bilal Khan         Oman                2503.244489      38      648
2         AR Ramjani       Uganda                2244.036409      27      601
3           K Waiswa       Uganda                2141.823521      49      540
4           MR Adair      Ireland                2127.586761     134     1277
..               ...          ...                        ...     ...      ...
845         P Khadka        Nepal                        NaN       1       25
846       R Agamiire       Uganda                        NaN       7       37
847  Riazat Ali Shah       Uganda                        NaN      72      661
848     S Lamichhane        Nepal                        NaN      58      738
849      Sompal Kami        Nepal                        NaN      60      589

[850 rows x 5 columns]


In [61]:
dfB['adj_runs_conceded_per_bowl'] = ((dfB['adj_conceded_runs_off_bat'] + dfB['extras']) / dfB['n_bowls'])
dfB = dfB.sort_values(by = 'adj_conceded_runs_off_bat', ascending = False)
dfB.head(10)

Unnamed: 0,bowler,bowling_team,adj_conceded_runs_off_bat,extras,n_bowls,adj_runs_conceded_per_bowl
0,Zeeshan Maqsood,Oman,2706.274753,12,424,6.411025
1,Bilal Khan,Oman,2503.244489,38,648,3.921674
2,AR Ramjani,Uganda,2244.036409,27,601,3.778763
3,K Waiswa,Uganda,2141.823521,49,540,4.057081
4,MR Adair,Ireland,2127.586761,134,1277,1.771015
5,Haris Rauf,Pakistan,2105.100956,114,1483,1.496359
6,IS Sodhi,New Zealand,2067.358027,86,1502,1.43366
7,S Nsubuga,Uganda,1914.364606,7,257,7.476127
8,J Little,Ireland,1878.203676,110,1197,1.660989
9,Mohammad Nadeem,Oman,1834.257202,25,249,7.466896


In [62]:
players = pd.read_csv('players.csv')

# Add runs per bowl values to players.csv 
# Only RPB values for hitters and all_rounders will be used
players = players.merge(dfA[['striker', 'adj_runs_per_bowl']], how='left', left_on='name',right_on='striker')
players = players.drop(columns = ['striker'])

# Add runs conceded per bowl values to players.csv 
# Only RCPB values for bowlers and all_rounders will be used
players = players.merge(dfB[['bowler', 'adj_runs_conceded_per_bowl']], how='left', left_on='name',right_on='bowler')
players = players.drop(columns = ['bowler'])

players.head(7)

Unnamed: 0,name,position,country,adj_runs_per_bowl,adj_runs_conceded_per_bowl
0,Pargat Singh,hitter,Canada,0.381827,3.295069
1,NR Kirton,hitter,Canada,0.435024,7.872274
2,NS Dhaliwal,hitter,Canada,0.378526,14.104895
3,Saad Bin Zafar,all_rounder,Canada,0.571145,3.523441
4,N Dutta,bowler,Canada,0.6944,2.522747
5,Kaleem Sana,bowler,Canada,0.132946,2.55306
6,JOA Gordon,bowler,Canada,0.201128,3.262785


In [63]:
%%capture
# Potential issue for players listed under the same name:
# runs_off_bat (+ extras) grouped together for all players,
# but separated by country, resulting in faulty numbers
# should not affect any of our selected players

"""
khan = df2.loc[df2['striker']=='Shoaib Khan']
print(khan)
goud = df2.loc[df2['striker']=='Sandeep Goud']
print(goud)
goud = df2B.loc[df2B['bowler']=='Sandeep Goud']
print(goud)
"""

### Run Simulations

In [64]:
bowlers = players[((players['position'] == 'bowler') | (players['position'] == 'all_rounder'))]
hitters = players[((players['position'] == 'hitter') | (players['position'] == 'all_rounder'))]

# Remove players for which there is no data (one US bowler)
bowlers = bowlers.dropna(subset=['adj_runs_conceded_per_bowl'])
hitters = hitters.dropna(subset=['adj_runs_per_bowl'])

print(bowlers.shape)
print(hitters.shape)

(73, 5)
(80, 5)


In [65]:
# How we determine which score to use determines our results! 
# The distribution of RPB and RPCB could be decided on a per-match basis 
# if there is enough data for a player

def get_runs(rpb, rcpb):
    return rand.uniform(rpb,rcpb)

In [66]:
def run_match(country1, country2, print_score=False):
    
    #make a vector for each team's batters/bowlers 
    bowlers1 = bowlers.loc[bowlers['country'] == country1]
    bowlers1 = bowlers1['name'].to_numpy()

    bowlers2 = bowlers.loc[bowlers['country'] == country2]
    bowlers2 = bowlers2['name'].to_numpy()

    hitters1 = hitters.loc[hitters['country'] == country1]
    hitters1 = hitters1['name'].to_numpy()

    hitters2 = hitters.loc[hitters['country'] == country2]
    hitters2 = hitters2['name'].to_numpy()

    #base scores
    country1_score = 0.0
    country2_score = 0.0 
    
    # 2 x 4 hitters x 4 bowlers x 6 bowls
    # 192 total bowls (except for US)

    #first team score:
    for hitter in hitters1:
        for bowler in bowlers2:
            for x in range(6):
                rpb = (players.loc[players['name'] == hitter, 'adj_runs_per_bowl'].values)[0]
                rcpb = (players.loc[players['name'] == bowler, 'adj_runs_conceded_per_bowl'].values)[0]
                country1_score += get_runs(rpb, rcpb)
              
    
    #second team score:
    for hitter in hitters2:
        for bowler in bowlers1:
            for x in range(6):
                rpb = (players.loc[players['name'] == hitter, 'adj_runs_per_bowl'].values)[0]
                rcpb = (players.loc[players['name'] == bowler, 'adj_runs_conceded_per_bowl'].values)[0]
                country2_score += get_runs(rpb, rcpb)

# Adjust scores in case of US
    if (country1 == "United States of America"):
        country1_score = country1_score*(3/4)
    elif (country2 == "United States of America"):
        country2_score = country2_score*(3/4)
    
    if(print_score):
        print(country1, "score:", country1_score)
        print(country2, "score:", country2_score)

    if country1_score > country2_score:
        return country1
    elif country1_score < country2_score:
        return country2
    else:
        return 'tie'

In [71]:
# Change these to try a different matchup!
teamA = 'India'
teamB = 'Australia'

# One match simulated with each function call
# Add 'True' to run_match to print scores
winner = run_match(teamA, teamB)
print("Winner:", winner)

Winner: India


In [72]:
# Run this to simulate many matches for a given pair
num_iterations = 200
A = 0
for i in range(num_iterations):
    if(run_match(teamA,teamB,True)==teamA):
        A+=1
print("Proportion that", teamA, "won:",A/num_iterations)

India score: 128.72196987291247
Australia score: 125.98582155270411
India score: 129.44847026942782
Australia score: 125.1526296938788
India score: 129.2465478014586
Australia score: 126.52853245483136
India score: 128.52212251663735
Australia score: 125.21343524844734
India score: 128.94417332189676
Australia score: 125.9920839754067
India score: 128.9113147006774
Australia score: 126.16345270647743
India score: 128.88018420954805
Australia score: 125.07863553619836
India score: 129.09486651351602
Australia score: 125.44959770142864
India score: 130.33519686184533
Australia score: 126.52294238265017
India score: 128.57675169571502
Australia score: 125.81700576192705
India score: 129.49177911659692
Australia score: 125.70503738215368
India score: 129.2156722416607
Australia score: 126.22988738112622
India score: 129.35871540960858
Australia score: 126.51365847920488
India score: 128.4117408879636
Australia score: 125.94159826333423
India score: 129.21065503393453
Australia score: 126.4

India score: 128.68353088277559
Australia score: 125.39215080428625
India score: 129.66015406775006
Australia score: 125.32379738452018
India score: 128.98966431402968
Australia score: 125.69665651117386
India score: 129.2452353757187
Australia score: 125.97504261288543
India score: 128.12180150462
Australia score: 125.47443980265379
India score: 128.50094647983258
Australia score: 126.13709917636595
India score: 129.27823426438144
Australia score: 125.92640866792209
India score: 128.66760748931011
Australia score: 124.99245316663931
India score: 129.56119899428614
Australia score: 125.74190759096429
India score: 129.24835233391462
Australia score: 125.71186079194791
India score: 129.2351018856858
Australia score: 126.70814752061227
India score: 129.1259250438212
Australia score: 126.01867198722375
India score: 129.73878214337904
Australia score: 126.22627842372846
India score: 129.3474797519251
Australia score: 125.93415853337635
India score: 129.39577340403207
Australia score: 126.10

In [None]:
# TODO: Loop through teams list to simulate matches