In [2]:
import pandas as pd
import numpy as np

In [3]:
# Read in merged CSV file
merge = pd.read_csv('merged_files.csv')
print(merge.columns)
print(merge['batting_team'].unique())
merge.head()

Index(['Unnamed: 0', 'match_id', 'season', 'start_date', 'venue', 'innings',
       'ball', 'batting_team', 'bowling_team', 'striker', 'non_striker',
       'bowler', 'runs_off_bat', 'extras', 'wides', 'noballs', 'byes',
       'legbyes', 'penalty', 'wicket_type', 'player_dismissed'],
      dtype='object')
['England' 'Australia' 'New Zealand' 'South Africa' 'Pakistan' 'Sri Lanka'
 'West Indies' 'India' 'Kenya' 'Scotland' 'Zimbabwe' 'Bangladesh'
 'Bermuda' 'Netherlands' 'Ireland' 'Afghanistan' 'Canada' 'Nepal'
 'Hong Kong' 'United Arab Emirates' 'Papua New Guinea' 'Oman'
 'ICC World XI' 'Philippines' 'Vanuatu' 'United States of America'
 'Germany' 'Italy' 'Ghana' 'Namibia' 'Uganda' 'Botswana' 'Nigeria'
 'Guernsey' 'Denmark' 'Norway' 'Jersey' 'Thailand' 'Malaysia' 'Maldives'
 'Singapore' 'Qatar' 'Kuwait' 'Cayman Islands' 'Portugal' 'Spain'
 'Gibraltar' 'Bhutan' 'Saudi Arabia' 'Bahrain' 'Iran' 'Belgium'
 'Luxembourg' 'Czech Republic' 'Isle of Man' 'Bulgaria' 'Romania'
 'Austria' 'Greece' 

Unnamed: 0.1,Unnamed: 0,match_id,season,start_date,venue,innings,ball,batting_team,bowling_team,striker,...,bowler,runs_off_bat,extras,wides,noballs,byes,legbyes,penalty,wicket_type,player_dismissed
0,0,211028,2005,2005-06-13,The Rose Bowl,1,0.1,England,Australia,ME Trescothick,...,B Lee,0,0,,,,,,,
1,1,211028,2005,2005-06-13,The Rose Bowl,1,0.2,England,Australia,ME Trescothick,...,B Lee,1,0,,,,,,,
2,2,211028,2005,2005-06-13,The Rose Bowl,1,0.3,England,Australia,GO Jones,...,B Lee,0,0,,,,,,,
3,3,211028,2005,2005-06-13,The Rose Bowl,1,0.4,England,Australia,GO Jones,...,B Lee,0,0,,,,,,,
4,4,211028,2005,2005-06-13,The Rose Bowl,1,0.5,England,Australia,GO Jones,...,B Lee,0,0,,,,,,,


In [4]:
# We will use only data from the last 4 years (2020-2024)
merge[['start-year', 'start-day', 'start-month']] = merge['start_date'].str.split('-', expand=True)
df = merge.loc[merge['start-year'].astype(int) >= 2020]
merge = merge.drop(['start-year', 'start-day', 'start-month'], axis=1)
print(df.shape)
df.head()

(285767, 24)


Unnamed: 0.1,Unnamed: 0,match_id,season,start_date,venue,innings,ball,batting_team,bowling_team,striker,...,wides,noballs,byes,legbyes,penalty,wicket_type,player_dismissed,start-year,start-day,start-month
169922,0,1185313,2019/20,2020-02-12,Buffalo Park,1,0.1,South Africa,England,T Bavuma,...,,,,,,,,2020,2,12
169923,1,1185313,2019/20,2020-02-12,Buffalo Park,1,0.2,South Africa,England,Q de Kock,...,,,,,,,,2020,2,12
169924,2,1185313,2019/20,2020-02-12,Buffalo Park,1,0.3,South Africa,England,T Bavuma,...,,,,,,,,2020,2,12
169925,3,1185313,2019/20,2020-02-12,Buffalo Park,1,0.4,South Africa,England,Q de Kock,...,,,,,,,,2020,2,12
169926,4,1185313,2019/20,2020-02-12,Buffalo Park,1,0.5,South Africa,England,Q de Kock,...,,,,,,,,2020,2,12


In [5]:
# Create dataframe of matches only from teams participating in the 2024 World Cup
with open('teams.txt') as file:
    teams_list = file.read().splitlines()
wc20=df.loc[(df['bowling_team'].isin(teams_list)) | (df['batting_team'].isin(teams_list))]
print(wc20.shape)
wc20.head()

(147566, 24)


Unnamed: 0.1,Unnamed: 0,match_id,season,start_date,venue,innings,ball,batting_team,bowling_team,striker,...,wides,noballs,byes,legbyes,penalty,wicket_type,player_dismissed,start-year,start-day,start-month
169922,0,1185313,2019/20,2020-02-12,Buffalo Park,1,0.1,South Africa,England,T Bavuma,...,,,,,,,,2020,2,12
169923,1,1185313,2019/20,2020-02-12,Buffalo Park,1,0.2,South Africa,England,Q de Kock,...,,,,,,,,2020,2,12
169924,2,1185313,2019/20,2020-02-12,Buffalo Park,1,0.3,South Africa,England,T Bavuma,...,,,,,,,,2020,2,12
169925,3,1185313,2019/20,2020-02-12,Buffalo Park,1,0.4,South Africa,England,Q de Kock,...,,,,,,,,2020,2,12
169926,4,1185313,2019/20,2020-02-12,Buffalo Park,1,0.5,South Africa,England,Q de Kock,...,,,,,,,,2020,2,12


In [6]:
group_A = teams_list[0:5]
group_B = teams_list[5:10]
group_C = teams_list[10:15]
group_D = teams_list[15:20]

### Calculate Stats

In [36]:
_df = (wc20
       .loc[:, ['striker', 'runs_off_bat', 'batting_team', 'extras', 'wides', 'noballs','byes','legbyes']]
       .groupby(['striker', 'batting_team'], as_index = False)
       .sum())
_df_sorted = _df.sort_values(by='runs_off_bat', ascending=False)
print(_df_sorted.head())
print(_df_sorted[_df_sorted['striker'] == 'V Kohli'])
#checks out 

             striker batting_team  runs_off_bat  extras  wides  noballs  byes  \
730  Mohammad Rizwan     Pakistan          2801     128   73.0      7.0  12.0   
186       Babar Azam     Pakistan          2293      86   51.0      7.0   5.0   
979         SA Yadav        India          2141      67   50.0      4.0   6.0   
365      GD Phillips  New Zealand          1677      66   38.0      8.0   5.0   
961         S Sesazi       Uganda          1670     125   78.0     11.0   2.0   

     legbyes  
730     36.0  
186     23.0  
979      7.0  
365     15.0  
961     34.0  
      striker batting_team  runs_off_bat  extras  wides  noballs  byes  \
1154  V Kohli        India          1410      67   43.0      5.0   8.0   

      legbyes  
1154     11.0  


In [32]:
num_bowls = (wc20
       .loc[:, ['striker', 'bowling_team', 'batting_team']]
       .groupby(['striker', 'batting_team'], as_index = False)
       .count()
       .rename(columns = {'bowling_team' : 'n_bowls'})
        )
df2 = _df_sorted.merge(num_bowls, on = ['striker', 'batting_team'])
df2 = df2.sort_values(by = 'runs_off_bat', ascending = False)
print(df2)

              striker batting_team  runs_off_bat  extras  wides  noballs  \
0     Mohammad Rizwan     Pakistan          2801     128   73.0      7.0   
1          Babar Azam     Pakistan          2293      86   51.0      7.0   
2            SA Yadav        India          2141      67   50.0      4.0   
3         GD Phillips  New Zealand          1677      66   38.0      8.0   
4            S Sesazi       Uganda          1670     125   78.0     11.0   
...               ...          ...           ...     ...    ...      ...   
1150      Surya Anand      Denmark             0       0    0.0      0.0   
1149    Shahbaz Badar      Bahrain             0       1    1.0      0.0   
1148       E Munkhbat     Mongolia             0       1    1.0      0.0   
1147        AAA Patel      Lesotho             0       0    0.0      0.0   
1218         DE Budge     Scotland             0       0    0.0      0.0   

      byes  legbyes  n_bowls  
0     12.0     36.0     2220  
1      5.0     23.0     1

In [None]:
# check numbers for V Kohli
print(df2[df2['striker'] == 'V Kohli'])

In [31]:
# Calculate runs per bowl for strikers
# getting rid of extras as part of batters run, probably better to include in conceded runs for bowlers
df2['runs_per_bowl'] = (df2['runs_off_bat'] / df2['n_bowls'])
df2 = df2.sort_values(by = 'runs_per_bowl', ascending = False)
print(df2.head(10))

              striker batting_team  runs_off_bat  extras  wides  noballs  \
968      Aziz Sualley        Ghana             4       0    0.0      0.0   
737   Shahnawaz Dhani     Pakistan            16       0    0.0      0.0   
926       Umran Malik        India             5       0    0.0      0.0   
939      Mukesh Kumar        India             5       0    0.0      0.0   
944         W Barresi  Netherlands             5       0    0.0      0.0   
812       AM Fernando    Sri Lanka            10       2    1.0      1.0   
707     Rakibul Hasan   Bangladesh            18       1    1.0      0.0   
824          A Kapoor       Canada             9       0    0.0      0.0   
664     S Shrivastava         Oman            22       0    0.0      0.0   
1046          CB Sole     Scotland             2       0    0.0      0.0   

      byes  legbyes  n_bowls  runs_per_bowl  
968    0.0      0.0        1       4.000000  
737    0.0      0.0        6       2.666667  
926    0.0      0.0      

In [11]:
us = df2[df2['batting_team'] == 'United States of America']
#us.sort_values(by = 'runs_per_bowl', ascending = False)

In [17]:
#including extras 
_dfB = (wc20
       .loc[:, ['bowler', 'bowling_team', 'runs_off_bat', 'extras']]
       .groupby(['bowler', 'bowling_team'], as_index = False)
       .sum())
_df_sortedB = _dfB.sort_values(by='runs_off_bat', ascending=False)
print(_df_sortedB.head())

         bowler bowling_team  runs_off_bat  extras
289  Haris Rauf     Pakistan          1897     114
306    IS Sodhi  New Zealand          1860      86
42    AU Rashid      England          1667      61
772  TG Southee  New Zealand          1658     105
481    MR Adair      Ireland          1509     134


In [18]:
num_bowlsB = (wc20
       .loc[:, ['bowler', 'bowling_team', 'batting_team']]
       .groupby(['bowler', 'bowling_team'], as_index = False)
       .count()
       .rename(columns = {'batting_team' : 'n_bowls'})
        )
df2B = _df_sortedB.merge(num_bowlsB, on = ['bowler', 'bowling_team'])
df2B = df2B.sort_values(by = 'runs_off_bat', ascending = False)
print(df2B)

           bowler              bowling_team  runs_off_bat  extras  n_bowls
0      Haris Rauf                  Pakistan          1897     114     1483
1        IS Sodhi               New Zealand          1860      86     1502
2       AU Rashid                   England          1667      61     1413
3      TG Southee               New Zealand          1658     105     1350
4        MR Adair                   Ireland          1509     134     1277
..            ...                       ...           ...     ...      ...
845       SS Iyer                     India             2       0        2
846     SJ Modani  United States of America             2       0        6
847   KNA Bandara                 Sri Lanka             2       0        6
848   JEA Doctora               Philippines             1       1        3
849  Sandeep Goud                   Bahamas             1       0        1

[850 rows x 5 columns]


In [20]:
df2B['runs_conceded_per_bowl'] = ((df2B['runs_off_bat'] + df2B['extras']) / df2B['n_bowls'])
df2B = df2B.sort_values(by = 'runs_conceded_per_bowl', ascending = False)
print(df2B.head(10))
#same problem with Sandeep Goud

            bowler  bowling_team  runs_off_bat  extras  n_bowls  \
786     KK Tillett        Belize            12       5        3   
688    Umair Tariq       Austria            24       0        6   
669      B Terbish      Mongolia            26       1        7   
705   RR Hendricks  South Africa            21       0        6   
710  Yusuf Ebrahim        Panama            21       0        6   
572  Gurdeep Singh         Kenya            41       0       12   
526  M Altankhuyag      Mongolia            50       5       17   
726      NR Kirton        Canada            19       0        6   
733    RS Fernando     Sri Lanka            17       1        6   
731     KD Karthik         India            18       0        6   

     runs_conceded_per_bowl  
786                5.666667  
688                4.000000  
669                3.857143  
705                3.500000  
710                3.500000  
572                3.416667  
526                3.235294  
726                3.1

In [21]:
usB = df2B[df2B['bowling_team'] == 'United States of America']
usB.sort_values(by = 'runs_conceded_per_bowl', ascending = False)

Unnamed: 0,bowler,bowling_team,runs_off_bat,extras,n_bowls,runs_conceded_per_bowl
516,MO Kain,United States of America,53,2,34,1.617647
402,Aaron Jones,United States of America,89,1,57,1.578947
389,CAH Stevenson,United States of America,96,5,74,1.364865
424,Yasir Mohammad,United States of America,79,9,68,1.294118
324,Ali Khan,United States of America,139,16,122,1.270492
246,J Theron,United States of America,212,16,184,1.23913
598,K Gore,United States of America,36,0,30,1.2
568,TO Carmichael,United States of America,42,1,37,1.162162
228,SN Netravalkar,United States of America,230,8,231,1.030303
230,NK Patel,United States of America,228,7,254,0.925197


In [22]:
players = pd.read_csv('players.csv')

# Add runs per bowl values to players.csv (for hitters & all-rounders)
# Ignore rpb values for bowlers for now
players = players.merge(df2[['striker', 'runs_per_bowl']], how='left', left_on='name',right_on='striker')
players = players.drop(columns = ['striker'])

players = players.merge(df2B[['bowler', 'runs_conceded_per_bowl']], how='left', left_on='name',right_on='bowler')
players = players.drop(columns = ['bowler'])

In [None]:
players.head(7)
#pd.set_option('display.max_rows', None)
#players

In [None]:
# Potential issue for players listed under the same name:
# runs_off_bat (+ extras) grouped together for all players,
# but separated by country, resulting in faulty numbers

"""
pd.set_option('display.max_rows', None)
khan = df2.loc[df2['striker']=='Shoaib Khan']
print(khan)
goud = df2.loc[df2['striker']=='Sandeep Goud']
print(goud)
goud = df2B.loc[df2B['bowler']=='Sandeep Goud']
print(goud)
"""

### Run Simulations

In [67]:
bowlers = players[((players['position'] == 'bowler') | (players['position'] == 'all_rounder'))]
hitters = players[((players['position'] == 'hitter') | (players['position'] == 'all_rounder'))]

# Change these to try a different matchup!
teamA = 'Australia'
teamB = 'Oman'

In [69]:
bowlersA = bowlers.loc[bowlers['country'] == teamA]
bowlersA = bowlersA['name'].to_numpy()

bowlersB = bowlers.loc[bowlers['country'] == teamB]
bowlersB = bowlersB['name'].to_numpy()

hittersA = hitters.loc[hitters['country'] == teamA]
hittersA = hittersA['name'].to_numpy()

hittersB = hitters.loc[hitters['country'] == teamB]
hittersB = hittersB['name'].to_numpy()

['KH Prajapati' 'PS Athavale' 'Aqib Ilyas' 'Zeeshan Maqsood']
