In [1]:
import pandas as pd
import numpy as np

In [2]:
# Read in merged CSV file
merge = pd.read_csv('merged_files.csv')
print(merge.columns)
print(merge['batting_team'].unique())
merge.head()

Index(['Unnamed: 0', 'match_id', 'season', 'start_date', 'venue', 'innings',
       'ball', 'batting_team', 'bowling_team', 'striker', 'non_striker',
       'bowler', 'runs_off_bat', 'extras', 'wides', 'noballs', 'byes',
       'legbyes', 'penalty', 'wicket_type', 'player_dismissed'],
      dtype='object')
['England' 'Australia' 'New Zealand' 'South Africa' 'Pakistan' 'Sri Lanka'
 'West Indies' 'India' 'Kenya' 'Scotland' 'Zimbabwe' 'Bangladesh'
 'Bermuda' 'Netherlands' 'Ireland' 'Afghanistan' 'Canada' 'Nepal'
 'Hong Kong' 'United Arab Emirates' 'Papua New Guinea' 'Oman'
 'ICC World XI' 'Philippines' 'Vanuatu' 'United States of America'
 'Germany' 'Italy' 'Ghana' 'Namibia' 'Uganda' 'Botswana' 'Nigeria'
 'Guernsey' 'Denmark' 'Norway' 'Jersey' 'Thailand' 'Malaysia' 'Maldives'
 'Singapore' 'Qatar' 'Kuwait' 'Cayman Islands' 'Portugal' 'Spain'
 'Gibraltar' 'Bhutan' 'Saudi Arabia' 'Bahrain' 'Iran' 'Belgium'
 'Luxembourg' 'Czech Republic' 'Isle of Man' 'Bulgaria' 'Romania'
 'Austria' 'Greece' 

Unnamed: 0.1,Unnamed: 0,match_id,season,start_date,venue,innings,ball,batting_team,bowling_team,striker,...,bowler,runs_off_bat,extras,wides,noballs,byes,legbyes,penalty,wicket_type,player_dismissed
0,0,211028,2005,2005-06-13,The Rose Bowl,1,0.1,England,Australia,ME Trescothick,...,B Lee,0,0,,,,,,,
1,1,211028,2005,2005-06-13,The Rose Bowl,1,0.2,England,Australia,ME Trescothick,...,B Lee,1,0,,,,,,,
2,2,211028,2005,2005-06-13,The Rose Bowl,1,0.3,England,Australia,GO Jones,...,B Lee,0,0,,,,,,,
3,3,211028,2005,2005-06-13,The Rose Bowl,1,0.4,England,Australia,GO Jones,...,B Lee,0,0,,,,,,,
4,4,211028,2005,2005-06-13,The Rose Bowl,1,0.5,England,Australia,GO Jones,...,B Lee,0,0,,,,,,,


In [3]:
# We will use only data from the last 4 years (2020-2024)
merge[['start-year', 'start-day', 'start-month']] = merge['start_date'].str.split('-', expand=True)
df = merge.loc[merge['start-year'].astype(int) >= 2020]
merge = merge.drop(['start-year', 'start-day', 'start-month'], axis=1)
print(df.shape)
df.head()

(285767, 24)


Unnamed: 0.1,Unnamed: 0,match_id,season,start_date,venue,innings,ball,batting_team,bowling_team,striker,...,wides,noballs,byes,legbyes,penalty,wicket_type,player_dismissed,start-year,start-day,start-month
169922,0,1185313,2019/20,2020-02-12,Buffalo Park,1,0.1,South Africa,England,T Bavuma,...,,,,,,,,2020,2,12
169923,1,1185313,2019/20,2020-02-12,Buffalo Park,1,0.2,South Africa,England,Q de Kock,...,,,,,,,,2020,2,12
169924,2,1185313,2019/20,2020-02-12,Buffalo Park,1,0.3,South Africa,England,T Bavuma,...,,,,,,,,2020,2,12
169925,3,1185313,2019/20,2020-02-12,Buffalo Park,1,0.4,South Africa,England,Q de Kock,...,,,,,,,,2020,2,12
169926,4,1185313,2019/20,2020-02-12,Buffalo Park,1,0.5,South Africa,England,Q de Kock,...,,,,,,,,2020,2,12


In [4]:
# Create dataframe of matches only from teams participating in the 2024 World Cup
with open('teams.txt') as file:
    teams_list = file.read().splitlines()
wc20=df.loc[(df['bowling_team'].isin(teams_list)) | (df['batting_team'].isin(teams_list))]
print(wc20.shape)
wc20.head()

(147566, 24)


Unnamed: 0.1,Unnamed: 0,match_id,season,start_date,venue,innings,ball,batting_team,bowling_team,striker,...,wides,noballs,byes,legbyes,penalty,wicket_type,player_dismissed,start-year,start-day,start-month
169922,0,1185313,2019/20,2020-02-12,Buffalo Park,1,0.1,South Africa,England,T Bavuma,...,,,,,,,,2020,2,12
169923,1,1185313,2019/20,2020-02-12,Buffalo Park,1,0.2,South Africa,England,Q de Kock,...,,,,,,,,2020,2,12
169924,2,1185313,2019/20,2020-02-12,Buffalo Park,1,0.3,South Africa,England,T Bavuma,...,,,,,,,,2020,2,12
169925,3,1185313,2019/20,2020-02-12,Buffalo Park,1,0.4,South Africa,England,Q de Kock,...,,,,,,,,2020,2,12
169926,4,1185313,2019/20,2020-02-12,Buffalo Park,1,0.5,South Africa,England,Q de Kock,...,,,,,,,,2020,2,12


In [5]:
group_A = teams_list[0:5]
group_B = teams_list[5:10]
group_C = teams_list[10:15]
group_D = teams_list[15:20]

### Calculate Stats

In [52]:
_df = (wc20
       .loc[:, ['striker', 'runs_off_bat']]
       .groupby(['striker'], as_index = False)
       .sum())
_df_sorted = _df.sort_values(by='runs_off_bat', ascending=False)
print(_df_sorted.head())
print(_df_sorted[_df_sorted['striker'] == 'V Kohli'])
#checks out 

             striker  runs_off_bat
729  Mohammad Rizwan          2801
186       Babar Azam          2293
978         SA Yadav          2141
365      GD Phillips          1677
960         S Sesazi          1670
      striker  runs_off_bat
1151  V Kohli          1410


In [55]:
num_bowls = (wc20
       .loc[:, ['striker', 'bowling_team', 'batting_team']]
       .groupby(['striker', 'batting_team'], as_index = False)
       .count()
       .rename(columns = {'bowling_team' : 'n_bowls'})
        )
df2 = _df_sorted.merge(num_bowls, on = ['striker'])
df2 = df2.sort_values(by = 'runs_off_bat', ascending = False)
print(df2)

                  striker  runs_off_bat batting_team  n_bowls
0         Mohammad Rizwan          2801     Pakistan     2220
1              Babar Azam          2293     Pakistan     1808
2                SA Yadav          2141        India     1296
3             GD Phillips          1677  New Zealand     1155
4                S Sesazi          1670       Uganda     1447
...                   ...           ...          ...      ...
1150  Ammar Zuhdi Hazalan             0     Malaysia        3
1149            Anas Khan             0    Hong Kong        2
1148        N Senamontree             0     Thailand        3
1147             JC Darji             0     Tanzania        2
1218             DE Budge             0     Scotland        3

[1219 rows x 4 columns]


In [62]:
# Calculate runs per bowl for strikers
# getting rid of extras as part of batters run, probably better to include in conceded runs for bowlers
df2['runs_per_bowl'] = (df2['runs_off_bat'] / df2['n_bowls'])
df2 = df2.sort_values(by = 'runs_per_bowl', ascending = False)
print(df2.head(10))
#there's a mistake with Shoaib Khan (from Pakistan not Germany) and Sandeep Goud (from Oman not Bahamas) 
#and the numbers make no sense 
#neither are in player list but this same bug could affect bowlers?

             striker  runs_off_bat batting_team  n_bowls  runs_per_bowl
197      Shoaib Khan           258      Germany        6      43.000000
446     Sandeep Goud            61      Bahamas        6      10.166667
960     Aziz Sualley             4        Ghana        1       4.000000
631       Ihsanullah            26     Pakistan        8       3.250000
738  Shahnawaz Dhani            16     Pakistan        6       2.666667
927      Umran Malik             5        India        2       2.500000
915     Mukesh Kumar             5        India        2       2.500000
935        W Barresi             5  Netherlands        2       2.500000
809      AM Fernando            10    Sri Lanka        4       2.500000
705    Rakibul Hasan            18   Bangladesh        8       2.250000


In [64]:
us = df2[df2['batting_team'] == 'United States of America']
#us.sort_values(by = 'runs_per_bowl', ascending = False)

In [67]:
#including extras 
_dfB = (wc20
       .loc[:, ['bowler', 'runs_off_bat', 'extras']]
       .groupby(['bowler'], as_index = False)
       .sum())
_df_sortedB = _dfB.sort_values(by='runs_off_bat', ascending=False)
print(_df_sortedB.head())

         bowler  runs_off_bat  extras
289  Haris Rauf          1897     114
306    IS Sodhi          1860      86
42    AU Rashid          1667      61
771  TG Southee          1658     105
481    MR Adair          1509     134


In [72]:
num_bowlsB = (wc20
       .loc[:, ['bowler', 'bowling_team', 'batting_team']]
       .groupby(['bowler', 'bowling_team'], as_index = False)
       .count()
       .rename(columns = {'batting_team' : 'n_bowls'})
        )
df2B = _df_sortedB.merge(num_bowlsB, on = ['bowler'])
df2B = df2B.sort_values(by = 'runs_off_bat', ascending = False)
print(df2B)

          bowler  runs_off_bat  extras              bowling_team  n_bowls
0     Haris Rauf          1897     114                  Pakistan     1483
1       IS Sodhi          1860      86               New Zealand     1502
2      AU Rashid          1667      61                   England     1413
3     TG Southee          1658     105               New Zealand     1350
4       MR Adair          1509     134                   Ireland     1277
..           ...           ...     ...                       ...      ...
845     J Jagroo             3       0                   Bahamas        6
846  KNA Bandara             2       0                 Sri Lanka        6
847      SS Iyer             2       0                     India        2
848    SJ Modani             2       0  United States of America        6
849  JEA Doctora             1       1               Philippines        3

[850 rows x 5 columns]


In [75]:
df2B['runs_conceded_per_bowl'] = ((df2B['runs_off_bat'] + df2B['extras']) / df2B['n_bowls'])
df2B = df2B.sort_values(by = 'runs_conceded_per_bowl', ascending = False)
print(df2B.head(10))
#same problem with Sandeep Goud

            bowler  runs_off_bat  extras  bowling_team  n_bowls  \
497   Sandeep Goud            57       1       Bahamas        1   
790     KK Tillett            12       5        Belize        3   
687    Umair Tariq            24       0       Austria        6   
675      B Terbish            26       1      Mongolia        7   
712   RR Hendricks            21       0  South Africa        6   
710  Yusuf Ebrahim            21       0        Panama        6   
571  Gurdeep Singh            41       0         Kenya       12   
527  M Altankhuyag            50       5      Mongolia       17   
725      NR Kirton            19       0        Canada        6   
734    RS Fernando            17       1     Sri Lanka        6   

     runs_conceded_per_bowl  
497               58.000000  
790                5.666667  
687                4.000000  
675                3.857143  
712                3.500000  
710                3.500000  
571                3.416667  
527                3.2

In [77]:
usB = df2B[df2B['bowling_team'] == 'United States of America']
usB.sort_values(by = 'runs_conceded_per_bowl', ascending = False)

Unnamed: 0,bowler,runs_off_bat,extras,bowling_team,n_bowls,runs_conceded_per_bowl
516,MO Kain,53,2,United States of America,34,1.617647
400,Aaron Jones,89,1,United States of America,57,1.578947
388,CAH Stevenson,96,5,United States of America,74,1.364865
422,Yasir Mohammad,79,9,United States of America,68,1.294118
324,Ali Khan,139,16,United States of America,122,1.270492
246,J Theron,212,16,United States of America,184,1.23913
601,K Gore,36,0,United States of America,30,1.2
567,TO Carmichael,42,1,United States of America,37,1.162162
228,SN Netravalkar,230,8,United States of America,231,1.030303
230,NK Patel,228,7,United States of America,254,0.925197


In [80]:
players = pd.read_csv('players.csv')

# Add runs per bowl values to players.csv (for hitters & all-rounders)
# Ignore rpb values for bowlers for now
players = players.merge(df2[['striker', 'runs_per_bowl']], how='left', left_on='name',right_on='striker')
players = players.drop(columns = ['striker'])

players = players.merge(df2B[['bowler', 'runs_conceded_per_bowl']], how='left', left_on='name',right_on='bowler')
players = players.drop(columns = ['bowler'])

In [81]:
players.head(7)
#pd.set_option('display.max_rows', None)
#players

Unnamed: 0,name,position,country,runs_per_bowl,runs_conceded_per_bowl
0,Pargat Singh,hitter,Canada,1.061321,1.72093
1,NR Kirton,hitter,Canada,1.462687,3.166667
2,NS Dhaliwal,hitter,Canada,1.419825,1.307692
3,Saad Bin Zafar,all_rounder,Canada,1.349057,1.031553
4,N Dutta,bowler,Canada,1.710526,0.994845
5,Kaleem Sana,bowler,Canada,0.272727,0.924479
6,JOA Gordon,bowler,Canada,0.5,1.20202


### Run Simulations

In [None]:
# TODO: implement simulations based on the calculated player data