In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
country_map = {
    'AUS': 'Australia',
    'PAK': 'Pakistan',
    'SL': 'Sri Lanka',
    'SA': 'South Africa',
    'INDIA': 'India',
    'NZ': 'New Zealand',
    'ENG': 'England',
    'WI': 'West Indies',
    'BDESH': 'Bangladesh',
    'KENYA': 'Kenya',
    'ZIM': 'Zimbabwe'
}

## Team Aggregated Statistics

### Scraping from ESPN-Cricinfo

In [3]:
def get_batting_stats():
    def url(i):
        return 'https://stats.espncricinfo.com/ci/engine/stats/index.html?class=2;page=' + str(i) + ';spanmax1=15+Apr+2021;spanmin1=15+Apr+2006;spanval1=span;template=results;type=team;view=year'
  
    i = 1

    df = None

    while True:
        new_df = pd.read_html(url(i))[2]

        if(len(new_df.index) == 1):
            break
        
        if df is None:
            df = new_df
        else:
            df = df.append(new_df, ignore_index=True)
        
        i= i + 1
        print(i)

    df.to_csv("batting-stats.csv")


    return df

In [4]:
def get_bowling_stats():
    def url(i):
        return 'https://stats.espncricinfo.com/ci/engine/stats/index.html?class=2;page=' + str(i) + ';spanmax1=15+Apr+2021;spanmin1=15+Apr+2006;spanval1=span;team_view=bowl;template=results;type=team;view=year'
    i = 1

    df = None

    while True:
        new_df = pd.read_html(url(i))[2]

        if(len(new_df.index) == 1):
            break
        
        if df is None:
            df = new_df
        else:
            df = df.append(new_df, ignore_index=True)
        
        i= i + 1
        print(i)

    df.to_csv("bowling-stats.csv")

    return df

In [5]:
batting_stats = get_batting_stats()

batting_stats = batting_stats.add_prefix("Batting")

batting_stats.head()

2
3
4
5
6
7


Unnamed: 0,BattingTeam,BattingMat,BattingWon,BattingLost,BattingTied,BattingNR,BattingW/L,BattingAve,BattingRPO,BattingInns,BattingHS,BattingLS,BattingYear,BattingUnnamed: 13
0,Australia,34,24,8,0,2,3.0,46.51,5.88,33,377,148,2007,
1,Pakistan,32,24,7,0,1,3.428,31.56,4.86,32,317,124,2011,
2,Australia,39,23,14,0,2,1.642,35.4,5.14,39,350,131,2009,
3,India,34,22,10,0,2,2.2,38.21,5.47,33,383,146,2013,
4,India,34,21,10,2,1,2.1,35.85,5.51,34,418,146,2011,


In [6]:
bowling_stats = get_bowling_stats()

bowling_stats = bowling_stats.add_prefix("Bowling")

bowling_stats.head()

2
3
4
5
6
7


Unnamed: 0,BowlingTeam,BowlingMat,BowlingWon,BowlingLost,BowlingTied,BowlingNR,BowlingW/L,BowlingAve,BowlingRPO,BowlingInns,BowlingHS,BowlingLS,BowlingYear,BowlingUnnamed: 13
0,Australia,34,24,8,0,2,3.0,25.97,5.04,34,350,91,2007,
1,Pakistan,32,24,7,0,1,3.428,24.22,4.5,31,311,91,2011,
2,Australia,39,23,14,0,2,1.642,29.63,5.02,38,354,145,2009,
3,India,34,22,10,0,2,2.2,30.39,5.24,34,359,96,2013,
4,India,34,21,10,2,1,2.1,29.56,5.17,34,338,174,2011,


### Join Scraped Data

In [7]:
df = pd.concat([batting_stats, bowling_stats], axis=1, join="inner")

df.head()

Unnamed: 0,BattingTeam,BattingMat,BattingWon,BattingLost,BattingTied,BattingNR,BattingW/L,BattingAve,BattingRPO,BattingInns,BattingHS,BattingLS,BattingYear,BattingUnnamed: 13,BowlingTeam,BowlingMat,BowlingWon,BowlingLost,BowlingTied,BowlingNR,BowlingW/L,BowlingAve,BowlingRPO,BowlingInns,BowlingHS,BowlingLS,BowlingYear,BowlingUnnamed: 13
0,Australia,34,24,8,0,2,3.0,46.51,5.88,33,377,148,2007,,Australia,34,24,8,0,2,3.0,25.97,5.04,34,350,91,2007,
1,Pakistan,32,24,7,0,1,3.428,31.56,4.86,32,317,124,2011,,Pakistan,32,24,7,0,1,3.428,24.22,4.5,31,311,91,2011,
2,Australia,39,23,14,0,2,1.642,35.4,5.14,39,350,131,2009,,Australia,39,23,14,0,2,1.642,29.63,5.02,38,354,145,2009,
3,India,34,22,10,0,2,2.2,38.21,5.47,33,383,146,2013,,India,34,22,10,0,2,2.2,30.39,5.24,34,359,96,2013,
4,India,34,21,10,2,1,2.1,35.85,5.51,34,418,146,2011,,India,34,21,10,2,1,2.1,29.56,5.17,34,338,174,2011,


In [8]:
df = df.rename(columns={"BattingTeam": "Team", "BattingMat": "Mat", "BattingLost": "Lost", "BattingWon": "Won", "BattingYear": "Year", "BattingW/L": "W/L"})

df = df.drop(columns=['BattingTied', 'BattingNR', 'BowlingTeam', 'BowlingMat', 'BowlingLost', 'BowlingWon', 'BowlingYear', 'BowlingTied', 'BowlingNR', 'BowlingW/L', 'BattingUnnamed: 13', 'BowlingUnnamed: 13'])

df.head()

Unnamed: 0,Team,Mat,Won,Lost,W/L,BattingAve,BattingRPO,BattingInns,BattingHS,BattingLS,Year,BowlingAve,BowlingRPO,BowlingInns,BowlingHS,BowlingLS
0,Australia,34,24,8,3.0,46.51,5.88,33,377,148,2007,25.97,5.04,34,350,91
1,Pakistan,32,24,7,3.428,31.56,4.86,32,317,124,2011,24.22,4.5,31,311,91
2,Australia,39,23,14,1.642,35.4,5.14,39,350,131,2009,29.63,5.02,38,354,145
3,India,34,22,10,2.2,38.21,5.47,33,383,146,2013,30.39,5.24,34,359,96
4,India,34,21,10,2.1,35.85,5.51,34,418,146,2011,29.56,5.17,34,338,174


In [9]:
df["BattingLS"] = df["BattingLS"].replace("-", 0).astype(str).astype(int)

df["BowlingLS"] = df["BowlingLS"].replace("-", 0).astype(str).astype(int)

In [10]:
df = df[df['Team'].isin(country_map.values())]

In [11]:
df['Team'].unique()

array(['Australia', 'Pakistan', 'India', 'New Zealand', 'Sri Lanka',
       'South Africa', 'England', 'West Indies', 'Bangladesh', 'Kenya',
       'Zimbabwe'], dtype=object)

In [12]:
df.dtypes

Team            object
Mat              int64
Won              int64
Lost             int64
W/L             object
BattingAve     float64
BattingRPO     float64
BattingInns      int64
BattingHS        int64
BattingLS        int64
Year             int64
BowlingAve     float64
BowlingRPO     float64
BowlingInns      int64
BowlingHS        int64
BowlingLS        int64
dtype: object

### Normalize Attributes

In [13]:
df['NormalizedBattingAve'] = (df['BattingAve'] - df['BattingAve'].min())/(df['BattingAve'].max() - df['BattingAve'].min())
df['NormalizedBattingRPO'] = (df['BattingRPO'] - df['BattingRPO'].min())/(df['BattingRPO'].max() - df['BattingRPO'].min())
df['NormalizedBattingHS'] = (df['BattingHS'] - df['BattingHS'].min())/(df['BattingHS'].max() - df['BattingHS'].min())
df['NormalizedBattingLS'] = (df['BattingLS'] - df['BattingLS'].min())/(df['BattingLS'].max() - df['BattingLS'].min())



In [14]:
# Normalize and deduct from 1 for bowling figures.
df['NormalizedBowlingAve'] = 1 - (df['BowlingAve'] - df['BowlingAve'].min())/(df['BowlingAve'].max() - df['BowlingAve'].min())
df['NormalizedBowlingRPO'] = 1 - (df['BowlingRPO'] - df['BowlingRPO'].min())/(df['BowlingRPO'].max() - df['BowlingRPO'].min())
df['NormalizedBowlingHS'] = 1 - (df['BowlingHS'] - df['BowlingHS'].min())/(df['BowlingHS'].max() - df['BowlingHS'].min())
df['NormalizedBowlingLS'] = 1 - (df['BowlingLS'] - df['BowlingLS'].min())/(df['BowlingLS'].max() - df['BowlingLS'].min())


In [15]:
df

Unnamed: 0,Team,Mat,Won,Lost,W/L,BattingAve,BattingRPO,BattingInns,BattingHS,BattingLS,Year,BowlingAve,BowlingRPO,BowlingInns,BowlingHS,BowlingLS,NormalizedBattingAve,NormalizedBattingRPO,NormalizedBattingHS,NormalizedBattingLS,NormalizedBowlingAve,NormalizedBowlingRPO,NormalizedBowlingHS,NormalizedBowlingLS
0,Australia,34,24,8,3,46.51,5.88,33,377,148,2007,25.97,5.04,34,350,91,0.732934,0.693950,0.628571,0.449848,0.867838,0.599278,0.467857,0.723404
1,Pakistan,32,24,7,3.428,31.56,4.86,32,317,124,2011,24.22,4.50,31,311,91,0.343408,0.330961,0.414286,0.376900,0.906257,0.794224,0.607143,0.723404
2,Australia,39,23,14,1.642,35.40,5.14,39,350,131,2009,29.63,5.02,38,354,145,0.443460,0.430605,0.532143,0.398176,0.787486,0.606498,0.453571,0.559271
3,India,34,22,10,2.2,38.21,5.47,33,383,146,2013,30.39,5.24,34,359,96,0.516675,0.548043,0.650000,0.443769,0.770801,0.527076,0.435714,0.708207
4,India,34,21,10,2.1,35.85,5.51,34,418,146,2011,29.56,5.17,34,338,174,0.455185,0.562278,0.775000,0.443769,0.789023,0.552347,0.510714,0.471125
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
247,South Africa,3,1,2,0.500,41.18,6.06,3,341,292,2021,39.91,6.12,3,324,0,0.594059,0.758007,0.500000,0.887538,0.561800,0.209386,0.560714,1.000000
253,Kenya,8,0,8,0,18.38,3.93,8,264,69,2011,44.89,5.66,8,324,0,0.000000,0.000000,0.225000,0.209726,0.452470,0.375451,0.560714,1.000000
255,Zimbabwe,3,0,3,0,20.00,4.14,3,231,158,2012,41.37,6.68,3,373,248,0.042209,0.074733,0.107143,0.480243,0.529748,0.007220,0.385714,0.246201
262,Zimbabwe,6,0,5,0,26.42,5.25,6,318,152,2020,45.57,6.22,6,322,0,0.209484,0.469751,0.417857,0.462006,0.437541,0.173285,0.567857,1.000000


In [16]:
df = df.round(2)

In [17]:
df

Unnamed: 0,Team,Mat,Won,Lost,W/L,BattingAve,BattingRPO,BattingInns,BattingHS,BattingLS,Year,BowlingAve,BowlingRPO,BowlingInns,BowlingHS,BowlingLS,NormalizedBattingAve,NormalizedBattingRPO,NormalizedBattingHS,NormalizedBattingLS,NormalizedBowlingAve,NormalizedBowlingRPO,NormalizedBowlingHS,NormalizedBowlingLS
0,Australia,34,24,8,3,46.51,5.88,33,377,148,2007,25.97,5.04,34,350,91,0.73,0.69,0.63,0.45,0.87,0.60,0.47,0.72
1,Pakistan,32,24,7,3.428,31.56,4.86,32,317,124,2011,24.22,4.50,31,311,91,0.34,0.33,0.41,0.38,0.91,0.79,0.61,0.72
2,Australia,39,23,14,1.642,35.40,5.14,39,350,131,2009,29.63,5.02,38,354,145,0.44,0.43,0.53,0.40,0.79,0.61,0.45,0.56
3,India,34,22,10,2.2,38.21,5.47,33,383,146,2013,30.39,5.24,34,359,96,0.52,0.55,0.65,0.44,0.77,0.53,0.44,0.71
4,India,34,21,10,2.1,35.85,5.51,34,418,146,2011,29.56,5.17,34,338,174,0.46,0.56,0.78,0.44,0.79,0.55,0.51,0.47
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
247,South Africa,3,1,2,0.500,41.18,6.06,3,341,292,2021,39.91,6.12,3,324,0,0.59,0.76,0.50,0.89,0.56,0.21,0.56,1.00
253,Kenya,8,0,8,0,18.38,3.93,8,264,69,2011,44.89,5.66,8,324,0,0.00,0.00,0.22,0.21,0.45,0.38,0.56,1.00
255,Zimbabwe,3,0,3,0,20.00,4.14,3,231,158,2012,41.37,6.68,3,373,248,0.04,0.07,0.11,0.48,0.53,0.01,0.39,0.25
262,Zimbabwe,6,0,5,0,26.42,5.25,6,318,152,2020,45.57,6.22,6,322,0,0.21,0.47,0.42,0.46,0.44,0.17,0.57,1.00


In [18]:
df.to_csv("data.csv")

In [19]:
df['W/L'] = df['W/L'].replace("-", 0).astype(str).astype(float)

In [20]:
a = df.groupby("Year")["W/L"].mean()
a

Year
2006    1.994636
2007    1.441455
2008    1.880636
2009    1.133818
2010    1.311364
2011    1.338818
2012    1.402364
2013    1.062000
2014    1.253636
2015    1.625700
2016    1.148200
2017    1.489100
2018    1.412200
2019    1.409000
2020    1.035700
2021    0.875000
Name: W/L, dtype: float64

In [21]:
df['AvgW/L'] = df['Year'].map(lambda x: a[x]).round(2)

df[['Year', 'AvgW/L']]

Unnamed: 0,Year,AvgW/L
0,2007,1.44
1,2011,1.34
2,2009,1.13
3,2013,1.06
4,2011,1.34
...,...,...
247,2021,0.88
253,2011,1.34
255,2012,1.40
262,2020,1.04


In [22]:
df.to_csv('data.csv')

In [23]:
df['WorldCup'] = False


In [24]:
df['W/L'].min()

0.0

## Batsman Statistics

### SCraping from ESPN-Cricinfo

In [25]:
def get_batsman_stats():
    
    def url(i):
        return 'https://stats.espncricinfo.com/ci/engine/stats/index.html?class=2;page=' + str(i) + ';spanmax1=18+Apr+2021;spanmin1=18+Apr+2006;spanval1=span;template=results;type=batting;view=year'
    
    i = 1

    df = None

    while True:
        new_df = pd.read_html(url(i))[2]

        if(len(new_df.index) == 1):
            break
        
        if df is None:
            df = new_df
        else:
            df = df.append(new_df, ignore_index=True)
        
        i= i + 1
        print(i, end='\r')

    df.to_csv("batsman-stats.csv")


    return df

In [26]:
batsman_stats = get_batsman_stats()



In [27]:
batsman_stats = pd.read_csv("batsman-stats.csv")

batsman_stats.head(-20)

Unnamed: 0.1,Unnamed: 0,Player,Mat,Inns,NO,Runs,HS,Ave,BF,SR,100,50,0,4s,6s,Year,Unnamed: 15
0,0,ML Hayden (AUS),32,30,3,1601,181*,59.29,1795,89.19,5,6,1,168,35,2007,
1,1,RG Sharma (INDIA),28,27,1,1490,159,57.3,1657,89.92,7,6,1,146,36,2019,
2,2,MJ Guptill (NZ),32,32,5,1489,237*,55.14,1542,96.56,4,8,3,162,42,2015,
3,3,V Kohli (INDIA),26,26,7,1460,131,76.84,1473,99.11,6,7,2,136,22,2017,
4,4,SR Tendulkar (INDIA),33,32,2,1425,100*,47.5,1665,85.58,1,13,3,192,12,2007,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5335,5335,Jay Odedra (OMAN),1,-,-,-,-,-,-,-,-,-,-,-,-,2020,
5336,5336,JB Little (IRE),2,-,-,-,-,-,-,-,-,-,-,-,-,2020,
5337,5337,Mohammad Naim (BDESH),1,-,-,-,-,-,-,-,-,-,-,-,-,2020,
5338,5338,Mustafizur Rahman (BDESH),2,-,-,-,-,-,-,-,-,-,-,-,-,2020,


### Set Country

In [28]:
def get_country(data):
    country_map = {
        'AUS': 'Australia',
        'PAK': 'Pakistan',
        'SL': 'Sri Lanka',
        'SA': 'South Africa',
        'INDIA': 'India',
        'NZ': 'New Zealand',
        'ENG': 'England',
        'WI': 'West Indies',
        'BDESH': 'Bangladesh',
        'KENYA': 'Kenya',
        'ZIM': 'Zimbabwe'
    }

    for key in country_map:
        if key in data:
            return country_map[key]
    return 'Other'

batsman_stats['Team'] = batsman_stats['Player'].map(lambda x: get_country(x))

In [29]:
batsman_stats = batsman_stats[batsman_stats['Team'] != 'Other']

batsman_stats.head()

Unnamed: 0.1,Unnamed: 0,Player,Mat,Inns,NO,Runs,HS,Ave,BF,SR,100,50,0,4s,6s,Year,Unnamed: 15,Team
0,0,ML Hayden (AUS),32,30,3,1601,181*,59.29,1795,89.19,5,6,1,168,35,2007,,Australia
1,1,RG Sharma (INDIA),28,27,1,1490,159,57.3,1657,89.92,7,6,1,146,36,2019,,India
2,2,MJ Guptill (NZ),32,32,5,1489,237*,55.14,1542,96.56,4,8,3,162,42,2015,,New Zealand
3,3,V Kohli (INDIA),26,26,7,1460,131,76.84,1473,99.11,6,7,2,136,22,2017,,India
4,4,SR Tendulkar (INDIA),33,32,2,1425,100*,47.5,1665,85.58,1,13,3,192,12,2007,,India


In [30]:
batsman_stats = batsman_stats.drop(columns=['Unnamed: 0', 'Unnamed: 15'])

In [31]:
# Character treatment
batsman_stats['HS'] = batsman_stats['HS'].replace("-", 0)

batsman_stats['HS'] = batsman_stats['HS'].map(lambda x: int(str(x).replace("*", "")))

batsman_stats.head()

Unnamed: 0,Player,Mat,Inns,NO,Runs,HS,Ave,BF,SR,100,50,0,4s,6s,Year,Team
0,ML Hayden (AUS),32,30,3,1601,181,59.29,1795,89.19,5,6,1,168,35,2007,Australia
1,RG Sharma (INDIA),28,27,1,1490,159,57.3,1657,89.92,7,6,1,146,36,2019,India
2,MJ Guptill (NZ),32,32,5,1489,237,55.14,1542,96.56,4,8,3,162,42,2015,New Zealand
3,V Kohli (INDIA),26,26,7,1460,131,76.84,1473,99.11,6,7,2,136,22,2017,India
4,SR Tendulkar (INDIA),33,32,2,1425,100,47.5,1665,85.58,1,13,3,192,12,2007,India


In [32]:
batsman_stats.to_csv('batsman-stats.csv')

## Bowler Stats

#### Scrape from ESPN-Cricinfo

In [33]:
def get_bowler_stats():
    
    def url(i):
        return 'https://stats.espncricinfo.com/ci/engine/stats/index.html?class=2;page=' + str(i) +';spanmax1=19+Apr+2021;spanmin1=19+Apr+2006;spanval1=span;template=results;type=bowling;view=year'
        # return 'https://stats.espncricinfo.com/ci/engine/stats/index.html?class=2;page=' + str(i) + ';spanmax1=18+Apr+2021;spanmin1=18+Apr+2006;spanval1=span;template=results;type=batting;view=year'
    
    i = 1

    df = None

    while True:
        new_df = pd.read_html(url(i))[2]

        if(len(new_df.index) == 1):
            break
        
        if df is None:
            df = new_df
        else:
            df = df.append(new_df, ignore_index=True)
        
        i= i + 1
        
    print(str(i) +  " Pages scraped")

    df.to_csv("bowler-stats.csv")


    return df

In [34]:
bowler_stats = get_bowler_stats()

109 Pages scraped


In [35]:

bowler_stats['Team'] = bowler_stats['Player'].map(lambda x: get_country(x))

bowler_stats

Unnamed: 0,Player,Mat,Inns,Overs,Mdns,Runs,Wkts,BBI,Ave,Econ,SR,4,5,Year,Unnamed: 14,Team
0,Saeed Ajmal (PAK),33,33,306.3,15,1268,62,5/24,20.45,4.13,29.6,2,1,2013,,Pakistan
1,RA Jadeja (INDIA),34,34,303.3,22,1321,52,5/36,25.4,4.35,35,1,1,2013,,India
2,Junaid Khan (PAK),28,28,228.3,17,1116,52,4/15,21.46,4.88,26.3,1,0,2013,,Pakistan
3,BAW Mendis (SL),18,17,135.3,13,486,48,6/13,10.12,3.58,16.9,3,3,2008,,Sri Lanka
4,SL Malinga (SL),24,23,192.1,12,924,48,6/38,19.25,4.8,24,0,3,2011,,Sri Lanka
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5355,LJ Tucker (IRE),5,-,-,-,-,-,-,-,-,-,-,-,2021,,Other
5356,HE van der Dussen (SA),2,-,-,-,-,-,-,-,-,-,-,-,2021,,South Africa
5357,K Verreynne (SA),1,-,-,-,-,-,-,-,-,-,-,-,2021,,South Africa
5358,WA Young (NZ),2,-,-,-,-,-,-,-,-,-,-,-,2021,,New Zealand


In [36]:
bowler_stats = bowler_stats.drop(columns=['Unnamed: 14'])

bowler_stats

Unnamed: 0,Player,Mat,Inns,Overs,Mdns,Runs,Wkts,BBI,Ave,Econ,SR,4,5,Year,Team
0,Saeed Ajmal (PAK),33,33,306.3,15,1268,62,5/24,20.45,4.13,29.6,2,1,2013,Pakistan
1,RA Jadeja (INDIA),34,34,303.3,22,1321,52,5/36,25.4,4.35,35,1,1,2013,India
2,Junaid Khan (PAK),28,28,228.3,17,1116,52,4/15,21.46,4.88,26.3,1,0,2013,Pakistan
3,BAW Mendis (SL),18,17,135.3,13,486,48,6/13,10.12,3.58,16.9,3,3,2008,Sri Lanka
4,SL Malinga (SL),24,23,192.1,12,924,48,6/38,19.25,4.8,24,0,3,2011,Sri Lanka
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5355,LJ Tucker (IRE),5,-,-,-,-,-,-,-,-,-,-,-,2021,Other
5356,HE van der Dussen (SA),2,-,-,-,-,-,-,-,-,-,-,-,2021,South Africa
5357,K Verreynne (SA),1,-,-,-,-,-,-,-,-,-,-,-,2021,South Africa
5358,WA Young (NZ),2,-,-,-,-,-,-,-,-,-,-,-,2021,New Zealand


In [37]:
bowler_stats = bowler_stats[bowler_stats['Team'] != 'Other']

bowler_stats

Unnamed: 0,Player,Mat,Inns,Overs,Mdns,Runs,Wkts,BBI,Ave,Econ,SR,4,5,Year,Team
0,Saeed Ajmal (PAK),33,33,306.3,15,1268,62,5/24,20.45,4.13,29.6,2,1,2013,Pakistan
1,RA Jadeja (INDIA),34,34,303.3,22,1321,52,5/36,25.4,4.35,35,1,1,2013,India
2,Junaid Khan (PAK),28,28,228.3,17,1116,52,4/15,21.46,4.88,26.3,1,0,2013,Pakistan
3,BAW Mendis (SL),18,17,135.3,13,486,48,6/13,10.12,3.58,16.9,3,3,2008,Sri Lanka
4,SL Malinga (SL),24,23,192.1,12,924,48,6/38,19.25,4.8,24,0,3,2011,Sri Lanka
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5353,Tamim Iqbal (BDESH),6,-,-,-,-,-,-,-,-,-,-,-,2021,Bangladesh
5354,LRPL Taylor (NZ),1,-,-,-,-,-,-,-,-,-,-,-,2021,New Zealand
5356,HE van der Dussen (SA),2,-,-,-,-,-,-,-,-,-,-,-,2021,South Africa
5357,K Verreynne (SA),1,-,-,-,-,-,-,-,-,-,-,-,2021,South Africa


In [38]:
bowler_stats = bowler_stats.drop(bowler_stats.index[bowler_stats['Econ'] == '-'])

bowler_stats.to_csv("bowler-stats.csv")

In [39]:
bowler_stats.head()

Unnamed: 0,Player,Mat,Inns,Overs,Mdns,Runs,Wkts,BBI,Ave,Econ,SR,4,5,Year,Team
0,Saeed Ajmal (PAK),33,33,306.3,15,1268,62,5/24,20.45,4.13,29.6,2,1,2013,Pakistan
1,RA Jadeja (INDIA),34,34,303.3,22,1321,52,5/36,25.4,4.35,35.0,1,1,2013,India
2,Junaid Khan (PAK),28,28,228.3,17,1116,52,4/15,21.46,4.88,26.3,1,0,2013,Pakistan
3,BAW Mendis (SL),18,17,135.3,13,486,48,6/13,10.12,3.58,16.9,3,3,2008,Sri Lanka
4,SL Malinga (SL),24,23,192.1,12,924,48,6/38,19.25,4.8,24.0,0,3,2011,Sri Lanka


In [40]:
bowler_stats['Team'].unique()

array(['Pakistan', 'India', 'Sri Lanka', 'Australia', 'Bangladesh',
       'South Africa', 'Zimbabwe', 'New Zealand', 'England',
       'West Indies', 'Kenya'], dtype=object)

In [41]:
batsman_stats['Ave'] = batsman_stats['HS'].replace("-", 0)

batsman_stats['HS'] = batsman_stats['HS'].map(lambda x: int(str(x).replace("*", "")))

batsman_stats.head()

Unnamed: 0,Player,Mat,Inns,NO,Runs,HS,Ave,BF,SR,100,50,0,4s,6s,Year,Team
0,ML Hayden (AUS),32,30,3,1601,181,181,1795,89.19,5,6,1,168,35,2007,Australia
1,RG Sharma (INDIA),28,27,1,1490,159,159,1657,89.92,7,6,1,146,36,2019,India
2,MJ Guptill (NZ),32,32,5,1489,237,237,1542,96.56,4,8,3,162,42,2015,New Zealand
3,V Kohli (INDIA),26,26,7,1460,131,131,1473,99.11,6,7,2,136,22,2017,India
4,SR Tendulkar (INDIA),33,32,2,1425,100,100,1665,85.58,1,13,3,192,12,2007,India
