## Batting Data Cleaning

In [8]:
import pandas as pd

In [72]:
batting_df = pd.read_csv('D:/DA_projects/CWC2023/Data/batting_df.csv')
batting_df.head()

Unnamed: 0,Match_id,Batsman_stats,Team,Opposite_Team,Innings
0,1,Bairstow c Daryl Mitchell b Santner 33 35 ...,England,New Zealand,1
1,1,Malan c Latham b Matt Henry 14 24 2 0 58.33,England,New Zealand,1
2,1,Root b Glenn Phillips 77 86 4 1 89.53,England,New Zealand,1
3,1,Harry Brook c Conway b Rachin Ravindra 25 ...,England,New Zealand,1
4,1,Moeen b Glenn Phillips 11 17 1 0 64.71,England,New Zealand,1


In [115]:
def extract_scores_to_df(each_batsman_sample):
    '''
    This Function will Extract Stats of Batsman, Statitcs such as Runs,Balls,Boundaries and Sixes (Numericals)
    Input : Batsman Stats as a row 
    Output : Batsman Stats as df 

    Sample Input : 'Bairstow    c Daryl Mitchell b Santner  33 35 4 1 94.29'
    '''
    stc = each_batsman_sample.split(' ')[-5:]  # only last 5 characters are Numericals see above cell for approach 
    
    # Extracting each value from scores assign it to appropriate attributes
    return pd.Series({
        'Runs': stc[0],
        'Balls': stc[1],
        'Fours': stc[2],
        'Sixes': stc[3],
        'StrikeRate': stc[4]
    })

In [116]:
batting_stats = batting_df['Batsman_stats'].apply(extract_scores_to_df)

In [117]:
batting_stats.head()


Unnamed: 0,Runs,Balls,Fours,Sixes,StrikeRate
0,33,35,4,1,94.29
1,14,24,2,0,58.33
2,77,86,4,1,89.53
3,25,16,4,1,156.25
4,11,17,1,0,64.71


In [118]:
batting_df.shape,batting_stats.shape

((876, 5), (876, 5))

In [119]:
# Concatenating 2 dfs row_wise to get exact score card (Dismissal+Stats)
updated_batting_df = pd.concat([batting_df,batting_stats],axis=1)  
updated_batting_df

Unnamed: 0,Match_id,Batsman_stats,Team,Opposite_Team,Innings,Runs,Balls,Fours,Sixes,StrikeRate
0,1,Bairstow c Daryl Mitchell b Santner 33 35 ...,England,New Zealand,1,33,35,4,1,94.29
1,1,Malan c Latham b Matt Henry 14 24 2 0 58.33,England,New Zealand,1,14,24,2,0,58.33
2,1,Root b Glenn Phillips 77 86 4 1 89.53,England,New Zealand,1,77,86,4,1,89.53
3,1,Harry Brook c Conway b Rachin Ravindra 25 ...,England,New Zealand,1,25,16,4,1,156.25
4,1,Moeen b Glenn Phillips 11 17 1 0 64.71,England,New Zealand,1,11,17,1,0,64.71
...,...,...,...,...,...,...,...,...,...,...
871,48,Travis Head c Shubman Gill b Siraj 137 120...,Australia,India,2,137,120,15,4,114.17
872,48,Mitchell Marsh c Rahul b Bumrah 15 15 1 1 ...,Australia,India,2,15,15,1,1,100.00
873,48,Steven Smith lbw b Bumrah 4 9 1 0 44.44,Australia,India,2,4,9,1,0,44.44
874,48,Marnus Labuschagne not out 58 110 4 0 52.73,Australia,India,2,58,110,4,0,52.73


In [120]:
data_list = []
def batsman_stats(sample):
    '''
    This will Create Separate columns for Each attribute in Batsman Score 
    such as Batsman, Dismissal Type,Bowler,Fielder,Bowler

    Input : Score Column, Output : Details of Score not numericals
    Sample Input : 'Bairstow    c Daryl Mitchell b Santner  33 35 4 1 94.29'
    '''
    for itr in sample:
        score = itr.split(' ')[:-6]  # upto Numericals last 6 characters are Numericals so upto 6 characters
        data_dict = {} # For Storing Each attribute in dict

        # for any dismissal type : Batsman | Dismissal Type | Fielder | Bowler 

        # Caught and Bowled : 'and' only present in caught and bowled
        if 'and' in score:
            data_dict['Batsman'] = " ".join(score[:score.index('c')]).strip() # upto c in 'caught'
            data_dict['Dismissal Type'] = 'Caught and Bowled' 
            data_dict['Fielder'] = '-' # No Fielder 
            data_dict['Bowler'] = " ".join(score[score.index('b')+1:]).strip() # for bowler 'b'


        # if 'c' and 'b' present in string then it is caught 
        elif 'c' in score and 'b' in score:
            data_dict['Batsman'] = " ".join(score[:score.index('c')]).strip() # upto c in 'caught'
            data_dict['Dismissal Type'] = 'Catch'
            # Fielder Name will come before Bowler 
            data_dict['Fielder'] = " ".join(score[score.index('c')+1:score.index('b')]).strip() 
            data_dict['Bowler'] = " ".join(score[score.index('b')+1:]).strip() # for bowler 'b'

        # stump
        elif 'st' in score and 'b' in score:        
            data_dict['Batsman'] = " ".join(score[:score.index('st')]).strip() 
            data_dict['Dismissal Type'] = 'Stump'  
            data_dict['Fielder'] = " ".join(score[score.index('st')+1:score.index('b')]).strip()
            data_dict['Bowler'] = " ".join(score[score.index('b')+1:]).strip()

        # lbw 
        elif 'lbw' in score and 'b' in score:        
            data_dict['Batsman'] = " ".join(score[:score.index('lbw')]).strip()
            data_dict['Dismissal Type'] = 'lbw'
            data_dict['Fielder'] = '-'
            data_dict['Bowler'] = " ".join(score[score.index('b')+1:]).strip()

        # Bowled
        elif 'b' in score:        
            data_dict['Batsman'] = " ".join(score[:score.index('b')]).strip()
            data_dict['Dismissal Type'] = 'Bowled'
            data_dict['Fielder'] = '-'
            data_dict['Bowler'] = " ".join(score[score.index('b')+1:]).strip()

        # Not out 
        elif 'not' in score:        
            data_dict['Batsman'] = " ".join(score[:score.index('not')]).strip()
            data_dict['Dismissal Type'] = 'Not out'
            data_dict['Fielder'] = '-'
            data_dict['Bowler'] = '-'

        # runout
        elif 'run' in score:
            data_dict['Batsman'] = " ".join(score[:score.index('run')]).strip()
            data_dict['Dismissal Type'] = 'Run out'
            data_dict['Fielder'] = " ".join(score[score.index('out')+1:]).strip()[1:-1]
            data_dict['Bowler'] = '-'

        # timed out
        elif 'timed' in score:
            data_dict['Batsman'] = " ".join(score[:score.index('timed')]).strip()
            data_dict['Dismissal Type'] = 'Timed out'
            data_dict['Fielder'] = '-'
            data_dict['Bowler'] = '-'

        # retd hurt
        elif 'retd' in score:
            data_dict['Batsman'] = " ".join(score[:score.index('retd')]).strip()
            data_dict['Dismissal Type'] = 'Retired hurt'
            data_dict['Fielder'] = '-'
            data_dict['Bowler'] = '-'

        # this is exceptional happend in this WC
        else:
            data_dict['Batsman'] = " ".join(score[:score.index('abs')]).strip()
            data_dict['Dismissal Type'] = 'Absent out'
            data_dict['Fielder'] = '-'
            data_dict['Bowler'] = '-'
        
        # Append the current data_dict to the data_list
        data_list.append(data_dict)

batsman_stats(batting_df['Batsman_stats'])

# Create a DataFrame
batsman_df = pd.DataFrame(data_list)

In [121]:
batsman_df.head()

Unnamed: 0,Batsman,Dismissal Type,Fielder,Bowler
0,Bairstow,Catch,Daryl Mitchell,Santner
1,Malan,Catch,Latham,Matt Henry
2,Root,Bowled,-,Glenn Phillips
3,Harry Brook,Catch,Conway,Rachin Ravindra
4,Moeen,Bowled,-,Glenn Phillips


In [122]:
batting_stats.head()

Unnamed: 0,Runs,Balls,Fours,Sixes,StrikeRate
0,33,35,4,1,94.29
1,14,24,2,0,58.33
2,77,86,4,1,89.53
3,25,16,4,1,156.25
4,11,17,1,0,64.71


In [123]:
# Combining all dataframes along row to get batsman card
final_batting_df = pd.concat([batting_df,batsman_df,batting_stats],axis=1)
final_batting_df

Unnamed: 0,Match_id,Batsman_stats,Team,Opposite_Team,Innings,Batsman,Dismissal Type,Fielder,Bowler,Runs,Balls,Fours,Sixes,StrikeRate
0,1,Bairstow c Daryl Mitchell b Santner 33 35 ...,England,New Zealand,1,Bairstow,Catch,Daryl Mitchell,Santner,33,35,4,1,94.29
1,1,Malan c Latham b Matt Henry 14 24 2 0 58.33,England,New Zealand,1,Malan,Catch,Latham,Matt Henry,14,24,2,0,58.33
2,1,Root b Glenn Phillips 77 86 4 1 89.53,England,New Zealand,1,Root,Bowled,-,Glenn Phillips,77,86,4,1,89.53
3,1,Harry Brook c Conway b Rachin Ravindra 25 ...,England,New Zealand,1,Harry Brook,Catch,Conway,Rachin Ravindra,25,16,4,1,156.25
4,1,Moeen b Glenn Phillips 11 17 1 0 64.71,England,New Zealand,1,Moeen,Bowled,-,Glenn Phillips,11,17,1,0,64.71
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
871,48,Travis Head c Shubman Gill b Siraj 137 120...,Australia,India,2,Travis Head,Catch,Shubman Gill,Siraj,137,120,15,4,114.17
872,48,Mitchell Marsh c Rahul b Bumrah 15 15 1 1 ...,Australia,India,2,Mitchell Marsh,Catch,Rahul,Bumrah,15,15,1,1,100.00
873,48,Steven Smith lbw b Bumrah 4 9 1 0 44.44,Australia,India,2,Steven Smith,lbw,-,Bumrah,4,9,1,0,44.44
874,48,Marnus Labuschagne not out 58 110 4 0 52.73,Australia,India,2,Marnus Labuschagne,Not out,-,-,58,110,4,0,52.73


In [124]:
final_batting_df.drop(columns='Batsman_stats',inplace=True) # Batsman_stats column not needed 

In [125]:
final_batting_df.head()

Unnamed: 0,Match_id,Team,Opposite_Team,Innings,Batsman,Dismissal Type,Fielder,Bowler,Runs,Balls,Fours,Sixes,StrikeRate
0,1,England,New Zealand,1,Bairstow,Catch,Daryl Mitchell,Santner,33,35,4,1,94.29
1,1,England,New Zealand,1,Malan,Catch,Latham,Matt Henry,14,24,2,0,58.33
2,1,England,New Zealand,1,Root,Bowled,-,Glenn Phillips,77,86,4,1,89.53
3,1,England,New Zealand,1,Harry Brook,Catch,Conway,Rachin Ravindra,25,16,4,1,156.25
4,1,England,New Zealand,1,Moeen,Bowled,-,Glenn Phillips,11,17,1,0,64.71


In [126]:
# Data Inspection 
batting_df.shape,batting_stats.shape # Match found

((876, 5), (876, 5))

# Bowling Data Cleaning

In [127]:
bowling_df = pd.read_csv(r'D:/DA_projects/CWC2023/Data/bowling_df.csv')
bowling_df.head()

Unnamed: 0,Match_id,Bowler_stats,Team,Opposite_Team,Innings
0,1,Boult 10 1 48 1 0 1 4.80,New Zealand,England,1
1,1,Matt Henry 10 1 48 3 0 0 4.80,New Zealand,England,1
2,1,Santner 10 0 37 2 0 1 3.70,New Zealand,England,1
3,1,Neesham 7 0 56 0 0 4 8.00,New Zealand,England,1
4,1,Rachin Ravindra 10 0 76 1 0 0 7.60,New Zealand,England,1


In [128]:
def bowler_stats_extraction(samples):
    '''
    This Function will Extract Stats of Bowler, Statitcs such as Overs,Runs,Maindens,Wickets and Economy (Numericals)
    Input : Bowler Stats as a row 
    Output : Bowler Stats as df 

    Sample Input : 'Boult   10 1 48 1 0 1 4.80'
    '''
    
    #for i in samples:
    bowling_stats = samples.split(' ')[-7:]
    Name = " ".join(samples.split(' ')[:-7]).strip()
    return pd.Series({
            'Bowler': Name,
            'Overs' : bowling_stats[0],
            'Maidens' : bowling_stats[1],
            'Runs' : bowling_stats[2],
            'Wickets' : bowling_stats[3],
            'No_Balls' : bowling_stats[4],
            'Wides' : bowling_stats[5],
            'Economy' : bowling_stats[6]})

In [129]:
bowl_stats = bowling_df['Bowler_stats'].apply(bowler_stats_extraction)
bowl_stats.head()

Unnamed: 0,Bowler,Overs,Maidens,Runs,Wickets,No_Balls,Wides,Economy
0,Boult,10,1,48,1,0,1,4.8
1,Matt Henry,10,1,48,3,0,0,4.8
2,Santner,10,0,37,2,0,1,3.7
3,Neesham,7,0,56,0,0,4,8.0
4,Rachin Ravindra,10,0,76,1,0,0,7.6


In [130]:
# Concatenating 2 dfs row_wise to get exact bowling card 
final_bowling_df = pd.concat([bowling_df,bowl_stats],axis=1)
final_bowling_df.head()

Unnamed: 0,Match_id,Bowler_stats,Team,Opposite_Team,Innings,Bowler,Overs,Maidens,Runs,Wickets,No_Balls,Wides,Economy
0,1,Boult 10 1 48 1 0 1 4.80,New Zealand,England,1,Boult,10,1,48,1,0,1,4.8
1,1,Matt Henry 10 1 48 3 0 0 4.80,New Zealand,England,1,Matt Henry,10,1,48,3,0,0,4.8
2,1,Santner 10 0 37 2 0 1 3.70,New Zealand,England,1,Santner,10,0,37,2,0,1,3.7
3,1,Neesham 7 0 56 0 0 4 8.00,New Zealand,England,1,Neesham,7,0,56,0,0,4,8.0
4,1,Rachin Ravindra 10 0 76 1 0 0 7.60,New Zealand,England,1,Rachin Ravindra,10,0,76,1,0,0,7.6


In [131]:
final_bowling_df.drop(columns='Bowler_stats',inplace=True)

In [132]:
final_bowling_df.head()

Unnamed: 0,Match_id,Team,Opposite_Team,Innings,Bowler,Overs,Maidens,Runs,Wickets,No_Balls,Wides,Economy
0,1,New Zealand,England,1,Boult,10,1,48,1,0,1,4.8
1,1,New Zealand,England,1,Matt Henry,10,1,48,3,0,0,4.8
2,1,New Zealand,England,1,Santner,10,0,37,2,0,1,3.7
3,1,New Zealand,England,1,Neesham,7,0,56,0,0,4,8.0
4,1,New Zealand,England,1,Rachin Ravindra,10,0,76,1,0,0,7.6


### Names Normalization 

In [133]:
# England Replacement
final_batting_df.replace({
    'Bairstow': 'Jonny Bairstow',
    'Malan': 'Dawid Malan',
    'Root': 'Joe Root',
    'Harry Brook': 'Harry Brook',  # No change in this case
    'Moeen': 'Moeen Ali',
    'Jos Buttler (c & wk)': 'Jos Buttler',
    'Livingstone': 'Liam Livingstone',
    'Sam Curran': 'Sam Curran',
    'Chris Woakes': 'Chris Woakes',
    'Adil Rashid': 'Adil Rashid',
    'Mark Wood': 'Mark Wood',
    'R Topley': 'Reece Topley',
    'Stokes': 'Ben Stokes',
    'Willey': 'David Willey',
    'Gus Atkinson': 'Gus Atkinson'
},inplace=True)

# England Replacement
final_bowling_df.replace(
    {'Chris Woakes': 'Chris Woakes',
    'R Topley': 'Reece Topley',
    'Sam Curran': 'Sam Curran',
    'Adil Rashid': 'Adil Rashid',
    'Mark Wood': 'Mark Wood',
    'Livingstone': 'Liam Livingstone',
    'Root': 'Joe Root',
    'Willey': 'David Willey',
    'Gus Atkinson': 'Gus Atkinson',
    'Moeen': 'Moeen Ali',
    'Moeen Ali': 'Moeen Ali',
    'Liam Livingstone': 'Liam Livingstone',
    'Reece Topley': 'Reece Topley',
    'David Willey': 'David Willey'
},inplace=True)

In [134]:
# Pakistan Replacement

final_batting_df.replace({
    'Fakhar Zaman': 'Fakhar Zaman',
    'Imam-ul-Haq': 'Imam-ul-Haq',
    'Babar Azam (c)': 'Babar Azam',
    'Rizwan (wk)': 'Mohammad Rizwan',
    'Saud Shakeel': 'Saud Shakeel',
    'Iftikhar Ahmed': 'Iftikhar Ahmed',
    'Mohammad Nawaz': 'Mohammad Nawaz',
    'Shadab Khan': 'Shadab Khan',
    'Hasan Ali': 'Hasan Ali',
    'Shaheen Afridi': 'Shaheen Afridi',
    'Haris Rauf': 'Haris Rauf',
    'Shafique': 'Shafique',
    'Mohammad Wasim Jr': 'Mohammad Wasim Jr',
    'Abdullah Shafique': 'Abdullah Shafique',
    'Mohammad Rizwan (wk)': 'Mohammad Rizwan',
    'Usama Mir': 'Usama Mir',
    'Agha Salman': 'Agha Salman'
},inplace=True)
final_bowling_df.replace({
    'Shaheen Afridi': 'Shaheen Afridi',
    'Hasan Ali': 'Hasan Ali',
    'Mohammad Nawaz': 'Mohammad Nawaz',
    'Haris Rauf': 'Haris Rauf',
    'Shadab Khan': 'Shadab Khan',
    'Iftikhar Ahmed': 'Iftikhar Ahmed',
    'Usama Mir': 'Usama Mir',
    'Mohammad Wasim Jr': 'Mohammad Wasim Jr',
    'Agha Salman': 'Agha Salman'
},inplace=True)

In [135]:
# Afghanistan Replacement
final_bowling_df
final_batting_df.replace({
    'Gurbaz (wk)': 'Rahmanullah Gurbaz',
    'Ibrahim Zadran': 'Ibrahim Zadran',
    'Rahmat': 'Rahmat Shah',
    'Shahidi (c)': 'Hashmatullah Shahidi',
    'Najibullah': 'Najibullah Zadran',
    'Nabi': 'Mohammad Nabi',
    'Azmatullah': 'Azmatullah Omarzai',
    'Rashid Khan': 'Rashid Khan',
    'Mujeeb': 'Mujeeb Ur Rahman',
    'Naveen-ul-Haq': 'Naveen-ul-Haq',
    'Fazalhaq Farooqi': 'Fazalhaq Farooqi',
    'Ikram Alikhil (wk)': 'Ikram Alikhil',
    'Noor Ahmad': 'Noor Ahmad',
    'Rahmanullah Gurbaz': 'Rahmanullah Gurbaz',
    'Rahmat Shah': 'Rahmat Shah',
    'Hashmatullah Shahidi (c)': 'Hashmatullah Shahidi',
    'Azmatullah Omarzai': 'Azmatullah Omarzai',
    'Mohammad Nabi': 'Mohammad Nabi',
    'Mujeeb Ur Rahman': 'Mujeeb Ur Rahman',
    'Rahmanullah Gurbaz (wk)': 'Rahmanullah Gurbaz'
},inplace=True)

final_bowling_df.replace({
    'Mujeeb': 'Mujeeb Ur Rahman',
    'Fazalhaq Farooqi': 'Fazalhaq Farooqi',
    'Naveen-ul-Haq': 'Naveen-ul-Haq',
    'Nabi': 'Mohammad Nabi',
    'Rashid Khan': 'Rashid Khan',
    'Azmatullah': 'Azmatullah Omarzai',
    'Noor Ahmad': 'Noor Ahmad',
    'Mujeeb Ur Rahman': 'Mujeeb Ur Rahman',
    'Mohammad Nabi': 'Mohammad Nabi',
    'Azmatullah Omarzai': 'Azmatullah Omarzai',
    'Rahmat Shah': 'Rahmat Shah'},inplace=True)

In [136]:
# South Africa Replacement

final_batting_df.replace({
    'de Kock': 'Quinton de Kock',
    'Bavuma (c)': 'Temba Bavuma',
    'van der Dussen': 'Rassie van der Dussen',
    'Markram': 'Aiden Markram',
    'Klaasen (wk)': 'Heinrich Klaasen',
    'Miller': 'David Miller',
    'Marco Jansen': 'Marco Jansen',
    'de Kock (wk)': 'Quinton de Kock',
    'Klaasen': 'Heinrich Klaasen',
    'Rabada': 'Kagiso Rabada',
    'Maharaj': 'Keshav Maharaj',
    'Reeza Hendricks': 'Reeza Hendricks',
    'Markram (c)': 'Aiden Markram',
    'Gerald Coetzee': 'Gerald Coetzee',
    'Shamsi': 'Tabraiz Shamsi',
    'Temba Bavuma (c)': 'Temba Bavuma',
    'Quinton de Kock (wk)': 'Quinton de Kock',
    'Rassie van der Dussen': 'Rassie van der Dussen',
    'Aiden Markram': 'Aiden Markram',
    'Heinrich Klaasen': 'Heinrich Klaasen',
    'David Miller': 'David Miller',
    'Keshav Maharaj': 'Keshav Maharaj',
    'Kagiso Rabada': 'Kagiso Rabada',
    'Lungi Ngidi': 'Lungi Ngidi',
    'Tabraiz Shamsi': 'Tabraiz Shamsi',
    'Andile Phehlukwayo': 'Andile Phehlukwayo'
},inplace=True)

final_batting_df.replace({
    'Lungi Ngidi': 'Lungi Ngidi',
    'Marco Jansen': 'Marco Jansen',
    'Rabada': 'Kagiso Rabada',
    'Gerald Coetzee': 'Gerald Coetzee',
    'Maharaj': 'Keshav Maharaj',
    'Markram': 'Aiden Markram',
    'Shamsi': 'Tabraiz Shamsi',
    'Phehlukwayo': 'Andile Phehlukwayo',
    'Kagiso Rabada': 'Kagiso Rabada',
    'Keshav Maharaj': 'Keshav Maharaj',
    'Tabraiz Shamsi': 'Tabraiz Shamsi',
    'Lizaad Williams': 'Lizaad Williams',
    'Aiden Markram': 'Aiden Markram'
},inplace=True)

In [137]:
# Australia Replacement

final_batting_df.replace({
    'Warner': 'David Warner',
    'Mitchell Marsh': 'Mitchell Marsh',
    'Steven Smith': 'Steven Smith',
    'Labuschagne': 'Marnus Labuschagne',
    'Maxwell': 'Glenn Maxwell',
    'Alex Carey (wk)': 'Alex Carey',
    'Green': 'Cameron Green',
    'Cummins (c)': 'Pat Cummins',
    'Starc': 'Mitchell Starc',
    'Zampa': 'Adam Zampa',
    'Hazlewood': 'Josh Hazlewood',
    'Stoinis': 'Marcus Stoinis',
    'Josh Inglis (wk)': 'Josh Inglis',
    'Head': 'Travis Head',
    'David Warner': 'David Warner',
    'Marnus Labuschagne': 'Marnus Labuschagne',
    'Glenn Maxwell': 'Glenn Maxwell',
    'Marcus Stoinis': 'Marcus Stoinis',
    'Mitchell Starc': 'Mitchell Starc',
    'Pat Cummins (c)': 'Pat Cummins',
    'Adam Zampa': 'Adam Zampa',
    'Josh Hazlewood': 'Josh Hazlewood',
    'Travis Head': 'Travis Head'
},inplace=True)

final_bowling_df.replace({
    'Starc': 'Mitchell Starc',
    'Hazlewood': 'Josh Hazlewood',
    'Maxwell': 'Glenn Maxwell',
    'Cummins (c)': 'Pat Cummins',
    'Zampa': 'Adam Zampa',
    'Mitchell Marsh': 'Mitchell Marsh',
    'Stoinis': 'Marcus Stoinis',
    'Head': 'Travis Head',
    'Abbott': 'Sean Abbott',
    'Mitchell Starc': 'Mitchell Starc',
    'Josh Hazlewood': 'Josh Hazlewood',
    'Pat Cummins (c)': 'Pat Cummins',
    'Glenn Maxwell': 'Glenn Maxwell',
    'Cameron Green': 'Cameron Green',
    'Adam Zampa': 'Adam Zampa',
    'Marcus Stoinis': 'Marcus Stoinis',
    'Travis Head': 'Travis Head'
},inplace=True)

In [138]:
# Newzeland Replacement
final_bowling_df
final_batting_df.replace({
    'Conway': 'Devon Conway',
    'Will Young': 'Will Young',
    'Rachin Ravindra': 'Rachin Ravindra',
    'Daryl Mitchell': 'Daryl Mitchell',
    'Latham (c & wk)': 'Tom Latham',
    'Glenn Phillips': 'Glenn Phillips',
    'Chapman': 'Mark Chapman',
    'Santner': 'Mitchell Santner',
    'Matt Henry': 'Matt Henry',
    'Lockie Ferguson': 'Lockie Ferguson',
    'Boult': 'Trent Boult',
    'Williamson (c)': 'Kane Williamson',
    'Latham (wk)': 'Tom Latham',
    'Devon Conway': 'Devon Conway',
    'Kane Williamson (c)': 'Kane Williamson',
    'Tom Latham (c & wk)': 'Tom Latham',
    'James Neesham': 'James Neesham',
    'Mitchell Santner': 'Mitchell Santner',
    'Trent Boult': 'Trent Boult',
    'Tim Southee': 'Tim Southee',
    'Mark Chapman': 'Mark Chapman',
    'Tom Latham (wk)': 'Tom Latham'
},inplace=True)

final_bowling_df.replace({
    'Boult': 'Trent Boult',
    'Matt Henry': 'Matt Henry',
    'Santner': 'Mitchell Santner',
    'Neesham': 'Jimmy Neesham',
    'Rachin Ravindra': 'Rachin Ravindra',
    'Glenn Phillips': 'Glenn Phillips',
    'Lockie Ferguson': 'Lockie Ferguson',
    'Daryl Mitchell': 'Daryl Mitchell',
    'Southee': 'Tim Southee',
    'Trent Boult': 'Trent Boult',
    'Mitchell Santner': 'Mitchell Santner',
    'Tim Southee': 'Tim Southee',
    'Ish Sodhi': 'Ish Sodhi'
}
,inplace=True)

In [139]:
# Sri lanka Replacement

final_batting_df.replace({
    'Pathum Nissanka': 'Pathum Nissanka',
    'Kusal Perera': 'Kusal Perera',
    'Kusal Mendis': 'Kusal Mendis',
    'Samarawickrama (wk)': 'Sadeera Samarawickrama',
    'Asalanka': 'Charith Asalanka',
    'Dhananjaya de Silva': 'Dhananjaya de Silva',
    'Shanaka (c)': 'Dasun Shanaka',
    'Dunith Wellalage': 'Dunith Wellalage',
    'M Theekshana': 'Maheesh Theekshana',
    'Matheesha Pathirana': 'Matheesha Pathirana',
    'Kusal Mendis (c & wk)': 'Kusal Mendis',
    'Samarawickrama': 'Sadeera Samarawickrama',
    'C Karunaratne': 'Dimuth Karunaratne',
    'Lahiru Kumara': 'Lahiru Kumara',
    'Dilshan Madushanka': 'Dilshan Madushanka',
    'Karunaratne': 'Dimuth Karunaratne',
    'Mathews': 'Angelo Mathews',
    'Chameera': 'Dushmantha Chameera',
    'Rajitha': 'Kasun Rajitha',
    'Kusal Mendis (wk)': 'Kusal Mendis',
    'Sadeera Samarawickrama': 'Sadeera Samarawickrama',
    'Charith Asalanka': 'Charith Asalanka',
    'Dasun Shanaka (c)': 'Dasun Shanaka',
    'Kasun Rajitha': 'Kasun Rajitha',
    'Dushan Hemantha': 'Dushan Hemantha',
    'Dimuth Karunaratne': 'Dimuth Karunaratne',
    'Angelo Mathews': 'Angelo Mathews',
    'Dushmantha Chameera': 'Dushmantha Chameera',
    'Maheesh Theekshana': 'Maheesh Theekshana'
},inplace=True)
final_bowling_df.replace({
    'Rajitha': 'Lahiru Kumara',
    'Dilshan Madushanka': 'Dilshan Madushanka',
    'Shanaka (c)': 'Dasun Shanaka',
    'Dhananjaya de Silva': 'Dhananjaya de Silva',
    'Matheesha Pathirana': 'Matheesha Pathirana',
    'Dunith Wellalage': 'Dunith Wellalage',
    'C Karunaratne': 'Dimuth Karunaratne',
    'M Theekshana': 'Maheesh Theekshana',
    'Dushan Hemantha': 'Dushan Hemantha',
    'Mathews': 'Angelo Mathews',
    'Lahiru Kumara': 'Lahiru Kumara',
    'Chameera': 'Dushmantha Chameera',
    'Maheesh Theekshana': 'Maheesh Theekshana',
    'Dasun Shanaka (c)': 'Dasun Shanaka',
    'Charith Asalanka': 'Charith Asalanka',
    'Chamika Karunaratne': 'Chamika Karunaratne',
    'Kasun Rajitha': 'Kasun Rajitha',
    'Angelo Mathews': 'Angelo Mathews',
    'Dushmantha Chameera': 'Dushmantha Chameera'
},inplace=True)

In [140]:
# Bangladesh Replacement
final_batting_df.replace({
    'Litton Das': 'Litton Das',
    'Tanzid Hasan': 'Tanzid Hasan',
    'Mehidy Hasan Miraz': 'Mehidy Hasan Miraz',
    'Shanto': 'Najmul Hossain Shanto',
    'Shakib (c)': 'Shakib Al Hasan',
    'Mushfiqur Rahim (wk)': 'Mushfiqur Rahim',
    'Towhid Hridoy': 'Towhid Hridoy',
    'Mahmudullah': 'Mahmudullah',
    'Taskin Ahmed': 'Taskin Ahmed',
    'Mustafizur': 'Mustafizur Rahman',
    'Shoriful Islam': 'Shoriful Islam',
    'Shanto (c)': 'Najmul Hossain Shanto',
    'Nasum Ahmed': 'Nasum Ahmed',
    'Mahedi Hasan': 'Mahedi Hasan',
    'Najmul Hossain Shanto': 'Najmul Hossain Shanto',
    'Shakib Al Hasan (c)': 'Shakib Al Hasan',
    'Mustafizur Rahman': 'Mustafizur Rahman',
    'Hasan Mahmud': 'Hasan Mahmud',
    'Tanzim Hasan Sakib': 'Tanzim Hasan Sakib'
},inplace=True)

final_bowling_df.replace({
    'Taskin Ahmed': 'Taskin Ahmed',
    'Shoriful Islam': 'Shoriful Islam',
    'Mustafizur': 'Mustafizur Rahman',
    'Shakib (c)': 'Shakib Al Hasan',
    'Mehidy Hasan Miraz': 'Mehidy Hasan Miraz',
    'Mahmudullah': 'Mahmudullah',
    'Mahedi Hasan': 'Mahedi Hasan',
    'Hasan Mahmud': 'Hasan Mahmud',
    'Nasum Ahmed': 'Nasum Ahmed',
    'Tanzim Hasan Sakib': 'Tanzim Hasan Sakib',
    'Mustafizur Rahman': 'Mustafizur Rahman',
    'Shakib Al Hasan (c)': 'Shakib Al Hasan',
    'Najmul Hossain Shanto': 'Najmul Hossain Shanto'
},inplace=True)

In [141]:
#Netherlands Replacement
final_batting_df.replace({
    'Vikramjit Singh': 'Vikramjit Singh',
    'Max ODowd': 'Max ODowd',
    'Ackermann': 'Colin Ackermann',
    'Bas de Leede': 'Bas de Leede',
    'Sybrand Engelbrecht': 'Sybrand Engelbrecht',
    'Teja Nidamanuru': 'Teja Nidamanuru',
    'Edwards (c & wk)': 'Scott Edwards',
    'van Beek': 'Logan van Beek',
    'van der Merwe': 'Roelof van der Merwe',
    'Aryan Dutt': 'Aryan Dutt',
    'van Meekeren': 'Paul van Meekeren',
    'Barresi': 'Wesley Barresi',
    'Shariz Ahmad': 'Shariz Ahmad',
    'Saqib Zulfiqar': 'Saqib Zulfiqar',
    'Colin Ackermann': 'Colin Ackermann',
    'Scott Edwards (c & wk)': 'Scott Edwards',
    'Roelof van der Merwe': 'Roelof van der Merwe',
    'Logan van Beek': 'Logan van Beek',
    'Paul van Meekeren': 'Paul van Meekeren',
    'Ryan Klein': 'Ryan Klein',
    'Wesley Barresi': 'Wesley Barresi'
},inplace=True)


final_bowling_df.replace({
    'Aryan Dutt': 'Aryan Dutt',
    'van Beek': 'Logan van Beek',
    'Ackermann': 'Colin Ackermann',
    'van Meekeren': 'Paul van Meekeren',
    'Bas de Leede': 'Bas de Leede',
    'van der Merwe': 'Roelof van der Merwe',
    'Vikramjit Singh': 'Vikramjit Singh',
    'Saqib Zulfiqar': 'Saqib Zulfiqar',
    'Ryan Klein': 'Ryan Klein',
    'Logan van Beek': 'Logan van Beek',
    'Colin Ackermann': 'Colin Ackermann',
    'Paul van Meekeren': 'Paul van Meekeren',
    'Roelof van der Merwe': 'Roelof van der Merwe',
    'Shariz Ahmad': 'Shariz Ahmad'
},inplace=True)

In [142]:
# India Replacement
final_batting_df.replace({
    'Rohit (c)': 'Rohit Sharma',
    'Shubman Gill': 'Shubman Gill',
    'Kohli': 'Virat Kohli',
    'Shreyas Iyer': 'Shreyas Iyer',
    'Rahul (wk)': 'KL Rahul',
    'Suryakumar Yadav': 'Suryakumar Yadav',
    'Ravindra Jadeja': 'Ravindra Jadeja',
    'Shami': 'Mohammed Shami',
    'Bumrah': 'Jasprit Bumrah',
    'Kuldeep Yadav': 'Kuldeep Yadav',
    'Siraj': 'Mohammed Siraj',
    'Rohit Sharma (c)': 'Rohit Sharma',
    'Ishan Kishan': 'Ishan Kishan',
    'Virat Kohli': 'Virat Kohli',
    'KL Rahul (wk)': 'KL Rahul',
    'Hardik Pandya': 'Hardik Pandya',
    'Mohammed Shami': 'Mohammed Shami',
    'Ashwin': 'Ravichandran Ashwin'
},inplace=True)

final_bowling_df.replace({
    'Bumrah': 'Jasprit Bumrah',
    'Siraj': 'Mohammed Siraj',
    'Hardik Pandya': 'Hardik Pandya',
    'Ashwin': 'Ravichandran Ashwin',
    'Kuldeep Yadav': 'Kuldeep Yadav',
    'Ravindra Jadeja': 'Ravindra Jadeja',
    'Thakur': 'Shardul Thakur',
    'Kohli': 'Virat Kohli',
    'Shami': 'Mohammed Shami',
    'Jasprit Bumrah': 'Jasprit Bumrah',
    'Mohammed Siraj': 'Mohammed Siraj',
    'Mohammed Shami': 'Mohammed Shami',
    'Virat Kohli': 'Virat Kohli',
    'Shubman Gill': 'Shubman Gill',
    'Suryakumar Yadav': 'Suryakumar Yadav',
    'Rohit Sharma (c)': 'Rohit Sharma'
},inplace=True)

In [143]:
# Bowler Names Normalization
final_bowling_df.replace({
    'Trent Boult': 'Trent Boult',
    'Matt Henry': 'Matt Henry',
    'Mitchell Santner': 'Mitchell Santner',
    'Jimmy Neesham': 'Jimmy Neesham',
    'Rachin Ravindra': 'Rachin Ravindra',

    'Aryan Dutt': 'Aryan Dutt',
    'Logan van Beek': 'Logan van Beek',
    'Colin Ackermann': 'Colin Ackermann',
    'Paul van Meekeren': 'Paul van Meekeren',
    'Bas de Leede': 'Bas de Leede',
    'Roelof van der Merwe': 'Roelof van der Merwe',
    'Vikramjit Singh': 'Vikramjit Singh',
    'Saqib Zulfiqar': 'Saqib Zulfiqar',
    'Ryan Klein': 'Ryan Klein',
    'Shariz Ahmad': 'Shariz Ahmad',

    'Taskin Ahmed': 'Taskin Ahmed',
    'Shoriful Islam': 'Shoriful Islam',
    'Mustafizur Rahman': 'Mustafizur Rahman',
    'Shakib Al Hasan': 'Shakib Al Hasan',
    'Mehidy Hasan Miraz': 'Mehidy Hasan Miraz',
    'Mahmudullah': 'Mahmudullah',
    'Mahedi Hasan': 'Mahedi Hasan',
    'Hasan Mahmud': 'Hasan Mahmud',
    'Nasum Ahmed': 'Nasum Ahmed',
    'Tanzim Hasan Sakib': 'Tanzim Hasan Sakib',
    'Najmul Hossain Shanto': 'Najmul Hossain Shanto',

    'Lahiru Kumara': 'Lahiru Kumara',
    'Dilshan Madushanka': 'Dilshan Madushanka',
    'Dasun Shanaka': 'Dasun Shanaka',
    'Dhananjaya de Silva': 'Dhananjaya de Silva',
    'Matheesha Pathirana': 'Matheesha Pathirana',
    'Dunith Wellalage': 'Dunith Wellalage',
    'Dimuth Karunaratne': 'Dimuth Karunaratne',
    'Maheesh Theekshana': 'Maheesh Theekshana',
    'Dushan Hemantha': 'Dushan Hemantha',
    'Angelo Mathews': 'Angelo Mathews',
    'Dushmantha Chameera': 'Dushmantha Chameera',
    'Charith Asalanka': 'Charith Asalanka',
    'Chamika Karunaratne': 'Chamika Karunaratne',
    'Kasun Rajitha': 'Kasun Rajitha',

    'Jasprit Bumrah': 'Jasprit Bumrah',
    'Mohammed Siraj': 'Mohammed Siraj',
    'Hardik Pandya': 'Hardik Pandya',
    'Ravichandran Ashwin': 'Ravichandran Ashwin',
    'Kuldeep Yadav': 'Kuldeep Yadav',
    'Ravindra Jadeja': 'Ravindra Jadeja',
    'Shardul Thakur': 'Shardul Thakur',
    'Virat Kohli': 'Virat Kohli',
    'Mohammed Shami': 'Mohammed Shami',
    'Shubman Gill': 'Shubman Gill',
    'Suryakumar Yadav': 'Suryakumar Yadav',
    'Rohit Sharma': 'Rohit Sharma',

    'Shaheen Afridi': 'Shaheen Afridi',
    'Hasan Ali': 'Hasan Ali',
    'Mohammad Nawaz': 'Mohammad Nawaz',
    'Haris Rauf': 'Haris Rauf',
    'Shadab Khan': 'Shadab Khan',
    'Iftikhar Ahmed': 'Iftikhar Ahmed',
    'Usama Mir': 'Usama Mir',
    'Mohammad Wasim Jr': 'Mohammad Wasim Jr',
    'Agha Salman': 'Agha Salman',

    'Mitchell Starc': 'Mitchell Starc',
    'Josh Hazlewood': 'Josh Hazlewood',
    'Glenn Maxwell': 'Glenn Maxwell',
    'Pat Cummins': 'Pat Cummins',
    'Adam Zampa': 'Adam Zampa',
    'Mitchell Marsh': 'Mitchell Marsh',
    'Marcus Stoinis': 'Marcus Stoinis',
    'Travis Head': 'Travis Head',
    'Sean Abbott': 'Sean Abbott',
    'Cameron Green': 'Cameron Green',

    'Chris Woakes': 'Chris Woakes',
    'Reece Topley': 'Reece Topley',
    'Sam Curran': 'Sam Curran',
    'Adil Rashid': 'Adil Rashid',
    'Mark Wood': 'Mark Wood',
    'Liam Livingstone': 'Liam Livingstone',
    'Joe Root': 'Joe Root',
    'David Willey': 'David Willey',
    'Gus Atkinson': 'Gus Atkinson',
    'Moeen Ali': 'Moeen Ali',
    'Lungi Ngidi': 'Lungi Ngidi',
    'Marco Jansen': 'Marco Jansen',
    'Rabada': 'Kagiso Rabada',
    'Gerald Coetzee': 'Gerald Coetzee',
    'Maharaj': 'Keshav Maharaj',
    'Markram': 'Aiden Markram',
    'Shamsi': 'Tabraiz Shamsi',
    'Phehlukwayo': 'Phehlukwayo',
    'Kagiso Rabada': 'Kagiso Rabada',
    'Keshav Maharaj': 'Keshav Maharaj',
    'Tabraiz Shamsi': 'Tabraiz Shamsi',
    'Lizaad Williams': 'Lizaad Williams',
    'Mujeeb Ur Rahman': 'Mujeeb Ur Rahman',
    'Fazalhaq Farooqi': 'Fazalhaq Farooqi',
    'Naveen-ul-Haq': 'Naveen-ul-Haq',
    'Mohammad Nabi': 'Mohammad Nabi',
    'Rashid Khan': 'Rashid Khan',
    'Azmatullah Omarzai': 'Azmatullah Omarzai',
    'Noor Ahmad': 'Noor Ahmad',
    'Rahmat Shah': 'Rahmat Shah'},inplace=True)

In [144]:
# Creating bowler type column
bowler_types = {
    'Trent Boult': "Seamer",
    'Matt Henry': "Seamer",
    'Mitchell Santner': "Spinner",
    'Jimmy Neesham': "Seamer",
    'Rachin Ravindra': "Spinner",
    'Aryan Dutt': "Seamer",
    'Logan van Beek': "Seamer",
    'Colin Ackermann': "Spinner",
    'Paul van Meekeren': "Seamer",
    'Bas de Leede': "Seamer",
    'Roelof van der Merwe': "Spinner",
    'Vikramjit Singh': "Seamer",
    'Saqib Zulfiqar': "Spinner",
    'Ryan Klein': "Seamer",
    'Shariz Ahmad': "Spinner",
    'Taskin Ahmed': "Seamer",
    'Shoriful Islam': "Seamer",
    'Mustafizur Rahman': "Seamer",
    'Shakib Al Hasan': "Spinner",
    'Mehidy Hasan Miraz': "Spinner",
    'Mahmudullah': "Spinner",
    'Mahedi Hasan': "Spinner",
    'Hasan Mahmud': "Seamer",
    'Nasum Ahmed': "Spinner",
    'Tanzim Hasan Sakib': "Seamer",
    'Najmul Hossain Shanto': "Spinner",
    'Lahiru Kumara': "Seamer",
    'Dilshan Madushanka': "Seamer",
    'Dasun Shanaka': "Spinner",
    'Dhananjaya de Silva': "Spinner",
    'Matheesha Pathirana': "Seamer",
    'Dunith Wellalage': "Spinner",
    'Dimuth Karunaratne': "Seamer",
    'Maheesh Theekshana': "Spinner",
    'Dushan Hemantha': "Spinner",
    'Angelo Mathews': "Seamer",
    'Dushmantha Chameera': "Seamer",
    'Charith Asalanka': "Spinner",
    'Chamika Karunaratne': "Seamer",
    'Kasun Rajitha': "Seamer",
    'Jasprit Bumrah': "Seamer",
    'Mohammed Siraj': "Seamer",
    'Hardik Pandya': "Seamer",
    'Ravichandran Ashwin': "Spinner",
    'Kuldeep Yadav': "Spinner",
    'Ravindra Jadeja': "Spinner",
    'Shardul Thakur': "Seamer",
    'Virat Kohli': "Seamer",
    'Mohammed Shami': "Seamer",
    'Shubman Gill': "Spinner",
    'Suryakumar Yadav': "Spinner",
    'Rohit Sharma': "Spinner",
    'Shaheen Afridi': "Seamer",
    'Hasan Ali': "Seamer",
    'Mohammad Nawaz': "Spinner",
    'Haris Rauf': "Seamer",
    'Shadab Khan': "Spinner",
    'Iftikhar Ahmed': "Spinner",
    'Usama Mir': "Spinner",
    'Mohammad Wasim Jr': "Seamer",
    'Agha Salman': "Spinner",
    'Mitchell Starc': "Seamer",
    'Josh Hazlewood': "Seamer",
    'Glenn Maxwell': "Spinner",
    'Pat Cummins': "Seamer",
    'Adam Zampa': "Spinner",
    'Mitchell Marsh': "Seamer",
    'Marcus Stoinis': "Seamer",
    'Travis Head': "Spinner",
    'Sean Abbott': "Seamer",
    'Cameron Green': "Seamer",
    'Chris Woakes': "Seamer",
    'Reece Topley': "Seamer",
    'Sam Curran': "Seamer",
    'Adil Rashid': "Spinner",
    'Mark Wood': "Seamer",
    'Liam Livingstone': "Spinner",
    'Joe Root': "Spinner",
    'David Willey': "Seamer",
    'Gus Atkinson': "Seamer",
    'Moeen Ali': "Spinner",
    'Lungi Ngidi': "Seamer",
    'Marco Jansen': "Seamer",
    'Rabada': "Seamer",
    'Gerald Coetzee': "Seamer",
    'Maharaj': "Spinner",
    'Markram': "Seamer",
    'Shamsi': "Spinner",
    'Phehlukwayo': "Seamer",
    'Kagiso Rabada': "Seamer",
    'Keshav Maharaj': "Spinner",
    'Tabraiz Shamsi': "Spinner",
    'Lizaad Williams': "Seamer",
    'Mujeeb Ur Rahman': "Spinner",
    'Fazalhaq Farooqi': "Seamer",
    'Naveen-ul-Haq': "Seamer",
    'Mohammad Nabi': "Spinner",
    'Rashid Khan': "Spinner",
    'Azmatullah Omarzai': "Seamer",
    'Noor Ahmad': "Spinner",
    'Rahmat Shah': "Spinner",
    'Glenn Phillips': "Spinner",
    'Lockie Ferguson': "Seamer",
    'Daryl Mitchell': "Seamer",
    'Aiden Markram': "Spinner",
    'Rachin Ravindra': "Spinner",
    'Tim Southee': "Seamer",
    'Ish Sodhi': "Spinner"
}

In [145]:
def bowler_type(bowler):
    return bowler_types.get(bowler)

final_bowling_df['Bowler_Type'] = final_bowling_df['Bowler'].apply(bowler_type)

In [146]:
final_bowling_df.head()

Unnamed: 0,Match_id,Team,Opposite_Team,Innings,Bowler,Overs,Maidens,Runs,Wickets,No_Balls,Wides,Economy,Bowler_Type
0,1,New Zealand,England,1,Trent Boult,10,1,48,1,0,1,4.8,Seamer
1,1,New Zealand,England,1,Matt Henry,10,1,48,3,0,0,4.8,Seamer
2,1,New Zealand,England,1,Mitchell Santner,10,0,37,2,0,1,3.7,Spinner
3,1,New Zealand,England,1,Jimmy Neesham,7,0,56,0,0,4,8.0,Seamer
4,1,New Zealand,England,1,Rachin Ravindra,10,0,76,1,0,0,7.6,Spinner


In [147]:
final_batting_df.to_csv(r'D:/DA_projects/CWC2023/Data/final_batting_df.csv',index=False)

In [148]:
final_bowling_df.to_csv(r'D:/DA_projects/CWC2023/Data/final_bowling_df.csv',index=False)