# Match Wise Data Scraping

### Data Inspection

In [1]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
import numpy as np

In [2]:
schedule_df = pd.read_csv(r"F:\DA Projects\CWC2023\Data\schedule.csv")
pd.set_option('display.max_colwidth', None)
schedule_df.head(3)

Unnamed: 0.1,Unnamed: 0,Match_id,Match,Team1,Team2,Venue,City,Winner,Won_By,Player_of_the_Match,Match_Info,Toss
0,0,1,ENGLAND vs NEW ZEALAND,England,New zealand,Narendra Modi Stadium,Ahmedabad,New Zealand,9 wkts,Rachin Ravindra,https://www.cricbuzz.com/live-cricket-scorecard/75413/eng-vs-nz-1st-match-icc-cricket-world-cup-2023,New Zealand
1,1,2,PAKISTAN vs NETHERLANDS,Pakistan,Netherlands,Rajiv Gandhi International Stadium,Hyderabad,Pakistan,81 runs,Saud Shakeel,https://www.cricbuzz.com/live-cricket-scorecard/75420/pak-vs-ned-2nd-match-icc-cricket-world-cup-2023,Netherlands
2,2,3,BANGLADESH vs AFGHANISTAN,Bangladesh,Afghanistan,Himachal Pradesh Cricket Association Stadium,Dharamsala,Bangladesh,6 wkts,Mehidy Hasan Miraz,https://www.cricbuzz.com/live-cricket-scorecard/75427/ban-vs-afg-3rd-match-icc-cricket-world-cup-2023,Bangladesh


### Connection Checking

In [3]:
def url_check(urls):
    '''
    This Function Will Cross Check all match urls are working fine or not

    Input : Each Match url
    Ouput : HTTP Status Code 200 
    '''
    try:
        cnt = 0 
        for url_iterator in urls:
            response = requests.get(url_iterator) 
            # status code checking 
            if response.status_code == 200: 
                cnt+=1
            else:
                # finding match string in url 
                start = sample.find('/',54) 
                match = sample[start+1:] 
                print(f"Unable to access for this {match} Match")
                
        return f'Request granted for {cnt} Matches !'
        
    except requests.exceptions.RequestException as e:
        return f'Request error:, {e}'

urls = schedule_df['Match_Info']
print(url_check(urls))

Request granted for 48 Matches !


# All Matches one by one 

### Each Match Data 

In [4]:
sample_data = ['England Innings 282-9 (50 Ov)   Batter  R B 4s 6s SR      Bairstow    c Daryl Mitchell b Santner  33 35 4 1 94.29        Malan    c Latham b Matt Henry  14 24 2 0 58.33        Root    b Glenn Phillips  77 86 4 1 89.53        Harry Brook    c Conway b Rachin Ravindra  25 16 4 1 156.25        Moeen    b Glenn Phillips  11 17 1 0 64.71        Jos Buttler (c & wk)    c Latham b Matt Henry  43 42 2 2 102.38        Livingstone    c Matt Henry b Boult  20 22 3 0 90.91        Sam Curran    c Latham b Matt Henry  14 19 0 0 73.68        Chris Woakes    c Will Young b Santner  11 12 1 0 91.67        Adil Rashid    not out  15 13 0 1 115.38        Mark Wood    not out  13 14 0 0 92.86      Extras  6  \xa0(b 0, lb 0, w 6, nb 0, p 0)   Total  282   \xa0(9 wkts, 50 Ov)',
 'Bowler O M R W NB WD ECO     Boult   10 1 48 1 0 1 4.80        Matt Henry   10 1 48 3 0 0 4.80        Santner   10 0 37 2 0 1 3.70        Neesham   7 0 56 0 0 4 8.00        Rachin Ravindra   10 0 76 1 0 0 7.60        Glenn Phillips   3 0 17 2 0 0 5.70',
 'Powerplays Overs Runs   Mandatory 0.1-10 51',
 'New Zealand Innings 283-1 (36.2 Ov)   Batter  R B 4s 6s SR      Devon Conway    not out  152 121 19 3 125.62        Will Young    c Jos Buttler b Sam Curran  0 1 0 0 0.00        Rachin Ravindra    not out  123 96 11 5 128.12      Extras  8  \xa0(b 4, lb 1, w 3, nb 0, p 0)   Total  283   \xa0(1 wkts, 36.2 Ov)     Did not Bat   Daryl Mitchell , Tom Latham (c & wk) , Glenn Phillips , Mark Chapman , James Neesham , Mitchell Santner , Matt Henry , Trent Boult',
 'Bowler O M R W NB WD ECO     Chris Woakes   6 0 45 0 0 0 7.50        Sam Curran   6 2 47 1 0 2 7.80        Mark Wood   5 0 55 0 0 1 11.00        Moeen Ali   9.2 0 60 0 0 0 6.40        Adil Rashid   7 0 47 0 0 0 6.70        Liam Livingstone   3 0 24 0 0 0 8.00',
 'Powerplays Overs Runs   Mandatory 0.1-10 81']

## First Innings Batting

In [5]:
bat1 = []
def first_innings_batting(first_innings_bat,match_number,opposite_team):
    '''
    This Function Will Extract 1st innings Batting
    For Easy understanding See Sample Data for each match in Above Cell üîù

    Input : first_innings_batting,match_number,opposite_team_name
    Output : Each player First Innings Batting 2D list 
    '''
    # 1st Innings will starts after SR text
    start = first_innings_bat.find('SR')       
    # 1st Innings will ends before Extras text
    end = first_innings_bat.find('Extras')    
    # 1st Innings Team Name will present before I , 1 for India Eg: India in this case we will get 0 as Index
    team_name = first_innings_bat.find('I',1)   
    # Teamname Extraction                                                                                    
    team = first_innings_bat[:team_name].strip()   
    # Opposite Teamname 
    opp_team = opposite_team[:opposite_team.index('I',1)].strip()
    # Actual Data
    actual = first_innings_bat[start+2:end].strip()
    # Same Length is between each attribute 
    for i in actual.split('        '):
        bat1.append({'Match_id':match_number,
                     'Score':i,
                     'Team':team,
                     'Opposite_Team':opp_team,
                    'Innings':'1'})

In [6]:
sample_data[1]

'Bowler O M R W NB WD ECO     Boult   10 1 48 1 0 1 4.80        Matt Henry   10 1 48 3 0 0 4.80        Santner   10 0 37 2 0 1 3.70        Neesham   7 0 56 0 0 4 8.00        Rachin Ravindra   10 0 76 1 0 0 7.60        Glenn Phillips   3 0 17 2 0 0 5.70'

## First_Innings Bowling

In [7]:
bowl1 = []
def first_innings_bowling(first_innings_bowl,sample,match_number,opposite_team):
    '''
    This Function Will Extract 1st innings Bowling Statistics 
    For Easy understanding See Sample Data for each match in Above Cellüîù
    
    Input : first_innings_bowl,match_number,opposite_team_name
    Output : Each player First Innings Bowling 2D list
    '''
    # Bowler Name Starts After ECO text
    start = first_innings_bowl.find('ECO') 
    # Teamname Extraction     
    team_name = sample.find('I',1)
    team = sample[:team_name].strip()
    # Opposite Teamname 
    opp_team = opposite_team[:opposite_team.index('I',1)].strip()
    # Actual Data
    actual = first_innings_bowl[start+3:].strip()
    for i in actual.split('        '):
        bowl1.append({'Match_id':match_number,
                      'Bowling':i,
                      'Team':team,
                      'Opposite_Team':opp_team,
                     'Innings':'1'})

In [8]:
#first_innings_bowling
sample_data[3]

'New Zealand Innings 283-1 (36.2 Ov)   Batter  R B 4s 6s SR      Devon Conway    not out  152 121 19 3 125.62        Will Young    c Jos Buttler b Sam Curran  0 1 0 0 0.00        Rachin Ravindra    not out  123 96 11 5 128.12      Extras  8  \xa0(b 4, lb 1, w 3, nb 0, p 0)   Total  283   \xa0(1 wkts, 36.2 Ov)     Did not Bat   Daryl Mitchell , Tom Latham (c & wk) , Glenn Phillips , Mark Chapman , James Neesham , Mitchell Santner , Matt Henry , Trent Boult'

## Second Innings Batting

In [9]:
bat2 = []
def second_innings_batting(second_innings_bat,match_number,opposite_team):
    '''
    This Function Will Extract 2nd innings Batting
    For Easy understanding See Sample Data for each match in Above Cellüîù

    Input : second_innings_batting,match_number,opposite_team_name
    Output : Each player Second Innings Batting 2D list
    '''
    # 2nd Innings will starts after SR text
    start = second_innings_bat.find('SR')       
    # 2nd Innings will ends before Extras text
    end = second_innings_bat.find('Extras')    
    # 2nd Innings Team Name will present before I , 1 for India Eg: India in this case we will get 0 as Index
    team_name = second_innings_bat.find('I',1)   
    # Teamname Extraction                                                                                    
    team = second_innings_bat[:team_name].strip()  
    # Opposite Teamname 
    opp_team = opposite_team[:opposite_team.index('I',1)].strip()
    # Actual Data
    actual = second_innings_bat[start+2:end].strip()
    # Same Length is between each attribute 
    for i in actual.split('        '):
        bat2.append({'Match_id':match_number,
                     'Score':i,
                     'Team':team,
                     'Opposite_Team':opp_team,
                    'Innings':'2'})


In [10]:
sample_data[4]

'Bowler O M R W NB WD ECO     Chris Woakes   6 0 45 0 0 0 7.50        Sam Curran   6 2 47 1 0 2 7.80        Mark Wood   5 0 55 0 0 1 11.00        Moeen Ali   9.2 0 60 0 0 0 6.40        Adil Rashid   7 0 47 0 0 0 6.70        Liam Livingstone   3 0 24 0 0 0 8.00'

## Second Innings Bowling

In [11]:
bowl2 = []
def second_innings_bowling(second_innings_bowl,sample,match_number,opposite_team):
    '''
    This Function Will Extract 2nd innings Bowling Statistics 
    For Easy understanding See Sample Data for each match in Above Cellüîù

    Input : second_innings_batting,match_number,opposite_team_name
    Output : Each player Second Innings Bowling 2D list
    '''
    # Bowler Name Starts After ECO text
    start = second_innings_bowl.find('ECO') 
    # Teamname Extraction     
    team_name = sample.find('I',1)
    team = sample[:team_name].strip()
     # Opposite Teamname 
    opp_team = opposite_team[:opposite_team.find('I',1)].strip()
    # Actual Data
    actual = second_innings_bowl[start+3:].strip()
    for i in actual.split('        '):
        bowl2.append({'Match_id':match_number,
                      'Bowling':i,
                      'Team':team,
                      'Opposite_Team':opp_team,
                     'Innings':'2'})


### Note
Cleaned_data_looks like this

![Add Image](https:/)

    - These cleaned Data have 6 parts
    - 1st Innings Batting
    - 1st Innings Bowling
    - 1st Innings Powerplay
    - 2nd Innings Batting
    - 2nd Innings Bowling
    - 2nd Innings Powerplay

In [12]:
toss = []
def give_me_soup(match_urls):
    '''
    This is the Main Function It Will pass the arguments to Each Innings from First Batting to Second Innings Bowling along wit toss
    By Extracting Each match soup & Batting , Bowling Attributes
    Input : Each Match Url
    Output: Batting, Bowling, Toss Details of Eatch Match
    '''
    try:
        cnt=0   # for Count & Match_id column
        for sample in match_urls:
            response = requests.get(sample)
            if response.status_code == 200: # Status Code check for each match 
                cnt+=1

                # Extraction
                match_soup = BeautifulSoup(response.content,'html.parser')  # Html parsing 
                match_score = match_soup.find_all('div',attrs={'class':'cb-col cb-col-100 cb-ltst-wgt-hdr'}) # scorecard class
                toss_txt = match_soup.find_all('div',attrs={'class':'cb-col cb-col-73'})[2].text # Toss Class
                cleaned = [i.text.strip() for i in match_score]  # cleaned data for each match 

                
                # Data Uploading & please have a look at above cell
                first_innings_batting(cleaned[0],cnt,cleaned[3])         
                first_innings_bowling(cleaned[1],cleaned[3],cnt,cleaned[0])
                second_innings_batting(cleaned[3],cnt,cleaned[0])
                second_innings_bowling(cleaned[4],cleaned[0],cnt,cleaned[3])

                win_index = toss_txt.index('won')
                toss.append(toss_txt[:win_index].strip())
                print(f'{cnt} Match Data Uploaded!')
                      
            else:
                start = sample.find('/',54)
                match = sample[start+1:]
                print(f"Unable to access for this {match} Match")
                 
        
    except requests.exceptions.RequestException as e:
        print("Request error:", e)


give_me_soup(urls)

1 Match Data Uploaded!
2 Match Data Uploaded!
3 Match Data Uploaded!
4 Match Data Uploaded!
5 Match Data Uploaded!
6 Match Data Uploaded!
7 Match Data Uploaded!
8 Match Data Uploaded!
9 Match Data Uploaded!
10 Match Data Uploaded!
11 Match Data Uploaded!
12 Match Data Uploaded!
13 Match Data Uploaded!
14 Match Data Uploaded!
15 Match Data Uploaded!
16 Match Data Uploaded!
17 Match Data Uploaded!
18 Match Data Uploaded!
19 Match Data Uploaded!
20 Match Data Uploaded!
21 Match Data Uploaded!
22 Match Data Uploaded!
23 Match Data Uploaded!
24 Match Data Uploaded!
25 Match Data Uploaded!
26 Match Data Uploaded!
27 Match Data Uploaded!
28 Match Data Uploaded!
29 Match Data Uploaded!
30 Match Data Uploaded!
31 Match Data Uploaded!
32 Match Data Uploaded!
33 Match Data Uploaded!
34 Match Data Uploaded!
35 Match Data Uploaded!
36 Match Data Uploaded!
37 Match Data Uploaded!
38 Match Data Uploaded!
39 Match Data Uploaded!
40 Match Data Uploaded!
41 Match Data Uploaded!
42 Match Data Uploaded!
4

In [13]:
schedule_df['Toss'] = toss

In [14]:
schedule_df.shape

(48, 12)

In [15]:
pd.set_option('display.max_colwidth', 13)
schedule_df.head()

Unnamed: 0.1,Unnamed: 0,Match_id,Match,Team1,Team2,Venue,City,Winner,Won_By,Player_of_the_Match,Match_Info,Toss
0,0,1,ENGLAND v...,England,New zealand,Narendra ...,Ahmedabad,New Zealand,9 wkts,Rachin Ra...,https://w...,New Zealand
1,1,2,PAKISTAN ...,Pakistan,Netherlands,Rajiv Gan...,Hyderabad,Pakistan,81 runs,Saud Shakeel,https://w...,Netherlands
2,2,3,BANGLADES...,Bangladesh,Afghanistan,Himachal ...,Dharamsala,Bangladesh,6 wkts,Mehidy Ha...,https://w...,Bangladesh
3,3,4,SOUTH AFR...,South africa,Sri lanka,Arun Jait...,Delhi,South Africa,102 runs,Aiden Mar...,https://w...,Sri Lanka
4,4,5,INDIA vs ...,India,Australia,MA Chidam...,Chennai,India,6 wkts,KL Rahul,https://w...,Australia


### Dataframes Creation

In [16]:
bat_1_df = pd.DataFrame(bat1)
bat_2_df = pd.DataFrame(bat2)
bowl_1_df = pd.DataFrame(bowl1)
bowl_2_df = pd.DataFrame(bowl2)

In [17]:
bat_1_df.tail()

Unnamed: 0,Match_id,Score,Team,Opposite_Team,Innings
468,48,Suryakuma...,India,Australia,1
469,48,Shami ...,India,Australia,1
470,48,Bumrah ...,India,Australia,1
471,48,Kuldeep Y...,India,Australia,1
472,48,Siraj ...,India,Australia,1


In [18]:
bat_2_df[10:13]

Unnamed: 0,Match_id,Score,Team,Opposite_Team,Innings
10,2,Roelof va...,Netherlands,Pakistan,2
11,2,Logan van...,Netherlands,Pakistan,2
12,2,Aryan Dut...,Netherlands,Pakistan,2


In [19]:
bowl_2_df.head()

Unnamed: 0,Match_id,Bowling,Team,Opposite_Team,Innings
0,1,Chris Woa...,England,New Zealand,2
1,1,Sam Curra...,England,New Zealand,2
2,1,Mark Wood...,England,New Zealand,2
3,1,Moeen Ali...,England,New Zealand,2
4,1,Adil Rash...,England,New Zealand,2


In [20]:
bowl_2_df.tail()

Unnamed: 0,Match_id,Bowling,Team,Opposite_Team,Innings
271,48,Jasprit B...,India,Australia,2
272,48,Mohammed ...,India,Australia,2
273,48,Ravindra ...,India,Australia,2
274,48,Kuldeep Y...,India,Australia,2
275,48,Mohammed ...,India,Australia,2


In [21]:
bat_1_df.shape,bat_2_df.shape

((473, 5), (403, 5))

In [22]:
bowl_1_df.shape,bowl_2_df.shape

((298, 5), (276, 5))

In [23]:
batting_df = pd.concat([bat_1_df,bat_2_df],axis=0)
bowling_df = pd.concat([bowl_1_df,bowl_2_df],axis=0)

In [24]:
batting_df.shape,bowling_df.shape

((876, 5), (574, 5))

In [25]:
# Exporting for Data Cleaning
batting_df.to_csv('F:/DA Projects/CWC2023/Data/pre_batting_df.csv')
bowling_df.to_csv('F:/DA Projects/CWC2023/Data/pre_bowling_df.csv')
schedule_df.to_csv('F:/DA Projects/CWC2023/Data/schedule.csv')