# CWC Schedule

In [2]:
# Libraries
from bs4 import BeautifulSoup
import requests
import pandas as pd
import numpy as np

In [3]:
# Page Url
url = 'https://www.cricbuzz.com/cricket-series/6732/icc-cricket-world-cup-2023/matches'

In [4]:
webpage = requests.get(url)
webpage # Response should be 200

<Response [200]>

### MainPage Soup

In [5]:
# Whole Webpage Content of main page (HTML+CSS+JS)
print(f'Webpage Content : \n{webpage.content[:400]} \n\n Type : {type(webpage.content)}')

Webpage Content : 
b'\r\n\r\n<!DOCTYPE html><html lang="en" itemscope itemtype="http://schema.org/WebPage"><head><meta charset="utf-8"><script>var is_mobile = /symbian|tizen|midp|uc(web|browser)|MSIE (5.0|6.0|7.0|8.0)|tablet/i.test(navigator.userAgent);\tif(is_mobile && window.location.hostname != "www1.cricbuzz.com") window.location.hostname = "m.cricbuzz.com";</script><style>html{scroll-behavior: smooth;}\tbody{background' 

 Type : <class 'bytes'>


### Html parsing


In [6]:
main_soup = BeautifulSoup(webpage.content,'html.parser') # to html 
main_soup.text[:400] # First 00 characters in soup 

'\n ICC Cricket World Cup 2023 schedule, live scores and results | Cricbuzz.com  ✖Live ScoresScheduleArchivesNewsAll Stories  Premium Editorials Latest NewsTopicsSpotlightOpinionsSpecialsStats & AnalysisInterviewsLive BlogsHarsha BhogleSeries  Indian Premier League 2024 ICC Mens T20 World Cup 2024 Womens Premier League 2024 New Zealand tour of Pakistan, 2024 Sri Lanka tour of Bangladesh, 2024 Bangla'

### Separate Links Extraction

In [7]:
match_links = main_soup.find_all('a',attrs={'class':'text-hvr-underline'})
venue_links = main_soup.find_all('div',attrs={'class':'text-gray'}) 
winner = main_soup.find_all('a',attrs={'class':'cb-text-complete'})

#CrossCheck
print(f'Match Details \n : {match_links[:3]}\n')
print(f'Venue Details \n : {venue_links[:3]}\n')
print(f'Winner  Details \n : {winner[:3]}\n')

Match Details 
 : [<a class="text-hvr-underline" href="/cricket-scores/75413/eng-vs-nz-1st-match-icc-cricket-world-cup-2023" title="ENGLAND vs NEW ZEALAND Live Cricket Score and ball by ball commentary"><span>ENGLAND vs NEW ZEALAND, 1st Match</span></a>, <a class="text-hvr-underline" href="/cricket-scores/75420/pak-vs-ned-2nd-match-icc-cricket-world-cup-2023" title="PAKISTAN vs NETHERLANDS Live Cricket Score and ball by ball commentary"><span>PAKISTAN vs NETHERLANDS, 2nd Match</span></a>, <a class="text-hvr-underline" href="/cricket-scores/75427/ban-vs-afg-3rd-match-icc-cricket-world-cup-2023" title="BANGLADESH vs AFGHANISTAN Live Cricket Score and ball by ball commentary"><span>BANGLADESH vs AFGHANISTAN, 3rd Match</span></a>]

Venue Details 
 : [<div class="text-gray"> <span> 48 ODIs </span> <span class="cb-nav-dt"> . </span> <span>Oct 05</span> <span> - Nov 19</span> </div>, <div class="text-gray">Narendra Modi Stadium, Ahmedabad</div>, <div class="cb-font-12 text-gray"><span>08:30 A

In [8]:
### Corrections 
match_links = match_links[:48]
venue_links = venue_links[1:] #1st line don't have venue details 👇
print(f'match_links : {len(match_links)}')
print(f'venue_links : {len(venue_links)}')

match_links : 48
venue_links : 96


### Match_Schedule Extraction 

In [9]:
match_links[9]

<a class="text-hvr-underline" href="/cricket-scores/75465/aus-vs-rsa-10th-match-icc-cricket-world-cup-2023" title="AUSTRALIA vs SOUTH AFRICA Live Cricket Score and ball by ball commentary"><span>AUSTRALIA vs SOUTH AFRICA, 10th Match</span></a>

In [10]:
match_List = []
team_1_List = []
team_2_List = []
def match_schedule(match_links):
    '''
    This Function will Extract Match Details from Each Matchlinks
    Such as Match Fixture and Team Names
    
    Input : Match_links in HTML Format
    Output : Match_Schedule in separate list 

    Sample Input : 'ENGLAND vs NEW ZEALAND, 1st Match'
    '''
    # For Easy Understanding please see above Input
    for i in match_links:
        row = i.text # Extracting only text from HTML  
        
        first = row.index('v') # First Team Name will come before v in "VS"   
        start = row.find('s') #  Second Team Name will come after s in "VS"
        coma = row.find(',')  # This is placeholder for each match fixture 
        
        only_match = row.split(',')[0] # Every Match detail 
        team_1 = row[:first].strip().capitalize() # Removing spaces & Capitalizing 
        team_2 = row[start+1:coma].strip().capitalize() 
    
        match_List.append(only_match) 
        team_1_List.append(team_1)
        team_2_List.append(team_2)
        
    # Cross Check
    print(f'Match Details : {match_List[:3]} {len(match_List)}')
    print(f'Team_1 Names : {team_1_List[:3]} {len(match_List)}')
    print(f'Team_2 Names : {team_2_List[:3]}{len(match_List)}')

match_schedule(match_links)

Match Details : ['ENGLAND vs NEW ZEALAND', 'PAKISTAN vs NETHERLANDS', 'BANGLADESH vs AFGHANISTAN'] 48
Team_1 Names : ['England', 'Pakistan', 'Bangladesh'] 48
Team_2 Names : ['New zealand', 'Netherlands', 'Afghanistan']48


### Venue & City Extraction

In [11]:
venue_links[:4],len(venue_links)

([<div class="text-gray">Narendra Modi Stadium, Ahmedabad</div>,
  <div class="cb-font-12 text-gray"><span>08:30 AM </span>GMT	/<span> 02:00 PM</span> LOCAL</div>,
  <div class="text-gray">Rajiv Gandhi International Stadium, Hyderabad</div>,
  <div class="cb-font-12 text-gray"><span>08:30 AM </span>GMT	/<span> 02:00 PM</span> LOCAL</div>],
 96)

#### Note ⚠️
- Observe Length of venue_links it is having 96 but we have only 48 matches so divide it by 2 slow slow wait🫷
- 1st element has Venue & City Name
- 2nd Element don't have any details we are going to skip consecutive 1 element by using for loop


In [12]:
Venue = []
City = []
def venue_city(venue_links):
    '''
    This Function will Extract the Venue & City Names from Venu_links 
    Input : Venue HTML tag
    Output : Venue & City in Separate list(s)
    
    
    Sample Input : 'Narendra Modi Stadium, Ahmedabad'
    '''

    # For Easy Understanding please see above Input
    for i in range(0,len(venue_links),2): # please see above note 
        venue_city = venue_links[i].text  # Venue_City Extraction
        venue = venue_city.split(',')[0].strip() # Venue Extraction
        city = venue_city.split(',')[1].strip()  # City Extraction
        
        Venue.append(venue)
        City.append(city)

    
    # Crosscheck
    print(f'Venue Details : {Venue[:3]} {len(Venue)}')
    print(f'City Details : {City[:3]} {len(City)}')

venue_city(venue_links)

Venue Details : ['Narendra Modi Stadium', 'Rajiv Gandhi International Stadium', 'Himachal Pradesh Cricket Association Stadium'] 48
City Details : ['Ahmedabad', 'Hyderabad', 'Dharamsala'] 48


### Winner & Match Info

In [13]:
def player_of_the_match_extraction(url):
    '''
    This Function will return the POTM (Player of the Match) for Each Match 
    Input : Each Match url 
    Output : POTM Name

    Sample Input : 
    '''
    
    
    potm_soup = requests.get(url)
    potm_soup_html = BeautifulSoup(potm_soup.content,'html.parser')
    potm_soup_html_obj = potm_soup_html.find_all('a',attrs={'class':'cb-link-undrln'}) #POTM Class
    player = BeautifulSoup(str(potm_soup_html_obj), 'html.parser')
    
    return player.a.get_text()

In [14]:
winner[9].text

'South Africa won by 134 runs'

In [15]:
Winner_Team = []
Info =[]
Win_Type = []
Player_of_Match = []
def winner_team(winner):
    '''
    This Function will extract Match Information from Winner tag
    Such as Winner_Team,
            Match_Info (url for Full Details of Each Match)
            Player of the Match

    Sample Input : <a class="cb-text-complete" href="/cricket-scores/75465/aus-vs-rsa-10th-match-icc-cricket-world-cup-2023">South Africa won by 134 runs</a>
    '''
    
    for winner_team in winner:
        
        wn_txt = winner_team.text # Text Extraction from HTML
        
        # urls extraction 
        url = winner_team.get('href')  # url Extraction 
        
        # POTM Match url : generc cricbuzz url + Each match url
        player_of_match_url = 'https://www.cricbuzz.com/'+url 

        # Scoreboard Url : Scoreboard generic url + Each Match url 
        match_url = 'https://www.cricbuzz.com/live-cricket-scorecard'+ url[15:]

        # Eg : 'South Africa won by 134 runs'
        # won Attributes 
        win_index = wn_txt.index('won') # Index of w for Extaction of Winner Team Name 
        team = wn_txt[:win_index].strip() 
        won_type = wn_txt[wn_txt.find('by')+2:].strip() # Index of w for Extaction of Won Type
        player = player_of_the_match_extraction(player_of_match_url) # Extraction of POTM Name 

        # Exporting
        Win_Type.append(won_type)
        Winner_Team.append(team)
        Info.append(match_url)
        Player_of_Match.append(player)


    # Crosscheck
    print(f'Win_Types  : {Win_Type[:3]} {len(Win_Type)} \n')
    print(f'Winning_Team  : {Winner_Team[:3]} {len(Winner_Team)} \n')
    print(f'Match_Info urls  : {Info[:3]} {len(Info)} \n ')
    print(f'Player_of_Matches  : {Player_of_Match[:3]} {len(Player_of_Match)}\n')
        
winner_team(winner)

Win_Types  : ['9 wkts', '81 runs', '6 wkts'] 48 

Winning_Team  : ['New Zealand', 'Pakistan', 'Bangladesh'] 48 

Match_Info urls  : ['https://www.cricbuzz.com/live-cricket-scorecard/75413/eng-vs-nz-1st-match-icc-cricket-world-cup-2023', 'https://www.cricbuzz.com/live-cricket-scorecard/75420/pak-vs-ned-2nd-match-icc-cricket-world-cup-2023', 'https://www.cricbuzz.com/live-cricket-scorecard/75427/ban-vs-afg-3rd-match-icc-cricket-world-cup-2023'] 48 
 
Player_of_Matches  : ['Rachin Ravindra', 'Saud Shakeel', 'Mehidy Hasan Miraz'] 48



### Final Df

In [16]:
df = pd.DataFrame(
        {'Match':match_List,
         'Team1':team_1_List,
         'Team2':team_2_List,
         'Venue':Venue,
         'City':City,
         'Winner':Winner_Team,
         'Won_By':Win_Type,
         'Player_of_the_Match':Player_of_Match,
         'Match_Info':Info
        })
pd.set_option('display.max_colwidth', None) #for Full View

In [18]:
match_nums = list(range(1,49))
df['Match_id'] = match_nums 

df.set_index('Match_id',inplace=True) #Setting Match_id for future purpose 
df.reset_index(inplace=True)

In [19]:
df.head()

Unnamed: 0,Match_id,Match,Team1,Team2,Venue,City,Winner,Won_By,Player_of_the_Match,Match_Info
0,1,ENGLAND vs NEW ZEALAND,England,New zealand,Narendra Modi Stadium,Ahmedabad,New Zealand,9 wkts,Rachin Ravindra,https://www.cricbuzz.com/live-cricket-scorecard/75413/eng-vs-nz-1st-match-icc-cricket-world-cup-2023
1,2,PAKISTAN vs NETHERLANDS,Pakistan,Netherlands,Rajiv Gandhi International Stadium,Hyderabad,Pakistan,81 runs,Saud Shakeel,https://www.cricbuzz.com/live-cricket-scorecard/75420/pak-vs-ned-2nd-match-icc-cricket-world-cup-2023
2,3,BANGLADESH vs AFGHANISTAN,Bangladesh,Afghanistan,Himachal Pradesh Cricket Association Stadium,Dharamsala,Bangladesh,6 wkts,Mehidy Hasan Miraz,https://www.cricbuzz.com/live-cricket-scorecard/75427/ban-vs-afg-3rd-match-icc-cricket-world-cup-2023
3,4,SOUTH AFRICA vs SRI LANKA,South africa,Sri lanka,Arun Jaitley Stadium,Delhi,South Africa,102 runs,Aiden Markram,https://www.cricbuzz.com/live-cricket-scorecard/75434/rsa-vs-sl-4th-match-icc-cricket-world-cup-2023
4,5,INDIA vs AUSTRALIA,India,Australia,MA Chidambaram Stadium,Chennai,India,6 wkts,KL Rahul,https://www.cricbuzz.com/live-cricket-scorecard/75437/ind-vs-aus-5th-match-icc-cricket-world-cup-2023


In [23]:
df.to_csv('D:/DA_projects/CWC2023/Data/schedule.csv',index=False)