In [25]:
import time
import re
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

# Step 1: Set up Selenium WebDriver
driver = webdriver.Chrome()

# Step 2: Define the URL
url = "https://www.espncricinfo.com/series/indian-premier-league-2023-1345038/gujarat-titans-vs-chennai-super-kings-final-1370353/full-scorecard"
driver.get(url)

# Step 3: Wait for the page to load fully
WebDriverWait(driver, 10).until(
    EC.presence_of_element_located((By.TAG_NAME, "body"))
)

# Step 4: Scroll to the bottom of the page to ensure content is loaded
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
time.sleep(2)  # Allow time for dynamic content to load

# Step 5: Get the updated page source after scrolling
html_content = driver.page_source

# Step 6: Create BeautifulSoup object
soup = BeautifulSoup(html_content, 'html.parser')

# Step 7: Extract stadium name and city
stadium_info = soup.find("a", href=re.compile("/cricket-grounds/"))
if stadium_info:
    stadium_name, city = stadium_info["title"].split(", ")
    print(f"Stadium Name: {stadium_name}\nCity: {city}")
else:
    print("Stadium information not found.")

# Step 8: Find all tables
tables = soup.find_all('table')
if len(tables) > 4:
    tables = tables[:4]  # Only process the first four tables

# Step 9: Create a set for player names
player_names = set()

# Regex pattern to remove (c), †, and any text inside parentheses
pattern = r"\s*\([^)]*\)|†"

# Step 10: Extract names from tables
for table in tables:
    tbody = table.find('tbody')
    if tbody:
        rows = tbody.find_all('tr')
        for row in rows:
            player_cell = row.find('td')
            if player_cell:
                name_span = player_cell.find('span', class_='ds-text-tight-s')
                if name_span:
                    clean_name = re.sub(pattern, "", name_span.text).strip()
                    if clean_name:
                        player_names.add(clean_name)

# Step 11: Print results
print("Found tables:", len(tables))
print("Unique Player Names:", player_names)
print(f"Total number of unique players: {len(player_names)}")

# Step 12: Close the browser
driver.quit()

Stadium Name: Narendra Modi Stadium
City: Ahmedabad
Found tables: 4
Unique Player Names: {'Shivam Dube', 'Shubman Gill', 'Ambati Rayudu', 'Ruturaj Gaikwad', 'Rashid Khan', 'Mohammed Shami', 'Sai Sudharsan', 'Ravindra Jadeja', 'Deepak Chahar', 'MS Dhoni', 'Wriddhiman Saha', 'Hardik Pandya', 'Noor Ahmad', 'Mohit Sharma', 'Maheesh Theekshana', 'Devon Conway', 'Ajinkya Rahane', 'Josh Little', 'Matheesha Pathirana', 'Tushar Deshpande'}
Total number of unique players: 20
