In [None]:
!pip install requests beautifulsoup4 selenium pandas
!pip install webdriver-manager


In [68]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup
import pandas as pd
import requests

In [69]:
url = "https://www.espncricinfo.com/series/icc-champions-trophy-2024-25-1459031/india-vs-new-zealand-final-1466428/ball-by-ball-commentary"

In [74]:

# Set up the WebDriver
driver = webdriver.Chrome()
driver.get(url)

# Wait for initial content to load
WebDriverWait(driver, 10).until(
    EC.presence_of_element_located((By.CSS_SELECTOR, "div.ds-text-tight-l.ds-flex"))
)

# Handle potential popups (inspect the page to find actual selectors)
try:
    WebDriverWait(driver, 5).until(
        EC.presence_of_element_located((By.CSS_SELECTOR, ".ad-close-button"))  # Example selector
    )
    close_button = driver.find_element(By.CSS_SELECTOR, ".ad-close-button")
    close_button.click()
    print("Closed popup.")
except:
    print("No popup found or couldn’t close it.")

# Scroll until the first ball ("0.1") is loaded
SCROLL_PAUSE_TIME = 1
max_scrolls = 100  # Prevent infinite loops
scroll_count = 0

while scroll_count < max_scrolls:
    # Scroll down by a fixed increment
    driver.execute_script("window.scrollBy(0, 500);")
    time.sleep(SCROLL_PAUSE_TIME)
    
    # Check for the presence of "0.1" in ball numbers
    ball_spans = driver.find_elements(By.CSS_SELECTOR, "span.ds-text-tight-s.ds-font-regular.ds-mb-1.lg\\:ds-mb-0.lg\\:ds-mr-3.ds-block.ds-text-center.ds-text-typo-mid1")
    ball_texts = [span.text.strip() for span in ball_spans if span.text.strip()]
    if "0.1" in ball_texts:
        print("First ball '0.1' found. All commentaries loaded.")
        break
    scroll_count += 1

if scroll_count == max_scrolls:
    print("Reached maximum scrolls without finding '0.1'. Check for loading issues or adjust max_scrolls.")

# Get the fully rendered page source
page_source = driver.page_source
driver.quit()

# Parse the page source with BeautifulSoup
soup = BeautifulSoup(page_source, "html.parser")


No popup found or couldn’t close it.
First ball '0.1' found. All commentaries loaded.


In [75]:
# Find all commentary containers
ball_containers = soup.find_all("div", class_=lambda x: x and "ds-text-tight-l" in x and "ds-flex" in x)
print(f"Found {len(ball_containers)} commentary containers.")

# Extract data
ball_numbers = []
summaries = []
descriptions = []

for container in ball_containers:
    # Ball number
    ball_span = container.find("span", class_="ds-text-tight-s ds-font-regular ds-mb-1 lg:ds-mb-0 lg:ds-mr-3 ds-block ds-text-center ds-text-typo-mid1")
    ball_text = ball_span.get_text(strip=True) if ball_span else "N/A"
    
    # Summary
    summary_div = container.find("div", class_="ds-leading-[16px] lg:ds-leading-none ds-mb-0.5")
    summary_text = summary_div.get_text(strip=True) if summary_div else "N/A"
    
    # Description
    desc_p = container.find("p", class_="ci-html-content first-letter:ds-capitalize ds-leading-[24px]")
    desc_text = desc_p.get_text(strip=True) if desc_p else "N/A"
    
    ball_numbers.append(ball_text)
    summaries.append(summary_text)
    descriptions.append(desc_text)

# Create a DataFrame
df = pd.DataFrame({
    "Ball Number": ball_numbers,
    "Summary": summaries,
    "Description": descriptions
})

# Save raw data
df.to_csv("cricket_commentary_raw.csv", index=False)
print("Raw dataset saved as 'cricket_commentary_raw.csv'.")

# Process the data
# Sort by ball number (assuming "over.ball" format) in ascending order
df[["over", "ball"]] = df["Ball Number"].str.split(".", expand=True)
df["over"] = pd.to_numeric(df["over"], errors="coerce").fillna(-1).astype(int)
df["ball"] = pd.to_numeric(df["ball"], errors="coerce").fillna(-1).astype(int)
df = df.sort_values(by=["over", "ball"]).reset_index(drop=True)

# Add sequential Ball column
df["Ball"] = df.index + 1

# Save processed data
df = df[["Ball", "Ball Number", "Summary", "Description"]]
df.to_csv("cricket_commentary_processed.csv", index=False)
print("Processed dataset saved as 'cricket_commentary_processed.csv'.")

Found 300 commentary containers.
Raw dataset saved as 'cricket_commentary_raw.csv'.
Processed dataset saved as 'cricket_commentary_processed.csv'.


In [80]:


# Parse the page source with BeautifulSoup
soup = BeautifulSoup(page_source, "html.parser")

# Lists to store extracted data
ball_numbers = []
summaries = []
descriptions = []

# Find all commentary containers
ball_containers = soup.find_all("div", class_=lambda x: x and "ds-text-tight-l" in x and "ds-flex" in x)

if not ball_containers:
    print("No commentary containers found. Check the class names or page structure.")
else:
    print(f"Found {len(ball_containers)} commentary containers.")
    for container in ball_containers:
        # Extract ball number
        ball_span = container.find("span", class_="ds-text-tight-s ds-font-regular ds-mb-1 lg:ds-mb-0 lg:ds-mr-3 ds-block ds-text-center ds-text-typo-mid1")
        ball_text = ball_span.get_text(strip=True) if ball_span else "N/A"
        
        # Extract summary
        summary_div = container.find("div", class_="ds-leading-[16px] lg:ds-leading-none ds-mb-0.5")
        summary_text = summary_div.get_text(strip=True) if summary_div else "N/A"
        
        # Extract description
        desc_p = container.find("p", class_="ci-html-content first-letter:ds-capitalize ds-leading-[24px]")
        desc_text = desc_p.get_text(strip=True) if desc_p else "N/A"
        
        # Append to lists
        ball_numbers.append(ball_text)
        summaries.append(summary_text)
        descriptions.append(desc_text)

    # Print the first 5 entries to verify
    for i in range(min(5, len(ball_numbers))):
        print(f"Ball: {ball_numbers[i]}")
        print(f"Summary: {summaries[i]}")
        print(f"Description: {descriptions[i]}")
        print("-" * 50)

    # Create a DataFrame and save to CSV
    data = {
        "Ball Number": ball_numbers,
        "Summary": summaries,
        "Description": descriptions
    }
    df = pd.DataFrame(data)
    print("Preview of the dataset:")
    print(df.head())
    #df.to_csv("cricket_commentary_dataset.csv", index=False)
    print("Dataset saved as 'cricket_commentary_dataset.csv'.")

Found 300 commentary containers.
Ball: 48.6
Summary: O'Rourke to Jadeja,FOUR runs
Description: INDIA WIN THE CHAMPIONS TROPHY!A third title for them in the competition. And this surely has to be the most comprehensive. Unbeaten all through the tournament and have rarely been challenged at any stage. It is Ravindra Jadeja, who hits the winning runs. Pulls this past deep square leg and into the fence. Raises his arms in delight, so does Rahul as the rest of the team rushes out to the middle.A second straight ICC title for India.
--------------------------------------------------
Ball: 48.5
Summary: O'Rourke to Rahul,1 run
Description: Banged in short, he nails the pull to deep square leg on the bounce
--------------------------------------------------
Ball: 48.4
Summary: O'Rourke to Jadeja,1 run
Description: Fuller in length on off, he pushes to mid-off and sets off
--------------------------------------------------
Ball: 48.3
Summary: O'Rourke to Jadeja,no run
Description: Good length b

In [81]:
import pandas as pd
import re



# Step 1: Split "Ball Number" into "over" and "ball" for sorting
df[["over", "ball"]] = df["Ball Number"].str.split(".", expand=True)
df["over"] = df["over"].astype(int)
df["ball"] = df["ball"].astype(int)

# Step 2: Sort the DataFrame by "over" and "ball" in ascending order
df = df.sort_values(by=["over", "ball"]).reset_index(drop=True)

# Step 3: Add "Ball" column with sequential numbers starting from 1
df["Ball"] = df.index + 1

# Step 4: Define a function to parse the "Summary" column
def parse_summary(summary):
    try:
        # Split into bowler and the rest
        parts = summary.split(" to ", 1)
        bowler = parts[0].strip()
        rest = parts[1]
        # Split into batsman and outcome
        subparts = rest.split(",", 1)
        batsman = subparts[0].strip()
        outcome = subparts[1].strip()
        # Parse the runs from the outcome
        if outcome == "no run":
            runs = 0
        elif outcome == "FOUR runs":
            runs = 4
        elif outcome == "SIX runs":
            runs = 6
        else:
            match = re.match(r"(\d+) run(s?)", outcome)
            if match:
                runs = int(match.group(1))
            else:
                runs = "N/A"  # Handle unexpected outcomes
        return bowler, batsman, runs
    except:
        return "N/A", "N/A", "N/A"  # Error handling

# Step 5: Apply the parsing function and create new columns
df[["Bowler", "Batsman", "Runs"]] = df["Summary"].apply(parse_summary).apply(pd.Series)

# Step 6: Select and reorder columns
df = df[["Ball", "Ball Number", "Bowler", "Batsman", "Runs", "Description"]]

# Display the processed DataFrame
print("Processed DataFrame:")
print(df)

# Save to CSV
df.to_csv("processed_cricket_commentary.csv", index=False)
print("\nProcessed dataset saved as 'processed_cricket_commentary.csv'.")

Processed DataFrame:
     Ball Ball Number    Bowler       Batsman Runs  \
0       1         0.1  Jamieson  Rohit Sharma    0   
1       2         0.2  Jamieson  Rohit Sharma    6   
2       3         0.3  Jamieson  Rohit Sharma    2   
3       4         0.4  Jamieson  Rohit Sharma    0   
4       5         0.5  Jamieson  Rohit Sharma    0   
..    ...         ...       ...           ...  ...   
295   296        48.2  O'Rourke         Rahul    1   
296   297        48.3  O'Rourke        Jadeja    0   
297   298        48.4  O'Rourke        Jadeja    1   
298   299        48.5  O'Rourke         Rahul    1   
299   300        48.6  O'Rourke        Jadeja    4   

                                           Description  
0    Away swing, low bounce, beats the outside edge...  
1    Pulled behind square. Old-school Rohit, standi...  
2    Touch too straight, clipped off the pads. Thro...  
3           Length on fourth, extra bounce. Good leave  
4    A bit of away movement, that was late sw