In [11]:
import re
def parse_summary(summary):
    bowler = "N/A"
    batsman = "N/A"
    runs = 0
    is_wicket = False
    bowler_score = 0  
    
    try:
        if " to " in summary:
            parts = summary.split(" to ", 1)
            bowler = parts[0].strip()
            rest = parts[1].strip()
            
            if "," in rest:
                subparts = rest.split(",", 1)
                batsman = subparts[0].strip()
                outcome = subparts[1].strip()
                
                if "OUT" in outcome or "WICKET" in outcome:
                    runs = 0
                    is_wicket = True
                    bowler_score = 5  # +5 for a wicket
                elif outcome == "no run":
                    runs = 0
                    bowler_score = 1  # +1 for no run
                elif "FOUR" in outcome or "4 run" in outcome:
                    runs = 4
                    bowler_score = -2  # -2 for a four
                elif "SIX" in outcome or "6 run" in outcome:
                    runs = 6
                    bowler_score = -3  # -3 for a six
                elif "leg bye" in outcome or "bye" in outcome:
                    if "leg bye" in outcome:
                        match = re.search(r'(\d+) leg bye', outcome)
                        runs = int(match.group(1)) if match else 1
                    else:  # regular bye
                        match = re.search(r'(\d+) bye', outcome)
                        runs = int(match.group(1)) if match else 1
                    bowler_score = -0.25  # -0.25 for leg bye or bye
                elif "wide" in outcome:
                    # Wides are always at least 1 run
                    match = re.search(r'(\d+) wide', outcome)
                    runs = int(match.group(1)) if match else 1
                    bowler_score = -1  # -1 for wide
                elif "no ball" in outcome:
                    # No balls are 1 run plus any runs scored
                    runs = 1  # Start with 1 for the no ball
                    
                    # Check if additional runs were scored
                    run_match = re.search(r'(\d+) run', outcome)
                    if run_match:
                        runs += int(run_match.group(1))
                    bowler_score = -1  # -1 for no ball (similar to wide)
                # Regular runs
                else:
                    match = re.search(r'(\d+) run', outcome)
                    if match:
                        runs = int(match.group(1))
                        # For regular runs (not 0, 4, or 6)
                        if runs > 0:
                            bowler_score = -runs / 2  # Negative score proportional to runs
        
        return bowler, batsman, runs, is_wicket, bowler_score
    except Exception as e:
        print(f"Error parsing summary: {e}")
        return "N/A", "N/A", 0, False, 0

In [4]:
def preprocess_raw_data(df):
    if df["Ball Number"].dtype == 'object':
        df[["over_str", "ball_str"]] = df["Ball Number"].str.split(".", expand=True)
        df["over"] = pd.to_numeric(df["over_str"], errors="coerce").fillna(-1).astype(int)
        df["ball"] = pd.to_numeric(df["ball_str"], errors="coerce").fillna(-1).astype(int)
        
        df["sort_order"] = df["over"] * 6 + df["ball"]
        df = df.sort_values(by=["sort_order"]).reset_index(drop=True)
        df = df.drop(columns=["over_str", "ball_str", "sort_order"])
    
    df["Ball"] = df.index + 1
    return df

In [5]:
def extract_commentary_data(page_source):
    soup = BeautifulSoup(page_source, "html.parser")
    
    ball_containers = soup.find_all("div", class_=lambda x: x and "ds-text-tight-l" in x and "ds-flex" in x)
    
    ball_numbers = []
    summaries = []
    descriptions = []
    
    for container in ball_containers:
        ball_span = container.find("span", class_="ds-text-tight-s ds-font-regular ds-mb-1 lg:ds-mb-0 lg:ds-mr-3 ds-block ds-text-center ds-text-typo-mid1")
        ball_text = ball_span.get_text(strip=True) if ball_span else "N/A"
        
        summary_div = container.find("div", class_="ds-leading-[16px] lg:ds-leading-none ds-mb-0.5")
        summary_text = summary_div.get_text(strip=True) if summary_div else "N/A"
        
        desc_p = container.find("p", class_="ci-html-content first-letter:ds-capitalize ds-leading-[24px]")
        desc_text = desc_p.get_text(strip=True) if desc_p else "N/A"
        
        ball_numbers.append(ball_text)
        summaries.append(summary_text)
        descriptions.append(desc_text)
    
    df = pd.DataFrame({
        "Ball Number": ball_numbers,
        "Summary": summaries,
        "Description": descriptions
    })
    
    return df

In [6]:
def analyze_match_data(df, match_id, target, city):
    # Apply the parse_summary function to extract data from the summary
    parsed_data = df["Summary"].apply(parse_summary)
    df['bowler'], df['batsman'], df['Runs'], df['is_wicket'], df['bowler_score'] = zip(*parsed_data)
    
    # Evaluate bowler performance for each ball
    df['bowler_good'] = df.apply(lambda row: 1 if row['bowler_score'] > 0 else 0, axis=1)
    
    # Ensure Runs column is numeric
    df['Runs'] = pd.to_numeric(df['Runs'], errors='coerce').fillna(0)
    
    # Add metadata
    df['match_id'] = match_id
    df['city'] = city
    
    # Identify extras and calculate legal deliveries
    df['is_extra'] = df['Summary'].str.contains(r'wide|no ball', case=False, regex=True)
    df['legal_delivery'] = ~df['is_extra']
    df['legal_deliveries_count'] = df['legal_delivery'].cumsum()
    
    # Calculate overs properly based on legal deliveries
    df['overs_completed'] = (df['legal_deliveries_count'] - 1) // 6
    df['balls_in_current_over'] = ((df['legal_deliveries_count'] - 1) % 6) + 1
    
    # Handle the case where legal_deliveries_count is 0 (first ball)
    df.loc[df['legal_deliveries_count'] == 0, 'overs_completed'] = 0
    df.loc[df['legal_deliveries_count'] == 0, 'balls_in_current_over'] = 0
    
    # Calculate decimal overs
    df['overs_bowled'] = df['overs_completed'] + (df['balls_in_current_over'] / 6)
    df['overs_completed'] = df['overs_completed'].clip(lower=0)
    df['overs_bowled'] = df['overs_bowled'].clip(lower=0)
    
    # Calculate run statistics
    df['total_runs'] = df['Runs'].cumsum()
    df['current_run_rate'] = df.apply(
        lambda row: row['total_runs'] / max(row['overs_bowled'], 0.1) if row['overs_bowled'] > 0 else 0,
        axis=1
    )
    
    # Add match metadata and calculate target-related stats
    total_overs = 20  # Standard T20 match
    df['total_overs'] = total_overs
    df['target'] = target
    df['runs_needed'] = np.maximum(target - df['total_runs'], 0)
    df['overs_remaining'] = np.maximum(df['total_overs'] - df['overs_bowled'], 0)
    
    # Calculate required run rate
    df['required_run_rate'] = df.apply(
        lambda row: row['runs_needed'] / np.maximum(row['overs_remaining'], 0.1) 
                    if row['overs_remaining'] > 0 else float('inf'),
        axis=1
    )
    
    # Calculate wicket statistics
    df['wickets_fallen'] = df['is_wicket'].cumsum()
    df['wickets_in_hand'] = 10 - df['wickets_fallen']
    
    # Clean up unnecessary columns
    df = df.drop(columns=['Summary', 'balls_in_current_over', 'Description', 'bowler_good'])
    
    return df

In [7]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup
import pandas as pd
import re
import time
import numpy as np
import random

import urllib3, socket
from urllib3.connection import HTTPConnection
    
def extraction2(url, target, id, city):
    print("Starting extraction...")
    match_id = id
    HTTPConnection.default_socket_options = ( 
            HTTPConnection.default_socket_options + [
            (socket.SOL_SOCKET, socket.SO_SNDBUF, 1000000), #1MB in byte
            (socket.SOL_SOCKET, socket.SO_RCVBUF, 1000000)
        ])
    chrome_options = webdriver.ChromeOptions()
    chrome_options.add_argument("--no-sandbox")
    #chrome_options.add_argument("--disable-gpu")
    chrome_options.add_argument("--disable-extensions")
    chrome_options.add_argument("--disable-dev-shm-usage")
    
    driver = webdriver.Chrome(options=chrome_options)
    driver.set_page_load_timeout(300)  # Increased timeout
    driver.set_script_timeout(300)     # Increased timeout
    
    try:
        driver.get(url)
        try:
            WebDriverWait(driver, 60).until(  # Increased wait time
                EC.presence_of_element_located((By.CSS_SELECTOR, "div.ds-text-tight-l.ds-flex"))
            )
        except Exception as e:
            print(f"Error waiting for page to load: {e}")
            driver.quit()
            return None
        
        scroll_pause_time = 1
        
        while True:
            driver.execute_script("window.scrollBy({ top: 200, behavior: 'smooth' });")
            time.sleep(0.2) 
                    
            # Check for the presence of "0.1" in ball numbers
            ball_spans = driver.find_elements(By.CSS_SELECTOR, "span.ds-text-tight-s.ds-font-regular.ds-mb-1.lg\\:ds-mb-0.lg\\:ds-mr-3.ds-block.ds-text-center.ds-text-typo-mid1")
            ball_texts = [span.text.strip() for span in ball_spans if span.text.strip()]
            if "0.1" in ball_texts:
                break
        
        
        time.sleep(2)
        page_source = driver.page_source
        
    except Exception as e:
        print(f"An error occurred: {e}")
        try:
            driver.quit()
        except:
            pass
        return None
    finally:
        try:
            driver.quit()
            print("Browser closed")
        except:
            print("Error closing browser")

    
    df = extract_commentary_data(page_source)
    df.to_csv("cricket_commentary_raw.csv", index=False)
    
    try:
        df = preprocess_raw_data(df)
        df.to_csv("commentary_processed.csv", index=False)
        
        df = analyze_match_data(df, int(id), int(target), city)
        
        # Save the final processed commentary CSV
        df.to_csv("processed_cricket_commentary.csv", index=False)
        print("Final processed dataset saved as 'processed_cricket_commentary.csv'.")
        
        # Save the enhanced dataset with metadata calculations
        filename = f"{match_id}b.csv"
        df.to_csv(filename, index=False)
        print(f"Dataset saved as '{filename}'.")
        
        
    except Exception as e:
        print(f"Error processing cricket commentary: {e}")
        import traceback
        traceback.print_exc()

In [8]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup
import pandas as pd
import re
import time
import numpy as np
import random

import urllib3, socket
from urllib3.connection import HTTPConnection

def extraction1(url, target, id, city):
    print("Starting extraction...")
    match_id = id
    HTTPConnection.default_socket_options = ( 
            HTTPConnection.default_socket_options + [
            (socket.SOL_SOCKET, socket.SO_SNDBUF, 1000000), #1MB in byte
            (socket.SOL_SOCKET, socket.SO_RCVBUF, 1000000)
        ])
    chrome_options = webdriver.ChromeOptions()
    chrome_options.add_argument("--no-sandbox")
    #chrome_options.add_argument("--disable-gpu")
    chrome_options.add_argument("--disable-extensions")
    chrome_options.add_argument("--disable-dev-shm-usage")
    
    driver = webdriver.Chrome(options=chrome_options)
    driver.set_page_load_timeout(300)  # Increased timeout
    driver.set_script_timeout(300)     # Increased timeout
    
    try:
        driver.get(url)
        try:
            WebDriverWait(driver, 60).until(  # Increased wait time
                EC.presence_of_element_located((By.CSS_SELECTOR, "div.ds-text-tight-l.ds-flex"))
            )
        except Exception as e:
            print(f"Error waiting for page to load: {e}")
            driver.quit()
            return None
            
        # Add click functionality for innings selection
        try:
            innings_dropdown_trigger = WebDriverWait(driver, 20).until(
                EC.presence_of_all_elements_located((By.CLASS_NAME, "ds-popper-wrapper"))
            )
            
            dropdown = innings_dropdown_trigger[1]
            print("Got dropdown", dropdown)
            dropdown.click()
            print("Innings dropdown opened.")
            
            # Locate the innings options within the popup
            innings_options = WebDriverWait(driver, 20).until(
                EC.presence_of_all_elements_located((By.XPATH, "//div[@data-tippy-root]//li[@class='ds-w-full ds-flex']"))
            )
            print(f"Found {len(innings_options)} innings options in popup.")
            
            # Click the first innings tab
            if len(innings_options) >= 1:
                first_innings_tab = innings_options[0]  # First option (index 0)
                tab_title = first_innings_tab.get_attribute("title")
                print(f"First innings tab identified: {tab_title}")
                try:
                    first_innings_tab.click()
                    print("First innings tab clicked.")
                except Exception as e:
                    print(f"Failed to click tab: {e}")
                    driver.execute_script("arguments[0].click();", first_innings_tab)  # Fallback
                    print("Clicked via JavaScript fallback.")
            else:
                print("No innings options found in popup.")
            
            # Wait for the first innings commentary to load
            WebDriverWait(driver, 20).until(
                EC.presence_of_element_located((By.CLASS_NAME, "ds-leading-[16px]"))
            )
            print("First innings commentary loaded successfully.")
            
        except Exception as e:
            print(f"Error selecting innings: {e}")
            # Continue with the script even if innings selection fails
        
        # Original scroll code
        scroll_pause_time = 1
        
        while True:
            driver.execute_script("window.scrollBy({ top: 200, behavior: 'smooth' });")
            time.sleep(0.2) 
                    
            # Check for the presence of "0.1" in ball numbers
            ball_spans = driver.find_elements(By.CSS_SELECTOR, "span.ds-text-tight-s.ds-font-regular.ds-mb-1.lg\\:ds-mb-0.lg\\:ds-mr-3.ds-block.ds-text-center.ds-text-typo-mid1")
            ball_texts = [span.text.strip() for span in ball_spans if span.text.strip()]
            if "0.1" in ball_texts:
                break
        
        
        time.sleep(2)
        page_source = driver.page_source
        
    except Exception as e:
        print(f"An error occurred: {e}")
        try:
            driver.quit()
        except:
            pass
        return None
    finally:
        try:
            driver.quit()
            print("Browser closed")
        except:
            print("Error closing browser")
    
    df = extract_commentary_data(page_source)
    df.to_csv("cricket_commentary_raw.csv", index=False)
    
    try:
        df = preprocess_raw_data(df)
        df.to_csv("commentary_processed.csv", index=False)
        
        df = analyze_match_data(df, int(id), int(target), city)
        
        # Save the final processed commentary CSV
        df.to_csv("processed_cricket_commentary.csv", index=False)
        print("Final processed dataset saved as 'processed_cricket_commentary.csv'.")
        
        # Save the enhanced dataset with metadata calculations
        filename = f"{match_id}a.csv"
        df.to_csv(filename, index=False)
        print(f"Dataset saved as '{filename}'.")
        
        
    except Exception as e:
        print(f"Error processing cricket commentary: {e}")
        import traceback
        traceback.print_exc()

In [10]:
from selenium import webdriver
from bs4 import BeautifulSoup
import time
import re

def get_target_and_commentary_urls(url, start_match=1):
    print("Starting script to fetch target scores and commentary URLs...")
    driver = webdriver.Chrome()
    
    try:
        driver.get(url)
        time.sleep(5)  # Allow time for page to load fully
    
        soup = BeautifulSoup(driver.page_source, 'html.parser')
        matches = soup.find_all('div', class_='ds-p-4 ds-border-y ds-border-line')
        print(f"Found {len(matches)} match containers.")
        
        extracted_data = []
        match_id = start_match
        
        for match in matches[start_match-1:]:
            print(f"\nProcessing match {match_id}...")
            link_tag = match.find('a', href=True)
            if not link_tag:
                print("No link tag found; skipping this match.")
                match_id += 1
                continue
            original_url = 'https://www.espncricinfo.com' + link_tag['href']
            
            url_parts = original_url.split('/')
            url_parts[-1] = 'ball-by-ball-commentary'
            commentary_url = '/'.join(url_parts)
            
            # Extract the city
            city = None
            details_div = match.find('div', class_='ds-text-tight-s ds-font-regular ds-truncate ds-text-typo-mid3')
            if details_div:
                span = details_div.find('span', class_='ds-text-tight-s ds-font-medium ds-text-typo')
                if span:
                    next_text = span.next_sibling
                    if next_text and isinstance(next_text, str):
                        city_part = next_text.strip().lstrip('•').strip()
                        city = city_part.split(',')[0].strip()
            
            # Extract target as before
            target = None
            team_scores = match.find_all('div', class_='ci-team-score')
            for score_div in team_scores:
                score_text_div = score_div.find('div', class_='ds-text-compact-s ds-text-typo ds-text-right ds-whitespace-nowrap')
                if score_text_div:
                    target_span = score_text_div.find('span', class_='ds-text-compact-xs ds-mr-0.5')
                    if target_span and target_span.text.strip():
                        target_match = re.search(r'T:(\d+)', target_span.text)
                        if target_match:
                            target = int(target_match.group(1))
                            break
            
            if target is None:
                target = 0
                print("No target score found for this match.")
            
            # Append all data, including city
            extracted_data.append({
                'match_id': match_id,
                'city': city,
                'target_score': target,
                'commentary_url': commentary_url
            })
            
            extraction2(commentary_url, target, match_id, city)
            extraction1(commentary_url, target, match_id, city)
            match_id += 1
        
        print("\nData extraction complete.")
        return extracted_data
    
    except Exception as e:
        print(f"An error occurred: {e}")
        return extracted_data  
    
    finally:
        print("Closing browser in 5 seconds...")
        time.sleep(5)
        driver.quit()
        print("Browser closed.")


if __name__ == "__main__":
    schedule_url = "https://www.espncricinfo.com/series/indian-premier-league-2023-1345038/match-schedule-fixtures-and-results"
    print("If the script stopped previously, you can resume from a specific match.")
    user_input = input("Enter the match number to start from (or press Enter to start from 1): ")        
    start_match = int(user_input) if user_input else 1
    
    print(f"Starting scraping from match number {start_match}")
    match_details = get_target_and_commentary_urls(schedule_url, start_match)
    

If the script stopped previously, you can resume from a specific match.


Enter the match number to start from (or press Enter to start from 1):  1


Starting scraping from match number 1
Starting script to fetch target scores and commentary URLs...
Found 74 match containers.

Processing match 1...
Starting extraction...
Browser closed
Final processed dataset saved as 'processed_cricket_commentary.csv'.
Dataset saved as '1b.csv'.
Starting extraction...
Got dropdown <selenium.webdriver.remote.webelement.WebElement (session="57b0c0031d56625256307cdac082d8d5", element="f.861C03155102765E6CCEC75CA5D0E805.d.86A7D846D0B592BCD7E341BEA01E8D29.e.39")>
Innings dropdown opened.
Found 2 innings options in popup.
First innings tab identified: CSK 
First innings tab clicked.
Error selecting innings: Message: invalid selector: An invalid or illegal selector was specified
  (Session info: chrome=134.0.6998.35); For documentation on this error, please visit: https://www.selenium.dev/documentation/webdriver/troubleshooting/errors#invalid-selector-exception
Stacktrace:
#0 0x5ca761fecffa <unknown>
#1 0x5ca761aab970 <unknown>
#2 0x5ca761ab250e <unknown>

KeyboardInterrupt: 