# Scroll function

In [10]:
####################################################################################
############################ Get Selenium driver details    ########################
####################################################################################
def get_driver_details(url):
    from selenium import webdriver
    from selenium.webdriver.chrome.service import Service
    from webdriver_manager.chrome import ChromeDriverManager
    driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()))
    driver.get(url)
    return driver

########################################################################################
##################### Scroll function to scroll the page       #########################
########################################################################################

def scroll_page(driver):
    import time
    time.sleep(2)  # Allow 2 seconds for the web page to open
    scroll_pause_time = 5
    screen_height = driver.execute_script("return window.screen.height;")   # get the screen height of the web

    i = 1
    scroll_height = driver.execute_script("return document.body.scrollHeight;")  

    while True:
        # scroll one screen height each time
        driver.execute_script("window.scrollTo(0, {screen_height}*{i});".format(screen_height=screen_height, i=i))  
        i += 1
        time.sleep(scroll_pause_time)
        # update scroll height each time after scrolled, as the scroll height can change after we scrolled the page
        scroll_height = driver.execute_script("return document.body.scrollHeight;")  
        # Break the loop when the height we need to scroll to is larger than the total scroll height
        if (screen_height) * i > scroll_height:
            break 


# Function to select innings by clicking the dropdown (Only for 1st Innings)

In [14]:
########################################################################################
##################### Function to select innings by clicking the dropdown      ########
########################################################################################

def select_innings(driver, innings):
    from selenium.webdriver.common.by import By
    from selenium.webdriver.support.ui import WebDriverWait
    from selenium.webdriver.support import expected_conditions as EC
    from selenium.common.exceptions import TimeoutException

    try:
        dropdown_button = WebDriverWait(driver, 10).until(
            EC.element_to_be_clickable((By.XPATH, "//div[@aria-expanded='false']//span[contains(@class, 'ds-text-tight-s')]"))
        )

        dropdown_button.click()

        innings_option_xpath = f"//span[contains(@class, 'ds-text-typo') and text()='{innings} ']"
        WebDriverWait(driver, 10).until(
            EC.element_to_be_clickable((By.XPATH, innings_option_xpath))
        )

        innings_option = driver.find_element(By.XPATH, innings_option_xpath)
        innings_option.click()

    except TimeoutException:
        print(f"Failed to select {innings} Innings. Element not found within the specified timeout.")


# Function to convert data into more usable

In [15]:
def get_items(s):
    # Initialize the result dictionary with default values
    res = {'action': '', 'ball_type': '', 'run': 0}
    
    if 'W' in s:
        s = s.replace('W', '').strip()
        if len(s) == 0:
            # Handle empty 'W' as wicket with 0 runs
            res.update({'action': 'wicket', 'ball_type': 'valid', 'run': 0})
        else:
            res.update({'action': 'wicket', 'ball_type': 'valid', 'run': int(s)})
    elif 'w' in s:
        s = s.replace('w', '').strip()
        if s:
            # Handle wide with run value
            res.update({'action': 'run', 'ball_type': 'wide', 'run': int(s)})
    elif 'nb' in s:
        s = s.replace('nb', '').strip()
        if s:
            # Handle noball with run value
            res.update({'action': 'run', 'ball_type': 'noball', 'run': int(s)})
    elif 'lb' in s:
        # Extract the numeric part from 'lb'
        s = s.replace('lb', '').strip()
        if s:
            # Extract numeric part and convert to int, then label as leg bye
            res.update({'action': 'run', 'ball_type': 'leg bye', 'run': int(s)})
    elif 'b' in s:
        # Extract the numeric part from 'b'
        s = s.replace('b', '').strip()
        if s:
            # Extract numeric part and convert to int, then label as bye
            res.update({'action': 'run', 'ball_type': 'bye', 'run': int(s)})
    elif s == '•':
        # Handle '•' as dot ball with 0 runs
        res.update({'action': 'dot', 'ball_type': 'valid', 'run': 0})
    else:
        # Handle any other numeric value as a run
        res.update({'action': 'run', 'ball_type': 'valid', 'run': int(s)})
    
    return res


# Full Match details 

In [20]:

########################################################################################
##################### Infite page scraper : example ESPN page      ###################
########################################################################################

def get_infinite_page_info(url,innings):
    from bs4 import BeautifulSoup
    import pandas as pd

    driver = get_driver_details(url)
    select_innings(driver, innings)
    scroll_page(driver)  # Call the scroll function

    html_content = driver.page_source

    # Now using beautiful soup
    soup = BeautifulSoup(html_content, 'html.parser')
    Over, Bowlers, Batsman, BallType, Action, Run,score = [], [], [], [], [], [],[]
    output_file = f"WC_{innings}_Innings.csv"
    
    # Loop through cricket commentary data
    for cricket in soup.find_all("div", class_="ds-text-tight-m ds-font-regular ds-flex ds-px-3 ds-py-2 lg:ds-px-4 lg:ds-py-[10px] ds-items-start ds-select-none lg:ds-select-auto"):
        # Extract the over, bowler, and batsman
        over_elem = cricket.find("span", class_="ds-text-tight-s ds-font-regular ds-mb-1 lg:ds-mb-0 lg:ds-mr-3 ds-block ds-text-center ds-text-typo-mid1")
        bbb_elem = cricket.find("div", class_="ds-leading-none ds-mb-0.5")

        over = over_elem.text if over_elem else None
    
        if bbb_elem:
            bbb_text = bbb_elem.text
            bowler = bbb_text.split(",")[0].split(" ")[0]
            batsman = bbb_text.split(",")[0].split("to")[1].split(" ")[1]
        else:
            bowler = None
            batsman = None
    
        # Extract the action, ball_type, and run
        run_elems = [
            cricket.find("div", class_="ds-flex ds-items-center ds-justify-center ds-rounded ds-overflow-hidden ds-bg-ui-fill-default-translucent ds-text-typo"),
            cricket.find("div", class_="ds-flex ds-items-center ds-justify-center ds-rounded ds-overflow-hidden ds-bg-raw-green-d2 ds-text-raw-white"),
            cricket.find("div", class_="ds-flex ds-items-center ds-justify-center ds-rounded ds-overflow-hidden ds-bg-raw-purple ds-text-raw-white"),
            cricket.find("div", class_="ds-flex ds-items-center ds-justify-center ds-rounded ds-overflow-hidden ds-bg-raw-red ds-text-raw-white")
        ]

        # Initialize run to None
        run = None
        
        # Loop through the run elements and extract the first valid one
        for run_elem in run_elems:
            if run_elem is not None:
                run = run_elem.text
                score.append(run)
                # Process the run using get_items
                result = get_items(run)
                Action.append(result['action'])
                BallType.append(result['ball_type'])
                Run.append(result['run'])
                    
        # Append the data to the lists
        Over.append(over)
        Bowlers.append(bowler)
        Batsman.append(batsman)

    # Create the DataFrame with filled-in lists
    df = pd.DataFrame({'over': Over, 'action': Action, 'ball_type': BallType, 'run': Run,'Score':score, "bowler": Bowlers, "batsman": Batsman})

    # Save the DataFrame to a CSV file
    df.to_csv(output_file, index=False, encoding='utf-8')

    return df

url="https://www.espncricinfo.com/series/icc-cricket-world-cup-2023-24-1367856/india-vs-australia-5th-match-1384396/ball-by-ball-commentary"
innings1='AUS'
innings2='IND'



# Extract data for 1st innings
print(get_infinite_page_info(url, innings1))
print()
print("Second Innings details here:")
print()
# Extract data for 2nd innings
print(get_infinite_page_info(url, innings2))


     over  action ball_type  run Score  bowler batsman
0    49.3  wicket     valid    0     W   Siraj   Starc
1    49.2     run     valid    4     4   Siraj   Starc
2    49.1     dot     valid    0     •   Siraj   Starc
3    48.6     run     valid    1     1  Hardik   Starc
4    48.5     run     valid    4     4  Hardik   Starc
..    ...     ...       ...  ...   ...     ...     ...
295   0.5     dot     valid    0     •  Bumrah   Marsh
296   0.4     dot     valid    0     •  Bumrah   Marsh
297   0.3     dot     valid    0     •  Bumrah   Marsh
298   0.2     run     valid    1     1  Bumrah  Warner
299   0.1     dot     valid    0     •  Bumrah  Warner

[300 rows x 7 columns]

Second Innings details here:

     over  action ball_type  run Score   bowler batsman
0    41.2     run     valid    6     6  Cummins   Rahul
1    41.1     dot     valid    0     •  Cummins   Rahul
2    40.6     run     valid    1     1  Maxwell   Rahul
3    40.5     dot     valid    0     •  Maxwell   Rahul
4    

In [21]:
import pandas as pd
df=pd.read_csv("WC_AUS_Innings.csv")
df

Unnamed: 0,over,action,ball_type,run,Score,bowler,batsman
0,41.2,run,valid,6,6,Cummins,Rahul
1,41.1,dot,valid,0,•,Cummins,Rahul
2,40.6,run,valid,1,1,Maxwell,Rahul
3,40.5,dot,valid,0,•,Maxwell,Rahul
4,40.4,run,valid,1,1,Maxwell,Hardik
...,...,...,...,...,...,...,...
248,0.4,wicket,valid,0,W,Starc,Ishan
249,0.3,run,leg bye,1,1lb,Starc,Rohit
250,0.2,dot,valid,0,•,Starc,Rohit
251,0.1,dot,valid,0,•,Starc,Rohit


In [7]:
import pandas as pd
ind=pd.read_csv("WC_IND_Innings.csv")
ind.head(20)

Unnamed: 0,over,action,ball_type,run,Score,bowler,batsman
0,41.2,run,valid,6,6,Cummins,Rahul
1,41.1,dot,valid,0,•,Cummins,Rahul
2,40.6,run,valid,1,1,Maxwell,Rahul
3,40.5,dot,valid,0,•,Maxwell,Rahul
4,40.4,run,valid,1,1,Maxwell,Hardik
5,40.3,run,valid,1,1,Maxwell,Rahul
6,40.2,run,valid,4,4,Maxwell,Rahul
7,40.1,run,valid,6,6,Maxwell,Rahul
8,39.6,dot,valid,0,•,Hazlewood,Hardik
9,39.5,run,valid,6,6,Hazlewood,Hardik
