In [2]:
import time
import numpy as np
import pandas as pd
from selenium import webdriver
from selenium.webdriver.common.by import By 
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.action_chains import ActionChains

In [30]:
class StocksScraper :
    def __init__(self, driver, timeout=10):
        self.driver = driver
        self.wait = WebDriverWait(self.driver , timeout=timeout)
        self.data = []

    def wait_for_the_page_to_load(self):
        page_title = self.driver.title
        try :
            self.wait.until(
                lambda d : d.execute_script("return document.readyState")== "complete"
            )
        except :
            print(f"The page \"{page_title}\" did not get fully loaded within the given duration")
        else :
            print(f"The page \"{page_title}\" is successfully loaded")

    def access_url(self , url):
        self.driver.get(url)
        self.wait_for_the_page_to_load()


    def access_most_active_stocks(self):
        actions = ActionChains(self.driver)
        # checking whether market menu is present or not
        markets_menu = self.wait.until(
            EC.presence_of_element_located((By.XPATH , '/html[1]/body[1]/div[2]/header[1]/div[1]/div[1]/div[1]/div[4]/div[1]/div[1]/ul[1]/li[3]/a[1]/span[1]'))
        )
        actions.move_to_element(markets_menu).perform()

        # click on Trending tickers
        # checking whether Trending tickers is clickable or not
        trending_tickers = self.wait.until(
            EC.element_to_be_clickable((By.XPATH ,'/html[1]/body[1]/div[2]/header[1]/div[1]/div[1]/div[1]/div[4]/div[1]/div[1]/ul[1]/li[3]/div[1]/ul[1]/li[4]/a[1]/div[1]'))
        )
        trending_tickers.click()
        self.wait_for_the_page_to_load()

        # click on Most Active 
        most_active = self.wait.until(
            EC.element_to_be_clickable((By.XPATH ,'/html[1]/body[1]/div[2]/main[1]/section[1]/section[1]/section[1]/article[1]/section[1]/div[1]/nav[1]/ul[1]/li[1]/a[1]/span[1]'))
        )
        most_active.click()
        self.wait_for_the_page_to_load() 

    def extract_stocks_data(self):
        # extract data from the web page
        # scraping
        # wait until table is present
        while True : 
            self.wait.until(EC.presence_of_element_located((By.TAG_NAME, "table")))
    
            # find all rows
            # .find_elements because we have more than one row and we want all the rows
            rows = self.driver.find_elements(By.CSS_SELECTOR, "table tbody tr")
    
            for row in rows:
                values = row.find_elements(By.TAG_NAME, "td")
                if len(values) >= 10:  # make sure the row has all required columns
                    stock = {
                        "name": values[1].text,
                        "symbol": values[0].text,
                        "price": values[3].text,
                        "change": values[4].text,
                        "volume": values[6].text,
                        "market_cap": values[8].text,
                        "pe_ratio": values[9].text,
                    }
                    self.data.append(stock)

            # try to click next
            try:
                next_button = self.wait.until(
                    EC.element_to_be_clickable((By.XPATH, '//*[@id="nimbus-app"]/section/section/section/article/section[1]/div/div[3]/div[3]/button[3]'))
                )
                next_button.click()
                time.sleep(2)  # give time to load new data
            except:
                print("The \"next\" button is not clickable. We have navigated through all the pages.")
                break
        
            else :
                next_button.click()
                time.sleep(2) # as we are not changing the page


    def clean_and_save_data(self , filename = "temp"):
        stocks_df = (
            pd
            .DataFrame(self.data)
            .apply(lambda col: col.str.strip() if col.dtype == "object" else col)
            .rename(columns={
                "price": "price_usd",
                "volume": "volume_M",
                "market_cap": "market_cap_B"
            })
            .assign(
                price_usd = lambda df_: pd.to_numeric(df_.price_usd.str.replace(",", "")),
                change = lambda df_: pd.to_numeric(df_.change.str.replace("+", "").str.replace(",", "")),
                volume_M = lambda df_: pd.to_numeric(df_.volume_M.str.replace("M", "").str.replace(",", "")),
                market_cap_B = lambda df_: df_.market_cap_B.apply(
                    lambda val: float(val.replace("B", "")) if "B" in val 
                    else float(val.replace("T", "")) * 1000 if "T" in val 
                    else float(val.replace("M", "")) / 1000 if "M" in val 
                    else np.nan
                ),
                pe_ratio = lambda df_: (
                    df_.pe_ratio
                    .replace("-", np.nan)
                    .str.replace(",", "")
                    .pipe(pd.to_numeric)
                )
            )
        )
        stocks_df.to_excel(f"{filename}.xlsx" , index=False, engine="openpyxl")
    
    
        
    

In [34]:
if __name__ == "__main__":
    driver = webdriver.Chrome()
    driver.maximize_window()

    url = "https://finance.yahoo.com/"
    scraper = StocksScraper(driver,5)

    scraper.access_url(url)
    scraper.access_most_active_stocks()
    scraper.extract_stocks_data()
    scraper.clean_and_save_data("Reconstructed yahoo scraper")

    driver.quit()

The page "Yahoo Finance - Stock Market Live, Quotes, Business & Finance News" is successfully loaded
The page "Top Trending Stocks: US stocks with the highest interest today - Yahoo Finance" is successfully loaded
The page "Top Trending Stocks: US stocks with the highest interest today - Yahoo Finance" is successfully loaded
The "next" button is not clickable. We have navigated through all the pages.
