In [16]:
#!apt update
#!apt install chromium-chromedriver
!pip install selenium

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.wait import WebDriverWait

import pandas as pd



In [42]:
def get_key_from_metric_label(label):
    return label.replace(" ", "_")

def add_metrics_from_site(driver, metrics, metrics_by_label):
    
    if len(driver.find_elements(By.ID, 'Col1-0-KeyStatistics-Proxy')) == 0:
        print("ERROR: no key-statistics page found! Returns empty metrics")
        return metrics + [None] * (len(metrics_by_label) - 1) # return placeholders
    
    
    metrics_body = driver.find_element(By.ID, 'Col1-0-KeyStatistics-Proxy')
    
    for metric_label in metrics_by_label:
        if metric_label == Metrics.TICKER: # tickername only for dataframe-label reasons. Maybe change logic to more sothisticated list 
            continue
            
        market_cap_element = metrics_body.find_element(By.XPATH, "//tr[contains(., '"+metric_label+"')]")
        child_elements = market_cap_element.find_elements(By.XPATH, "./*")

        assert(len(child_elements) == 2)
        
        metrics.append(child_elements[1].text)
        
        # TODO!
        #if get_key_from_metric_label(child_elements[0].text) != metric_label:
        #    print("Caution: modified metric-name from website " + get_key_from_metric_label(child_elements[0].text) + " differs from the used column-key: "+ metric_label )
        
        
        #metrics[get_key_from_metric_label(child_elements[0].text)] = child_elements[1].text
        #metrics.append((child_elements[0].text, child_elements[1].text))  # TODO return metric_label value (but be aware of the contains-search!)

    return metrics
    

In [43]:
# supporting class with constants from the website (keys etc.)
class Metrics(object):
    
    # Valuation Measures
    MARKET_CAP = 'Market Cap'
    MARKET_CAP_KEY = 'Market_Cap'
    ENTERPRISE_VALUE = 'Enterprise Value'
    ENTERPRISE_VALUE_KEY = 'Enterprise_Value'
    
    
    # Profitability
    PM = "Profit Margin"
    OM = "Operating Margin"
    
    # Management Effectiveness
    ROA = "Return on Assets"
    ROE = "Return on Equity"
    
    # Balance Sheet
    TC = "Total Cash (mrq)"  # CAUTION! search only by contains, not equals!
    TCPS = "Total Cash Per Share (mrq)"
    TD = "Total Debt (mrq)"  # CAUTION! search only by contains, not equals!
    TDE = "Total Debt/Equity (mrq)"
    # ...
    
    # Cash Flow Statement
    OCF = "Operating Cash Flow"
    LFCF = "Levered Free Cash Flow"
    
    # Dividiens & Splits
    FADR = "Forward Annual Dividend Rate"
    FADY = "Forward Annual Dividend Yield"
    TADR = "Trailing Annual Dividend Rate"
    TADY = "Trailing Annual Dividend Yield"
    Y5ADY = "5 Year Average Dividend Yield"
    PR = "Payout Ratio"
    # ...
    
    base = []
    TICKER = "Ticker"
    
    
    def __init__(self):
        self.base = [self.TICKER]
    
    def basics(self, addBase=False):
        if addBase:
            return self.base + [self.MARKET_CAP, self.ENTERPRISE_VALUE]
        else:
            return [self.MARKET_CAP, self.ENTERPRISE_VALUE]
    
    def diviends(self, addBase=False):
        if addBase:
            return self.base + [self.FADR, self.FADY, self.TADR, self.TADY, self.Y5ADY, self.PR]
        else:
            return [self.FADR, self.FADY, self.TADR, self.TADY, self.Y5ADY, self.PR]
        
    
    def dividends_slides(self, addBase=False):
        if addBase:
            return self.base + [self.FADY, self.TADY, self.Y5ADY, self.PR, self.PM, self.ROE, self.TC]
        else:
            return [self.FADY, self.TADY, self.Y5ADY, self.PR, self.PM, self.ROE, self.TC]
    
    def stability_slides(self, addBase=False):
        if addBase:
            return self.base + [self.TDE, self.OCF, self.LFCF]
        else:
            return [self.TDE, self.OCF, self.LFCF]
    
    def from_slides(self):
        return self.dividends_slides(addBase=True) + self.stability_slides()
    
    

class Tickerinfo(object):  # for additional type security
    
    def __init__(self, name, url):
        self.name = name
        self.url = url
        

In [46]:
options = webdriver.ChromeOptions()
options.add_argument('--headless') # for not displaying the graphical environment, shows virtualized browser without GUI
options.add_argument('--no-sandbox') # so that it can access machine resources, blocking sandbox processes it can access whatever
options.add_argument('--disable-dev-shm-usage')  # colab does not have enough memory
# open it, go to a website, and get results
driver = webdriver.Chrome(options=options)

url = "https://finance.yahoo.com/"
driver.get(url)

try:
    # Accept cookies by clicking the button with the specified ID
    print("accept cookies")
    iframe = driver.find_element(By.CLASS_NAME, 'con-wizard')
    accept_cookies_button = iframe.find_element(By.CLASS_NAME, 'accept-all')
    accept_cookies_button.click()
    
    
    print("call trending tickers")
    # call url with tickers:
    driver.get("https://finance.yahoo.com/trending-tickers")
    
    tab = driver.find_element(By.TAG_NAME, 'tbody')
    tickers = []
    assert(tab)
    elements = tab.find_elements(By.TAG_NAME, 'tr')
    
    print("amount of tickers: ", len(elements))
    #elements = elements[:5]  # TODO, take all tickers (only for testing)
    
    
    links = [e.find_element(By.TAG_NAME, 'a') for e in elements]

    for l in links:
        tickers.append(Tickerinfo(l.text, l.get_attribute("href")))
    
    print("getting tickers finished")
    
    required_metrics = Metrics().from_slides()
    
    df_metrics = pd.DataFrame(columns=[get_key_from_metric_label(label) for label in required_metrics])
    
    # call metric-webpage for each ticker and scrape values
    for ticker in tickers:
        #if not ticker.name == "ES=F":
        #    continue
            
        tickername = ticker.name
        print()
        print("ticker: ", tickername)
        url = "https://finance.yahoo.com/quote/"+tickername+"/key-statistics?p="+tickername
        driver.get(url)

        metrics = [tickername]
         # get metric values from website
        print("start scraping metrics for ticker from: ", driver.current_url)
        metrics = add_metrics_from_site(driver, metrics, required_metrics)

         # add metrics as new last row to df
        df_metrics.loc[len(df_metrics)] = metrics

    #print(df_metrics)

finally:
    # Close the WebDriver
    driver.quit()
    
print()
print("finished scraping")

accept cookies
call trending tickers
amount of tickers:  30
getting tickers finished

ticker:  CRM
start scraping metrics for ticker from:  https://finance.yahoo.com/quote/CRM/key-statistics?p=CRM

ticker:  SNOW
start scraping metrics for ticker from:  https://finance.yahoo.com/quote/SNOW/key-statistics?p=SNOW

ticker:  VVOS
start scraping metrics for ticker from:  https://finance.yahoo.com/quote/VVOS/key-statistics?p=VVOS

ticker:  GM
start scraping metrics for ticker from:  https://finance.yahoo.com/quote/GM/key-statistics?p=GM

ticker:  CYTO
start scraping metrics for ticker from:  https://finance.yahoo.com/quote/CYTO/key-statistics?p=CYTO

ticker:  CI
start scraping metrics for ticker from:  https://finance.yahoo.com/quote/CI/key-statistics?p=CI

ticker:  0700.HK
start scraping metrics for ticker from:  https://finance.yahoo.com/quote/0700.HK/key-statistics?p=0700.HK

ticker:  PSTG
start scraping metrics for ticker from:  https://finance.yahoo.com/quote/PSTG/key-statistics?p=PSTG



In [47]:
df_metrics

Unnamed: 0,Ticker,Forward_Annual_Dividend_Yield,Trailing_Annual_Dividend_Yield,5_Year_Average_Dividend_Yield,Payout_Ratio,Profit_Margin,Return_on_Equity,Total_Cash_(mrq),Total_Debt/Equity_(mrq),Operating_Cash_Flow,Levered_Free_Cash_Flow
0,CRM,,0.00%,,0.00%,4.77%,2.67%,12.4B,23.93%,8.4B,10.57B
1,SNOW,,0.00%,,0.00%,-35.22%,-16.08%,3.75B,5.60%,679.23M,897.16M
2,VVOS,,0.00%,,0.00%,-106.27%,-311.34%,988k,"4,573.91%",-12.2M,-6.45M
3,GM,1.14%,1.25%,4.06,5.06%,5.83%,13.27%,30.77B,154.97%,22.9B,11B
4,CYTO,,0.00%,,0.00%,0.00%,-964.17%,49.57k,,-10.81M,-8.98M
5,CI,1.87%,1.68%,0.76,27.13%,2.79%,11.99%,9.33B,68.04%,12.44B,9.84B
6,0700.HK,0.76%,0.67%,0.37,10.84%,32.48%,23.90%,362.49B,44.88%,203.61B,124.51B
7,PSTG,,0.00%,,0.00%,-0.03%,-0.10%,1.23B,28.79%,662.59M,334.19M
8,CNXA,,0.00%,,0.00%,0.00%,-112.00%,285.86k,,-1.01M,-11.83M
9,PINS,,0.00%,,0.00%,-7.43%,-7.37%,2.33B,6.09%,412.97M,346.71M
