## Getting fundamental data from yahoo finance

In [1]:
import requests
from bs4 import BeautifulSoup

income_statement = {}

url = "https://finance.yahoo.com/quote/AAPL/financials?p=AAPL"

# It is important to send headers because the request won't be served as yahoo finance loathes when developers webscrape. 

headers = {"User-Agent" : "Chrome/96.0.4664.110"}
page = requests.get(url, headers=headers)
page_content = page.content
soup = BeautifulSoup(page_content,"html.parser")

# First we'll just grab the section in our html code which has all the sauce for the table and then work our way to the end goal.
tabl = soup.find_all("div" , {"class" : "M(0) Whs(n) BdEnd Bdc($seperatorColor) D(itb)"})

# That we've got the table in our hand, we may iterate through each of it's row to get the data and store is in a dictionary using vanilla python magic. 
for t in tabl:
    rows = t.find_all("div" , {"class": "D(tbr) fi-row Bgc($hoverBgColor):h"})
    for row in rows:
        income_statement[row.get_text(separator="|").split("|")[0]] = row.get_text(separator="|").split("|")[1] # 1st value in the list (list that we get after splitting it on "|") has the data for the current year. 
        # Although we may fetch the data for the previous years if we want to by simply getting all that we have by using [1:] instead of [1]
        


In [2]:
income_statement

{'Total Revenue': '387,537,000',
 'Cost of Revenue': '220,666,000',
 'Gross Profit': '166,871,000',
 'Operating Expense': '52,906,000',
 'Operating Income': '113,965,000',
 'Net Non Operating Interest Income Expense': '-197,000',
 'Other Income Expense': '-283,000',
 'Pretax Income': '113,485,000',
 'Tax Provision': '18,314,000',
 'Net Income Common Stockholders': '95,171,000',
 'Diluted NI Available to Com Stockholders': '95,171,000',
 'Basic EPS': '-',
 'Diluted EPS': '-',
 'Basic Average Shares': '-',
 'Diluted Average Shares': '-',
 'Total Operating Income as Reported': '113,965,000',
 'Total Expenses': '273,572,000',
 'Net Income from Continuing & Discontinued Operation': '95,171,000',
 'Normalized Income': '95,171,000',
 'Interest Income': '3,043,000',
 'Interest Expense': '3,240,000',
 'Net Interest Income': '-197,000',
 'EBIT': '116,725,000',
 'EBITDA': '128,048,000',
 'Reconciled Cost of Revenue': '220,666,000',
 'Reconciled Depreciation': '11,323,000',
 'Net Income from Conti

## Now getting data for several tickers

In [3]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

tickers = ["AAPL", "FB", "CSCO", "INFY.NS", "3988.HK"]
income_statatement_dict = {}
balance_sheet_dict = {}
cashflow_st_dict = {}

for ticker in tickers:
    # scraping income statement
    url = f"https://finance.yahoo.com/quote/{ticker}/financials?p={ticker}"

    income_statement = {}
    # used to store the headings of the tables that we'll be scraping.
    table_title = {}

    headers = {"User-Agent": "Chrome/96.0.4664.110"}
    page = requests.get(url, headers=headers)
    page_content = page.content

    soup = BeautifulSoup(page_content, "html.parser")
    tabl = soup.find_all(
        "div", {"class": "M(0) Whs(n) BdEnd Bdc($seperatorColor) D(itb)"})

    for t in tabl:
        heading = t.find_all("div", {"class": "D(tbr) C($primaryColor)"})
        for top_row in heading:
            table_title[top_row.get_text(separator="|").split(
                "|")[0]] = top_row.get_text(separator="|").split("|")[1:]
        rows = t.find_all(
            "div", {"class": "D(tbr) fi-row Bgc($hoverBgColor):h"})
        for row in rows:
            income_statement[row.get_text(separator="|").split(
                "|")[0]] = row.get_text(separator="|").split("|")[1:]

    temp = pd.DataFrame(income_statement).T
    temp.columns = table_title["Breakdown"]
    income_statatement_dict[ticker] = temp

    # scraping balance sheet statement
    url = f"https://finance.yahoo.com/quote/{ticker}/balance-sheet?p={ticker}"
    balance_sheet = {}
    table_title = {}

    headers = {"User-Agent": "Chrome/96.0.4664.110"}
    page = requests.get(url, headers=headers)
    page_content = page.content
    soup = BeautifulSoup(page_content, "html.parser")
    tabl = soup.find_all(
        "div", {"class": "M(0) Whs(n) BdEnd Bdc($seperatorColor) D(itb)"})
    for t in tabl:
        heading = t.find_all("div", {"class": "D(tbr) C($primaryColor)"})
        for top_row in heading:
            table_title[top_row.get_text(separator="|").split(
                "|")[0]] = top_row.get_text(separator="|").split("|")[1:]
        rows = t.find_all(
            "div", {"class": "D(tbr) fi-row Bgc($hoverBgColor):h"})
        for row in rows:
            balance_sheet[row.get_text(separator="|").split(
                "|")[0]] = row.get_text(separator="|").split("|")[1:]

    temp = pd.DataFrame(balance_sheet).T
    temp.columns = table_title["Breakdown"]
    balance_sheet_dict[ticker] = temp

    # scraping cashflow statement
    url = f"https://finance.yahoo.com/quote/{ticker}/cash-flow?p={ticker}"
    cashflow_statement = {}
    table_title = {}

    headers = {"User-Agent": "Chrome/96.0.4664.110"}
    page = requests.get(url, headers=headers)
    page_content = page.content
    soup = BeautifulSoup(page_content, "html.parser")
    tabl = soup.find_all(
        "div", {"class": "M(0) Whs(n) BdEnd Bdc($seperatorColor) D(itb)"})
    for t in tabl:
        heading = t.find_all("div", {"class": "D(tbr) C($primaryColor)"})
        for top_row in heading:
            table_title[top_row.get_text(separator="|").split(
                "|")[0]] = top_row.get_text(separator="|").split("|")[1:]
        rows = t.find_all(
            "div", {"class": "D(tbr) fi-row Bgc($hoverBgColor):h"})
        for row in rows:
            cashflow_statement[row.get_text(separator="|").split(
                "|")[0]] = row.get_text(separator="|").split("|")[1:]

    temp = pd.DataFrame(cashflow_statement).T
    temp.columns = table_title["Breakdown"]
    cashflow_st_dict[ticker] = temp

# converting dataframe values to numeric
for ticker in tickers:
    for col in income_statatement_dict[ticker].columns:
        income_statatement_dict[ticker][col] = income_statatement_dict[ticker][col].str.replace(
            ',|- ', '')
        income_statatement_dict[ticker][col] = pd.to_numeric(
            income_statatement_dict[ticker][col], errors='coerce')
        
        cashflow_st_dict[ticker][col] = cashflow_st_dict[ticker][col].str.replace(
            ',|- ', '')
        cashflow_st_dict[ticker][col] = pd.to_numeric(
            cashflow_st_dict[ticker][col], errors='coerce')
        
        if col != "ttm":  # yahoo has ttm column for income statement and cashflow statement only
            balance_sheet_dict[ticker][col] = balance_sheet_dict[ticker][col].str.replace(
                ',|- ', '')
            balance_sheet_dict[ticker][col] = pd.to_numeric(
                balance_sheet_dict[ticker][col], errors='coerce')


  income_statatement_dict[ticker][col] = income_statatement_dict[ticker][col].str.replace(
  cashflow_st_dict[ticker][col] = cashflow_st_dict[ticker][col].str.replace(
  balance_sheet_dict[ticker][col] = balance_sheet_dict[ticker][col].str.replace(


In [4]:
balance_sheet.keys()

dict_keys(['Total Assets', 'Total Liabilities Net Minority Interest', 'Total Equity Gross Minority Interest', 'Total Capitalization', 'Preferred Stock Equity', 'Common Stock Equity', 'Net Tangible Assets', 'Invested Capital', 'Tangible Book Value', 'Total Debt', 'Share Issued', 'Ordinary Shares Number'])

## Getting key statistics 

In [5]:
import requests
from bs4 import BeautifulSoup

tickers = ["AAPL","FB","CSCO","INFY.NS","3988.HK"]
key_statistics = {}

for ticker in tickers:
    #scraping key statistics
    url = f"https://finance.yahoo.com/quote/{ticker}/key-statistics?p={ticker}"
    headers = {"User-Agent" : "Chrome/96.0.4664.110"}
    page = requests.get(url, headers=headers)
    page_content = page.content
    soup = BeautifulSoup(page_content,"html.parser")
    tabl = soup.find_all("table" , {"class" : "W(100%) Bdcl(c)"}) #remove/add the trailing space if getting error
    
    temp_stats = {}
    for t in tabl:
        rows = t.find_all("tr")
        for row in rows:
            temp_stats[row.get_text(separator="|").split("|")[0]] = row.get_text(separator="|").split("|")[-1]
    
    key_statistics[ticker] = temp_stats

In [6]:
key_statistics["AAPL"]

{'Beta (5Y Monthly)': '1.30',
 '52-Week Change': '-4.87%',
 'S&P500 52-Week Change': '-9.11%',
 '52 Week High': '176.15',
 '52 Week Low': '124.17',
 '50-Day Moving Average': '152.94',
 '200-Day Moving Average': '148.99',
 'Avg Vol (3 month)': '66.49M',
 'Avg Vol (10 day)': '54.94M',
 'Shares Outstanding': '15.82B',
 'Implied Shares Outstanding': 'N/A',
 'Float': '15.81B',
 '% Held by Insiders': '0.07%',
 '% Held by Institutions': '61.29%',
 'Shares Short (Mar 14, 2023)': '107.42M',
 'Short Ratio (Mar 14, 2023)': '1.75',
 'Short % of Float (Mar 14, 2023)': '0.68%',
 'Short % of Shares Outstanding (Mar 14, 2023)': '0.68%',
 'Shares Short (prior month Feb 14, 2023)': '104.34M',
 'Forward Annual Dividend Rate': '0.92',
 'Forward Annual Dividend Yield': '0.56%',
 'Trailing Annual Dividend Rate': '0.91',
 'Trailing Annual Dividend Yield': '0.56%',
 '5 Year Average Dividend Yield': '0.94',
 'Payout Ratio': '15.45%',
 'Dividend Date': 'Feb 15, 2023',
 'Ex-Dividend Date': 'Feb 09, 2023',
 'Last