In [4]:
# import libraries
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
import pandas as pd
import time
import os
from dotenv import load_dotenv
from sqlalchemy import create_engine

In [5]:
# environment variables
load_dotenv()

username = os.getenv('username')
password = os.getenv('password')
host = os.getenv('host')
port = os.getenv('port')
database = os.getenv('database')

engine = create_engine(f'mysql+pymysql://{username}:{password}@{host}:{port}/{database}')

In [6]:
# selenium driver
service = Service(ChromeDriverManager().install())
driver = webdriver.Chrome(service=service)

In [7]:
# extract data from website
def extract_table_data(table_element):
    rows = table_element.find_elements(By.TAG_NAME, 'tr')
    headers = [header.text for header in rows[0].find_elements(By.TAG_NAME, 'th')]
    data = [[td.text for td in row.find_elements(By.TAG_NAME, 'td')] for row in rows[1:]]
    return pd.DataFrame(data, columns=headers)

# change df to sql
def dataframe_to_sql(df, table_name, engine):
    df.to_sql(name=table_name, con=engine, if_exists='append', index=False)

In [8]:
# company symbols
symbols = ["JPM", "GS", "C", "JLL", "DIS", "TPR", "F", "XOM", "AAPL", "AMZN", "PFE", "MRK"]

In [49]:
# empty list for each symbol
all_insider_transactions = []
all_trading_volumes = []
all_eps_trends = []
all_eps_revisions = []
all_growth_estimates = []

In [50]:
# scrape data for each symbol
for symbol in symbols:

    # insider transactions
    url = f'https://finance.yahoo.com/quote/{symbol}/insider-transactions'
    driver.get(url)
    time.sleep(10)
    insider_table = driver.find_element(By.CLASS_NAME, 'svelte-1s2g2l0')
    insider_transactions_df = extract_table_data(insider_table)
    insider_transactions_df['Symbol'] = symbol
    all_insider_transactions.append(insider_transactions_df)

    # trading volume
    url = f'https://finance.yahoo.com/quote/{symbol}/history?filter=history'
    driver.get(url)
    time.sleep(6)
    trading_volume_table = driver.find_element(By.CSS_SELECTOR, '.table.svelte-ewueuo')
    trading_volume_df = extract_table_data(trading_volume_table)
    trading_volume_df['Symbol'] = symbol
    all_trading_volumes.append(trading_volume_df)

    # analyst ratings
    url = f'https://finance.yahoo.com/quote/{symbol}/analysis'
    driver.get(url)
    time.sleep(10)
    categories = {
        'epsTrend': all_eps_trends,
        'epsRevisions': all_eps_revisions,
        'growthEstimate': all_growth_estimates
    }
    # extract data for each category
    for category, container in categories.items():
        section_element = driver.find_element(By.XPATH, f'//section[@data-testid="{category}"]')
        df = extract_table_data(section_element)
        df['Symbol'] = symbol
        container.append(df)

In [59]:
# concat dfs for each category and upload to SQL
dataframe_to_sql(pd.concat(all_insider_transactions, ignore_index=True), 'insider_transactions', engine)
dataframe_to_sql(pd.concat(all_trading_volumes, ignore_index=True), 'trading_volume', engine)
dataframe_to_sql(pd.concat(all_eps_trends, ignore_index=True), 'eps_trend', engine)
dataframe_to_sql(pd.concat(all_eps_revisions, ignore_index=True), 'eps_revisions', engine)
dataframe_to_sql(pd.concat(all_growth_estimates, ignore_index=True), 'growth_estimate', engine)

In [60]:
# close selenium driver
driver.quit()