In [1]:
import sys
sys.path.insert(0, '..')  # Add the parent directory (project root directory) to the Python path

In [2]:
import time
import pandas as pd
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
from selenium.common.exceptions import NoSuchElementException
from utils.drivers import setup_driver
from tqdm import tqdm

def navigate_to_page(driver, url):
    driver.get(url)
    try:
        # Try to find and click the "ETF Filter Options" menu
        filter_options_selector = '#uc_5_ctl02_filtersPanel > div.fund-select.fund-filter > div.header-fund-medium.no-print > span.filter-expand-collapse.pimcon-round-button-white.collapsed > span'
        filter_options = WebDriverWait(driver, 20).until(
            EC.presence_of_element_located((By.CSS_SELECTOR, filter_options_selector)))
        filter_options.click()
        # Wait for the redirection to complete
        time.sleep(5)

        # Try to find and click the "Sector" menu
        sector_selector = '#uc_5_ctl02_sectorFilter > div.header-choosen > i.icomoon.icon-arrow-down10'
        sector_options = WebDriverWait(driver, 20).until(EC.presence_of_element_located((By.CSS_SELECTOR, sector_selector)))
        sector_options.click()

        # Select sectors to filter by
        us_treasury_selector = '#uc_5_ctl02_sectorFilter > div.content-choosen.sectors-choosen > div > div.mutil-choosen > div:nth-child(1) > div > span'
        us_treasury_filter = WebDriverWait(driver, 20).until(
            EC.presence_of_element_located((By.CSS_SELECTOR, us_treasury_selector)))
        us_treasury_filter.click()
        
        us_tips_selector = '#uc_5_ctl02_sectorFilter > div.content-choosen.sectors-choosen > div > div.mutil-choosen > div:nth-child(2) > div > span'
        us_tips_filter = WebDriverWait(driver, 20).until(
            EC.presence_of_element_located((By.CSS_SELECTOR, us_tips_selector)))
        us_tips_filter.click()
        
        municipal_selector = '#uc_5_ctl02_sectorFilter > div.content-choosen.sectors-choosen > div > div.mutil-choosen > div:nth-child(3) > div > span'
        municipal_filter = WebDriverWait(driver, 20).until(
            EC.presence_of_element_located((By.CSS_SELECTOR, municipal_selector)))
        municipal_filter.click()
        
        credit_selector = '#uc_5_ctl02_sectorFilter > div.content-choosen.sectors-choosen > div > div.mutil-choosen > div:nth-child(5) > div > span'
        credit_filter = WebDriverWait(driver, 20).until(
            EC.presence_of_element_located((By.CSS_SELECTOR, credit_selector)))
        credit_filter.click()
        
        core_fixed_income_selector = '#uc_5_ctl02_sectorFilter > div.content-choosen.sectors-choosen > div > div.mutil-choosen > div:nth-child(6) > div > span'
        core_fixed_income_filter = WebDriverWait(driver, 20).until(
            EC.presence_of_element_located((By.CSS_SELECTOR, core_fixed_income_selector)))
        core_fixed_income_filter.click()

        # Add delay and then redirect back to the target page
        time.sleep(10)
        # driver.get(url)

    except NoSuchElementException:
        # Ignore if the elements are not found
        pass
    
def get_links(driver):
    links = set()

    # Initial height
    last_height = driver.execute_script("return document.body.scrollHeight")

    while True:
        # Gather the links
        link_elements = driver.find_elements(By.CSS_SELECTOR, 'tr > td.name.excel > p > a')
        for elem in link_elements:
            link = elem.get_attribute('href')
            if link is not None:
                links.add(link)

        # Scroll down by chunk
        driver.execute_script("window.scrollBy(0, 800);")  # Adjust this value according to your needs
        time.sleep(2)  # Adjust this value according to your internet speed or webpage's response time

        # Calculate new scroll height and compare with last scroll height
        new_height = driver.execute_script("return document.body.scrollHeight")
        if new_height == last_height:  # If heights are the same it will exit the function
            break
        last_height = new_height

    return list(links)  # Return the links as a list

def extract_etf_info(driver, url):
    # Navigate to the url
    driver.get(url)

    # Scroll down about 10% of the webpage
    scroll_height = driver.execute_script("return document.body.scrollHeight")
    driver.execute_script(f"window.scrollTo(0, {scroll_height * 0.05});")
    time.sleep(2)  # delay to allow the page to load

    # Check for the pop-up
    try:
        WebDriverWait(driver, 20).until(EC.presence_of_element_located((By.CSS_SELECTOR, '#SplashPage1 > section > div > div.listOfRoles > ul > li:nth-child(1) > label')))
        financial_advisor_button = driver.find_element(By.CSS_SELECTOR, '#SplashPage1 > section > div > div.listOfRoles > ul > li:nth-child(1) > label')
        financial_advisor_button.click()
    except (NoSuchElementException, TimeoutException):
        # Ignore if the pop-up is not found
        pass

    try:
        # First try with the usual CSS selector
        WebDriverWait(driver, 20).until(EC.presence_of_element_located((By.CSS_SELECTOR,
            '#pageWrapper > section:nth-child(2) > div:nth-child(19) > div.row > div > div > div')))
        yield_to_maturity = driver.find_element(By.CSS_SELECTOR,
            '#pageWrapper > section:nth-child(2) > div:nth-child(19) > div.row > div > div > div > div > div:nth-child(4) > div.datapoint').text
    except NoSuchElementException:
        # If the usual CSS selector fails, try the second CSS selector
        try:
            WebDriverWait(driver, 20).until(EC.presence_of_element_located((By.CSS_SELECTOR,
                '#pageWrapper > section:nth-child(2) > div:nth-child(20) > div.row > div > div > div')))
            yield_to_maturity = driver.find_element(By.CSS_SELECTOR,
                '#pageWrapper > section:nth-child(2) > div:nth-child(20) > div.row > div > div > div > div > div:nth-child(4) > div.datapoint').text
        except NoSuchElementException:
            return None  # Returns None if yield_to_maturity is not present, which leads to skipping this ETF
    except TimeoutException:
        # If it takes too long to find an element
        return None

    # Extract the info
    name = WebDriverWait(driver, 20).until(EC.presence_of_element_located((By.CSS_SELECTOR, '#etf-header > div.container > div.etf-title-row > div.etf-name'))).text
    print(name)
    as_of_date = WebDriverWait(driver, 20).until(EC.presence_of_element_located((By.CSS_SELECTOR,
                                     '#pageWrapper > section:nth-child(2) > div:nth-child(19) > div.row > div > div > div > div > div:nth-child(4) > div.as-of-date'))).text
    ticker = WebDriverWait(driver, 20).until(EC.presence_of_element_located((By.CSS_SELECTOR,
                                 '#etf-header > div.container > div.etf-title-row > h1'))).text

    return {"Ticker": ticker, "Name": name, "Yield to Maturity": yield_to_maturity, "As of Date": as_of_date}

def get_etf_data(driver, links):
    # Get data for all ETFs
    data = []
    for link in tqdm(links):
        etf_data = extract_etf_info(driver, link)

        # Only add the etf_data to the list if it's not None
        if etf_data is not None:
            data.append(etf_data)

    # Convert data list to a pandas DataFrame
    df = pd.DataFrame(data)

    # Set 'ticker' as index
    df.set_index('Ticker', inplace=True)

    # Convert 'yield_to_maturity' to a numeric value
    df['Yield to Maturity'] = df['Yield to Maturity'].str.rstrip('%').astype('float') / 100.0

    # Convert 'as_of_date' to datetime
    df['As of Date'] = df['As of Date'].str.replace('as of ', '', case=False)
    df['As of Date'] = pd.to_datetime(df['As of Date'], format="%m/%d/%Y")
    df['As of Date'] = df['As of Date'].dt.strftime('%m-%d-%Y')

    return df

In [3]:
'#pageWrapper > section:nth-child(2) > div:nth-child(20) > div.row > div > div > div > div > div:nth-child(4) > div.as-of-date'

'#pageWrapper > section:nth-child(2) > div:nth-child(20) > div.row > div > div > div > div > div:nth-child(4) > div.as-of-date'

In [4]:
url = 'https://www.pimco.com/en-us/investments/etf'
driver = setup_driver(headless=False)
navigate_to_page(driver, url)
links = get_links(driver)
df = get_etf_data(driver, links)

 27%|██████████████████████▋                                                            | 3/11 [02:44<07:21, 55.24s/it]

Broad U.S. TIPS Index Exchange-Traded Fund


 27%|██████████████████████▋                                                            | 3/11 [03:33<09:30, 71.26s/it]


TimeoutException: Message: 
Stacktrace:
Backtrace:
	GetHandleVerifier [0x00B8A813+48355]
	(No symbol) [0x00B1C4B1]
	(No symbol) [0x00A25358]
	(No symbol) [0x00A509A5]
	(No symbol) [0x00A50B3B]
	(No symbol) [0x00A7E232]
	(No symbol) [0x00A6A784]
	(No symbol) [0x00A7C922]
	(No symbol) [0x00A6A536]
	(No symbol) [0x00A482DC]
	(No symbol) [0x00A493DD]
	GetHandleVerifier [0x00DEAABD+2539405]
	GetHandleVerifier [0x00E2A78F+2800735]
	GetHandleVerifier [0x00E2456C+2775612]
	GetHandleVerifier [0x00C151E0+616112]
	(No symbol) [0x00B25F8C]
	(No symbol) [0x00B22328]
	(No symbol) [0x00B2240B]
	(No symbol) [0x00B14FF7]
	BaseThreadInitThunk [0x76947D59+25]
	RtlInitializeExceptionChain [0x77C1B74B+107]
	RtlClearBits [0x77C1B6CF+191]


In [5]:
links

['https://www.pimco.com/en-us/investments/etf/municipal-income-opportunities-active-exchange-traded-fund',
 'https://www.pimco.com/en-us/investments/etf/15-year-us-tips-index-exchange-traded-fund',
 'https://www.pimco.com/en-us/investments/etf/1-5-year-us-tips-index-exchange-traded-fund',
 'https://www.pimco.com/en-us/investments/etf/broad-us-tips-index-exchange-traded-fund',
 'https://www.pimco.com/en-us/investments/etf/intermediate-municipal-bond-active-exchange-traded-fund',
 'https://www.pimco.com/en-us/investments/etf/short-term-municipal-bond-active-exchange-traded-fund',
 'https://www.pimco.com/en-us/investments/etf/25-year-zero-coupon-us-treasury-index-exchange-traded-fund',
 'https://www.pimco.com/en-us/investments/etf/0-5-year-high-yield-corporate-bond-index-exchange-traded-fund',
 'https://www.pimco.com/en-us/investments/etf/enhanced-low-duration-active-exchange-traded-fund',
 'https://www.pimco.com/en-us/investments/etf/investment-grade-corporate-bond-index-exchange-traded-