In [37]:
import pandas as pd
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait
import time

In [38]:
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()))

In [39]:
url = 'https://sistemaswebb3-listados.b3.com.br/indexPage/day/IBOV?language=pt-br'
driver.get(url)

In [40]:
# Wait for the container to be available
container = WebDriverWait(driver, 10).until(
    EC.presence_of_element_located((By.ID, "divContainerIframeB3"))
)
print("Found the container:", container)


Found the container: <selenium.webdriver.remote.webelement.WebElement (session="06ab66468a3350d5e2f866b09d21c423", element="f.AF9FCD3F8ED313C773C86BEF55691D73.d.F2A8F1246F1C70BF695BEB2E63626456.e.12")>


In [41]:
from datetime import datetime

# Extract the date from the container
date_element = container.find_element(By.XPATH, "//p[contains(text(), 'Carteira Teórica do IBovespa válida para')]")
date_text = date_element.text.split("para")[-1].strip()  # Extracts '17/03/25'

# Convert the date to ISO 8601 format (YYYY-MM-DD)
parsed_date = datetime.strptime(date_text, "%d/%m/%y")  # Parse the date using the original format
formatted_date = parsed_date.strftime("%Y-%m-%d")  # Format it to ISO 8601

# Now formatted_date contains the string in 'YYYY-MM-DD' format
print(formatted_date)

2025-03-17


*** SCRAPE ONE PAGE

In [42]:
# Locate the table within the container
table = container.find_element(By.TAG_NAME, "table")
rows = table.find_elements(By.TAG_NAME, "tr")

# Extract header
headers = [header.text for header in rows[0].find_elements(By.TAG_NAME, "th")]

# Extract rows
data = []
for row in rows[1:]:
    cols = row.find_elements(By.TAG_NAME, "td")
    data.append([col.text for col in cols])

# Convert to Pandas DataFrame
df = pd.DataFrame(data, columns=headers)

# Add the extracted date to the DataFrame as a new column
df["Date"] = formatted_date

# Save the DataFrame to a Parquet file with the date in the name
parquet_file_name = f"ibovespa_{formatted_date}.parquet"
df.to_parquet(f'data\\{parquet_file_name}', engine="pyarrow", index=False)

csv_file_name = parquet_file_name.replace(".parquet", ".csv")  # Replace the extension
df.to_csv(f'data\\{csv_file_name}', index=False, encoding="utf-8")

print(f"DataFrame with date column saved as {parquet_file_name}")


DataFrame with date column saved as ibovespa_2025-03-17.parquet


In [43]:
df.head()

Unnamed: 0,Código,Ação,Tipo,Qtde. Teórica,Part. (%),Date
0,ALOS3,ALLOS,ON NM,476.976.044,439,2025-03-17
1,ABEV3,AMBEV S/A,ON ED,4.394.835.131,2867,2025-03-17
2,ASAI3,ASSAI,ON NM,1.345.832.968,491,2025-03-17
3,AURE3,AUREN,ON NM,323.738.747,119,2025-03-17
4,AMOB3,AUTOMOB,ON NM,533.959.816,7,2025-03-17


### TRYING TO SCRAPE ALL PAGES

In [44]:
# Function to get the total number of pages
def get_total_pages(driver):
    pagination_element = WebDriverWait(driver, 10).until(
        EC.presence_of_element_located((By.XPATH, "//ul[@class='ngx-pagination']"))
    )
    pages = pagination_element.find_elements(By.TAG_NAME, "li")
    last_page_text = pages[-2].text.strip()
    total_pages = int(''.join(filter(str.isdigit, last_page_text)))  # Clean and convert
    return total_pages

# Initialize an empty list to store data from all pages
all_data = []

# Get the total number of pages
total_pages = get_total_pages(driver)
print(f"Total pages to scrape: {total_pages}")

Total pages to scrape: 5


In [None]:
# Loop through all pages
page = 0
for page_number in range(1, total_pages + 1):
    page = page_number
    print(f"Scraping page {page_number}...")

    # Locate the table on the current page
    table = container.find_element(By.TAG_NAME, "table")
    thead = table.find_element(By.TAG_NAME, "thead")
    tbody = table.find_element(By.TAG_NAME, "tbody")
    tfoot = table.find_element(By.TAG_NAME, "tfoot")
    
    #locate rows inside TBody
    rows = tbody.find_elements(By.TAG_NAME, "tr")
    
    # Extract headers (only on the first iteration) and prepare the code to get footer content
    if page_number == 1:
        rowheader = thead.find_elements(By.TAG_NAME, "tr")
        headers = [header.text.replace("Qtde. Teórica","Qtde").replace("Part. (%)","Part").replace("Código","Codigo").replace("Ação","Acao")  
                   for header in rowheader[0].find_elements(By.TAG_NAME, "th")]
        rowfooter = tfoot.find_elements(By.TAG_NAME, "tr")

    # Extract row data
    for row in rows[1:]:
        cols = row.find_elements(By.TAG_NAME, "td")
        all_data.append([col.text for col in cols])
    
    # Go to the next page if not on the last one
    if page_number < total_pages:
        element = driver.find_element(By.XPATH, "//li[contains(@class, 'pagination-next')]/a")
        element.click();
        time.sleep(10);

# Attach footer content to the all_data array
#for rowf in rowfooter:
#    colsf = rowf.find_elements(By.TAG_NAME, "td")
#    all_data.append([col.text for col in colsf])

# Convert collected data to a Pandas DataFrame
df = pd.DataFrame(all_data, columns=headers)

# Add the extracted date to the DataFrame
df["Data"] = formatted_date

# Save the DataFrame to a Parquet file and a CSV file
parquet_file_name = f"data\\ibovespa_{formatted_date}.parquet"
df.to_parquet(parquet_file_name, engine="pyarrow", index=False)

csv_file_name = parquet_file_name.replace(".parquet", ".csv")
df.to_csv(csv_file_name, index=False, encoding="utf-8")

print(f"Data saved: Parquet: {parquet_file_name}, CSV: {csv_file_name}")

Scraping page 1...
Scraping page 2...
Scraping page 3...
Scraping page 4...
Scraping page 5...
Data saved: Parquet: data\ibovespa_2025-03-17.parquet, CSV: data\ibovespa_2025-03-17.csv
