In [None]:
"""
Autor: Lucas Moreira
Projeto: ETL de dados Fenabrave (Novos e Seminovos/Usados)
Descrição: Notebook de extração dos dados da Fenabrave 
Data: 2025-06
"""

import os
from selenium import webdriver
from bs4 import BeautifulSoup
from selenium.webdriver.chrome.options import Options
import requests
import time
from datetime import datetime
import pandas as pd
from oauth2client.service_account import ServiceAccountCredentials
from selenium.webdriver.support.ui import WebDriverWait 
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium import webdriver
from webdriver_manager.chrome import ChromeDriverManager
from urllib.parse import urlparse

path = '/Users/lucasmoreira/Documents/pessoal/chromedriver_mac64'
path_data = os.path.join(path, 'chromedriver')



### Seminovos e Usados

In [None]:
# Coleta dados da página inicial
chrome_options = webdriver.ChromeOptions()
prefs = {
    "plugins.always_open_pdf_externally": True,
    "download.prompt_for_download": False,
    "download.directory_upgrade": True
}
chrome_options.add_experimental_option("prefs", prefs)
driver = webdriver.Chrome(options=chrome_options)

fenabrave = 'https://www.fenabrave.org.br/Portal/Conteudo/SemiNovoseUsados'
driver.get(fenabrave)
time.sleep(3)

pdf_links_usados = []

for i in (range(2, 18)):
    try:
        link_element = driver.find_element(By.XPATH, f'/html/body/div[3]/div/div[{i}]/div/p[3]/a')
        pdf_url = link_element.get_attribute('href')
        if pdf_url and pdf_url.lower().endswith('.pdf'):
            pdf_links_usados.append(pdf_url)
    except:
        print(f'Erro no arquivo {i}')
driver.quit()

names = [url.split('files/')[-1] for url in pdf_links_usados]
pdf_links = [f'https://www.fenabrave.org.br/portal/files/{name}' for name in names]
headers = {'User-Agent': 'Mozilla/5.0'}
print(f"Iniciando download de {len(pdf_links)} arquivos...")

# Download
sucessos, erros, ja_existem = 0, 0, 0
for i, url in enumerate(pdf_links, 1):
    filename = os.path.basename(urlparse(url).path)
    filepath = os.path.join('input/usados', filename)

    # Mensagem de status
    status = ""

    if os.path.exists(filepath):
        ja_existem += 1
        status = "Já existia"
    else:
        try:
            response = requests.get(url, headers=headers, timeout=30)
            response.raise_for_status()

            with open(filepath, "wb") as f:
                f.write(response.content)

            sucessos += 1
            status = "Sucesso"

        except Exception:
            erros += 1
            status = "Erro"

        time.sleep(0.5)

    print(f"[{i}/{len(pdf_links)}] {filename}: {status}")

print(f"Total Sucesso: {sucessos}")
print(f"Total Histórico: {ja_existem}")
print(f"Total Erro: {erros}")
print(f"Total Processado: {len(pdf_links)}")

In [None]:
# Coleta dados históricos (2005-2023)
chrome_options = webdriver.ChromeOptions()
prefs = {
    "plugins.always_open_pdf_externally": True,
    "download.prompt_for_download": False,
    "download.directory_upgrade": True
}
chrome_options.add_experimental_option("prefs", prefs)
driver = webdriver.Chrome(options=chrome_options)

fenabrave = 'https://www.fenabrave.org.br/Portal/Conteudo/SemiNovoseUsados'
driver.get(fenabrave)
time.sleep(3)

pdf_links_usados_historico = []

for i in range(1, 19): 
    try:
        driver.find_element(By.XPATH, '//*[@id="textBusca"]').click()
        option = driver.find_element(By.XPATH, f'//*[@id="textBusca"]/option[{i}]')
        option.click()
        ano = option.text.strip()
        
        time.sleep(2)
        driver.find_element(By.XPATH, '/html/body/div[3]/div/div[19]/div/div[2]/button').click()

        time.sleep(3)
        # Percorre as divs de 13 (jan) até 2 (dez) com mapeamento do mês
        for b, mes in zip(range(13, 1, -1), range(1, 13)):
            try:
                link_element = driver.find_element(By.XPATH, f'//*[@id="maisEmplacamentos"]/div[{b}]/div/p[3]/a')
                pdf_url = link_element.get_attribute('href')
                if pdf_url and pdf_url.lower().endswith('.pdf'):
                    pdf_links_usados_historico.append(pdf_url)
            except Exception as e:
                print(f"Mês {mes} (div {b}) não encontrado: {e}")
                
        print(f"Links obtidos até o momento: {len(pdf_links_usados_historico)}")

    except Exception as e:
        print(f"Erro ao processar {i}: {e}")

driver.quit()


names = [url.split('files/')[-1] for url in pdf_links_usados_historico]
pdf_links = [f'https://www.fenabrave.org.br/portal/files/{name}' for name in names]
headers = {'User-Agent': 'Mozilla/5.0'}
print(f"Iniciando download de {len(pdf_links)} arquivos...")

# Download
sucessos, erros, ja_existem = 0, 0, 0
for i, url in enumerate(pdf_links, 1):
    filename = os.path.basename(urlparse(url).path)
    filepath = os.path.join('input/usados', filename)

    # Mensagem de status
    status = ""

    if os.path.exists(filepath):
        ja_existem += 1
        status = "Já existia"
    else:
        try:
            response = requests.get(url, headers=headers, timeout=30)
            response.raise_for_status()

            with open(filepath, "wb") as f:
                f.write(response.content)

            sucessos += 1
            status = "Sucesso"

        except Exception:
            erros += 1
            status = "Erro"

        time.sleep(0.5)

    print(f"[{i}/{len(pdf_links)}] {filename}: {status}")

# Relatório final
print(f"Total Sucesso: {sucessos}")
print(f"Total Histórico: {ja_existem}")
print(f"Total Erro: {erros}")
print(f"Total Processado: {len(pdf_links)}")

### Novos

In [None]:
# Coleta dados da página inicial
url = "https://www.fenabrave.org.br/portalv2/Conteudo/Emplacamentos"
response = requests.get(url)
soup = BeautifulSoup(response.content, 'html.parser')

pdf_links_novos = []
for link in soup.find_all('a', class_="btn btn-info btn-block"):
    pdf_links_novos.append(link['href'])
    
names = [url.split('files/')[-1] for url in pdf_links_novos]
pdf_links = [f'https://www.fenabrave.org.br/portal/files/{name}' for name in names]
headers = {'User-Agent': 'Mozilla/5.0'}
print(f"Iniciando download de {len(pdf_links)} arquivos...")

# Download
sucessos, erros, ja_existem = 0, 0, 0
for i, url in enumerate(pdf_links, 1):
    filename = os.path.basename(urlparse(url).path)
    filepath = os.path.join('input/novos', filename)

    # Mensagem de status
    status = ""

    if os.path.exists(filepath):
        ja_existem += 1
        status = "Já existia"
    else:
        try:
            response = requests.get(url, headers=headers, timeout=30)
            response.raise_for_status()

            with open(filepath, "wb") as f:
                f.write(response.content)

            sucessos += 1
            status = "Sucesso"

        except Exception:
            erros += 1
            status = "Erro"

        time.sleep(0.5)

    print(f"[{i}/{len(pdf_links)}] {filename}: {status}")

# Status
print(f"Total Sucesso: {sucessos}")
print(f"Total Histórico: {ja_existem}")
print(f"Total Erro: {erros}")
print(f"Total Processado: {len(pdf_links)}")

In [None]:
# Coleta dados históricos (2003-2023)
chrome_options = webdriver.ChromeOptions()
prefs = {
    "plugins.always_open_pdf_externally": True,
    "download.prompt_for_download": False,
    "download.directory_upgrade": True
}
chrome_options.add_experimental_option("prefs", prefs)
driver = webdriver.Chrome(options=chrome_options)
fenabrave = 'https://www.fenabrave.org.br/portalv2/Conteudo/Emplacamentos'
driver.get(fenabrave)
time.sleep(5)


pdf_links_novos_historico = []

for i in range(2, 23): 
    try:
        driver.find_element(By.XPATH, '/html/body/div[1]/section/div/div[7]/div/div[1]/select').click()
        option = driver.find_element(By.XPATH, f'/html/body/div[1]/section/div/div[7]/div/div[1]/select/option[{i}]')
        option.click()
        ano = option.text.strip()
        
        time.sleep(2)
        driver.find_element(By.XPATH, '/html/body/div[1]/section/div/div[7]/div/div[3]/button').click()

        time.sleep(3)
        # Percorre as divs de 13 (jan) até 2 (dez) com mapeamento do mês
        for b, mes in zip(range(1,13), range(1, 13)):
            try:
                link_element = driver.find_element(By.XPATH, f'//*[@id="appendBuscarMaisAnos2"]/div[{b}]/div/a[2]')
                pdf_url = link_element.get_attribute('href')
                if pdf_url and pdf_url.lower().endswith('.pdf'):
                    pdf_links_novos_historico.append(pdf_url)
            except Exception as e:
                print(f"Mês {mes} (div {b}) não encontrado: {e}")
                
        print(f"Links obtidos até o momento: {len(pdf_links_novos_historico)}")

    except Exception as e:
        print(f"Erro ao processar {i}: {e}")

driver.quit()

names = [url.split('files/')[-1] for url in pdf_links_novos_historico]
pdf_links = [f'https://www.fenabrave.org.br/portal/files/{name}' for name in names]
headers = {'User-Agent': 'Mozilla/5.0'}
print(f"Iniciando download de {len(pdf_links)} arquivos...")

# Download
sucessos, erros, ja_existem = 0, 0, 0
for i, url in enumerate(pdf_links, 1):
    filename = os.path.basename(urlparse(url).path)
    filepath = os.path.join('input/novos', filename)

    # Mensagem de status
    status = ""

    if os.path.exists(filepath):
        ja_existem += 1
        status = "Já existia"
    else:
        try:
            response = requests.get(url, headers=headers, timeout=30)
            response.raise_for_status()

            with open(filepath, "wb") as f:
                f.write(response.content)

            sucessos += 1
            status = "Sucesso"

        except Exception:
            erros += 1
            status = "Erro"

        time.sleep(0.5)

    print(f"[{i}/{len(pdf_links)}] {filename}: {status}")

# Status
print(f"Total Sucesso: {sucessos}")
print(f"Total Histórico: {ja_existem}")
print(f"Total Erro: {erros}")
print(f"Total Processado: {len(pdf_links)}")