In [1]:
pip install selenium webdriver-manager


Collecting selenium
  Downloading selenium-4.28.1-py3-none-any.whl.metadata (7.1 kB)
Collecting webdriver-manager
  Downloading webdriver_manager-4.0.2-py2.py3-none-any.whl.metadata (12 kB)
Collecting trio~=0.17 (from selenium)
  Downloading trio-0.28.0-py3-none-any.whl.metadata (8.5 kB)
Collecting trio-websocket~=0.9 (from selenium)
  Downloading trio_websocket-0.11.1-py3-none-any.whl.metadata (4.7 kB)
Collecting websocket-client~=1.8 (from selenium)
  Downloading websocket_client-1.8.0-py3-none-any.whl.metadata (8.0 kB)
Collecting python-dotenv (from webdriver-manager)
  Downloading python_dotenv-1.0.1-py3-none-any.whl.metadata (23 kB)
Collecting attrs>=23.2.0 (from trio~=0.17->selenium)
  Downloading attrs-25.1.0-py3-none-any.whl.metadata (10 kB)
Collecting sortedcontainers (from trio~=0.17->selenium)
  Downloading sortedcontainers-2.4.0-py2.py3-none-any.whl.metadata (10 kB)
Collecting outcome (from trio~=0.17->selenium)
  Downloading outcome-1.3.0.post0-py2.py3-none-any.whl.metadat


[notice] A new release of pip is available: 24.0 -> 25.0
[notice] To update, run: C:\Users\F16216\AppData\Local\Microsoft\WindowsApps\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\python.exe -m pip install --upgrade pip


In [16]:
import pandas as pd
import random
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from webdriver_manager.chrome import ChromeDriverManager
import time
import os

csv_filename = r"ruta"

# Esto por si el proceso de Scrappeo se ve interrumpido, para volver a retomar desde el último punto.
if os.path.exists(csv_filename):
    df_existing = pd.read_csv(csv_filename)
    processed_names = set(df_existing["Nombre"])  # Evita duplicados
    start_page = (len(df_existing) // 20) + 1  
    data = df_existing.values.tolist()
    print(f"📂 Datos previos cargados. Retomamos desde la página {start_page}.")
else:
    processed_names = set()
    start_page = 0
    data = []

options = webdriver.ChromeOptions()
options.add_argument("--start-maximized")
options.add_argument("--disable-blink-features=AutomationControlled")  
options.add_argument("--headless=new")  # Opcional: Modo sin interfaz
options.add_argument("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36")
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)

login_url = "https://capacitacionelectoral.cne.gob.ec/login/index.php"
driver.get(login_url)
time.sleep(3)

# Ingresar credenciales
username_input = driver.find_element(By.ID, "username")
password_input = driver.find_element(By.ID, "password")
login_button = driver.find_element(By.ID, "loginbtn")

username_input.send_keys("usuario")
password_input.send_keys("contraseña")
login_button.click()
time.sleep(3)

# Verificar login
if "login" in driver.current_url:
    print("❌ Error: No se pudo iniciar sesión.")
    driver.quit()
    exit()
else:
    print("✅ Login exitoso!")

# URL de la info a scrappear
participantes_url = "https://capacitacionelectoral.cne.gob.ec/user/index.php?id=15"
driver.get(participantes_url)
time.sleep(3)

# Obtener total de páginas
try:
    pagination_buttons = driver.find_elements(By.CSS_SELECTOR, "ul.pagination li.page-item a.page-link")
    all_page_numbers = [btn.text.strip() for btn in pagination_buttons if btn.text.strip().isdigit()]
    total_pages = max(map(int, all_page_numbers))
except:
    total_pages = 1

print(f"📄 Total de páginas encontradas: {total_pages}")

# 🔄 Recorrer todas las páginas desde la última guardada
for current_page in range(start_page, total_pages):
    print(f"➡️ Extrayendo datos de la página {current_page + 1}/{total_pages}")

    current_page_url = f"https://capacitacionelectoral.cne.gob.ec/user/index.php?id=15&page={current_page}"
    driver.get(current_page_url)
    time.sleep(random.uniform(3, 5))  # esto es para evitar bloqueos
    participantes = driver.find_elements(By.CSS_SELECTOR, "a.d-inline-block.aabtn")
    links = [p.get_attribute("href") for p in participantes]

    print(f"📌 Encontrados {len(links)} participantes en la página {current_page + 1}")

    
    for link in links:
        retries = 0
        while retries < 3:  
            try:
                driver.get(link)
                time.sleep(random.uniform(3, 6))  
                break  # Si se carga bien la página, salimos del bucle
            except Exception as e:
                print(f"⚠️ Error de conexión con {link}. Reintentando... ({retries + 1}/3)")
                time.sleep(5)  
                retries += 1

        if retries == 3:
            print(f"🚫 Falló la carga de {link}. Se omite.")
            continue  

        # Extraer nombre
        try:
            name = driver.find_element(By.CSS_SELECTOR, "h2.rui-main-content-title--h2").text
        except:
            name = "N/A"

        
        if name in processed_names:
            print(f"⚠️ {name} ya está en el CSV. Se omite.")
            continue
        processed_names.add(name)

        # Extraer email
        try:
            email = driver.find_element(By.CSS_SELECTOR, "a[href^='mailto:']").text
        except:
            email = "N/A"

        # Extraer ciudad
        try:
            content_nodes = driver.find_elements(By.CSS_SELECTOR, "li.contentnode")
            all_texts = [node.text for node in content_nodes]
            city = next((text.split("\n")[1] for text in all_texts if text.startswith("Ciudad")), "N/A")
        except:
            city = "N/A"

        print(f"📌 {name} | {email} | {city}")
        data.append([name, email, city])

    
    if current_page % 5 == 0:
        df = pd.DataFrame(data, columns=["Nombre", "Email", "Ciudad"])
        df.to_csv(csv_filename, index=False, encoding="utf-8")
        print(f"✅ Backup guardado en {csv_filename}")

# Guardar datos finales en CSV
df = pd.DataFrame(data, columns=["Nombre", "Email", "Ciudad"])
df.to_csv(csv_filename, index=False, encoding="utf-8")

print(f"✅ Datos guardados correctamente en {csv_filename}")

# Cerrar el navegador
driver.quit()


📂 Datos previos cargados. Retomamos desde la página 17.
✅ Login exitoso!
📄 Total de páginas encontradas: 52
➡️ Extrayendo datos de la página 18/52
📌 Encontrados 20 participantes en la página 18
📌 AMPARITO DEL ROSARIO FRANCO SALAZAR | amparitofrancosalazar@hotmail.com | IMBABURA
📌 LICETT CAROLINA FREIRE PAREDES | lc.freire@uta.edu.ec | TUNGURAHUA
📌 AMARELIS ILIANA FREIRE VASCO | amarelisfreire_1985@hotmail.com | PASTAZA
📌 YAMILETH DANIELA FUELTALA ESCOBAR | alejaguzman19921@gmail.com | CARCHI
📌 LJUBICA MARCELA FUENTES ORTIZ | ljubifuentes@gmail.com | PICHINCHA
📌 DOMENICA BELEN FUERTES MAIGUA | dfuertesm@estud.usfq.edu.ec | PICHINCHA
📌 DILAN IVAN GAHON ORDOEZ | 1752145357@email.com | PICHINCHA
📌 JOSE EMILIO GAIBOR BARBA | jgaibor24@yahoo.com | BOLIVAR
📌 FREDDY FERNANDO GAIBOR GAIBOR | fgg_1967@hotmail.com | SUCUMBIOS
📌 KARINA JUDITH GAIBOR SANCHEZ | karina.arturito@hotmail.com | GUAYAS
📌 LEOPOLDO ANDRES GALARZA MOSCOSO | galarzamoscosol@gmail.com | GUAYAS
📌 CRISTHIAN GERALDO GALARZA RIAS