In [None]:
pip install selenium webdriver-manager


In [None]:
import pandas as pd
import random
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from webdriver_manager.chrome import ChromeDriverManager
import time
import os

csv_filename = r"ruta"

# Esto por si el proceso de Scrappeo se ve interrumpido, para volver a retomar desde el √∫ltimo punto.
if os.path.exists(csv_filename):
    df_existing = pd.read_csv(csv_filename)
    processed_names = set(df_existing["Nombre"])  # Evita duplicados
    start_page = (len(df_existing) // 20) + 1  
    data = df_existing.values.tolist()
    print(f"üìÇ Datos previos cargados. Retomamos desde la p√°gina {start_page}.")
else:
    processed_names = set()
    start_page = 0
    data = []

options = webdriver.ChromeOptions()
options.add_argument("--start-maximized")
options.add_argument("--disable-blink-features=AutomationControlled")  
options.add_argument("--headless=new")  # Opcional: Modo sin interfaz
options.add_argument("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36")
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)

login_url = "https://capacitacionelectoral.cne.gob.ec/login/index.php"
driver.get(login_url)
time.sleep(3)

# Ingresar credenciales
username_input = driver.find_element(By.ID, "username")
password_input = driver.find_element(By.ID, "password")
login_button = driver.find_element(By.ID, "loginbtn")

username_input.send_keys("usuario")
password_input.send_keys("contrase√±a")
login_button.click()
time.sleep(3)

# Verificar login
if "login" in driver.current_url:
    print("‚ùå Error: No se pudo iniciar sesi√≥n.")
    driver.quit()
    exit()
else:
    print("‚úÖ Login exitoso!")

# URL de la info a scrappear
participantes_url = "https://capacitacionelectoral.cne.gob.ec/user/index.php?id=15"
driver.get(participantes_url)
time.sleep(3)

# Obtener total de p√°ginas
try:
    pagination_buttons = driver.find_elements(By.CSS_SELECTOR, "ul.pagination li.page-item a.page-link")
    all_page_numbers = [btn.text.strip() for btn in pagination_buttons if btn.text.strip().isdigit()]
    total_pages = max(map(int, all_page_numbers))
except:
    total_pages = 1

print(f"üìÑ Total de p√°ginas encontradas: {total_pages}")

# üîÑ Recorrer todas las p√°ginas desde la √∫ltima guardada
for current_page in range(start_page, total_pages):
    print(f"‚û°Ô∏è Extrayendo datos de la p√°gina {current_page + 1}/{total_pages}")

    current_page_url = f"https://capacitacionelectoral.cne.gob.ec/user/index.php?id=15&page={current_page}"
    driver.get(current_page_url)
    time.sleep(random.uniform(3, 5))  # esto es para evitar bloqueos
    participantes = driver.find_elements(By.CSS_SELECTOR, "a.d-inline-block.aabtn")
    links = [p.get_attribute("href") for p in participantes]

    print(f"üìå Encontrados {len(links)} participantes en la p√°gina {current_page + 1}")

    
    for link in links:
        retries = 0
        while retries < 3:  
            try:
                driver.get(link)
                time.sleep(random.uniform(3, 6))  
                break  # Si se carga bien la p√°gina, salimos del bucle
            except Exception as e:
                print(f"‚ö†Ô∏è Error de conexi√≥n con {link}. Reintentando... ({retries + 1}/3)")
                time.sleep(5)  
                retries += 1

        if retries == 3:
            print(f"üö´ Fall√≥ la carga de {link}. Se omite.")
            continue  

        # Extraer nombre
        try:
            name = driver.find_element(By.CSS_SELECTOR, "h2.rui-main-content-title--h2").text
        except:
            name = "N/A"

        
        if name in processed_names:
            print(f"‚ö†Ô∏è {name} ya est√° en el CSV. Se omite.")
            continue
        processed_names.add(name)

        # Extraer email
        try:
            email = driver.find_element(By.CSS_SELECTOR, "a[href^='mailto:']").text
        except:
            email = "N/A"

        # Extraer ciudad
        try:
            content_nodes = driver.find_elements(By.CSS_SELECTOR, "li.contentnode")
            all_texts = [node.text for node in content_nodes]
            city = next((text.split("\n")[1] for text in all_texts if text.startswith("Ciudad")), "N/A")
        except:
            city = "N/A"

        print(f"üìå {name} | {email} | {city}")
        data.append([name, email, city])

    
    if current_page % 5 == 0:
        df = pd.DataFrame(data, columns=["Nombre", "Email", "Ciudad"])
        df.to_csv(csv_filename, index=False, encoding="utf-8")
        print(f"‚úÖ Backup guardado en {csv_filename}")

# Guardar datos finales en CSV
df = pd.DataFrame(data, columns=["Nombre", "Email", "Ciudad"])
df.to_csv(csv_filename, index=False, encoding="utf-8")

print(f"‚úÖ Datos guardados correctamente en {csv_filename}")

# Cerrar el navegador
driver.quit()
