In [1]:
import os
import time
from selenium import webdriver
from selenium.common import exceptions
# from selenium.webdriver.firefox.service import Service as FirefoxService
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.support.ui import Select
from selenium.webdriver.firefox.options import Options
from webdriver_manager.firefox import GeckoDriverManager
from tqdm import tqdm
from IPython.display import clear_output
from bs4 import BeautifulSoup
from io import StringIO
import pandas as pd
import argparse

In [2]:
def fill_textbox(
    browser, selector, text,  timeout=10, residual_timeout=1
):
    for _ in range(3):
        try:
            box = WebDriverWait(browser, timeout).until(
                EC.presence_of_element_located((By.XPATH, selector))
            )
            time.sleep(residual_timeout)
            box.clear()
            box.send_keys(text)
        except:
            time.sleep(timeout)
            continue
        
        
def click_by_xpath(browser, selector, timeout=10, residual_timeout=1):
    WebDriverWait(browser, timeout).until(
        EC.element_to_be_clickable((By.XPATH, selector))
    ).click()
    time.sleep(residual_timeout)

def login_siu(browser, siu_user, siu_pass):
    fill_textbox(browser, '//*[@id="ef_form_5000221_datosusuario"]', siu_user)
    fill_textbox(browser, '//*[@id="ef_form_5000221_datosclave"]', siu_pass)
    click_by_xpath(browser, '//*[@id="form_5000221_datos_ingresar"]')

In [3]:
def write_in_xpath(browser, xpath, text, timeout=10, residual_timeout=1):
    try:
        element = WebDriverWait(browser, timeout).until(
            EC.presence_of_element_located((By.XPATH, xpath))
        )
        time.sleep(residual_timeout)
        element.clear()
        element.send_keys(text)
    except Exception as e:
        print(f"Error al ingresar texto en el elemento {xpath}: {e}")

In [4]:
def select_option_by_value(browser, selector, option_value, timeout=10, residual_timeout=1):
    # Wait for the dropdown element to be clickable
    dropdown = WebDriverWait(browser, timeout).until(
        EC.element_to_be_clickable((By.XPATH, selector))
    )
    # Create a Select object for the dropdown
    select = Select(dropdown)
    
    try:
        select.select_by_value(option_value)
        time.sleep(residual_timeout)
    except Exception:
        print(f"Option with value '{option_value}' not found.")

def select_option_by_text(browser, selector, option_text, timeout=10, residual_timeout=1):
    # Wait for the dropdown element to be clickable
    dropdown = WebDriverWait(browser, timeout).until(
        EC.element_to_be_clickable((By.XPATH, selector))
    )
    # Create a Select object for the dropdown
    select = Select(dropdown)
    
    try:
        select.select_by_visible_text(option_text)
        time.sleep(residual_timeout)
    except Exception:
        print(f"Option with text '{option_text}' not found.")


In [5]:

def get_dropdown_options(browser, selector, timeout=10):
    """
    Retorna una lista de tuplas (valor, texto) de las opciones del dropdown indicado por su selector xpath.
    """
    dropdown = WebDriverWait(browser, timeout).until(
        EC.presence_of_element_located((By.XPATH, selector))
    )
    select = Select(dropdown)
    options = [(option.get_attribute("value"), option.text) for option in select.options]
    return options

def select_option_by_input(browser, selector, text=False, timeout=10, residual_timeout=1, input_text=None):
    # Obtener las opciones del dropdown
    options = get_dropdown_options(browser, selector, timeout)
    if input_text is None:
        
        # Si hay más de 10 opciones, muestra solo las primeras 10 y un extra para "Mostrar todas las opciones"
        if len(options) > 10:
            preview_options = options[:10]
            print("Opciones disponibles (primeras 10):")
            for idx, (_, text) in enumerate(preview_options):
                print(f"{idx}: {text}")
            print("10: Mostrar todas las opciones")
            
            valid = False
            while not valid:
                entrada = input("Ingrese el número de la opción que desea seleccionar: ")
                try:
                    indice = int(entrada)
                    if indice == 10:
                        # Mostrar todas las opciones y solicitar nuevamente
                        clear_output()
                        print("Opciones disponibles:")
                        
                        for idx, (_, text) in enumerate(options):
                            print(f"{idx}: {text}")
                        entrada_all = input("Ingrese el número de la opción que desea seleccionar: ")
                        indice = int(entrada_all)
                        if indice < 0 or indice >= len(options):
                            print("Índice fuera de rango.")
                            continue
                        valid = True
                    elif 0 <= indice < 10:
                        valid = True
                    else:
                        print("Índice fuera de rango.")
                except ValueError:
                    print("Entrada no válida. Por favor, ingrese un número entero.")
        else:
            # Si las opciones son 10 o menos, mostrarlas todas
            print("Opciones disponibles:")
            for idx, (_, text) in enumerate(options):
                print(f"{idx}: {text}")
                
            valid = False
            while not valid:
                entrada = input("Ingrese el número de la opción que desea seleccionar: ")
                try:
                    indice = int(entrada)
                    if 0 <= indice < len(options):
                        valid = True
                    else:
                        print("Índice fuera de rango.")
                except ValueError:
                    print("Entrada no válida. Por favor, ingrese un número entero.")
        # Seleccionar la opción utilizando el valor correspondiente
        value_to_select = options[indice][0]
    else:
        value_to_select = input_text
    clear_output()
    if text:
        select_option_by_text(browser, selector, value_to_select, timeout, residual_timeout)
    else:
        select_option_by_value(browser, selector, value_to_select, timeout, residual_timeout)

In [6]:
def filtrar_año(browser, input_text=None, timeout=10, residual_timeout=1):
    select_option_by_input(
        browser,
        '//*[@id="ef_ei_38000482_filtroanio_academico"]',
        input_text=input_text,
        timeout=timeout,
        residual_timeout=residual_timeout,
    )


def filtrar_llamado(browser, input_text=None, text=True, timeout=10, residual_timeout=1):
    select_option_by_input(
        browser,
        '//*[@id="ef_ei_38000482_filtroturno_examen"]',
        input_text=input_text,
        text=text,
        timeout=timeout,
        residual_timeout=residual_timeout,
    )
    
def ejecutar_filtro(browser, timeout=10, residual_timeout=1):
    click_by_xpath(browser, '//*[@id="ei_38000482_filtro_filtrar"]', timeout, residual_timeout)

In [7]:
def acta_generator(browser, acta_obj, timeout=15, residual_timeout=1):
    acta_obj.click()
    WebDriverWait(browser, timeout).until(
        EC.presence_of_element_located((By.XPATH, '//*[@id="ci_38000483_cancelar"]'))
    )
    soup = BeautifulSoup(browser.page_source, 'html.parser')
    info = general_info(soup, timeout, residual_timeout)
    tabs = [tab_alumnos(soup, timeout, residual_timeout)]
    
    try:
        n_pages = int(browser.find_element(By.XPATH, '//*[@id="cuerpo_js_cuadro_38000500_alumnos"]/tbody/tr[4]/td/div/strong[2]').text)
    except exceptions.NoSuchElementException:
        n_pages = 1
    for _ in range(1, n_pages):
        next_page = browser.find_element(By.XPATH, '//*[@src="/toba_2.6/img/nucleo/paginacion/siguiente.gif?av=3.3.26"]')
        next_page.click()
        WebDriverWait(browser, timeout).until(
            EC.presence_of_element_located((By.XPATH, '//*[@id="ci_38000483_cancelar"]'))
        )
        soup = BeautifulSoup(browser.page_source, 'html.parser')
        tabs.append(tab_alumnos(soup, timeout, residual_timeout))
        
    tab = pd.concat(tabs, ignore_index=True)
    tab["Nº"] = tab.index + 1
    old_cols = tab.columns.to_list()
    
    for col, val in info.items():
        tab[col] = val
    old_cols.extend(info.keys())
    tab = tab[old_cols]
    back = browser.find_element(By.XPATH, '//*[@id="ci_38000483_cancelar"]')
    back.click()
    time.sleep(residual_timeout)
    return tab
    
def general_info(soup, timeout=15, residual_timeout=1):
    '''Saca la info gral del acta (una para todas las pags)'''
    tab = pd.read_html(StringIO(soup.prettify()))[1]
    info = {}
    tab_info = tab.iloc[1:5] 
    for row in tab_info.iterrows():
        info[row[1][0]] = row[1][1]
        info[row[1][2]] = row[1][3]
        info[row[1][4]] = row[1][5]
    for i in range(10):
        try:
            info.pop(str(i))
        except:
            continue
    return info

def tab_alumnos(soup, timeout=15, residual_timeout=1):
    '''Saca la info de una pagina sola'''
    tab = pd.read_html(StringIO(soup.prettify()))[7]
    tab.columns = tab.iloc[0,:]
    tab = tab.drop(0)
    return tab
    
def next_page(browser, residual_timeout=1):
    obj = browser.find_element(
        By.XPATH,
        '//*[@src="/toba_2.6/img/nucleo/paginacion/siguiente.gif?av=3.3.26"]',
    )
    obj.click()
    WebDriverWait(browser, 10).until(
        EC.presence_of_element_located(
            (By.XPATH, '//*[@class="ei-boton-fila"]')
        )
    )
    time.sleep(residual_timeout)
    
def next_page(browser, residual_timeout=1):
    obj = browser.find_element(
        By.XPATH,
        '//*[@src="/toba_2.6/img/nucleo/paginacion/siguiente.gif?av=3.3.26"]',
    )
    obj.click()
    WebDriverWait(browser, 10).until(
        EC.presence_of_element_located(
            (By.XPATH, '//*[@class="ei-boton-fila"]')
        )
    )
    time.sleep(residual_timeout)
    
def prev_page(browser, residual_timeout=1):
    obj = browser.find_element(
        By.XPATH,
        '//*[@src="/toba_2.6/img/nucleo/paginacion/anterior.gif?av=3.3.26"]',
    )
    obj.click()
    WebDriverWait(browser, 10).until(
        EC.presence_of_element_located(
            (By.XPATH, '//*[@class="ei-boton-fila"]')
        )
    )
    time.sleep(residual_timeout)

In [8]:
import funcs as fx
def main(siu_credentials, año=None, llamado=None, residual_timeout=1):
    siu_user, siu_pass = open(siu_credentials, "r").readlines()
    siu_user = siu_user.strip()
    url = "https://guarani3-gerencial.guarani.cespi.unlp.edu.ar/guarani/3.11/aplicacion.php"

    print("Se va a abrir un navegador. Ingresar las credenciales del proxy")
    clear_output()

    browser = webdriver.Firefox()

    browser.get(url)
    fx.login_siu(browser, siu_user, siu_pass)
    browser.switch_to.window(browser.window_handles[1])
    fx.click_by_xpath(browser, '//*[@id="menu_img"]')
    fx.write_in_xpath(browser, '//*[@id="buscar_text"]', "Imprimir acta")
    fx.click_by_xpath(browser, '//*[@id="elemento_buscar_menu_38000085"]')
    fx.filtrar_año(browser, str(año))
    fx.filtrar_llamado(browser, llamado)
    fx.ejecutar_filtro(browser)

    dfs = []
    try:
        pags = int(
            browser.find_element(
                By.XPATH,
                '//*[@id="cuerpo_js_cuadro_38000496_cuadro_actas"]/tbody/tr[4]/td/div/strong[2]',
            ).text
        )
    except exceptions.NoSuchElementException:
        pags = 1
    for i in tqdm(range(pags), desc="Páginas", position=0, leave=True):
        if i != 0:
            fx.next_page(browser, residual_timeout)

        actas = browser.find_elements(
            By.XPATH, '//*[@class="ei-boton-fila"]'
        )  # asi se x cuantas actas loopear
        if i != 0:
            fx.prev_page(browser, residual_timeout)

        pbar = tqdm(range(len(actas)), desc="Actas", leave=False, position=1)
        for j in pbar:

            curr_pag = browser.find_element(
                By.XPATH, '//*[@id="cuadro_38000496_cuadro_actas__pagina_actual"]'
            ).get_property("value")

            while curr_pag != str(i + 1):
                fx.next_page(browser, 2)
                curr_pag = browser.find_element(
                    By.XPATH, '//*[@id="cuadro_38000496_cuadro_actas__pagina_actual"]'
                ).get_property("value")

            try:
                actas = browser.find_elements(By.XPATH, '//*[@class="ei-boton-fila"]')
                df = fx.acta_generator(browser, actas[j])
            except:
                time.sleep(5 * residual_timeout)
                actas = browser.find_elements(By.XPATH, '//*[@class="ei-boton-fila"]')
                df = fx.acta_generator(browser, actas[j])
            # print(df.Actividad.iloc[0], flush=True) # hacer q no rompa el tqdm

            pbar.display(f"{df.Actividad.iloc[0]}")
            dfs.append(df)
            # return df
            # if df.columns.duplicated().any():
            #     print(df.columns)
            #     print(df.head())
            #     raise ValueError("Columnas duplicadas")
    return pd.concat(dfs, ignore_index=True)

In [12]:
df_llamado = main("siu_pass.txt", "2024", "09 - Agosto LP")

Páginas:   0%|          | 0/5 [00:04<?, ?it/s]


In [9]:
import funcs as fx


In [10]:
siu_user, siu_pass = open("siu_pass.txt", "r").readlines()
siu_user = siu_user.strip()
url = "https://guarani3-gerencial.guarani.cespi.unlp.edu.ar/guarani/3.11/aplicacion.php"


In [22]:
browser = webdriver.Firefox()

browser.get(url)
login_siu(browser, siu_user, siu_pass)
browser.switch_to.window(browser.window_handles[1])
click_by_xpath(browser, '//*[@id="menu_img"]')
write_in_xpath(browser, '//*[@id="buscar_text"]', "Imprimir acta")
click_by_xpath(browser, '//*[@id="elemento_buscar_menu_38000085"]')
filtrar_año(browser, "2024")
filtrar_llamado(browser, "09 - Diciembre LP")
ejecutar_filtro(browser)

In [34]:
import importlib
importlib.reload(fx)

<module 'funcs' from 'c:\\Lucas\\Laburo\\Consultoria\\scraping-unlp\\funcs.py'>

In [35]:
actas = browser.find_elements(By.XPATH, '//*[@class="ei-boton-fila"]')


In [36]:
fx.acta_generator(browser, actas[9])

(0  Nº         Apellido y Nombre Identificación    Legajo Instancia  \
 0   1  Pelufo, Sabina Maria Sol   DNI 45812119  103778/0   Regular   
 
 0       Fecha Nota Letras Resultado    Acta Año Académico              Turno  \
 0  13/08/2024    7  Siete  Aprobado  103844          2024  09 - Agosto Sedes   
 
 0       Mesa                    Llamado  \
 0  Saladillo  Llamado 09 - Agosto Sedes   
 
 0                                          Actividad    Tipo   Estado  \
 0  (7243) Administración II (Técnicas Administrat...  Normal  Cerrada   
 
 0  Ubicación Código de Verificación  
 0  Saladillo                      1  ,
 '(7243) Administración II (Técnicas Administrativas y Gestión Organizacional)')

In [37]:
fx.general_info(BeautifulSoup(browser.page_source, 'html.parser'))


  info[row[1][0]] = row[1][1]
  info[row[1][2]] = row[1][3]
  info[row[1][4]] = row[1][5]


IndexError: index 5 is out of bounds for axis 0 with size 5

In [15]:
fx.tab_alumnos(BeautifulSoup(browser.page_source, 'html.parser'))

Unnamed: 0,Nº,Apellido y Nombre,Identificación,Legajo,Instancia,Fecha,Nota,Letras,Resultado
1,1,"Balenzano, Agustina",DNI 42177234,97651/2,Regular,22/08/2024,7,Siete,Aprobado
2,2,"Fosco Ardiles, Salvador Matias",DNI 37549347,97676/1,Regular,22/08/2024,6,Seis,Aprobado
3,3,"Menay, Tiziana",DNI 43471375,100820/0,Regular,22/08/2024,4,Cuatro,Aprobado
4,4,"Mengoni, Florencia",DNI 42835019,103773/4,Libre,22/08/2024,7,Siete,Aprobado
5,5,"Monclus, Cinthia Belen",DNI 41923080,97788/0,Regular,22/08/2024,2,Dos,Reprobado
6,6,"Pougy, Aldana Yamila Jacqueline",DNI 42323439,97713/9,Regular,22/08/2024,7,Siete,Aprobado
7,7,"Vaccarini, Daira",DNI 43186868,103795/1,Libre,22/08/2024,7,Siete,Aprobado
