In [1]:
from ladle.ladle import Ladle
from dataclasses import dataclass
import requests
import fitz  # PyMuPDF
import time
import os
import json
import hashlib
# Google Cloud Storage
from google.oauth2 import service_account
from google.api_core.exceptions import NotFound, Conflict, Forbidden, BadRequest
from google.cloud import storage
gcp_credentials_info = os.getenv("GCP_SERVICE_ACCOUNT_CREDENTIALS")
gcp_credentials_info = json.loads(gcp_credentials_info)
gcp_service_account_credentials = service_account.Credentials.from_service_account_info(gcp_credentials_info)
storage_client = storage.Client(credentials=gcp_service_account_credentials)

from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.webdriver.common.action_chains import ActionChains
from selenium.common.exceptions import StaleElementReferenceException

In [2]:
def extract_pdf_content(doc_url):
    """
    Extract PDF content using PyMuPDF (fitz) - handles complex layouts well
    """
    pdf_resp = requests.get(doc_url)
    pdf_bytes = pdf_resp.content
    
    pdf_text = None
    try:
        # Open PDF from bytes
        pdf_doc = fitz.open(stream=pdf_bytes, filetype="pdf")
        
        # Extract text from all pages
        text_pages = []
        for page_num in range(pdf_doc.page_count):
            page = pdf_doc[page_num]
            # Extract text - this preserves layout better than PyPDF2
            page_text = page.get_text()
            if page_text.strip():  # Only add non-empty pages
                text_pages.append(page_text)
                
        # Join all pages
        pdf_text = "\n\n".join(text_pages) if text_pages else None
        # Close the document
        pdf_doc.close()
        
    except Exception as e:
        print(f"Failed to extract text from PDF: {e}")
        pdf_text = None
    
    return {'pdf_text': pdf_text, 'pdf_bytes': pdf_bytes}

In [31]:
CONTENT_TYPE_MAP = {
    # Documents
    "pdf": "application/pdf",
    "doc": "application/msword",
    "docx": "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
    "xls": "application/vnd.ms-excel",
    "xlsx": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
    "ppt": "application/vnd.ms-powerpoint",
    "pptx": "application/vnd.openxmlformats-officedocument.presentationml.presentation",
    "txt": "text/plain",
    "rtf": "application/rtf",
    "csv": "text/csv",

    # Data / Markup
    "xml": "application/xml",
    "xsd": "application/xml",
    "json": "application/json",
    "yaml": "application/x-yaml",
    "yml": "application/x-yaml",

    # Archives / Compression
    "zip": "application/zip",
    "gz": "application/gzip",
    "tar": "application/x-tar",
    "rar": "application/vnd.rar",
    "7z": "application/x-7z-compressed",

    # Web
    "html": "text/html",
    "htm": "text/html",
    "css": "text/css",
    "js": "application/javascript",
    "jsonld": "application/ld+json",

    # Images
    "png": "image/png",
    "jpg": "image/jpeg",
    "jpeg": "image/jpeg",
    "gif": "image/gif",
    "bmp": "image/bmp",
    "svg": "image/svg+xml",
    "webp": "image/webp",
    "ico": "image/x-icon",

    # Audio / Video
    "mp3": "audio/mpeg",
    "wav": "audio/wav",
    "ogg": "audio/ogg",
    "mp4": "video/mp4",
    "mov": "video/quicktime",
    "avi": "video/x-msvideo",
    "webm": "video/webm",
}


In [3]:
def get_bucket(storage_client, bucket_name):
    """
    Get GCS bucket, create if it doesn't exist
    """
    # Try to get the bucket — this avoids an explicit .exists() check
    try:
        bucket = storage_client.get_bucket(bucket_name)
    except NotFound:
        print(f"Bucket '{bucket_name}' not found. Creating it...")
        try:
            bucket = storage_client.create_bucket(bucket_name)
            print(f"✅ Created bucket '{bucket_name}'")
        except Conflict:
            # Another process might have created it simultaneously
            bucket = storage_client.get_bucket(bucket_name)
        except Forbidden:
            raise PermissionError(
                f"🚫 Cannot create bucket '{bucket_name}'. ",
                f"Your service account needs 'roles/storage.admin' permission."
            )
    return bucket

In [None]:
def upload_to_storage(storage_client, bucket_name, pdf_obj, folder=None):
    """
    Upload the PDF object to Google Cloud Storage.
    If the bucket doesn't exist, try to create it (without triggering a 403 from bucket.exists()).
    Skip the upload when the document is already stored.
    Optionally upload inside a folder within the bucket.
    """
    bucket = get_bucket(storage_client, bucket_name)
    content_type = CONTENT_TYPE_MAP.get(

    blob_name = pdf_obj["name"].replace("/", "_").replace("\\", "_")
    normalized_folder = None
    if folder:
        candidate = folder.strip("/").replace("\\", "/")
        normalized_folder = candidate or None

    try:
        blob_path = f"{normalized_folder}/{blob_name}" if normalized_folder else blob_name
        if bucket.get_blob(blob_path) is not None:
            print(f"ℹ️ Skipping upload; '{blob_path}' already exists in gs://{bucket_name}")
            return

        blob = bucket.blob(blob_path)
        
        blob.upload_from_string(pdf_obj["bytes"], content_type=content_type)
        print(f"✅ Uploaded '{blob_path}' to gs://{bucket_name}")
    except BadRequest as err:
        err_text = str(err)
        if ("maximum object length") in err_text or ("The bucket name and object name together must be at most 1087 characters") in err_text:
            suffix = hashlib.sha1(blob_name.encode("utf-8")).hexdigest()[:100]
            max_blob_length = 1024
            folder_overhead = len(normalized_folder) + 1 if normalized_folder else 0
            base_length = max_blob_length - len(suffix) - 3 - folder_overhead
            if base_length <= 0:
                raise ValueError("Folder path too long to create object name.")
            truncated_name = f"{blob_name[:base_length]}___{suffix}"
            truncated_path = (
                f"{normalized_folder}/{truncated_name}" if normalized_folder else truncated_name
            )
            print(f"⚠️ Blob name too long; retrying as '{truncated_path}'")
            if bucket.get_blob(truncated_path) is not None:
                print(f"ℹ️ Skipping upload; '{truncated_path}' already exists in gs://{bucket_name}")
                return
            fallback_blob = bucket.blob(truncated_path)
            fallback_blob.upload_from_string(pdf_obj["bytes"], content_type=content_type)
            print(f"✅ Uploaded '{truncated_path}' to gs://{bucket_name}")
        else:
            raise

In [16]:
"""
1. Norme e leggi
https://www.parlamento.it/Parlamento/519
https://www.giustizia-amministrativa.it/il-codice-del-processo-amministrativo1,
https://www.giustizia-amministrativa.it/web/guest/codice-dei-contratti-pubblici-approvato-con-d.lgs.-31-marzo-2023-n.-36,
# Plus: link diretti alle fonti delle leggi individuate

2. Risoluzioni, Circolari e Provvedimenti del Direttore dell'Agenzia delle Entrate
https://www.agenziaentrate.gov.it/portale/normativa-e-prassi/risoluzioni/archivio-risoluzioni
https://www.agenziaentrate.gov.it/portale/web/guest/normativa-e-prassi/circolari/archivio-circolari
https://www.agenziaentrate.gov.it/portale/web/guest/archivio/normativa-prassi-archivio-documentazione/provvedimenti/provvedimenti-soggetti
https://www.agenziaentrate.gov.it/portale/archivio/normativa-prassi-archivio-documentazione/provvedimenti/altri-provvedimenti-non-soggetti

3. Commenti
# Piattaforma Valore24 o Eutekne o others

4. Interpelli
https://www.agenziaentrate.gov.it/portale/normativa-e-prassi/risposte-agli-interpelli/interpelli/archivio-interpelli
https://www.agenziaentrate.gov.it/portale/web/guest/archivio-istanze-di-interpello-sui-nuovi-investimenti
https://www.agenziaentrate.gov.it/portale/normativa-e-prassi/risposte-agli-interpelli/principi-di-diritto/archivio-principi-di-diritto
https://www.agenziaentrate.gov.it/portale/normativa-e-prassi/risposte-agli-interpelli/risposte-alle-istanze-di-consulenza-giuridica/archivio-risposte-alle-istanze-di-consulenza-giuridica

5. Sentenze
https://www.italgiure.giustizia.it/sncass/ (corte cassaz.)

6. Altre fonti
https://www.odcec.mi.it/aree-tematiche/formazione/quaderni
https://www.odcec.mi.it/lordine/centro-studi-odcec-milano/2
"""

"""
Fonti in ordine di importanza

1 Norme e leggi (sito del parlamento, parte fiscale codice civile, testo unico)
2 risoluzioni (risposte ufficiali di agenzia delle entrate - non sono leggi, sono anche in s24h)
3 commenti (libri, pubblicazioni sole24ore con testi di pubblicisti, eutechne)
4 interpelli (agenzia risponde ad una domanda per futuri altri richiedenti) pubblica
5 sentenze (cassazione, livello provinciale o regionale) alla pari degli interpelli, siti a pagamento. Forse agenzia entrate ha obbligo di pubblicare tutte le sentenze (di tipo commissione tributaria)
"""

'\nFonti in ordine di importanza\n\n1 Norme e leggi (sito del parlamento, parte fiscale codice civile, testo unico)\n2 risoluzioni (risposte ufficiali di agenzia delle entrate - non sono leggi, sono anche in s24h)\n3 commenti (libri, pubblicazioni sole24ore con testi di pubblicisti, eutechne)\n4 interpelli (agenzia risponde ad una domanda per futuri altri richiedenti) pubblica\n5 sentenze (cassazione, livello provinciale o regionale) alla pari degli interpelli, siti a pagamento. Forse agenzia entrate ha obbligo di pubblicare tutte le sentenze (di tipo commissione tributaria)\n'

In [8]:
ladle = Ladle(headless=False)

# Agenzia entrate

In [29]:
main_folder = 'provvedimenti_con_pubblicità'
bucket_name = "loomy-public-documents"

In [30]:
ladle.driver.get("https://www.agenziaentrate.gov.it/portale/web/guest/archivio/normativa-prassi-archivio-documentazione/provvedimenti/provvedimenti-soggetti")
try:
    ladle.clicks.click('//*[@id="closePopup"]', timeout=2)
except Exception as e:
    print(f"Site opened with no popup showing")

year_elements = ladle.elements.elements('/html/body/div[1]/div/div/div/div[3]/div/div/div[3]/main/div[*]/div/div/section/div/div[2]/div/div/div/div/p/a')
for i in range(len(year_elements)): # ANNI
    year = year_elements[i].text
    print("Year: " + year)
    # Save current page URL to return after visiting subsection. Years page.
    last_page_url = ladle.driver.current_url
    ladle.clicks.click(f'/html/body/div[1]/div/div/div/div[3]/div/div/div[3]/main/div[{2+i}]/div/div/section/div/div[2]/div/div/div/div/p/a')

    # Re-capture elements on the new page (avoid stale references)
    month_elements = ladle.elements.elements('/html/body/div[4]/div/div/div/div[3]/div/div/div[3]/main/div[2]/div/div/section/div/div[2]/div/div[*]/p/a')

    for j in range(len(month_elements)): # MESI
        # Save current inner page URL to return after visiting detail link. Months page.
        inner_last_page_url = ladle.driver.current_url
        
        # Div changes dynamically from 4 to 1
        possible_month_xpaths = [
            f"/html/body/div[4]/div/div/div/div[3]/div/div/div[3]/main/div[2]/div/div/section/div/div[2]/div/div[{1+j}]/p/a",
            f"/html/body/div[1]/div/div/div/div[3]/div/div/div[3]/main/div[2]/div/div/section/div/div[2]/div/div[{1+j}]/p/a"
        ]
        for month_xp in possible_month_xpaths:
            try:
                month = ladle.elements.element(month_xp, timeout=3, text=True)
                ladle.clicks.click(month_xp, timeout=3)
                break  # success
            except Exception:
                continue
        
        possible_doc_list_xpaths = [
            '/html/body/div[4]/div/div/div/div[3]/div/div/div/div[2]/div/div[2]/div/div/section/div/div[2]/div/div[*]/div/p/a',
            '/html/body/div[4]/div/div/div/div[3]/div/div/div[3]/main/div[2]/div/div/section/div/div[2]/div/div[*]/div/p/a'
        ]
        # RACCOGLIE DOCUMENTI NEL MESE
        for doc_list_xp in possible_doc_list_xpaths:
            try:
                elements = ladle.elements.elements(doc_list_xp)
                break  # success
            except Exception:
                continue

        print("Month: " + month)
        for h in range(len(elements)): # DOCUMENTI NEL MESE
            
            possible_doc_xpaths = [
                f'/html/body/div[4]/div/div/div/div[3]/div/div/div/div[2]/div/div[2]/div/div/section/div/div[2]/div/div[{1+h}]/div/p/a',
                f'/html/body/div[1]/div/div/div/div[3]/div/div/div/div[2]/div/div[2]/div/div/section/div/div[2]/div/div[{1+h}]/div/p/a',
                f'/html/body/div[4]/div/div/div/div[3]/div/div/div[3]/main/div[2]/div/div/section/div/div[2]/div/div[{1+h}]/div/p/a',
                f'/html/body/div[1]/div/div/div/div[3]/div/div/div[3]/main/div[2]/div/div/section/div/div[2]/div/div[{1+h}]/div/p/a'
            ]
            for xp in possible_doc_xpaths:
                try:
                    doc_elem = ladle.elements.element(xp, timeout=2)
                    doc_xpath = xp  # Save the successful xpath for later use
                    break  # success
                except Exception:
                    continue

            time.sleep(0.5)  # Small delay to ensure text is loaded otherwise it opens the pdf
            doc_elem = ladle.elements.element(doc_xpath, timeout=2) # Re-fetch to avoid stale reference
            doc_elem_text = doc_elem.text
            if 'pdf' in doc_elem_text.lower(): # can't be .endswith('pdf') because some pdfs have additional info at the end
                doc_url = doc_elem.get_attribute('href')
                pdf_resp = requests.get(doc_url)
                pdf_bytes = pdf_resp.content

                # assemble python object
                pdf_obj = {
                    "url": doc_url,
                    "name": doc_elem_text,
                    "bytes": pdf_bytes
                }
                # Load the doc to gcp storage
                upload_to_storage(storage_client=storage_client,
                                  bucket_name=bucket_name,
                                  pdf_obj=pdf_obj,
                                  folder=f"{main_folder}")
            else:
                # Save current doc list page URL to return to after visiting the folder
                doc_list_page_url = ladle.driver.current_url

                ladle.clicks.click(doc_xpath, retries=50)

                # Explore folder contents
                # ! ul[*] sections can be more than 1, but for simplicty we only explore the first one which usually contains the main docs
                possible_doc_list_inside_folder_xpaths = [
                    '/html/body/div[4]/div/div/div/div[3]/div/div/div/div[2]/div/div/div/div/section/div/div[2]/div/div/div[2]/div/div/ul[1]/li[*]/a',
                    '/html/body/div[4]/div/div/div/div[3]/div/div/div[3]/main/div/div/div/section/div/div[2]/div/div/div[2]/div/div/ul/li[*]/a',
                    '/html/body/div[1]/div/div/div/div[3]/div/div/div/div[2]/div/div/div/div/section/div/div[2]/div/div/div[2]/div/div/ul[1]/li[*]/a',
                    '/html/body/div[1]/div/div/div/div[3]/div/div/div[3]/main/div/div/div/section/div/div[2]/div/div/div[2]/div/div/ul/li[*]/a'
                ]
                for doc_list_inside_folder_xp in possible_doc_list_inside_folder_xpaths:
                    try:
                        doc_list_inside_folder = ladle.elements.elements(doc_list_inside_folder_xp)
                        break  # success
                    except Exception:
                        continue

                for k in range(len(doc_list_inside_folder)): # DOCUMENTI NELLA CARTELLA
                    # Save current folder_doc list page URL to return to after visiting the docs into the clicked folder
                    folder_doc_list_page_url = ladle.driver.current_url

                    possible_doc_inside_folder_xpaths = [
                        f'/html/body/div[4]/div/div/div/div[3]/div/div/div/div[2]/div/div/div/div/section/div/div[2]/div/div/div[2]/div/div/ul[1]/li[{1+k}]/a',
                        f'/html/body/div[4]/div/div/div/div[3]/div/div/div[3]/main/div/div/div/section/div/div[2]/div/div/div[2]/div/div/ul/li[{1+k}]/a',
                        f'/html/body/div[1]/div/div/div/div[3]/div/div/div/div[2]/div/div/div/div/section/div/div[2]/div/div/div[2]/div/div/ul[1]/li[{1+k}]/a',
                        f'/html/body/div[1]/div/div/div/div[3]/div/div/div[3]/main/div/div/div/section/div/div[2]/div/div/div[2]/div/div/ul/li[{1+k}]/a',
                        f'/html/body/div[1]/div/div/div/div[3]/div/div/div[3]/main/div[2]/div/div/section/div/div[2]/div/div[{1+k}]/div/p/a'
                    ]
                    # Div changes dynamically from 4 to 1

                    for doc_inside_folder_xp in possible_doc_inside_folder_xpaths:
                        try:
                            folder_doc_elem = ladle.elements.element(doc_inside_folder_xp, timeout=2)
                            folder_doc_xpath = doc_inside_folder_xp  # Save the successful xpath for later use
                            break  # success
                        except Exception:
                            continue
                    
                    time.sleep(0.5)  # Small delay to ensure text is loaded otherwise it opens the pdf
                    folder_doc_elem = ladle.elements.element(folder_doc_xpath, timeout=2) # Re-fetch to avoid stale reference
                    folder_doc_elem_text = folder_doc_elem.text
                    print("\t\t" +folder_doc_elem_text)
                    if 'pdf' in folder_doc_elem.text.lower(): # can't be .endswith('pdf') because some pdfs have additional info at the end
                        folder_doc_url = folder_doc_elem.get_attribute('href')
                        folder_pdf_resp = requests.get(folder_doc_url)
                        folder_pdf_bytes = folder_pdf_resp.content

                        # assemble python object
                        folder_pdf_obj = {
                            "url": folder_doc_url,
                            "name": f"{doc_elem_text} - {folder_doc_elem_text}",
                            "bytes": folder_pdf_bytes
                        }
                        # Load the doc to gcp storage
                        upload_to_storage(storage_client=storage_client,
                                          bucket_name=bucket_name,
                                          pdf_obj=folder_pdf_obj,
                                          folder=f"{main_folder}")
                    else:
                        print(f"{folder_doc_elem.text} is not a PDF, skipping...")
                        ladle.clicks.click(folder_doc_xpath)
                        # Don't do nothing (for now). Already on the folder doc list page if i don't click.
                        ladle.driver.get(folder_doc_list_page_url)
                
                # Return to doc list page
                ladle.driver.get(doc_list_page_url)

        # Return to inner list using URL instead of back
        ladle.driver.get(inner_last_page_url)

    # Return to outer list using URL instead of back
    ladle.driver.get(last_page_url)

Site opened with no popup showing
Year: 2025 - Provvedimenti del Direttore soggetti a pubblicità legale
Month: Ottobre 2025
		Allegati - pdf
Bucket 'loomy-public-documents' not found. Creating it...
✅ Created bucket 'loomy-public-documents'
✅ Uploaded 'provvedimenti_con_pubblicità/Modificazioni al modello “Comunicazioni per il regime di tassazione delle SIIQ e delle SIINQ”, approvato con Provvedimento del Direttore dell’Agenzia delle entrate prot. n. 161900 del 18 dicembre 2015 - Allegati - pdf' to gs://loomy-public-documents
		Allegato A - pdf
✅ Uploaded 'provvedimenti_con_pubblicità/Attuazione delle disposizioni in materia di rafforzamento del contenuto conoscitivo del cassetto fiscale, ai sensi dell’articolo 23 del decreto legislativo 8 gennaio 2024, n. 1. Modalità di richiesta e acquisizione, anche massiva, dei dati delle Certificazioni Uniche (CU) - Allegato A - pdf' to gs://loomy-public-documents
		Allegato B - pdf
✅ Uploaded 'provvedimenti_con_pubblicità/Attuazione delle disposi

KeyboardInterrupt: 

In [None]:
@dataclass
class DocumentRecord:
    source_type: str
    doc_id: str
    title: str
    url: str
    collected_at: str
    text_path: str
    meta: Dict[str, Any] = field(default_factory=dict)

    def to_dict(self):
        d = asdict(self)
        return d

# Sentenze cassazione

In [11]:
ladle = Ladle(headless=False)

In [12]:
ladle.driver.get("https://www.italgiure.giustizia.it/sncass/")

In [13]:
from datetime import datetime
anno_corrente = datetime.now().year
button_mapping = {
    "archivio": {
        "CIVILE": "/html/body/div[1]/div[3]/form/table/tbody[2]/tr/td[1]/table/tbody/tr[2]/td/div/div/div/div/div/table/tbody/tr[1]/td[2]",
        "PENALE": "/html/body/div[1]/div[3]/form/table/tbody[2]/tr/td[1]/table/tbody/tr[2]/td/div/div/div/div/div/table/tbody/tr[2]/td[2]"
    },
    "tipo": {
        "Decreto": "/html/body/div[1]/div[3]/form/table/tbody[2]/tr/td[1]/table/tbody/tr[3]/td/div/div/div/div/div/table/tbody/tr[1]/td[2]",
        "Ordinanza": "/html/body/div[1]/div[3]/form/table/tbody[2]/tr/td[1]/table/tbody/tr[3]/td/div/div/div/div/div/table/tbody/tr[2]/td[2]",
        "Ordinanza Interlocutoria": "/html/body/div[1]/div[3]/form/table/tbody[2]/tr/td[1]/table/tbody/tr[3]/td/div/div/div/div/div/table/tbody/tr[3]/td[2]",
        "Sentenza": "/html/body/div[1]/div[3]/form/table/tbody[2]/tr/td[1]/table/tbody/tr[3]/td/div/div/div/div/div/table/tbody/tr[4]/td[2]"
    },
    "sezione": {
        "PRIMA": "/html/body/div[1]/div[3]/form/table/tbody[2]/tr/td[1]/table/tbody/tr[4]/td/div/div/div/div/div/table/tbody/tr[1]/td[2]",
        "SECONDA": "/html/body/div[1]/div[3]/form/table/tbody[2]/tr/td[1]/table/tbody/tr[4]/td/div/div/div/div/div/table/tbody/tr[2]/td[2]",
        "TERZA": "/html/body/div[1]/div[3]/form/table/tbody[2]/tr/td[1]/table/tbody/tr[4]/td/div/div/div/div/div/table/tbody/tr[3]/td[2]",
        "QUARTA": "/html/body/div[1]/div[3]/form/table/tbody[2]/tr/td[1]/table/tbody/tr[4]/td/div/div/div/div/div/table/tbody/tr[4]/td[2]",
        "QUINTA": "/html/body/div[1]/div[3]/form/table/tbody[2]/tr/td[1]/table/tbody/tr[4]/td/div/div/div/div/div/table/tbody/tr[5]/td[2]",
        "SESTA": "/html/body/div[1]/div[3]/form/table/tbody[2]/tr/td[1]/table/tbody/tr[4]/td/div/div/div/div/div/table/tbody/tr[6]/td[2]",
        "SETTIMA": "/html/body/div[1]/div[3]/form/table/tbody[2]/tr/td[1]/table/tbody/tr[4]/td/div/div/div/div/div/table/tbody/tr[7]/td[2]",
        "FERIALE": "/html/body/div[1]/div[3]/form/table/tbody[2]/tr/td[1]/table/tbody/tr[4]/td/div/div/div/div/div/table/tbody/tr[8]/td[2]",
        "LAVORO": "/html/body/div[1]/div[3]/form/table/tbody[2]/tr/td[1]/table/tbody/tr[4]/td/div/div/div/div/div/table/tbody/tr[9]/td[2]",
        "UNITE": "/html/body/div[1]/div[3]/form/table/tbody[2]/tr/td[1]/table/tbody/tr[4]/td/div/div/div/div/div/table/tbody/tr[10]/td[2]",
    },
    "anno": { # Max 5 anni + corrente
        f"{anno_corrente}": "/html/body/div[1]/div[3]/form/table/tbody[2]/tr/td[1]/table/tbody/tr[5]/td/div/div/div/div/div/table/tbody/tr[1]/td[2]",
        f"{anno_corrente - 1}": "/html/body/div[1]/div[3]/form/table/tbody[2]/tr/td[1]/table/tbody/tr[5]/td/div/div/div/div/div/table/tbody/tr[2]/td[2]",
        f"{anno_corrente - 2}": "/html/body/div[1]/div[3]/form/table/tbody[2]/tr/td[1]/table/tbody/tr[5]/td/div/div/div/div/div/table/tbody/tr[3]/td[2]",
        f"{anno_corrente - 3}": "/html/body/div[1]/div[3]/form/table/tbody[2]/tr/td[1]/table/tbody/tr[5]/td/div/div/div/div/div/table/tbody/tr[4]/td[2]",
        f"{anno_corrente - 4}": "/html/body/div[1]/div[3]/form/table/tbody[2]/tr/td[1]/table/tbody/tr[5]/td/div/div/div/div/div/table/tbody/tr[5]/td[2]",
        f"{anno_corrente - 5}": "/html/body/div[1]/div[3]/form/table/tbody[2]/tr/td[1]/table/tbody/tr[5]/td/div/div/div/div/div/table/tbody/tr[6]/td[2]"
    }
}

In [14]:
ladle.clicks.click(button_mapping['sezione']['PRIMA'])
time.sleep(3)
ladle.clicks.click(button_mapping['sezione']['TERZA'])
time.sleep(3)
ladle.clicks.click(button_mapping['sezione']['QUINTA'])
time.sleep(3)
ladle.clicks.click(button_mapping['sezione']['LAVORO'])
time.sleep(3)
# L'archivio va cliccato per ultimo perché la CIVILE elimina alcune sezioni e perciò riordina gli xpath e.g. td[5] -> td[4]
ladle.clicks.click(button_mapping['archivio']['CIVILE'])

prova sotto

In [7]:
max_retries = 10
new_page_button_xpath = "/html/body/div[1]/div[3]/form/table/tbody[2]/tr/td[2]/div/div[2]/div[11]/span"
# While new_page_button_xpath in the source page click the next page button
new_page_button = ladle.elements.element(new_page_button_xpath)
new_page = ladle.elements.element_displayed(element=new_page_button)

while new_page:
    new_page_button = ladle.elements.element(new_page_button_xpath)
    new_page = ladle.elements.element_displayed(xpath=new_page_button_xpath)
    
    # Scrape page pdfs
    pdf_list = ladle.elements.elements('/html/body/div[1]/div[3]/form/table/tbody[2]/tr/td[2]/div/div[2]/div[*]/div/h3/a')
    for n in range(len(pdf_list)):
        # pdf_elem_text = ladle.elements.element(f'/html/body/div[1]/div[3]/form/table/tbody[2]/tr/td[2]/div/div[2]/div[{n+1}]/div/h3/a', text=True)
        # if "formato pdf" in pdf_elem_text:
        doc_url = ladle.elements.element(f'/html/body/div[1]/div[3]/form/table/tbody[2]/tr/td[2]/div/div[2]/div[{n+1}]/div/h3/a').get_attribute('href')
        print(doc_url)
        pdf_content = extract_pdf_content(doc_url)

        pdf_obj = {
            "url": doc_url,
            "bytes": pdf_content['pdf_bytes'],
            "text": pdf_content['pdf_text']
        }
        print(pdf_obj['url'])
    
    ladle.clicks.click(new_page_button_xpath)

NoSuchWindowException: Message: no such window: target window already closed
from unknown error: web view not found
  (Session info: chrome=141.0.7390.108)
Stacktrace:
	GetHandleVerifier [0x0x6afe83+66515]
	GetHandleVerifier [0x0x6afec4+66580]
	(No symbol) [0x0x49dc48]
	(No symbol) [0x0x47c18d]
	(No symbol) [0x0x511a4e]
	(No symbol) [0x0x52c4d9]
	(No symbol) [0x0x50afc6]
	(No symbol) [0x0x4dc2ca]
	(No symbol) [0x0x4dd154]
	GetHandleVerifier [0x0x9073b3+2521347]
	GetHandleVerifier [0x0x902333+2500739]
	GetHandleVerifier [0x0x6d7cd4+229924]
	GetHandleVerifier [0x0x6c8238+165768]
	GetHandleVerifier [0x0x6ceced+193085]
	GetHandleVerifier [0x0x6b8198+100072]
	GetHandleVerifier [0x0x6b8330+100480]
	GetHandleVerifier [0x0x6a25ea+11066]
	BaseThreadInitThunk [0x0x76eb5d49+25]
	RtlInitializeExceptionChain [0x0x7724d6db+107]
	RtlGetAppContainerNamedObjectPath [0x0x7724d661+561]
