In [25]:
from ladle.ladle import Ladle
from dataclasses import dataclass
import requests
import fitz  # PyMuPDF
import time
import os
import json
# Google Cloud Storage
from google.oauth2 import service_account
from google.api_core.exceptions import NotFound, Conflict, Forbidden, BadRequest
from google.cloud import storage
gcp_credentials_info = os.getenv("GCP_SERVICE_ACCOUNT_CREDENTIALS")
gcp_credentials_info = json.loads(gcp_credentials_info)
gcp_service_account_credentials = service_account.Credentials.from_service_account_info(gcp_credentials_info)
storage_client = storage.Client(credentials=gcp_service_account_credentials)

from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.webdriver.common.action_chains import ActionChains
from selenium.common.exceptions import StaleElementReferenceException

In [26]:
CONTENT_TYPE_MAP = {
    # Documents
    "pdf": "application/pdf",
    "doc": "application/msword",
    "docx": "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
    "xls": "application/vnd.ms-excel",
    "xlsx": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
    "ppt": "application/vnd.ms-powerpoint",
    "pptx": "application/vnd.openxmlformats-officedocument.presentationml.presentation",
    "txt": "text/plain",
    "rtf": "application/rtf",
    "csv": "text/csv",

    # Data / Markup
    "xml": "application/xml",
    "xsd": "application/xml",
    "json": "application/json",
    "yaml": "application/x-yaml",
    "yml": "application/x-yaml",

    # Archives / Compression
    "zip": "application/zip",
    "gz": "application/gzip",
    "tar": "application/x-tar",
    "rar": "application/vnd.rar",
    "7z": "application/x-7z-compressed",

    # Web
    "html": "text/html",
    "htm": "text/html",
    "css": "text/css",
    "js": "application/javascript",
    "jsonld": "application/ld+json",

    # Images
    "png": "image/png",
    "jpg": "image/jpeg",
    "jpeg": "image/jpeg",
    "gif": "image/gif",
    "bmp": "image/bmp",
    "svg": "image/svg+xml",
    "webp": "image/webp",
    "ico": "image/x-icon",

    # Audio / Video
    "mp3": "audio/mpeg",
    "wav": "audio/wav",
    "ogg": "audio/ogg",
    "mp4": "video/mp4",
    "mov": "video/quicktime",
    "avi": "video/x-msvideo",
    "webm": "video/webm",
}


In [27]:
def _get_bucket(storage_client, bucket_name):
    """
    Get GCS bucket, create if it doesn't exist
    """
    # Try to get the bucket — this avoids an explicit .exists() check
    try:
        bucket = storage_client.get_bucket(bucket_name)
    except NotFound:
        print(f"Bucket '{bucket_name}' not found. Creating it...")
        try:
            bucket = storage_client.create_bucket(bucket_name)
            print(f"✅ Created bucket '{bucket_name}'")
        except Conflict:
            # Another process might have created it simultaneously
            bucket = storage_client.get_bucket(bucket_name)
        except Forbidden:
            raise PermissionError(
                f"🚫 Cannot create bucket '{bucket_name}'. ",
                f"Your service account needs 'roles/storage.admin' permission."
            )
    return bucket

def _normalize_folder(folder):
    """
    Normalize folder path for GCS upload
    """
    normalized_folder = None
    if folder:
        candidate = folder.strip("/").replace("\\", "/")
        normalized_folder = candidate or None
    return normalized_folder

def _truncate_name(blob_name, normalized_folder, separator="_____"):
    """
    Truncate the blob name to fit within GCS limits (1024 BYTES).
    """
    max_bytes = 1024
    # Calculate folder overhead in BYTES
    folder_overhead = len(normalized_folder.encode('utf-8')) + 1 if normalized_folder else 0
    available_bytes = max_bytes - folder_overhead
    separator_bytes = len(separator.encode('utf-8'))
    if available_bytes <= separator_bytes:
        raise ValueError("Folder path too long to create object name.")
    blob_name_bytes = blob_name.encode('utf-8')
    # If it already fits, return as-is
    if len(blob_name_bytes) <= available_bytes:
        return blob_name
    # Calculate suffix length in bytes
    desired_suffix_bytes = min(100, len(blob_name_bytes))
    max_suffix_bytes = available_bytes - separator_bytes - 1
    if max_suffix_bytes <= 0:
        raise ValueError("Folder path too long to create object name.")
    suffix_bytes = min(desired_suffix_bytes, max_suffix_bytes)
    base_bytes = available_bytes - separator_bytes - suffix_bytes
    if base_bytes <= 0:
        raise ValueError("Folder path too long to create object name.")
    # Truncate at byte boundaries and decode safely
    base_part = blob_name_bytes[:base_bytes].decode('utf-8', errors='ignore')
    suffix_part = blob_name_bytes[-suffix_bytes:].decode('utf-8', errors='ignore')
    truncated_name = f"{base_part}{separator}{suffix_part}"
    return truncated_name

In [28]:
def upload_to_storage(storage_client, bucket_name, pdf_obj, folder=None):
    """
    Upload the PDF object to Google Cloud Storage.
    If the bucket doesn't exist, try to create it (without triggering a 403 from bucket.exists()).
    Skip the upload when the document is already stored.
    Optionally upload inside a folder within the bucket.
    """
    bucket = _get_bucket(storage_client, bucket_name)
    content_type = CONTENT_TYPE_MAP.get(pdf_obj["extension"], "application/file")
    blob_name = pdf_obj["name"].replace("/", "_").replace("\\", "_")
    normalized_folder = _normalize_folder(folder)

    try:
        blob_path = f"{normalized_folder}/{blob_name}" if normalized_folder else blob_name
        if bucket.get_blob(blob_path) is not None:
            # print(f"ℹ️ Skipping upload; '{blob_path}' already exists in gs://{bucket_name}")
            return

        blob = bucket.blob(blob_path)
        
        blob.upload_from_string(pdf_obj["bytes"], content_type=content_type)
        # print(f"✅ Uploaded '{blob_path}' to gs://{bucket_name}")
    except BadRequest as err:
        err_text = str(err)
        if ("maximum object length") in err_text or ("The bucket name and object name together must be at most 1087 characters") in err_text:
            truncated_name = _truncate_name(blob_name, normalized_folder, separator="_____")
            truncated_path = (
                f"{normalized_folder}/{truncated_name}" if normalized_folder else truncated_name
            )
            # print(f"⚠️ Blob name too long; retrying as '{truncated_path}'")
            # print("Len: " + str(len(truncated_path)))
            if bucket.get_blob(truncated_path) is not None:
                # print(f"ℹ️ Skipping upload; '{truncated_path}' already exists in gs://{bucket_name}")
                return
            fallback_blob = bucket.blob(truncated_path)
            fallback_blob.upload_from_string(pdf_obj["bytes"], content_type=content_type)
            # print(f"✅ Uploaded '{truncated_path}' to gs://{bucket_name}")
        else:
            raise

In [6]:
"""
1. Norme e leggi
https://www.parlamento.it/Parlamento/519
https://www.giustizia-amministrativa.it/il-codice-del-processo-amministrativo1,
https://www.giustizia-amministrativa.it/web/guest/codice-dei-contratti-pubblici-approvato-con-d.lgs.-31-marzo-2023-n.-36,
# Plus: link diretti alle fonti delle leggi individuate

2. Risoluzioni, Circolari e Provvedimenti del Direttore dell'Agenzia delle Entrate
https://www.agenziaentrate.gov.it/portale/normativa-e-prassi/risoluzioni/archivio-risoluzioni
https://www.agenziaentrate.gov.it/portale/web/guest/normativa-e-prassi/circolari/archivio-circolari
https://www.agenziaentrate.gov.it/portale/web/guest/archivio/normativa-prassi-archivio-documentazione/provvedimenti/provvedimenti-soggetti
https://www.agenziaentrate.gov.it/portale/archivio/normativa-prassi-archivio-documentazione/provvedimenti/altri-provvedimenti-non-soggetti

3. Commenti
# Piattaforma Valore24 o Eutekne o others

4. Interpelli
https://www.agenziaentrate.gov.it/portale/normativa-e-prassi/risposte-agli-interpelli/interpelli/archivio-interpelli
https://www.agenziaentrate.gov.it/portale/web/guest/archivio-istanze-di-interpello-sui-nuovi-investimenti
https://www.agenziaentrate.gov.it/portale/normativa-e-prassi/risposte-agli-interpelli/principi-di-diritto/archivio-principi-di-diritto
https://www.agenziaentrate.gov.it/portale/normativa-e-prassi/risposte-agli-interpelli/risposte-alle-istanze-di-consulenza-giuridica/archivio-risposte-alle-istanze-di-consulenza-giuridica

5. Sentenze
https://www.italgiure.giustizia.it/sncass/ (corte cassaz.)

6. Altre fonti
https://www.odcec.mi.it/aree-tematiche/formazione/quaderni
https://www.odcec.mi.it/lordine/centro-studi-odcec-milano/2
"""

"""
Fonti in ordine di importanza

1 Norme e leggi (sito del parlamento, parte fiscale codice civile, testo unico)
2 risoluzioni (risposte ufficiali di agenzia delle entrate - non sono leggi, sono anche in s24h)
3 commenti (libri, pubblicazioni sole24ore con testi di pubblicisti, eutechne)
4 interpelli (agenzia risponde ad una domanda per futuri altri richiedenti) pubblica
5 sentenze (cassazione, livello provinciale o regionale) alla pari degli interpelli, siti a pagamento. Forse agenzia entrate ha obbligo di pubblicare tutte le sentenze (di tipo commissione tributaria)
"""

'\nFonti in ordine di importanza\n\n1 Norme e leggi (sito del parlamento, parte fiscale codice civile, testo unico)\n2 risoluzioni (risposte ufficiali di agenzia delle entrate - non sono leggi, sono anche in s24h)\n3 commenti (libri, pubblicazioni sole24ore con testi di pubblicisti, eutechne)\n4 interpelli (agenzia risponde ad una domanda per futuri altri richiedenti) pubblica\n5 sentenze (cassazione, livello provinciale o regionale) alla pari degli interpelli, siti a pagamento. Forse agenzia entrate ha obbligo di pubblicare tutte le sentenze (di tipo commissione tributaria)\n'

In [11]:
ladle = Ladle(headless=False)

# Agenzia entrate

In [12]:
main_folder = 'provvedimenti_non_soggetti_a_pubblicita'
bucket_name = "loomy-public-documents"

In [None]:
# ladle.driver.get("https://www.agenziaentrate.gov.it/portale/web/guest/archivio/normativa-prassi-archivio-documentazione/provvedimenti/provvedimenti-soggetti")
ladle.driver.get("https://www.agenziaentrate.gov.it/portale/archivio/normativa-prassi-archivio-documentazione/provvedimenti/altri-provvedimenti-non-soggetti")
try:
    ladle.clicks.click('//*[@id="closePopup"]', timeout=2)
except Exception as e:
    print(f"Site opened with no popup showing")

possible_year_list_xpaths = [
    '/html/body/div[1]/div/div/div/div[3]/div/div/div[3]/main/div[*]/div/div/section/div/div[2]/div/div/div/div/p/a',
    '/html/body/div[4]/div/div/div/div[3]/div/div/div[3]/main/div[*]/div/div/section/div/div[2]/div/div/div/div/p/a'
]

# RACCOGLIE LISTA DEGLI ANNI
for year_list_xp in possible_year_list_xpaths:
    try:
        year_elements = ladle.elements.elements(year_list_xp)
        break  # success
    except Exception:
        continue

for i in range(len(year_elements)): # ANNI
    year = ladle.elements.element(f'/html/body/div[1]/div/div/div/div[3]/div/div/div[3]/main/div[{2+i}]/div/div/section/div/div[2]/div/div/div/div/p/a', text=True)
    print("Year: " + year)
    # Save current page URL to return after visiting subsection. Years page.
    last_page_url = ladle.driver.current_url
    ladle.clicks.click(f'/html/body/div[1]/div/div/div/div[3]/div/div/div[3]/main/div[{2+i}]/div/div/section/div/div[2]/div/div/div/div/p/a')

    # Re-capture elements on the new page (avoid stale references)
    possible_month_list_xpaths = [
        "/html/body/div[4]/div/div/div/div[3]/div/div/div[3]/main/div[2]/div/div/section/div/div[2]/div/div[*]/p/a",
        "/html/body/div[1]/div/div/div/div[3]/div/div/div[3]/main/div[2]/div/div/section/div/div[2]/div/div[*]/p/a",
        "/html/body/div[4]/div/div/div/div[3]/div/div/div[3]/main/div[*]/div/div/section/div/div[2]/div/div/div/div/p/a",
        "/html/body/div[1]/div/div/div/div[3]/div/div/div[3]/main/div[*]/div/div/section/div/div[2]/div/div/div/div/p/a"
    ]
    # RACCOGLIE LISTA DEI MESI
    for month_list_xp in possible_month_list_xpaths:
        try:
            month_elements = ladle.elements.elements(month_list_xp)
            break  # success
        except Exception:
            continue

    for j in range(len(month_elements)): # MESI
        # Save current inner page URL to return after visiting detail link. Months page.
        inner_last_page_url = ladle.driver.current_url
        
        month = None
        possible_month_xpaths = [xp.replace("*", str(1+j)) for xp in possible_month_list_xpaths]
        for month_xp in possible_month_xpaths:
            try:
                month = ladle.elements.element(month_xp, timeout=3, text=True)
                ladle.clicks.click(month_xp, timeout=3)
                break  # success
            except Exception:
                continue

        if month is None:
            # Go with div[2+j]
            possible_month_xpaths = [xp.replace("*", str(2+j)) for xp in possible_month_list_xpaths]
            for month_xp in possible_month_xpaths:
                try:
                    month = ladle.elements.element(month_xp, timeout=3, text=True)
                    ladle.clicks.click(month_xp, timeout=3)
                    break  # success
                except Exception:
                    continue
        
        print("Month: " + month)

        possible_doc_list_xpaths = [
            '/html/body/div[4]/div/div/div/div[3]/div/div/div/div[2]/div/div[2]/div/div/section/div/div[2]/div/div[*]/div/p/a',
            '/html/body/div[4]/div/div/div/div[3]/div/div/div[3]/main/div[3]/div/div/section/div/div[2]/div/div[*]/div/p/a',
            '/html/body/div[1]/div/div/div/div[3]/div/div/div/div[2]/div/div[2]/div/div/section/div/div[2]/div/div[*]/div/p/a',
            '/html/body/div[1]/div/div/div/div[3]/div/div/div[3]/main/div[2]/div/div/section/div/div[2]/div/div[*]/div/p/a',
            '/html/body/div[1]/div/div/div/div[3]/div/div/div[3]/main/div[3]/div/div/section/div/div[2]/div/div[*]/div/p/a',
            '/html/body/div[4]/div/div/div/div[3]/div/div/div[3]/main/div[2]/div/div/section/div/div[2]/div/div[*]/div/p/a'
        ]
        # RACCOGLIE DOCUMENTI NEL MESE
        for doc_list_xp in possible_doc_list_xpaths:
            try:
                elements = ladle.elements.elements(doc_list_xp)
                # Skip if takes the first element which is Articolo di legge ceh prevede la pubblicazione dei doc
                if len(elements) <= 1: 
                    continue
                else:
                    break  # success
            except Exception:
                continue

        for h in range(len(elements)): # DOCUMENTI NEL MESE
            possible_doc_xpaths = [xp.replace("*", str(1+h)) for xp in possible_doc_list_xpaths]
            for xp in possible_doc_xpaths:
                try:
                    doc_elem = ladle.elements.element(xp, timeout=2)
                    doc_xpath = xp  # Save the successful xpath for later use
                    break  # success
                except Exception:
                    continue
            
            # print("Doc xpath: " + doc_xpath)
            time.sleep(0.5)  # Small delay to ensure text is loaded otherwise it opens the pdf
            doc_elem = ladle.elements.element(doc_xpath, timeout=2) # Re-fetch to avoid stale reference
            doc_elem_text = doc_elem.text
            # If not pdf it's a folder
            if 'pdf' in doc_elem_text.lower(): # can't be .endswith('pdf') because some pdfs have additional info at the end
                doc_url = doc_elem.get_attribute('href')
                pdf_resp = requests.get(doc_url)
                pdf_bytes = pdf_resp.content
                file_extension = pdf_resp.url.split('.')[-1].split('?')[0].split('/')[0].lower()

                # assemble python object
                pdf_obj = {
                    "url": doc_url,
                    "name": doc_elem_text,
                    "bytes": pdf_bytes,
                    "extension": file_extension
                }
                # Load the doc to gcp storage
                upload_to_storage(storage_client=storage_client,
                                  bucket_name=bucket_name,
                                  pdf_obj=pdf_obj,
                                  folder=f"{main_folder}/{year}/{month}")
            else:
                # Save current doc list page URL to return to after visiting the folder
                doc_list_page_url = ladle.driver.current_url

                ladle.clicks.click(doc_xpath, retries=5)

                # Explore folder contents
                # ! ul[*] sections can be more than 1, but for simplicty we only explore the first one which usually contains the main docs
                possible_doc_list_inside_folder_xpaths = [
                    '/html/body/div[4]/div/div/div/div[3]/div/div/div/div[2]/div/div/div/div/section/div/div[2]/div/div/div[2]/div/div/ul[1]/li[*]/a',
                    '/html/body/div[4]/div/div/div/div[3]/div/div/div[3]/main/div/div/div/section/div/div[2]/div/div/div[2]/div/div/ul/li[*]/a',
                    '/html/body/div[4]/div/div/div/div[3]/div/div/div[3]/main/div[2]/div/div/section/div/div[2]/div/div[*]/div/p/a',
                    '/html/body/div[1]/div/div/div/div[3]/div/div/div/div[2]/div/div/div/div/section/div/div[2]/div/div/div[2]/div/div/ul[1]/li[*]/a',
                    '/html/body/div[1]/div/div/div/div[3]/div/div/div[3]/main/div/div/div/section/div/div[2]/div/div/div[2]/div/div/ul/li[*]/a',
                    '/html/body/div[1]/div/div/div/div[3]/div/div/div[3]/main/div[2]/div/div/section/div/div[2]/div/div[*]/div/p/a'
                ]
                for doc_list_inside_folder_xp in possible_doc_list_inside_folder_xpaths:
                    try:
                        doc_list_inside_folder = ladle.elements.elements(doc_list_inside_folder_xp)
                        break  # success
                    except Exception:
                        continue

                for k in range(len(doc_list_inside_folder)): # DOCUMENTI NELLA CARTELLA
                    # Save current folder_doc list page URL to return to after visiting the docs into the clicked folder
                    folder_doc_list_page_url = ladle.driver.current_url

                    possible_doc_inside_folder_xpaths = [xp.replace("*", str(1+k)) for xp in possible_doc_list_inside_folder_xpaths]
                    for doc_inside_folder_xp in possible_doc_inside_folder_xpaths:
                        try:
                            folder_doc_elem = ladle.elements.element(doc_inside_folder_xp, timeout=2)
                            folder_doc_xpath = doc_inside_folder_xp  # Save the successful xpath for later use
                            break  # success
                        except Exception:
                            continue
                    
                    time.sleep(0.5)  # Small delay to ensure text is loaded otherwise it opens the pdf
                    folder_doc_elem = ladle.elements.element(folder_doc_xpath, timeout=2) # Re-fetch to avoid stale reference
                    folder_doc_elem_text = folder_doc_elem.text
                    # print("\t\t" +folder_doc_elem_text)

                    folder_doc_url = folder_doc_elem.get_attribute('href')
                    folder_pdf_resp = requests.get(folder_doc_url)
                    folder_pdf_bytes = folder_pdf_resp.content

                    # assemble python object
                    folder_pdf_obj = {
                        "url": folder_doc_url,
                        "name": f"{doc_elem_text} - {folder_doc_elem_text}",
                        "bytes": folder_pdf_bytes,
                        "extension": folder_pdf_resp.url.split('.')[-1].split('?')[0].split('/')[0].lower()
                    }
                    # Load the doc to gcp storage
                    upload_to_storage(storage_client=storage_client,
                                        bucket_name=bucket_name,
                                        pdf_obj=folder_pdf_obj,
                                        folder=f"{main_folder}/{year}/{month}")
                
                # Return to doc list page
                ladle.driver.get(doc_list_page_url)

        # Return to inner list using URL instead of back
        ladle.driver.get(inner_last_page_url)

    # Return to outer list using URL instead of back
    ladle.driver.get(last_page_url)

Site opened with no popup showing
Year: 2025 - Provvedimenti del Direttore non soggetti a pubblicità legale
Month: Gennaio 2025 - Provvedimenti del Direttore non soggetti a pubblicità legale


TimeoutException: Message: 
Stacktrace:
	GetHandleVerifier [0x0x73fe43+66515]
	GetHandleVerifier [0x0x73fe84+66580]
	(No symbol) [0x0x52dc48]
	(No symbol) [0x0x578704]
	(No symbol) [0x0x578aab]
	(No symbol) [0x0x5bf482]
	(No symbol) [0x0x59b214]
	(No symbol) [0x0x5bcba7]
	(No symbol) [0x0x59afc6]
	(No symbol) [0x0x56c2ca]
	(No symbol) [0x0x56d154]
	GetHandleVerifier [0x0x997353+2521315]
	GetHandleVerifier [0x0x9922d3+2500707]
	GetHandleVerifier [0x0x767c94+229924]
	GetHandleVerifier [0x0x7581f8+165768]
	GetHandleVerifier [0x0x75ecad+193085]
	GetHandleVerifier [0x0x748158+100072]
	GetHandleVerifier [0x0x7482f0+100480]
	GetHandleVerifier [0x0x7325aa+11066]
	BaseThreadInitThunk [0x0x76d75d49+25]
	RtlInitializeExceptionChain [0x0x7778d6db+107]
	RtlGetAppContainerNamedObjectPath [0x0x7778d661+561]


In [70]:
len(pdf_obj['name'].encode('utf-8').decode('utf-8', errors='ignore'))

1006

In [15]:
len("provvedimenti_con_pubblicità/2025 - Provvedimenti del Direttore soggetti a pubblicità legale/Febbraio 2025")

106

In [None]:
@dataclass
class DocumentRecord:
    source_type: str
    doc_id: str
    title: str
    url: str
    collected_at: str
    text_path: str
    meta: Dict[str, Any] = field(default_factory=dict)

    def to_dict(self):
        d = asdict(self)
        return d

# Sentenze cassazione

In [16]:
ladle = Ladle(headless=False)

In [17]:
ladle.driver.get("https://www.italgiure.giustizia.it/sncass/")

In [18]:
from datetime import datetime
anno_corrente = datetime.now().year
button_mapping = {
    "archivio": {
        "CIVILE": "/html/body/div[1]/div[3]/form/table/tbody[2]/tr/td[1]/table/tbody/tr[2]/td/div/div/div/div/div/table/tbody/tr[1]/td[2]",
        "PENALE": "/html/body/div[1]/div[3]/form/table/tbody[2]/tr/td[1]/table/tbody/tr[2]/td/div/div/div/div/div/table/tbody/tr[2]/td[2]"
    },
    "tipo": {
        "Decreto": "/html/body/div[1]/div[3]/form/table/tbody[2]/tr/td[1]/table/tbody/tr[3]/td/div/div/div/div/div/table/tbody/tr[1]/td[2]",
        "Ordinanza": "/html/body/div[1]/div[3]/form/table/tbody[2]/tr/td[1]/table/tbody/tr[3]/td/div/div/div/div/div/table/tbody/tr[2]/td[2]",
        "Ordinanza Interlocutoria": "/html/body/div[1]/div[3]/form/table/tbody[2]/tr/td[1]/table/tbody/tr[3]/td/div/div/div/div/div/table/tbody/tr[3]/td[2]",
        "Sentenza": "/html/body/div[1]/div[3]/form/table/tbody[2]/tr/td[1]/table/tbody/tr[3]/td/div/div/div/div/div/table/tbody/tr[4]/td[2]"
    },
    "sezione": {
        "PRIMA": "/html/body/div[1]/div[3]/form/table/tbody[2]/tr/td[1]/table/tbody/tr[4]/td/div/div/div/div/div/table/tbody/tr[1]/td[2]",
        "SECONDA": "/html/body/div[1]/div[3]/form/table/tbody[2]/tr/td[1]/table/tbody/tr[4]/td/div/div/div/div/div/table/tbody/tr[2]/td[2]",
        "TERZA": "/html/body/div[1]/div[3]/form/table/tbody[2]/tr/td[1]/table/tbody/tr[4]/td/div/div/div/div/div/table/tbody/tr[3]/td[2]",
        "QUARTA": "/html/body/div[1]/div[3]/form/table/tbody[2]/tr/td[1]/table/tbody/tr[4]/td/div/div/div/div/div/table/tbody/tr[4]/td[2]",
        "QUINTA": "/html/body/div[1]/div[3]/form/table/tbody[2]/tr/td[1]/table/tbody/tr[4]/td/div/div/div/div/div/table/tbody/tr[5]/td[2]",
        "SESTA": "/html/body/div[1]/div[3]/form/table/tbody[2]/tr/td[1]/table/tbody/tr[4]/td/div/div/div/div/div/table/tbody/tr[6]/td[2]",
        "SETTIMA": "/html/body/div[1]/div[3]/form/table/tbody[2]/tr/td[1]/table/tbody/tr[4]/td/div/div/div/div/div/table/tbody/tr[7]/td[2]",
        "FERIALE": "/html/body/div[1]/div[3]/form/table/tbody[2]/tr/td[1]/table/tbody/tr[4]/td/div/div/div/div/div/table/tbody/tr[8]/td[2]",
        "LAVORO": "/html/body/div[1]/div[3]/form/table/tbody[2]/tr/td[1]/table/tbody/tr[4]/td/div/div/div/div/div/table/tbody/tr[9]/td[2]",
        "UNITE": "/html/body/div[1]/div[3]/form/table/tbody[2]/tr/td[1]/table/tbody/tr[4]/td/div/div/div/div/div/table/tbody/tr[10]/td[2]",
    },
    "anno": { # Max 5 anni + corrente
        f"{anno_corrente}": "/html/body/div[1]/div[3]/form/table/tbody[2]/tr/td[1]/table/tbody/tr[5]/td/div/div/div/div/div/table/tbody/tr[1]/td[2]",
        f"{anno_corrente - 1}": "/html/body/div[1]/div[3]/form/table/tbody[2]/tr/td[1]/table/tbody/tr[5]/td/div/div/div/div/div/table/tbody/tr[2]/td[2]",
        f"{anno_corrente - 2}": "/html/body/div[1]/div[3]/form/table/tbody[2]/tr/td[1]/table/tbody/tr[5]/td/div/div/div/div/div/table/tbody/tr[3]/td[2]",
        f"{anno_corrente - 3}": "/html/body/div[1]/div[3]/form/table/tbody[2]/tr/td[1]/table/tbody/tr[5]/td/div/div/div/div/div/table/tbody/tr[4]/td[2]",
        f"{anno_corrente - 4}": "/html/body/div[1]/div[3]/form/table/tbody[2]/tr/td[1]/table/tbody/tr[5]/td/div/div/div/div/div/table/tbody/tr[5]/td[2]",
        f"{anno_corrente - 5}": "/html/body/div[1]/div[3]/form/table/tbody[2]/tr/td[1]/table/tbody/tr[5]/td/div/div/div/div/div/table/tbody/tr[6]/td[2]"
    }
}

In [19]:
ladle.clicks.click(button_mapping['sezione']['PRIMA'])
time.sleep(3)
ladle.clicks.click(button_mapping['sezione']['TERZA'])
time.sleep(3)
ladle.clicks.click(button_mapping['sezione']['QUINTA'])
time.sleep(3)
ladle.clicks.click(button_mapping['sezione']['LAVORO'])
time.sleep(3)
# L'archivio va cliccato per ultimo perché la CIVILE elimina alcune sezioni e perciò riordina gli xpath e.g. td[5] -> td[4]
ladle.clicks.click(button_mapping['archivio']['CIVILE'])

prova sotto

In [None]:
# Configuration for scraping
import urllib.parse
import urllib3

# Disable SSL warnings (since we'll disable verification)
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)

main_folder = 'cassazione_sentenze'
bucket_name = "loomy-public-documents"
base_url = "https://www.italgiure.giustizia.it"  # Fixed: removed /sncass since data-arg contains full path

# Initialize results list
all_pdf_objs = []
page_counter = 0
max_pages = None  # Set to a number to limit pages for testing, or None for all pages

# Create a persistent session for efficient connection reuse
session = requests.Session()
session.headers.update({
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
})
# Disable SSL verification for this site (Italian government sites sometimes have cert issues)
session.verify = False

# XPath for pagination button
new_page_button_xpath = "/html/body/div[1]/div[3]/form/table/tbody[2]/tr/td[2]/div/div[2]/div[11]/span"

# Function to check if pagination button exists and is clickable
def has_next_page():
    try:
        button = ladle.elements.element(new_page_button_xpath, timeout=2)
        is_displayed = ladle.elements.element_displayed(element=button)
        # Also check if the button is enabled (not disabled)
        if is_displayed:
            button_class = button.get_attribute('class') or ''
            # Button might be disabled with certain classes or aria attributes
            return 'disabled' not in button_class.lower()
        return False
    except Exception as e:
        print(f"No next page button found: {e}")
        return False

# Function to extract PDF URL from link element
def extract_pdf_url(link_element):
    """Extract the actual PDF URL from the link's data-arg attribute"""
    try:
        # Find the img tag inside the link
        img = link_element.find_element(By.TAG_NAME, 'img')
        data_arg = img.get_attribute('data-arg')
        
        if data_arg:
            # URL decode the path
            decoded_path = urllib.parse.unquote(data_arg)
            # Construct full URL
            full_url = f"{base_url}{decoded_path}"
            return full_url
        return None
    except Exception as e:
        print(f"    Error extracting PDF URL: {e}")
        return None

# Function to scrape current page
def scrape_current_page():
    """Scrape all PDF links from the current page"""
    scraped_count = 0
    try:
        # Get all PDF links on current page
        # The XPath pattern with [*] should match all result divs
        pdf_list = ladle.elements.elements('/html/body/div[1]/div[3]/form/table/tbody[2]/tr/td[2]/div/div[2]/div[*]/div/h3/a')
        
        print(f"Found {len(pdf_list)} documents on this page")
        
        for n in range(len(pdf_list)):
            try:
                # Re-fetch element to avoid stale reference
                doc_link = ladle.elements.element(f'/html/body/div[1]/div[3]/form/table/tbody[2]/tr/td[2]/div/div[2]/div[{n+1}]/div/h3/a', timeout=2)
                doc_text = doc_link.text
                
                print(f"  [{n+1}/{len(pdf_list)}] Processing: {doc_text[:80]}...")
                
                # Extract the actual PDF URL from the data-arg attribute
                doc_url = extract_pdf_url(doc_link)
                
                if not doc_url:
                    print(f"    ⚠️ Could not extract PDF URL")
                    continue
                
                # Download PDF content using session (with SSL verification disabled)
                try:
                    response = session.get(doc_url, timeout=60, verify=False)
                    response.raise_for_status()
                    pdf_bytes = response.content
                    
                    if pdf_bytes:
                        pdf_obj = {
                            "url": doc_url,
                            "name": doc_text,
                            "bytes": pdf_bytes,
                            "extension": "pdf"
                        }
                        
                        # Store in list
                        all_pdf_objs.append(pdf_obj)
                        
                        # Upload to Google Cloud Storage
                        upload_to_storage(
                            storage_client=storage_client,
                            bucket_name=bucket_name,
                            pdf_obj=pdf_obj,
                            folder=main_folder
                        )
                        scraped_count += 1
                        print(f"    ✅ Downloaded and uploaded ({len(pdf_bytes)/1024:.1f} KB)")
                    else:
                        print(f"    ⚠️ No content downloaded")
                        
                except requests.exceptions.Timeout:
                    print(f"    ⚠️ Timeout while downloading PDF")
                except requests.exceptions.RequestException as e:
                    print(f"    ⚠️ Request error: {e}")
                except Exception as e:
                    print(f"    ⚠️ Unexpected error: {e}")
                    
            except Exception as e:
                print(f"  ⚠️ Error processing document {n+1}: {e}")
                continue
                
    except Exception as e:
        print(f"❌ Error scraping page: {e}")
    
    return scraped_count

# Main scraping loop
print("🚀 Starting Cassazione scraping...")
print("="*80)

try:
    while True:
        page_counter += 1
        print(f"\n📄 PAGE {page_counter}")
        print("-"*80)
        
        # Scrape current page
        scraped = scrape_current_page()
        print(f"✅ Scraped {scraped} PDFs from page {page_counter}")
        print(f"📊 Total PDFs collected so far: {len(all_pdf_objs)}")
        
        # Check if we should stop (max pages limit)
        if max_pages and page_counter >= max_pages:
            print(f"\n⏹️ Reached maximum page limit ({max_pages})")
            break
        
        # Check if there's a next page
        if not has_next_page():
            print(f"\n✅ No more pages available. Scraping complete!")
            break
        
        # Click next page button
        try:
            print(f"➡️ Navigating to page {page_counter + 1}...")
            ladle.clicks.click(new_page_button_xpath)
            time.sleep(2)  # Wait for page to load
        except Exception as e:
            print(f"❌ Failed to navigate to next page: {e}")
            break

finally:
    # Always close the session
    session.close()
    print("\n🔒 Session closed")

print("="*80)
print(f"🎉 SCRAPING COMPLETE!")
print(f"📊 Total pages processed: {page_counter}")
print(f"📚 Total PDFs collected: {len(all_pdf_objs)}")
print(f"☁️ All PDFs uploaded to gs://{bucket_name}/{main_folder}")


In [15]:
# Add this cell to investigate the link structure
# Let's examine the first link to understand its structure

# Get the first document link
first_link = ladle.elements.element('/html/body/div[1]/div[3]/form/table/tbody[2]/tr/td[2]/div/div[2]/div[1]/div/h3/a', timeout=5)

# Check various attributes that might contain the PDF URL
print("🔍 Investigating link structure:")
print(f"href: {first_link.get_attribute('href')}")
print(f"onclick: {first_link.get_attribute('onclick')}")
print(f"data-url: {first_link.get_attribute('data-url')}")
print(f"data-href: {first_link.get_attribute('data-href')}")
print(f"data-link: {first_link.get_attribute('data-link')}")
print(f"data-pdf: {first_link.get_attribute('data-pdf')}")
print(f"title: {first_link.get_attribute('title')}")

# Check the entire outer HTML to see the full element
print("\n📄 Full element HTML:")
print(first_link.get_attribute('outerHTML')[:500])

# Check if there's a parent element with useful info
parent = first_link.find_element(By.XPATH, '..')
print("\n👆 Parent element HTML:")
print(parent.get_attribute('outerHTML')[:500])

🔍 Investigating link structure:
href: javascript:void(0)
onclick: None
data-url: None
data-href: None
data-link: None
data-pdf: None
title: 

📄 Full element HTML:
<a href="javascript:void(0)" tabindex="0" style="text-decoration:none;color:#440;" class="text2pdf"> <span data-role="content" data-arg="filename" title="pdf della sentenza"><span><img class="toDocument pdf rowIcon" data-arg="/xway/application/nif/clean/hc.dll%3Fverbo%3Dattach%26db%3Dsnciv%26id%3D./20251014/snciv@s50@a2025@n27401@tD.clean.pdf" alt="formato pdf" src="pix/pdf.png"></span></span>&nbsp; <span class="chkcontent"><span class="label">Sez.</span>&nbsp;<span class="risultato" data-role="

👆 Parent element HTML:
<h3 class="doctitle" onclick="toTargetDoc(event, this)" style="margin:.1em; font-weight:bold; color:#440; font-size:90%;"> <a href="javascript:void(0)" tabindex="0" style="text-decoration:none;color:#440;" class="text2pdf"> <span data-role="content" data-arg="filename" title="pdf della sentenza"><span><img clas

In [23]:
# Test the URL construction to see what we're getting
first_link = ladle.elements.element('/html/body/div[1]/div[3]/form/table/tbody[2]/tr/td[2]/div/div[2]/div[1]/div/h3/a', timeout=5)
img = first_link.find_element(By.TAG_NAME, 'img')
data_arg = img.get_attribute('data-arg')

print("Raw data-arg:", data_arg)
print("\nURL decoded:", urllib.parse.unquote(data_arg))

# Fixed base URL (without /sncass)
fixed_base_url = "https://www.italgiure.giustizia.it"
print("\nFull URL (FIXED method):", f"{fixed_base_url}{urllib.parse.unquote(data_arg)}")

# Try to access it directly to see what works
test_url = f"{fixed_base_url}{urllib.parse.unquote(data_arg)}"
print(f"\nTesting URL: {test_url}")
try:
    resp = requests.get(test_url, verify=False, timeout=10)
    print(f"Status: {resp.status_code}")
    print(f"Content-Type: {resp.headers.get('Content-Type')}")
    print(f"Size: {len(resp.content)} bytes")
    if resp.status_code == 200:
        print("✅ SUCCESS! PDF downloaded correctly")
except Exception as e:
    print(f"Error: {e}")


Raw data-arg: /xway/application/nif/clean/hc.dll%3Fverbo%3Dattach%26db%3Dsnciv%26id%3D./20251016/snciv@s30@a2025@n27659@tD.clean.pdf

URL decoded: /xway/application/nif/clean/hc.dll?verbo=attach&db=snciv&id=./20251016/snciv@s30@a2025@n27659@tD.clean.pdf

Full URL (FIXED method): https://www.italgiure.giustizia.it/xway/application/nif/clean/hc.dll?verbo=attach&db=snciv&id=./20251016/snciv@s30@a2025@n27659@tD.clean.pdf

Testing URL: https://www.italgiure.giustizia.it/xway/application/nif/clean/hc.dll?verbo=attach&db=snciv&id=./20251016/snciv@s30@a2025@n27659@tD.clean.pdf
Status: 200
Content-Type: application/pdf
Size: 84865 bytes
✅ SUCCESS! PDF downloaded correctly
Status: 200
Content-Type: application/pdf
Size: 84865 bytes
✅ SUCCESS! PDF downloaded correctly


In [None]:
# Display summary statistics
print("📊 SCRAPING SUMMARY")
print("="*80)
print(f"Total PDFs collected: {len(all_pdf_objs)}")
print(f"\nSample of collected PDFs:")
for i, pdf in enumerate(all_pdf_objs[:5]):  # Show first 5
    print(f"\n{i+1}. {pdf['name'][:100]}")
    print(f"   URL: {pdf['url']}")
    print(f"   Size: {len(pdf['bytes'])/1024:.2f} KB")
    if pdf['text']:
        print(f"   Text preview: {pdf['text'][:150].strip()}...")


In [None]:
# Optional: Save metadata to JSON file (without the large byte data)
import json
from datetime import datetime

metadata = []
for pdf in all_pdf_objs:
    metadata.append({
        'name': pdf['name'],
        'url': pdf['url'],
        'size_kb': len(pdf['bytes']) / 1024,
        'has_text': pdf['text'] is not None,
        'text_length': len(pdf['text']) if pdf['text'] else 0
    })

# Save to file
output_file = f"cassazione_scraping_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json"
with open(output_file, 'w', encoding='utf-8') as f:
    json.dump({
        'scraped_at': datetime.now().isoformat(),
        'total_pdfs': len(all_pdf_objs),
        'pages_processed': page_counter,
        'documents': metadata
    }, f, indent=2, ensure_ascii=False)

print(f"✅ Metadata saved to: {output_file}")


Prova api sentenze

In [33]:
# Disable SSL warnings (since we'll disable verification)
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)

# Create a persistent session for efficient connection reuse
session = requests.Session()
session.headers.update({
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
})
# Disable SSL verification for this site (Italian government sites sometimes have cert issues)
session.verify = False

In [38]:
url = "https://www.italgiure.giustizia.it/xway/application/nif/clean/hc.dll?verbo=attach&db=snciv&id=./20251016/snciv@s30@a2025@n27609@tD.clean.pdf"
resp = session.get(url, verify=False, timeout=30)

In [39]:
resp.content

b'<!DOCTYPE html>\n<html lang="it">\n<head>\n <meta http-equiv="X-UA-Compatible" content="IE=edge" />\n <meta http-equiv="Content-Type" content="text/html; charset=iso-8859-1"/>\n <meta name="viewport" content="width=device-width, initial-scale=1"/>\n <script src=\'https://www.google.com/recaptcha/api.js\'></script>\n <title>richiesta documento</title>\n</head>\n<body>\n <form action="" method="post" style="width:308px;margin:auto;margin-top:10%">\n  <input type="hidden" name="id"    value="./20251016/snciv@s30@a2025@n27609@tD.clean.pdf"/>\n  <input type="hidden" name="verbo" value="attach"/>\n  <input type="hidden" name="db"    value="snciv"/>\n  <div class="g-recaptcha" data-sitekey="6Les-QkUAAAAAOMVgL93z-reIKDKtT6son58ltD3"></div>\n  <input value="richiedi il documento" type="submit" style="margin-top: 1.2em; color: #fff; background-color: #09f; padding: 8px 24px; height: 20px;border: none; border-radius: 3px; font: 700 14px/20px Helvetica Neue,Arial,Helvetica,Geneva,sans-serif; cur