In [23]:
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin,urlparse
import pdfplumber
import requests
import os
import re
from pprint import pprint
import numpy as np
from pathlib import Path
import csv
import hashlib

In [2]:
def extract_grouped_links(main_url):
    grouped_links = {}

    response = requests.get(main_url)
    soup = BeautifulSoup(response.content, 'html.parser')
    section = soup.find('section', class_='entry-content')

    if not section:
        print("<section class='entry-content'> not found.")
        return grouped_links

    current_heading = "Uncategorized"
    for tag in section.children:
        if tag.name == 'h5':
            current_heading = tag.get_text(strip=True)
            grouped_links[current_heading] = {
                "pdf_links": [],
                "html_links": []
            }

        elif tag.name == 'p':
            links = tag.find_all('a', href=True)
            for link in links:
                href = link['href']
                full_url = urljoin(main_url, href)

                if "javascript:" in href or href.startswith("#"):
                    continue

                try:
                    # Check Content-Type via HEAD request
                    head = requests.head(full_url, allow_redirects=True, timeout=10)
                    content_type = head.headers.get("Content-Type", "").lower()

                    if "application/pdf" in content_type:
                        grouped_links[current_heading]["pdf_links"].append(full_url)
                    else:
                        grouped_links[current_heading]["html_links"].append(full_url)

                except Exception as e:
                    print(f"Skipping {full_url}: {e}")
                    continue

    return grouped_links


In [3]:
url = "https://www.ministeroturismo.gov.it/del-settore-turistico/"
links_by_heading = extract_grouped_links(url)

#preview
for heading, group in links_by_heading.items():
    print(f"\n{heading}")
    print("  PDF Links:")
    for link in group["pdf_links"]:
        print("    -", link)
    print("  HTML Links:")
    for link in group["html_links"]:
        print("    -", link)




Norme generali del comparto turistico per i viaggiatori
  PDF Links:
    - https://www.ministeroturismo.gov.it/wp-content/uploads/2021/07/Codice-del-turismo.pdf
  HTML Links:
    - https://www.gazzettaufficiale.it/gunewsletter/dettaglio.jsp?service=1&datagu=2011-06-06&task=dettaglio&numgu=129&redaz=011G0123&tmstp=1307520490277

Norme generali del comparto turistico per i professionisti
  PDF Links:
    - https://eur-lex.europa.eu/legal-content/IT/TXT/PDF/?uri=CELEX:02005L0036-20140117&from=ES
    - https://eur-lex.europa.eu/legal-content/IT/TXT/PDF/?uri=CELEX:32013L0055&from=ES
  HTML Links:
    - https://www.gazzettaufficiale.it/eli/id/2007/11/09/007G0224/sg
    - https://www.gazzettaufficiale.it/eli/id/2013/08/20/13G00138/sg

Norme generali del comparto turistico per le imprese
  PDF Links:
    - https://www.ministeroturismo.gov.it/wp-content/uploads/2021/07/SA.62392-Agenzie-di-viaggio-e-tour-operator.pdf
  HTML Links:
    - https://www.gazzettaufficiale.it/eli/id/2016/2/09/16G00021

In [4]:

def extract_two_column_pdf(pdf_path):
    """Treat each page as two columns — extract left then right like two sub-pages."""
    import pdfplumber

    full_text = ""
    all_tables = []

    with pdfplumber.open(pdf_path) as pdf:
        for page in pdf.pages:
            width = page.width
            height = page.height

            # Split page into left and right halves
            left = page.within_bbox((0, 0, width / 2, height))
            right = page.within_bbox((width / 2, 0, width, height))

            left_text = left.extract_text() or ""
            right_text = right.extract_text() or ""

            # Combine them in reading order: left first, then right
            full_text += left_text.strip() + "\n" + right_text.strip() + "\n"

            # Tables from full page 
            # tables = page.extract_tables()
            # for table in tables:
            #     all_tables.append(table)

    return full_text.strip()#, all_tables


def extract_pdfplumber_data(pdf_path):
    """Extract text and tables from a PDF using pdfplumber."""
    full_text = ""
    all_tables = []

    with pdfplumber.open(pdf_path) as pdf:
        for page in pdf.pages:
            text = page.extract_text()
            if text:
                full_text += text + "\n"

            # tables = page.extract_tables()
            # for table in tables:
            #     all_tables.append(table)

    return full_text.strip() #, all_tables


def chunk_text(text, max_length=3, overlap=10):
    """Split text into overlapping chunks (suitable for RAG)."""
    sentences = re.split(r'(?<=[.!?]) +', text)
    chunks = []
    current = ""
    for sentence in sentences:
        if len(current) + len(sentence) <= max_length:
            current += sentence + " "
        else:
            chunks.append(current.strip())
            current = sentence + " "
    if current:
        chunks.append(current.strip())

    if overlap > 0 and len(chunks) > 1:
        overlapped = []
        for i in range(len(chunks)):
            prev = chunks[i - 1] if i > 0 else ""
            overlapped.append((prev + " " + chunks[i]).strip())
        return overlapped
    return chunks

def download_pdf(url, dest_folder="downloads"):
    """Download a PDF file, forcing a proper filename and extension."""
    os.makedirs(dest_folder, exist_ok=True)

    parsed_url = urlparse(url)
    filename = os.path.basename(parsed_url.path)

    if not filename.lower().endswith(".pdf"):
        # Build a filename from the path and query
        filename = parsed_url.path.strip("/").replace("/", "_")
        if parsed_url.query:
            filename += "_" + re.sub(r'\W+', '_', parsed_url.query)
        filename += ".pdf"

    filepath = os.path.join(dest_folder, filename)

    if not os.path.exists(filepath):
        try:
            print(f"Downloading: {url}")
            headers = {
                "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 "
                              "(KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
            }
            response = requests.get(url, headers=headers, timeout=20)
            response.raise_for_status()

            with open(filepath, "wb") as f:
                f.write(response.content)

            print(f"Saved to: {filepath}")
        except Exception as e:
            print(f"Failed to download {url}: {e}")
            return None

    return filepath

def process_grouped_pdf_links(links_by_heading, dest_folder="downloads", chunk_size=1000, overlap=200):
    """Process PDFs from links_by_heading using pdfplumber and custom handler for two-column PDFs."""

    #known two-column PDF URLs 
    TWO_COLUMN_PDF_URLS = [
        "https://eur-lex.europa.eu/legal-content/IT/TXT/PDF/?uri=CELEX:32013L0055&from=ES"
    ]

    processed = {}

    for heading, links in links_by_heading.items():
        pdf_urls = links.get("pdf_links", [])
        html_urls = links.get("html_links", [])

        processed[heading] = {
            "pdf_links": {},
            "html_links": html_urls
        }

        for url in pdf_urls:
            try:
                print(f"Processing PDF: {url}")
                path = download_pdf(url, dest_folder=dest_folder)

                #Use special extraction if the PDF is 2-column
                if url in TWO_COLUMN_PDF_URLS:
                    text = extract_two_column_pdf(path)
                else:
                    text = extract_pdfplumber_data(path)

                chunks = chunk_text(text, max_length=chunk_size, overlap=overlap)

                processed[heading]["pdf_links"][url] = {
                    "text": text,
                    #"tables": tables,
                    #"chunks": chunks
                }

            except Exception as e:
                print(f"Failed to process {url}: {e}")

    return processed



In [5]:
# Process the grouped PDF links
pdf_data = process_grouped_pdf_links(links_by_heading)

Processing PDF: https://www.ministeroturismo.gov.it/wp-content/uploads/2021/07/Codice-del-turismo.pdf
Processing PDF: https://eur-lex.europa.eu/legal-content/IT/TXT/PDF/?uri=CELEX:02005L0036-20140117&from=ES
Processing PDF: https://eur-lex.europa.eu/legal-content/IT/TXT/PDF/?uri=CELEX:32013L0055&from=ES
Processing PDF: https://www.ministeroturismo.gov.it/wp-content/uploads/2021/07/SA.62392-Agenzie-di-viaggio-e-tour-operator.pdf
Processing PDF: https://www.ministeroturismo.gov.it/wp-content/uploads/2024/09/Vademecum-Cammini-Aperti-16.09.24-def-3-file-da-caricare-1.pdf
Processing PDF: https://www.ministeroturismo.gov.it/wp-content/uploads/2023/01/Decreto-ministeriale-interoperabilita-tdh-2022_signed.pdf


In [6]:
for heading, group in pdf_data.items():
    for url, content in group["pdf_links"].items():
        print(f"\nHeading: {heading}")
        print(f"URL: {url}")
        print(f"Extracted Text (first 1000 characters):\n")
        print(content["text"])
        print("\n\n\n\n\n\n")


Heading: Norme generali del comparto turistico per i viaggiatori
URL: https://www.ministeroturismo.gov.it/wp-content/uploads/2021/07/Codice-del-turismo.pdf
Extracted Text (first 1000 characters):

DECRETO LEGISLATIVO 23 maggio 2011, n. 79
Codice della normativa statale in tema di ordinamento e mercato del
turismo, a norma dell'articolo 14 della legge 28 novembre 2005, n.
246, nonche' attuazione della direttiva 2008/122/CE, relativa ai
contratti di multiproprieta', contratti relativi ai prodotti per le
vacanze di lungo termine, contratti di rivendita e di scambio.
(11G0123)
GU n. 129 del 6-6-2011 - Suppl. Ordinario n.139
testo in vigore dal: 21-6-2011
IL PRESIDENTE DELLA REPUBBLICA
Visti gli articoli 76 e 87 della Costituzione;
Visto l'articolo 20, commi 3 e 4, della legge 15 marzo 1997, n. 59;
Vista la legge 28 novembre 2005, n. 246, ed, in particolare,
l'articolo 14, commi 14, 15 e 18;
Visto il decreto legislativo 30 luglio 1999, n. 303, recante
ordinamento della Presidenza del Consi

In [7]:
def extract_text_from_gazzetta(url):
    """
    Extracts the main legislative text from a Gazzetta Ufficiale page.
    """
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 "
                      "(KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
    }

    try:
        # Step 1: Access the main page
        response = requests.get(url, headers=headers, timeout=15)
        response.raise_for_status()
        soup = BeautifulSoup(response.content, 'html.parser')

        # Step 2: Find the iframe
        iframe = soup.find('iframe', class_='wrapped-iframe2')
        if not iframe:
            print("No iframe found on the page.")
            return None

        iframe_src = iframe.get('src')
        iframe_url = urljoin(url, iframe_src)
        print(f"Following iframe: {iframe_url}")

        # Step 3: Access the iframe content
        iframe_response = requests.get(iframe_url, headers=headers, timeout=15)
        iframe_response.raise_for_status()
        iframe_soup = BeautifulSoup(iframe_response.content, 'html.parser')

        # Step 4: Extract from <span class="dettaglio_atto_testo">
        span = iframe_soup.find('span', class_='dettaglio_atto_testo')
        if not span:
            print("<span class='dettaglio_atto_testo'> not found.")
            return None

        # Extract text from all <pre> tags inside this span
        pre_tags = span.find_all('pre')
        text = '\n'.join(tag.get_text(strip=True) for tag in pre_tags)

        if not text:
            print("No visible text found in <pre> tags.")
            return None

        print(f"Extracted text length: {len(text)}")
        return text

    except Exception as e:
        print(f"An error occurred: {e}")
        return None
        
# Example usage:
# sample_url = "https://www.gazzettaufficiale.it/gunewsletter/dettaglio.jsp?service=1&datagu=2011-06-06&task=dettaglio&numgu=129&redaz=011G0123&tmstp=1307520490277"
# extracted_text = extract_text_from_gazzetta(sample_url)

# if extracted_text:
#     print("\nExtracted Text Preview (first 1000 characters):\n")
#     print(extracted_text)
# else:
#     print("No text extracted.")

In [8]:
def process_gazzetta_links(links_by_heading, chunk_size=500, overlap=200):
    processed = {}

    for heading, group in links_by_heading.items():
        html_links = group.get("html_links", [])

        processed[heading] = processed.get(heading, {
            "pdf_links": {},
            "html_links": {}
        })

        for url in html_links:
            if "gazzettaufficiale.it" not in url:
                continue

            print(f"Processing Gazzetta link: {url}")
            text = extract_text_from_gazzetta(url)

            if text:
                #chunks = chunk_text(text, max_length=chunk_size, overlap=overlap)
                processed[heading]["html_links"][url] = {
                    "text": text,
                    #"chunks": chunks
                }

    return processed


In [9]:
processed_links = process_gazzetta_links(links_by_heading)

Processing Gazzetta link: https://www.gazzettaufficiale.it/gunewsletter/dettaglio.jsp?service=1&datagu=2011-06-06&task=dettaglio&numgu=129&redaz=011G0123&tmstp=1307520490277
Following iframe: https://www.gazzettaufficiale.it/atto/serie_generale/caricaArticoloDefault/originario?atto.dataPubblicazioneGazzetta=2011-06-06&atto.codiceRedazionale=011G0123&atto.tipoProvvedimento=DECRETO LEGISLATIVO
Extracted text length: 22107
Processing Gazzetta link: https://www.gazzettaufficiale.it/eli/id/2007/11/09/007G0224/sg
Following iframe: https://www.gazzettaufficiale.it/atto/serie_generale/caricaArticoloDefault/originario?atto.dataPubblicazioneGazzetta=2007-11-09&atto.codiceRedazionale=007G0224&atto.tipoProvvedimento=DECRETO LEGISLATIVO
Extracted text length: 20347
Processing Gazzetta link: https://www.gazzettaufficiale.it/eli/id/2013/08/20/13G00138/sg
Following iframe: https://www.gazzettaufficiale.it/atto/serie_generale/caricaArticoloDefault/originario?atto.dataPubblicazioneGazzetta=2013-08-20&at

In [10]:
for heading, group in processed_links.items():
    html_links = group.get("html_links", {})

    for url, content in html_links.items():
        print(f"\Heading: {heading}")
        print(f"URL: {url}")

        # Safely retrieve the text
        text = content.get("text", "")
        print(f"First 500 characters of text:\n{text[:500]}")

        # Show the first two chunks
        # chunks = content.get("chunks", [])
        # if len(chunks) >= 2:
        #     print("\nFirst 2 chunks:")
        #     print("\n*1*", chunks[0])
        #     print("\n*2*", chunks[1])
        # elif chunks:
        #     print("\nOnly 1 chunk:")
        #     print(chunks[0])
        # else:
        #     print("No chunks available.")

        print("-" * 80)


\Heading: Norme generali del comparto turistico per i viaggiatori
URL: https://www.gazzettaufficiale.it/gunewsletter/dettaglio.jsp?service=1&datagu=2011-06-06&task=dettaglio&numgu=129&redaz=011G0123&tmstp=1307520490277
First 500 characters of text:
IL PRESIDENTE DELLA REPUBBLICA 
 
  Visti gli articoli 76 e 87 della Costituzione; 
  Visto l'articolo 20, commi 3 e 4, della legge 15 marzo 1997, n. 59; 
  Vista la legge 28  novembre  2005,  n.  246,  ed,  in  particolare,
l'articolo 14, commi 14, 15 e 18; 
  Visto il decreto  legislativo  30  luglio  1999,  n.  303,  recante
ordinamento della Presidenza del  Consiglio  dei  ministri,  a  norma
dell'articolo 11 della legge 15 marzo 1997, n. 59; 
  Visto il decreto legislativo 6  settembre  200
--------------------------------------------------------------------------------
\Heading: Norme generali del comparto turistico per i professionisti
URL: https://www.gazzettaufficiale.it/eli/id/2007/11/09/007G0224/sg
First 500 characters of text:
IL

  print(f"\Heading: {heading}")


In [11]:
def sanitize_html_links(structure):
    """
    Convert any html_links lists to empty dicts to make them merge-compatible.
    """
    for heading, content in structure.items():
        if isinstance(content.get("html_links"), list):
            content["html_links"] = {}  # Replace list with empty dict
    return structure


In [12]:
def merge_processed_data(*dicts):
    """Merge multiple processed data dicts into one (grouped by heading)."""
    merged = {}

    for data in dicts:
        for heading, content in data.items():
            if heading not in merged:
                merged[heading] = {"pdf_links": {}, "html_links": {}}

            # Merge PDF links
            pdfs = content.get("pdf_links", {})
            merged[heading]["pdf_links"].update(pdfs)

            # Merge HTML links
            htmls = content.get("html_links", {})
            merged[heading]["html_links"].update(htmls)

    return merged


In [13]:
pdf_data = sanitize_html_links(pdf_data)
combined_data = merge_processed_data(pdf_data, processed_links)

In [14]:
def preview_combined_data_summary(combined_data):
    print("\nCombined Dataset Overview")
    print("=" * 50)

    for heading, group in combined_data.items():
        print(f"\nHeading: {heading}")

        # PDF Links
        pdf_links = group.get("pdf_links", {})
        print(f"  PDF links: {len(pdf_links)}")
        for url, content in pdf_links.items():
            chunk_count = len(content.get("chunks", []))
            print(f"    - {url} → {chunk_count} chunks")

        # HTML Links
        html_links = group.get("html_links", {})
        print(f"  HTML links: {len(html_links)}")
        for url, content in html_links.items():
            chunk_count = len(content.get("chunks", []))
            print(f"    - {url} → {chunk_count} chunks")

        print("-" * 50)


In [15]:
preview_combined_data_summary(combined_data)



Combined Dataset Overview

Heading: Norme generali del comparto turistico per i viaggiatori
  PDF links: 1
    - https://www.ministeroturismo.gov.it/wp-content/uploads/2021/07/Codice-del-turismo.pdf → 0 chunks
  HTML links: 1
    - https://www.gazzettaufficiale.it/gunewsletter/dettaglio.jsp?service=1&datagu=2011-06-06&task=dettaglio&numgu=129&redaz=011G0123&tmstp=1307520490277 → 0 chunks
--------------------------------------------------

Heading: Norme generali del comparto turistico per i professionisti
  PDF links: 2
    - https://eur-lex.europa.eu/legal-content/IT/TXT/PDF/?uri=CELEX:02005L0036-20140117&from=ES → 0 chunks
    - https://eur-lex.europa.eu/legal-content/IT/TXT/PDF/?uri=CELEX:32013L0055&from=ES → 0 chunks
  HTML links: 2
    - https://www.gazzettaufficiale.it/eli/id/2007/11/09/007G0224/sg → 0 chunks
    - https://www.gazzettaufficiale.it/eli/id/2013/08/20/13G00138/sg → 0 chunks
--------------------------------------------------

Heading: Norme generali del comparto tur

In [26]:
def save_combined_data_to_csv(combined_data, output_dir="output_csvs"):
    os.makedirs(output_dir, exist_ok=True)

    for group_name, group_links in combined_data.items():
        safe_group = group_name.replace(" ", "_").replace("/", "-")

        for link_type in ["pdf_links", "html_links"]:
            for url, content_dict in group_links.get(link_type, {}).items():
                full_text = content_dict.get("text", "").strip()

                if not full_text:
                    print(f"⚠️ Skipping empty content: {url}")
                    continue

                # Get base filename and append a hash to avoid duplicates
                parsed_url = urlparse(url)
                base_filename = os.path.splitext(os.path.basename(parsed_url.path))[0]
                url_hash = hashlib.md5(url.encode()).hexdigest()[:6]
                safe_filename = base_filename.replace(" ", "_").replace("/", "-")

                output_filename = f"{safe_group}_{safe_filename}_{url_hash}.csv"
                output_path = Path(output_dir) / output_filename

                print(f"Saving: {url} -> {output_filename}")

                with open(output_path, "w", newline="", encoding="utf-8") as f:
                    writer = csv.writer(f)
                    writer.writerow(["content"])
                    writer.writerow([full_text])

In [27]:
# Save all links into CSV files
save_combined_data_to_csv(combined_data)

Saving: https://www.ministeroturismo.gov.it/wp-content/uploads/2021/07/Codice-del-turismo.pdf -> Norme_generali_del_comparto_turistico_per_i_viaggiatori_Codice-del-turismo_a7985c.csv
Saving: https://www.gazzettaufficiale.it/gunewsletter/dettaglio.jsp?service=1&datagu=2011-06-06&task=dettaglio&numgu=129&redaz=011G0123&tmstp=1307520490277 -> Norme_generali_del_comparto_turistico_per_i_viaggiatori_dettaglio_a76e75.csv
Saving: https://eur-lex.europa.eu/legal-content/IT/TXT/PDF/?uri=CELEX:02005L0036-20140117&from=ES -> Norme_generali_del_comparto_turistico_per_i_professionisti__579c55.csv
Saving: https://eur-lex.europa.eu/legal-content/IT/TXT/PDF/?uri=CELEX:32013L0055&from=ES -> Norme_generali_del_comparto_turistico_per_i_professionisti__326f94.csv
Saving: https://www.gazzettaufficiale.it/eli/id/2007/11/09/007G0224/sg -> Norme_generali_del_comparto_turistico_per_i_professionisti_sg_da4485.csv
Saving: https://www.gazzettaufficiale.it/eli/id/2013/08/20/13G00138/sg -> Norme_generali_del_compar