In [1]:
all_defined = {'Colosseum, Rome, Italy': 1, 'Roman Forum, Rome, Italy': 1, 'Palatine Hill, Rome, Italy': 1, 'Piazza Venezia, Rome, Italy': 1, 'Trevi Fountain, Rome, Italy': 2, 'Piazza di Spagna, Rome, Italy': 2, "Sant'Ignazio, Rome, Italy": 2, 'Temple of Hadrian, Rome, Italy': 2, 'Pantheon, Rome, Italy': 2, 'Piazza Navona, Rome, Italy': 2, "St. Peter's Square, Vatican City": 3, "St. Peter's Basilica, Vatican City": 3, 'Vatican Museums, Vatican City': 3, "Castel Sant'Angelo, Rome, Italy": 3, 'Trastevere, Rome, Italy': 3}

In [2]:
import numpy as np

organized_attraction_names = all_defined.keys()
clusters = np.array([all_defined.get(n, 1) - 1 for n in organized_attraction_names])

In [3]:
clusters

array([0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2])

In [2]:
import subprocess
import os

def convert_to_pdf(docx_path: str):
    """
    Convert DOCX to PDF using available conversion method.

    Tries multiple methods in order:
    1. pypandoc (cross-platform, requires Pandoc installed)
    2. LibreOffice headless mode (Linux/macOS/WSL)

    Args:
        docx_path: Path to the DOCX file

    Returns:
        Path to the created PDF, or None if conversion failed
    """
    pdf_path = docx_path.replace('.docx', '.pdf')

    # Method 1: Try pypandoc (cross-platform, requires Pandoc)
    try:
        import pypandoc
        print(f"Converting to PDF using pypandoc: {docx_path}")

        # Convert docx to pdf using pandoc
        pypandoc.convert_file(
            docx_path,
            'pdf',
            outputfile=pdf_path,
            extra_args=['--pdf-engine=xelatex']  # Use xelatex for better unicode support
        )

        if os.path.exists(pdf_path):
            print(f"PDF created successfully with pypandoc: {pdf_path}")
            return pdf_path

    except ImportError:
        print("pypandoc not installed, trying alternative method")
    except Exception as e:
        print(f"pypandoc conversion failed: {e}, trying alternative method")

    # Method 2: Try LibreOffice headless mode (Linux/macOS/WSL)
    try:
        print("Attempting PDF conversion using LibreOffice...")

        # Find LibreOffice executable
        libreoffice_paths = [
            'libreoffice',
            'soffice',
            '/usr/bin/libreoffice',
            '/usr/bin/soffice',
            '/Applications/LibreOffice.app/Contents/MacOS/soffice',
        ]

        libreoffice_cmd = None
        for path in libreoffice_paths:
            try:
                result = subprocess.run([path, '--version'], capture_output=True, timeout=5)
                if result.returncode == 0:
                    libreoffice_cmd = path
                    break
            except (FileNotFoundError, subprocess.TimeoutExpired):
                continue

        if libreoffice_cmd:
            print(f"Found LibreOffice at: {libreoffice_cmd}")

            # Convert using LibreOffice
            result = subprocess.run([
                libreoffice_cmd,
                '--headless',
                '--convert-to', 'pdf',
                '--outdir', '.results',
                docx_path
            ], capture_output=True, text=True, timeout=120)

            if result.returncode == 0 and os.path.exists(pdf_path):
                print(f"PDF created successfully with LibreOffice: {pdf_path}")
                return pdf_path
            else:
                print(f"LibreOffice conversion failed: {result.stderr}")
        else:
            print("LibreOffice not found on system")

    except subprocess.TimeoutExpired:
        print("LibreOffice conversion timed out")
    except Exception as e:
        print(f"LibreOffice conversion failed: {e}")

    print("PDF conversion not available. Install Pandoc + TeX (for pypandoc) or LibreOffice")
    return None


In [3]:
convert_to_pdf(".results/Roma_Cl치ssica_3_Dias_Entre_Hist칩ria_e_Beleza.docx")

Converting to PDF using pypandoc: .results/Roma_Cl치ssica_3_Dias_Entre_Hist칩ria_e_Beleza.docx
pypandoc conversion failed: No pandoc was found: either install pandoc and add it
to your PATH or or call pypandoc.download_pandoc(...) or
install pypandoc wheels with included pandoc., trying alternative method
Attempting PDF conversion using LibreOffice...
LibreOffice not found on system
PDF conversion not available. Install Pandoc + TeX (for pypandoc) or LibreOffice
