In [None]:
import requests
import time
import os
import re

publication = (GALAXY_INPUTS['publication']).lower()
year = (GALAXY_INPUTS['year'])

def sanitize_filename(name):
    """Removes characters that are illegal in filenames."""
    return re.sub(r'[<>:"/\\ |?*]', '_', name)

def zip_and_put_to_galaxy(year, source_folder):
    """
    Zips the folder, sends it to Galaxy history, and cleans up raw files.
    """
    print("\n--- Starting Export to Galaxy ---")
    
    # 1. Zip the folder
    zip_base_name = f"downloads_{year}" # shutil adds .zip automatically
    print(f"Zipping folder: {source_folder}...")
    
    try:
        # shutil.make_archive(base_name, format, root_dir)
        zip_path = shutil.make_archive(zip_base_name, 'zip', source_folder)
        print(f"Archive created: {zip_path}")
        
        # 2. Send to Galaxy
        print("Sending zip file to Galaxy History...")
        put(f"{zip_base_name}.zip")
        print("Successfully sent to History.")

        # 3. Clean up (Delete raw files to save space)
        print("Cleaning up raw files...")
        shutil.rmtree(source_folder)
        print("Raw files deleted successfully.")

    except Exception as e:
        print(f"Error during export/cleanup: {e}")

def extract_date_from_label(label):
    """
    Tries to find a date in YYYY-MM-DD format within the label string.
    Returns the date string or 'unknown_date' if not found.
    """
    # Look for pattern YYYY-MM-DD
    match = re.search(r'(\d{4}-\d{2}-\d{2})', label)
    if match:
        return match.group(1)
    return "unknown_date"

def download_newspaper_year(year):
    base_folder = f"outputs/collection/downloads_{year}"
    
    # 1. Construct the Collection URL
    collection_url = f"https://iiif.onb.ac.at/presentation/collection/{publication}_{year}"
    print(f"Fetching Collection: {collection_url}")

    try:
        response = requests.get(collection_url)
        response.raise_for_status()
        collection_data = response.json()
    except Exception as e:
        print(f"Error fetching collection: {e}")
        return

    if "manifests" not in collection_data:
        print("No manifests found.")
        return

    print(f"Found {len(collection_data['manifests'])} issues. Starting download...\n")

    # 2. Iterate through Manifests (Issues)
    for i, manifest_entry in enumerate(collection_data["manifests"]):
        manifest_url = manifest_entry.get("@id")
        manifest_label = manifest_entry.get("label", f"Issue_{i}")
        
        # --- NEW: Extract Date for Filename ---
        issue_date = extract_date_from_label(manifest_label)
        
        # Create a clean folder name for this sequence/issue
        safe_folder_name = sanitize_filename(manifest_label)
        issue_path = os.path.join(base_folder, safe_folder_name)
        
        # Create the directory if it doesn't exist
        os.makedirs(issue_path, exist_ok=True)
        
        print(f"[{i+1}] Processing: {manifest_label} (Date: {issue_date})")

        try:
            m_response = requests.get(manifest_url)
            m_data = m_response.json()
            
            # 3. Iterate through Sequences -> Canvases (Pages)
            sequences = m_data.get("sequences", [])
            for sequence in sequences:
                canvases = sequence.get("canvases", [])
                
                print(f"    - Found {len(canvases)} pages.")
                
                for canvas in canvases:
                    # Get the page label (usually "0001", "0002", etc.)
                    # We strip to remove whitespace and zfill to ensure correct sorting (e.g., 001)
                    raw_page_label = str(canvas.get("label", "0")).strip()
                    clean_page_label = raw_page_label.zfill(4) 
                    
                    # --- NEW: Construct Filename with Date ---
                    # Format: 1756-01-03_page_0001.jpg
                    filename = f"{issue_date}_page_{clean_page_label}.jpg"
                    
                    # --- FIX: Join folder + filename ---
                    file_path = os.path.join(issue_path, filename)

                    # Skip if already downloaded
                    if os.path.exists(file_path):
                        # print(f"      [Skipping] {filename} exists.") # Optional: Reduce clutter
                        continue

                    # Find the image URL
                    images = canvas.get("images", [])
                    for image in images:
                        resource = image.get("resource", {})
                        image_url = resource.get("@id")
                        
                        if image_url:
                            print(f"      Downloading {filename}...", end="\r")
                            try:
                                img_r = requests.get(image_url, stream=True)
                                img_r.raise_for_status()
                                with open(file_path, 'wb') as f:
                                    for chunk in img_r.iter_content(chunk_size=8192):
                                        f.write(chunk)
                            except Exception as e:
                                print(f"\n      Error downloading {image_url}: {e}")
                            
                            time.sleep(0.2)
                print("") 

        except Exception as e:
            print(f"    Error processing manifest {manifest_url}: {e}")

        # Sleep between issues
        time.sleep(0.5)

    print("\nAll downloads complete!")

if __name__ == "__main__":
    download_newspaper_year(year)