In [1]:
from __future__ import annotations

"""
Generate DDB newspaper time spans.

This script:
1. Queries the DDB newspaper search endpoint for all titles with loaded issues.
2. Parses the 'progress' field to extract earliest and latest year.
3. Outputs a CSV with one row per newspaper:
   id, title, start_year, end_year, years_covered.

Intended as step 1 of the DDB pipeline for the Masterarbeit.
"""

import os
import re
import sys
from dataclasses import dataclass
from typing import List, Optional, Tuple

import pandas as pd
import requests
from pathlib import Path


# --- Config ---

API_KEY: str = ""  

BASE_URL: str = (
    "https://api.deutsche-digitale-bibliothek.de/search/index/newspaper/select"
)

ROWS_PER_PAGE: int = 500

# Output-Datei (fÃ¼rs Repo schÃ¶ner als im aktuellen Ordner)
OUT_FILE = Path("data/ddb/raw/zeitungszeitrÃ¤ume_alle.csv")


@dataclass
class NewspaperPeriod:
    id: str
    title: str
    start_year: Optional[int]
    end_year: Optional[int]

    @property
    def years_covered(self) -> Optional[int]:
        if self.start_year is not None and self.end_year is not None:
            return self.end_year - self.start_year + 1
        return None


def get_api_key() -> str:
    """
    Hole den API-Key aus der Konstante oder der Environment-Variable DDB_API_KEY.
    Wirft einen Fehler, wenn keiner gesetzt ist.
    """
    key = API_KEY or os.environ.get("DDB_API_KEY")
    if not key:
        raise RuntimeError("Kein DDB API-Key gesetzt. Bitte DDB_API_KEY in der Umgebung setzen.")
    return key


def fetch_newspapers(api_key: str) -> List[dict]:
    """
    Hole alle Newspaper-Dokumente aus der DDB (nur Titel mit 'hasLoadedIssues:true').
    Nutzt Paging Ã¼ber ROWS_PER_PAGE bis alle Treffer abgeholt sind.
    """
    all_docs: List[dict] = []
    start = 0

    while True:
        params = {
            "q": "hasLoadedIssues:true",
            "fl": "id,title,progress",
            "wt": "json",
            "start": start,
            "rows": ROWS_PER_PAGE,
            "oauth_consumer_key": api_key,
        }
        response = requests.get(BASE_URL, params=params)
        try:
            response.raise_for_status()
        except Exception as exc:
            raise RuntimeError(
                f"Fehler beim Abruf der Newspaper-Dokumente: {exc}\n"
                f"URL: {response.url}\n"
                f"Antwortcode: {response.status_code}\n"
                f"Antworttext: {response.text[:200]}..."
            ) from exc

        data = response.json()
        docs = data.get("response", {}).get("docs", [])
        num_found = data.get("response", {}).get("numFound", 0)

        all_docs.extend(docs)
        start += ROWS_PER_PAGE

        # Abbruchbedingung: alle Treffer abgeholt
        if start >= num_found:
            break

    return all_docs


def parse_progress(progress_list: List[str]) -> Tuple[Optional[int], Optional[int]]:
    """
    Parsen der 'progress'-EintrÃ¤ge, um frÃ¼hestes und spÃ¤testes Jahr zu ermitteln.
    Erwartet Jahreszahlen wie 1871, 1933, 2001 etc. in den Strings.
    """
    years: List[int] = []
    if not progress_list:
        return None, None

    # Jahreszahlen 1500â€“2099 (wie in deinem Originalcode)
    year_pattern = re.compile(r"\b(1[5-9]\d{2}|20\d{2})\b")

    for entry in progress_list:
        for match in year_pattern.findall(entry):
            try:
                years.append(int(match))
            except ValueError:
                continue

    if not years:
        return None, None
    return min(years), max(years)


def compute_periods(docs: List[dict]) -> List[NewspaperPeriod]:
    """
    Aus den rohen DDB-Dokumenten NewspaperPeriod-Objekte erzeugen.
    """
    periods: List[NewspaperPeriod] = []

    for doc in docs:
        raw_title = doc.get("title")

        # title kann laut DDB-API Liste oder String sein
        if isinstance(raw_title, list):
            title = "; ".join([t for t in raw_title if isinstance(t, str)])
        else:
            title = str(raw_title) if raw_title is not None else ""

        progress = doc.get("progress") or []
        start_year, end_year = parse_progress(progress)

        periods.append(
            NewspaperPeriod(
                id=doc.get("id", ""),
                title=title,
                start_year=start_year,
                end_year=end_year,
            )
        )

    return periods


def main() -> None:
    api_key = get_api_key()
    print("ðŸ“¥ Lade Zeitungstitel aus der DDB â€¦", flush=True)
    docs = fetch_newspapers(api_key)
    print(f"âœ… Abgerufen: {len(docs)} Titel")

    periods = compute_periods(docs)

    df = pd.DataFrame(
        [
            {
                "id": p.id,
                "title": p.title,
                "start_year": p.start_year,
                "end_year": p.end_year,
                "years_covered": p.years_covered,
            }
            for p in periods
        ]
    )

    # Sortierung wie bei dir: zuerst nach years_covered (absteigend), dann nach start_year (aufsteigend)
    df_sorted = df.sort_values(
        by=["years_covered", "start_year"], ascending=[False, True]
    )

    print(df_sorted.to_string(index=False))

    OUT_FILE.parent.mkdir(parents=True, exist_ok=True)
    df_sorted.to_csv(OUT_FILE, index=False)
    print(f"ðŸ’¾ Die komplette Tabelle wurde als '{OUT_FILE}' gespeichert.")


if __name__ == "__main__":
    try:
        main()
    except Exception as err:
        sys.stderr.write(f"Fehler: {err}\n")
        sys.exit(1)



A module that was compiled using NumPy 1.x cannot be run in
NumPy 2.3.5 as it may crash. To support both 1.x and 2.x
versions of NumPy, modules must be compiled with NumPy 2.0.
Some module may need to rebuild instead e.g. with 'pybind11>=2.12'.

If you are a user of the module, the easiest solution will be to
downgrade to 'numpy<2' or try to upgrade the affected module.
We expect that some modules will need time to support NumPy 2.

Traceback (most recent call last):  File "<frozen runpy>", line 198, in _run_module_as_main
  File "<frozen runpy>", line 88, in _run_code
  File "/opt/anaconda3/lib/python3.12/site-packages/ipykernel_launcher.py", line 17, in <module>
    app.launch_new_instance()
  File "/opt/anaconda3/lib/python3.12/site-packages/traitlets/config/application.py", line 1075, in launch_instance
    app.start()
  File "/opt/anaconda3/lib/python3.12/site-packages/ipykernel/kernelapp.py", line 701, in start
    self.io_loop.start()
  File "/opt/anaconda3/lib/python3.12/site-

ImportError: 
A module that was compiled using NumPy 1.x cannot be run in
NumPy 2.3.5 as it may crash. To support both 1.x and 2.x
versions of NumPy, modules must be compiled with NumPy 2.0.
Some module may need to rebuild instead e.g. with 'pybind11>=2.12'.

If you are a user of the module, the easiest solution will be to
downgrade to 'numpy<2' or try to upgrade the affected module.
We expect that some modules will need time to support NumPy 2.




A module that was compiled using NumPy 1.x cannot be run in
NumPy 2.3.5 as it may crash. To support both 1.x and 2.x
versions of NumPy, modules must be compiled with NumPy 2.0.
Some module may need to rebuild instead e.g. with 'pybind11>=2.12'.

If you are a user of the module, the easiest solution will be to
downgrade to 'numpy<2' or try to upgrade the affected module.
We expect that some modules will need time to support NumPy 2.

Traceback (most recent call last):  File "<frozen runpy>", line 198, in _run_module_as_main
  File "<frozen runpy>", line 88, in _run_code
  File "/opt/anaconda3/lib/python3.12/site-packages/ipykernel_launcher.py", line 17, in <module>
    app.launch_new_instance()
  File "/opt/anaconda3/lib/python3.12/site-packages/traitlets/config/application.py", line 1075, in launch_instance
    app.start()
  File "/opt/anaconda3/lib/python3.12/site-packages/ipykernel/kernelapp.py", line 701, in start
    self.io_loop.start()
  File "/opt/anaconda3/lib/python3.12/site-

ImportError: 
A module that was compiled using NumPy 1.x cannot be run in
NumPy 2.3.5 as it may crash. To support both 1.x and 2.x
versions of NumPy, modules must be compiled with NumPy 2.0.
Some module may need to rebuild instead e.g. with 'pybind11>=2.12'.

If you are a user of the module, the easiest solution will be to
downgrade to 'numpy<2' or try to upgrade the affected module.
We expect that some modules will need time to support NumPy 2.




A module that was compiled using NumPy 1.x cannot be run in
NumPy 2.3.5 as it may crash. To support both 1.x and 2.x
versions of NumPy, modules must be compiled with NumPy 2.0.
Some module may need to rebuild instead e.g. with 'pybind11>=2.12'.

If you are a user of the module, the easiest solution will be to
downgrade to 'numpy<2' or try to upgrade the affected module.
We expect that some modules will need time to support NumPy 2.

Traceback (most recent call last):  File "<frozen runpy>", line 198, in _run_module_as_main
  File "<frozen runpy>", line 88, in _run_code
  File "/opt/anaconda3/lib/python3.12/site-packages/ipykernel_launcher.py", line 17, in <module>
    app.launch_new_instance()
  File "/opt/anaconda3/lib/python3.12/site-packages/traitlets/config/application.py", line 1075, in launch_instance
    app.start()
  File "/opt/anaconda3/lib/python3.12/site-packages/ipykernel/kernelapp.py", line 701, in start
    self.io_loop.start()
  File "/opt/anaconda3/lib/python3.12/site-

AttributeError: _ARRAY_API not found


A module that was compiled using NumPy 1.x cannot be run in
NumPy 2.3.5 as it may crash. To support both 1.x and 2.x
versions of NumPy, modules must be compiled with NumPy 2.0.
Some module may need to rebuild instead e.g. with 'pybind11>=2.12'.

If you are a user of the module, the easiest solution will be to
downgrade to 'numpy<2' or try to upgrade the affected module.
We expect that some modules will need time to support NumPy 2.

Traceback (most recent call last):  File "<frozen runpy>", line 198, in _run_module_as_main
  File "<frozen runpy>", line 88, in _run_code
  File "/opt/anaconda3/lib/python3.12/site-packages/ipykernel_launcher.py", line 17, in <module>
    app.launch_new_instance()
  File "/opt/anaconda3/lib/python3.12/site-packages/traitlets/config/application.py", line 1075, in launch_instance
    app.start()
  File "/opt/anaconda3/lib/python3.12/site-packages/ipykernel/kernelapp.py", line 701, in start
    self.io_loop.start()
  File "/opt/anaconda3/lib/python3.12/site-

ImportError: 
A module that was compiled using NumPy 1.x cannot be run in
NumPy 2.3.5 as it may crash. To support both 1.x and 2.x
versions of NumPy, modules must be compiled with NumPy 2.0.
Some module may need to rebuild instead e.g. with 'pybind11>=2.12'.

If you are a user of the module, the easiest solution will be to
downgrade to 'numpy<2' or try to upgrade the affected module.
We expect that some modules will need time to support NumPy 2.



ðŸ“¥ Lade Zeitungstitel aus der DDB â€¦
âœ… Abgerufen: 2520 Titel
       id                                                                                                                                                                                                                                                                                                                                                                                                                                                                  title  start_year  end_year  years_covered
2779220-1                                                                                                                                                                                                                                                                                                                                                                                                    Lippische Landes-Zeitung; Lippische