In [1]:
from __future__ import annotations

"""
Fetch page-level fulltext for selected DDB newspapers.

For each newspaper in PAPERS, this script:

1. Uses `zp_issues` to retrieve issues in a given year range and obtain the ZDB-ID(s).
2. Uses `zp_pages(zdb_id=...)` to fetch all pages for that newspaper.
3. Keeps only pages within the given year range (START_YEAR‚ÄìEND_YEAR).
4. Parses `publication_date` into year, month, day.
5. Outputs:
    - one CSV per newspaper in data/ddb/processed/by_newspaper/
    - one master CSV with all pages in data/ddb/processed/ddb_pages_all.csv

Final columns in all outputs:
    title, year, month, day, pagenumber, text
"""

import os
import re
from pathlib import Path
from typing import List

import pandas as pd
from ddbapi import zp_issues, zp_pages  # ddbapi wrapper


# ========================
# Konfiguration
# ========================

# Zeitungen (Titel wie im Zeitungsportal / deinem Issues-DF)
PAPERS: List[str] = [
    "Honnefer Volkszeitung",
    "Jeversches Wochenblatt",
    "K√∂lnische Zeitung",
    "Neckar-Bote",
    "Oberkasseler Zeitung",
    "Schw√§bischer Merkur",
]

# Inklusiver Jahresbereich
START_YEAR = 1871
END_YEAR = 1954

# Sprache im Zeitungsportal (deutsch)
LANGUAGE = "ger"

# Optional: auf bestimmte Seiten begrenzen (z.B. 1‚Äì5 wie im alten Skript)
# Wenn du ALLE Seiten willst, setze PAGE_MIN/PAGE_MAX = None
PAGE_MIN = 1  # z.B. 1
PAGE_MAX = 5  # z.B. 5

# Output-Pfade
OUT_MASTER = Path("data/ddb/processed/ddb_pages_all.csv")
OUT_PER_PAPER_DIR = Path("data/ddb/processed/by_newspaper")
OUT_PER_PAPER_DIR.mkdir(parents=True, exist_ok=True)


# ========================
# Hilfsfunktionen
# ========================

def slugify(title: str) -> str:
    s = title.lower()
    s = re.sub(r"[^a-z0-9]+", "-", s)
    s = re.sub(r"-+", "-", s).strip("-")
    return s or "paper"


def build_publication_date_range(start_year: int, end_year: int) -> str:
    """
    DDB-konformer Datumsbereichs-String f√ºr `publication_date`,
    verwendet bei `zp_issues`.
    """
    date_from = f"{start_year}-01-01T12:00:00Z"
    date_to = f"{end_year}-12-31T12:00:00Z"
    return f"[{date_from} TO {date_to}]"


def fetch_zdb_ids_for_paper(paper_title: str, start_year: int, end_year: int, language: str) -> List[str]:
    """
    Holt Issues f√ºr eine Zeitung im gew√ºnschten Zeitraum und liefert
    die einzigartigen ZDB-IDs zur√ºck.
    """
    pub_range = build_publication_date_range(start_year, end_year)

    issues_df = zp_issues(
        paper_title=paper_title,
        publication_date=pub_range,
        language=language,
    )

    if issues_df.empty:
        print(f"‚ö†Ô∏è Keine Issues f√ºr '{paper_title}' im Zeitraum {start_year}-{end_year}.")
        return []

    if "zdb_id" not in issues_df.columns:
        raise RuntimeError("Erwartete Spalte 'zdb_id' nicht in Issues-DataFrame enthalten.")

    zdb_ids = sorted(issues_df["zdb_id"].dropna().unique().tolist())
    print(f"   ‚Üí Gefundene ZDB-ID(s) f√ºr '{paper_title}': {', '.join(zdb_ids)}")
    return zdb_ids


def fetch_pages_for_zdb(zdb_id: str) -> pd.DataFrame:
    """
    Holt alle Seiten f√ºr eine gegebene ZDB-ID √ºber `zp_pages`.
    """
    pages_df = zp_pages(zdb_id=zdb_id)
    if pages_df.empty:
        print(f"   ‚ö†Ô∏è Keine Seiten f√ºr ZDB-ID {zdb_id}.")
    return pages_df


def preprocess_pages_df(df: pd.DataFrame, paper_title_fallback: str) -> pd.DataFrame:
    """
    - relevante Spalten ausw√§hlen/umbenennen
    - Datum parsen
    - year/month/day hinzuf√ºgen
    - optional page-range filtern
    - auf finale Spalten reduzieren
    """
    # Sicherstellen, dass die erwarteten Spalten existieren
    needed_cols = {"pagenumber", "publication_date", "plainpagefulltext"}
    missing = needed_cols - set(df.columns)
    if missing:
        raise RuntimeError(f"Erwartete Spalten fehlen in pages_df: {missing}")

    # Paper-Titel-Spalte
    if "paper_title" in df.columns:
        title_series = df["paper_title"].astype(str)
    else:
        title_series = pd.Series([paper_title_fallback] * len(df))

    out = pd.DataFrame(
        {
            "title": title_series,
            "pagenumber": df["pagenumber"],
            "publication_date": df["publication_date"],
            "text": df["plainpagefulltext"],
        }
    )

    # Datumsfeld parsen
    out["publication_date"] = pd.to_datetime(out["publication_date"], errors="coerce")
    out = out.dropna(subset=["publication_date"])

    out["year"] = out["publication_date"].dt.year
    out["month"] = out["publication_date"].dt.month
    out["day"] = out["publication_date"].dt.day

    # Seitenbereich filtern (optional)
    out["pagenumber"] = pd.to_numeric(out["pagenumber"], errors="coerce")
    out = out.dropna(subset=["pagenumber"])
    out["pagenumber"] = out["pagenumber"].astype(int)

    if PAGE_MIN is not None:
        out = out[out["pagenumber"] >= PAGE_MIN]
    if PAGE_MAX is not None:
        out = out[out["pagenumber"] <= PAGE_MAX]

    # Nach Jahrbereich filtern
    out = out[(out["year"] >= START_YEAR) & (out["year"] <= END_YEAR)]

    # Sortierung
    out = out.sort_values(["year", "month", "day", "pagenumber"])

    # Finale Spaltenreihenfolge
    out = out[["title", "year", "month", "day", "pagenumber", "text"]]

    return out


# ========================
# Hauptlogik
# ========================

def main() -> None:
    # Hinweis: ddbapi nutzt intern den API-Key; hier nur Info, falls keiner gesetzt ist.
    if not os.environ.get("DDB_API_KEY"):
        print("‚ö†Ô∏è Hinweis: DDB_API_KEY ist nicht gesetzt. "
              "Stelle sicher, dass ddbapi deinen API-Key anderweitig findet.\n")

    all_pages: List[pd.DataFrame] = []

    print(f"üìÖ Zeitraum: {START_YEAR}‚Äì{END_YEAR}")
    print(f"üóûÔ∏è Zeitungen: {', '.join(PAPERS)}\n")

    for paper in PAPERS:
        print(f"‚û°Ô∏è Verarbeite Zeitung: {paper}")
        zdb_ids = fetch_zdb_ids_for_paper(paper, START_YEAR, END_YEAR, LANGUAGE)
        if not zdb_ids:
            continue

        paper_frames: List[pd.DataFrame] = []

        for zdb_id in zdb_ids:
            print(f"   ‚Üí Hole Seiten f√ºr ZDB-ID {zdb_id} ‚Ä¶")
            pages_df = fetch_pages_for_zdb(zdb_id)
            if pages_df.empty:
                continue

            processed = preprocess_pages_df(pages_df, paper_title_fallback=paper)
            if processed.empty:
                print("   ‚ö†Ô∏è Keine Seiten im gew√ºnschten Jahrbereich nach Preprocessing.")
                continue

            paper_frames.append(processed)

        if not paper_frames:
            print(f"   ‚ùå Keine Seiten f√ºr '{paper}' im Zeitraum {START_YEAR}‚Äì{END_YEAR}.")
            continue

        paper_all = pd.concat(paper_frames, ignore_index=True)

        # Pro Zeitung speichern
        slug = slugify(paper)
        filename = f"{slug}_{START_YEAR}_to_{END_YEAR}.csv"
        out_paper_path = OUT_PER_PAPER_DIR / filename
        paper_all.to_csv(out_paper_path, index=False, encoding="utf-8")
        print(f"   üíæ {len(paper_all)} Seiten gespeichert unter: {out_paper_path}\n")

        all_pages.append(paper_all)

    if not all_pages:
        print("‚ùå Keine Seiten f√ºr die angegebenen Zeitungen/Zeitr√§ume gefunden.")
        return

    master = pd.concat(all_pages, ignore_index=True)
    OUT_MASTER.parent.mkdir(parents=True, exist_ok=True)
    master.to_csv(OUT_MASTER, index=False, encoding="utf-8")

    print(f"‚úÖ Master-Datensatz mit {len(master)} Seiten gespeichert unter: {OUT_MASTER}")


if __name__ == "__main__":
    main()



A module that was compiled using NumPy 1.x cannot be run in
NumPy 2.3.5 as it may crash. To support both 1.x and 2.x
versions of NumPy, modules must be compiled with NumPy 2.0.
Some module may need to rebuild instead e.g. with 'pybind11>=2.12'.

If you are a user of the module, the easiest solution will be to
downgrade to 'numpy<2' or try to upgrade the affected module.
We expect that some modules will need time to support NumPy 2.

Traceback (most recent call last):  File "<frozen runpy>", line 198, in _run_module_as_main
  File "<frozen runpy>", line 88, in _run_code
  File "/opt/anaconda3/lib/python3.12/site-packages/ipykernel_launcher.py", line 17, in <module>
    app.launch_new_instance()
  File "/opt/anaconda3/lib/python3.12/site-packages/traitlets/config/application.py", line 1075, in launch_instance
    app.start()
  File "/opt/anaconda3/lib/python3.12/site-packages/ipykernel/kernelapp.py", line 701, in start
    self.io_loop.start()
  File "/opt/anaconda3/lib/python3.12/site-

ImportError: 
A module that was compiled using NumPy 1.x cannot be run in
NumPy 2.3.5 as it may crash. To support both 1.x and 2.x
versions of NumPy, modules must be compiled with NumPy 2.0.
Some module may need to rebuild instead e.g. with 'pybind11>=2.12'.

If you are a user of the module, the easiest solution will be to
downgrade to 'numpy<2' or try to upgrade the affected module.
We expect that some modules will need time to support NumPy 2.




A module that was compiled using NumPy 1.x cannot be run in
NumPy 2.3.5 as it may crash. To support both 1.x and 2.x
versions of NumPy, modules must be compiled with NumPy 2.0.
Some module may need to rebuild instead e.g. with 'pybind11>=2.12'.

If you are a user of the module, the easiest solution will be to
downgrade to 'numpy<2' or try to upgrade the affected module.
We expect that some modules will need time to support NumPy 2.

Traceback (most recent call last):  File "<frozen runpy>", line 198, in _run_module_as_main
  File "<frozen runpy>", line 88, in _run_code
  File "/opt/anaconda3/lib/python3.12/site-packages/ipykernel_launcher.py", line 17, in <module>
    app.launch_new_instance()
  File "/opt/anaconda3/lib/python3.12/site-packages/traitlets/config/application.py", line 1075, in launch_instance
    app.start()
  File "/opt/anaconda3/lib/python3.12/site-packages/ipykernel/kernelapp.py", line 701, in start
    self.io_loop.start()
  File "/opt/anaconda3/lib/python3.12/site-

ImportError: 
A module that was compiled using NumPy 1.x cannot be run in
NumPy 2.3.5 as it may crash. To support both 1.x and 2.x
versions of NumPy, modules must be compiled with NumPy 2.0.
Some module may need to rebuild instead e.g. with 'pybind11>=2.12'.

If you are a user of the module, the easiest solution will be to
downgrade to 'numpy<2' or try to upgrade the affected module.
We expect that some modules will need time to support NumPy 2.




A module that was compiled using NumPy 1.x cannot be run in
NumPy 2.3.5 as it may crash. To support both 1.x and 2.x
versions of NumPy, modules must be compiled with NumPy 2.0.
Some module may need to rebuild instead e.g. with 'pybind11>=2.12'.

If you are a user of the module, the easiest solution will be to
downgrade to 'numpy<2' or try to upgrade the affected module.
We expect that some modules will need time to support NumPy 2.

Traceback (most recent call last):  File "<frozen runpy>", line 198, in _run_module_as_main
  File "<frozen runpy>", line 88, in _run_code
  File "/opt/anaconda3/lib/python3.12/site-packages/ipykernel_launcher.py", line 17, in <module>
    app.launch_new_instance()
  File "/opt/anaconda3/lib/python3.12/site-packages/traitlets/config/application.py", line 1075, in launch_instance
    app.start()
  File "/opt/anaconda3/lib/python3.12/site-packages/ipykernel/kernelapp.py", line 701, in start
    self.io_loop.start()
  File "/opt/anaconda3/lib/python3.12/site-

AttributeError: _ARRAY_API not found


A module that was compiled using NumPy 1.x cannot be run in
NumPy 2.3.5 as it may crash. To support both 1.x and 2.x
versions of NumPy, modules must be compiled with NumPy 2.0.
Some module may need to rebuild instead e.g. with 'pybind11>=2.12'.

If you are a user of the module, the easiest solution will be to
downgrade to 'numpy<2' or try to upgrade the affected module.
We expect that some modules will need time to support NumPy 2.

Traceback (most recent call last):  File "<frozen runpy>", line 198, in _run_module_as_main
  File "<frozen runpy>", line 88, in _run_code
  File "/opt/anaconda3/lib/python3.12/site-packages/ipykernel_launcher.py", line 17, in <module>
    app.launch_new_instance()
  File "/opt/anaconda3/lib/python3.12/site-packages/traitlets/config/application.py", line 1075, in launch_instance
    app.start()
  File "/opt/anaconda3/lib/python3.12/site-packages/ipykernel/kernelapp.py", line 701, in start
    self.io_loop.start()
  File "/opt/anaconda3/lib/python3.12/site-

ImportError: 
A module that was compiled using NumPy 1.x cannot be run in
NumPy 2.3.5 as it may crash. To support both 1.x and 2.x
versions of NumPy, modules must be compiled with NumPy 2.0.
Some module may need to rebuild instead e.g. with 'pybind11>=2.12'.

If you are a user of the module, the easiest solution will be to
downgrade to 'numpy<2' or try to upgrade the affected module.
We expect that some modules will need time to support NumPy 2.



‚ö†Ô∏è Hinweis: DDB_API_KEY ist nicht gesetzt. Stelle sicher, dass ddbapi deinen API-Key anderweitig findet.

üìÖ Zeitraum: 1954‚Äì1954
üóûÔ∏è Zeitungen: Honnefer Volkszeitung

‚û°Ô∏è Verarbeite Zeitung: Honnefer Volkszeitung
https://api.deutsche-digitale-bibliothek.de/search/index/newspaper-issues/select?rows=1000&sort=id+ASC&q=type%3Aissue+AND+paper_title%3A%22Honnefer%5C+Volkszeitung%22+AND+publication_date%3A%22%5B1954-01-01T12%3A00%3A00Z%5C+TO%5C+1954-12-31T12%3A00%3A00Z%5D%22+AND+language%3A%22ger%22&cursorMark=%2A
Got 300 items.
   ‚Üí Gefundene ZDB-ID(s) f√ºr 'Honnefer Volkszeitung': 2835548-9
   ‚Üí Hole Seiten f√ºr ZDB-ID 2835548-9 ‚Ä¶
https://api.deutsche-digitale-bibliothek.de/search/index/newspaper-issues/select?rows=1000&sort=id+ASC&q=type%3Apage+AND+zdb_id%3A%222835548-9%22&cursorMark=%2A
Getting 1000 of 111050
Getting 2000 of 111050
Getting 3000 of 111050
Getting 4000 of 111050
Getting 5000 of 111050
Getting 6000 of 111050
Getting 7000 of 111050
Getting 8000 of 111050