In [5]:
import requests
from bs4 import BeautifulSoup
import re
import csv
import time

# === Config ===
BASE_URL = "https://anno.onb.ac.at"
HEADERS = {"User-Agent": "Mozilla/5.0"}
TARGET_CITIES = {"wien"}
TARGET_YEAR_RANGE = (1908, 1950)

# === Utilities ===
def get_soup(url):
    time.sleep(0.5)  # polite delay to avoid blocking
    response = requests.get(url, headers=HEADERS)
    response.raise_for_status()
    return BeautifulSoup(response.text, "html.parser")

def extract_newspapers():
    newspapers = []
    soup = get_soup(f"{BASE_URL}/alph_list.htm")
    for div in soup.find_all("div", class_="list-item"):
        try:
            aid_link = div.find("a", href=True)["href"]
            aid = re.search(r"aid=([a-zA-Z0-9]+)", aid_link).group(1)

            title_tag = div.find("h4")
            title = title_tag.get_text(strip=True) if title_tag else "Unknown Title"

            z_table = div.find("table", class_="zusatz")
            rows = z_table.find_all("tr")

            place = None
            for row in rows:
                cells = row.find_all("td")
                if len(cells) == 2:
                    key = cells[0].get_text(strip=True).lower()
                    val = cells[1].get_text(strip=True)
                    if "erscheinungsort" in key:
                        place = val.lower()

            if place in TARGET_CITIES:
                newspapers.append({"aid": aid, "title": title})
        except Exception as e:
            print(f"⚠️ Fehler beim Parsen eines Eintrags: {e}")
    return newspapers

def extract_available_years(aid):
    url = f"{BASE_URL}/cgi-content/anno?aid={aid}"
    soup = get_soup(url)
    year_links = soup.select("#content.view-year a[href*='datum=']")
    years = [int(re.search(r"datum=(\d{4})", link.get("href")).group(1)) for link in year_links if re.search(r"datum=(\d{4})", link.get("href"))]
    return [y for y in years if TARGET_YEAR_RANGE[0] <= y <= TARGET_YEAR_RANGE[1]]

def extract_issue_dates(aid, year):
    url = f"{BASE_URL}/cgi-content/anno?aid={aid}&datum={year}"
    soup = get_soup(url)
    active_links = soup.select("td.active a[href*='datum=']")
    dates = [re.search(r"datum=(\d{8})", a.get("href")).group(1) for a in active_links if re.search(r"datum=(\d{8})", a.get("href"))]
    return dates

# === Main Execution ===
rows = []
newspapers = extract_newspapers()

for paper in newspapers:
    aid = paper["aid"]
    title = paper["title"]
    print(f"🔍 Verarbeite Zeitung: {aid}")
    try:
        years = extract_available_years(aid)
        print(f"   📅 Gefundene Jahre: {years}")
        for year in years:
            dates = extract_issue_dates(aid, year)
            for date in dates:
                rows.append({
                    "aid": aid,
                    "title": title,
                    "date": date
                })
    except Exception as e:
        print(f"⚠️ Fehler bei Zeitung {aid}: {e}")

# === Save to CSV ===
csv_filename = "anno_issues_all_filtered.csv"
with open(csv_filename, "w", newline="", encoding="utf-8") as f:
    writer = csv.DictWriter(f, fieldnames=["aid", "title", "date"])
    writer.writeheader()
    writer.writerows(rows)

print(f"\n✅ Fertig. {len(rows)} Ausgaben gespeichert unter {csv_filename}")


🔍 Verarbeite Zeitung: zub
   📅 Gefundene Jahre: [1933]
🔍 Verarbeite Zeitung: vaz
   📅 Gefundene Jahre: [1908, 1909, 1910, 1911, 1912, 1913, 1914, 1915, 1916, 1917, 1918, 1919, 1920, 1921, 1922, 1923, 1924, 1925, 1926, 1927, 1928, 1929, 1930, 1931, 1932, 1933, 1934, 1935, 1936, 1937, 1938, 1939, 1940, 1941, 1942, 1943, 1944, 1945, 1946, 1947, 1948, 1949, 1950]
🔍 Verarbeite Zeitung: abd
   📅 Gefundene Jahre: [1915, 1916, 1917, 1918, 1919, 1920, 1921, 1922, 1923, 1924, 1925, 1926, 1927, 1928, 1929, 1930, 1931, 1932, 1933, 1934]
🔍 Verarbeite Zeitung: aub
   📅 Gefundene Jahre: [1933]
🔍 Verarbeite Zeitung: adl
   📅 Gefundene Jahre: []
🔍 Verarbeite Zeitung: abn
   📅 Gefundene Jahre: [1908, 1909, 1910, 1911, 1912, 1913, 1914, 1915, 1916, 1917, 1918, 1919, 1920, 1921, 1922, 1923, 1924, 1925, 1926, 1927, 1928, 1929, 1930, 1931, 1932, 1933, 1934, 1935, 1936, 1937, 1938, 1939, 1940, 1941, 1942, 1943, 1944, 1945, 1946, 1947, 1948, 1949, 1950]
🔍 Verarbeite Zeitung: agp
   📅 Gefundene Jahre: [1938, 1