In [6]:
"""
phoenix_scrape_single.py   –   v3
holt Kurzbeschreibung + ALLE Tabellen aus den Technical-Data-Accordion-Tabs
"""

import json, re, time, requests
from bs4 import BeautifulSoup, Tag
from typing import Dict

HEADERS = {
    "User-Agent": (
        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
        "AppleWebKit/537.36 (KHTML, like Gecko) "
        "Chrome/125.0 Safari/537.36"
    )
}
BASE_URL = "https://www.phoenixcontact.com/de/produkte/{artnr}"
TIMEOUT = 15


def fetch_soup(article: str) -> BeautifulSoup:
    url = BASE_URL.format(artnr=article)
    r = requests.get(url, headers=HEADERS, timeout=TIMEOUT)
    r.raise_for_status()
    return BeautifulSoup(r.text, "html.parser")


def get_short_description(soup: BeautifulSoup) -> str | None:
    meta = soup.find("meta", {"name": "description"})
    if meta and meta.get("content"):
        return meta["content"].strip()
    p = soup.find(id="pr-short-description")
    return p.get_text(strip=True) if p else None


# ------------------------- helpers ------------------------------------------
def _kv_from_table(tbl: Tag) -> Dict[str, str]:
    kv = {}
    for tr in tbl.select("tr"):
        tds = tr.find_all("td")
        if len(tds) >= 2:
            k = tds[0].get_text(" ", strip=True)
            v = tds[1].get_text(" ", strip=True)
            if k and v:
                kv[k] = v
    return kv


def _kv_from_dl(dl: Tag) -> Dict[str, str]:
    kv = {}
    for dt, dd in zip(dl.find_all("dt"), dl.find_all("dd")):
        k = dt.get_text(" ", strip=True)
        v = dd.get_text(" ", strip=True)
        if k and v:
            kv[k] = v
    return kv


# ------------------ NEU: robustes Accordion-Parsing --------------------------
def parse_tech_tables(soup: BeautifulSoup) -> Dict[str, Dict[str, str]]:
    """
    • findet das Haupt-Accordion „Technische Daten“
    • durchläuft ALLE divs, deren class mit 'accordion__item' beginnt
    """
    tech_root = soup.find(
        lambda t: t.name in ("label", "div", "h2", "h3")
        and "Technische Daten" in t.get_text()
    )
    if not tech_root:
        return {}

    tech: Dict[str, Dict[str, str]] = {}

    # alles nach dem Root bis zum nächsten H2/H3, der NICHT mehr zu Tech-Daten gehört
    for node in tech_root.find_all_next(
        lambda t: t.name == "div"
        and any(cls.startswith("accordion__item") for cls in t.get("class", []))
    ):
        # wenn wir aus dem technischen Daten-Block herauslaufen → abbrechen
        headline = node.find(class_="accordion__item__headline")
        if headline and "Technische Daten" not in headline.get_text():
            break  # nächster großer Abschnitt (z. B. Downloads …)

        # Untersektion?
        sub_label = node.find(class_=re.compile(r"accordion__label--sub"))
        if not sub_label:
            continue  # Top-Item selbst, nicht die Unterpunkte

        section = sub_label.get_text(" ", strip=True)

        content = node.find(class_=re.compile(r"accordion__content--sub"))
        if not content:
            continue

        kv: Dict[str, str] = {}
        for tbl in content.find_all("table"):
            kv.update(_kv_from_table(tbl))
        for dl in content.find_all("dl"):
            kv.update(_kv_from_dl(dl))

        if kv:
            tech[section] = kv

    return tech


# ------------------------- main wrapper -------------------------------------
def scrape_article(article: str) -> dict:
    soup = fetch_soup(article)
    return {
        "article": article,
        "short_description": get_short_description(soup),
        "technical_data": parse_tech_tables(soup),
        "url": BASE_URL.format(artnr=article),
        "timestamp": time.strftime("%Y-%m-%d %H:%M:%S"),
    }


if __name__ == "__main__":
    ARTNR = "3022218"          # ← andere Nummer einsetzen oder Schleife bauen
    data = scrape_article(ARTNR)
    print(json.dumps(data, ensure_ascii=False, indent=2))


{
  "article": "3022218",
  "short_description": "Schnellmontage-Endhalter, für Tragschiene NS 35/7,5 oder NS 35/15, mit Markierungsmöglichkeit, Breite: 9,5 mm, Farbe: grau",
  "technical_data": {
    "Artikeleigenschaften": {
      "Produkttyp": "Endhalter"
    },
    "Maße": {
      "Breite": "9,5 mm",
      "Höhe": "55,6 mm",
      "Tiefe auf NS 35/7,5": "35,5 mm",
      "Länge": "55,6 mm"
    },
    "Materialangaben": {
      "Farbe": "grau (RAL 7042)",
      "Material": "PA",
      "Brennbarkeitsklasse nach UL 94": "V2",
      "Statischer Isolierstoffeinsatz in Kälte": "-60 °C",
      "Temperatur Index Isolierstoff (DIN EN 60216-1 (VDE 0304-21))": "125 °C",
      "Relativer Isolierstoff Temperatur Index (Elec., UL 746 B)": "125 °C",
      "Brandschutz für Schienenfahrzeuge (DIN EN 45545-2) R22": "HL 1 - HL 2",
      "Brandschutz für Schienenfahrzeuge (DIN EN 45545-2) R23": "HL 1 - HL 2",
      "Brandschutz für Schienenfahrzeuge (DIN EN 45545-2) R24": "HL 1 - HL 2"
    },
    "Umwe

In [7]:
# ------------------------------------------------------------
# Requirements: requests, beautifulsoup4, pandas (for CSV only)
# pip install requests beautifulsoup4 pandas
#
# Usage:  python phoenix_bulk_scraper.py
# ------------------------------------------------------------

import json, re, time, csv, sys, pathlib
from typing import Dict, List

import requests
from bs4 import BeautifulSoup

HEADERS = {
    "User-Agent": (
        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
        "AppleWebKit/537.36 (KHTML, like Gecko) "
        "Chrome/125.0 Safari/537.36"
    )
}
BASE_URL = "https://www.phoenixcontact.com/de/produkte/{artnr}"
TIMEOUT = 15
DELAY   = 1          # seconds between requests (be polite)

# ---------------------------------------------------------------------------
# Helpers for one product page
# ---------------------------------------------------------------------------

def fetch_soup(article: str) -> BeautifulSoup | None:
    try:
        url  = BASE_URL.format(artnr=article)
        resp = requests.get(url, headers=HEADERS, timeout=TIMEOUT)
        resp.raise_for_status()
        return BeautifulSoup(resp.text, "html.parser")
    except requests.RequestException as exc:
        print(f"[WARN] {article}: {exc}")
        return None


def short_description(soup: BeautifulSoup) -> str | None:
    meta = soup.find("meta", {"name": "description"})
    if meta and meta.get("content"):
        return meta["content"].strip()
    p = soup.find(id="pr-short-description")
    return p.get_text(strip=True) if p else None


def kv_from_table(tbl) -> Dict[str, str]:
    kv = {}
    for tr in tbl.select("tr"):
        tds = tr.find_all("td")
        if len(tds) >= 2:
            k = tds[0].get_text(" ", strip=True)
            v = tds[1].get_text(" ", strip=True)
            if k and v:
                kv[k] = v
    return kv


def kv_from_dl(dl) -> Dict[str, str]:
    kv = {}
    for dt, dd in zip(dl.find_all("dt"), dl.find_all("dd")):
        k = dt.get_text(" ", strip=True)
        v = dd.get_text(" ", strip=True)
        if k and v:
            kv[k] = v
    return kv


def technical_data(soup: BeautifulSoup) -> Dict[str, Dict[str, str]]:
    root = soup.find(
        lambda t: t.name in ("label", "div", "h2", "h3") and "Technische Daten" in t.get_text()
    )
    if not root:
        return {}

    tech: Dict[str, Dict[str, str]] = {}

    for node in root.find_all_next(
        lambda t: t.name == "div" and any(cls.startswith("accordion__item") for cls in t.get("class", []))
    ):
        headline = node.find(class_="accordion__item__headline")
        if headline and "Technische Daten" not in headline.get_text():
            break    # next big accordion reached

        label = node.find(class_=re.compile(r"accordion__label--sub"))
        if not label:
            continue

        section = label.get_text(" ", strip=True)
        panel   = node.find(class_=re.compile(r"accordion__content--sub"))
        if not panel:
            continue

        kv: Dict[str, str] = {}
        for tbl in panel.find_all("table"):
            kv.update(kv_from_table(tbl))
        for dl in panel.find_all("dl"):
            kv.update(kv_from_dl(dl))
        if kv:
            tech[section] = kv

    return tech


def scrape_one(article: str) -> Dict:
    soup = fetch_soup(article)
    if not soup:
        return {"article": article, "error": "page not reachable"}

    return {
        "article": article,
        "short_description": short_description(soup) or "",
        "technical_data":   technical_data(soup)
    }

# ---------------------------------------------------------------------------
# Main bulk routine
# ---------------------------------------------------------------------------

def load_numbers(path: str | pathlib.Path) -> List[str]:
    text = pathlib.Path(path).read_text(encoding="utf-8", errors="ignore")
    try:
        data = json.loads(text[text.index("["): text.rindex("]")+1])
        return [item["number"] for item in data if "number" in item]
    except Exception as err:
        sys.exit(f"✖ Could not parse Product.txt: {err}")

def save_json(records: List[Dict], path="products.json"):
    pathlib.Path(path).write_text(json.dumps(records, ensure_ascii=False, indent=2), "utf-8")
    print(f"✔ JSON written to {path}")

def save_csv(records: List[Dict], path="products.csv"):
    # flatten Dict for CSV: only keep simple k/v technical data
    rows = []
    for rec in records:
        base = {"article": rec["article"], "short_description": rec["short_description"]}
        for section, kv in rec["technical_data"].items():
            for key, val in kv.items():
                rows.append({**base, "section": section, "key": key, "value": val})

    with open(path, "w", newline="", encoding="utf-8") as fh:
        writer = csv.DictWriter(fh, fieldnames=["article", "short_description", "section", "key", "value"])
        writer.writeheader()
        writer.writerows(rows)
    print(f"✔ CSV written to {path}")


def main():
    numbers = load_numbers("Product.txt")
    print(f"{len(numbers)} article numbers loaded.")

    results = []
    for art in numbers:
        print(f"⏳ {art} …", end="", flush=True)
        results.append(scrape_one(art))
        print(" done")
        time.sleep(DELAY)

    save_json(results)
    save_csv(results)


if __name__ == "__main__":
    main()


7937 article numbers loaded.
⏳ 0200020 … done
⏳ 0200046 … done
⏳ 0200059 … done
⏳ 0200075 … done
⏳ 0201113 … done
⏳ 0201278 … done
⏳ 0201317 … done
⏳ 0201333 … done
⏳ 0201346 … done
⏳ 0201647 … done
⏳ 0201650 … done
⏳ 0201812 … done
⏳ 0203247 … done
⏳ 0203250 … done
⏳ 0203263 … done
⏳ 0203276 … done
⏳ 0203438 … done
⏳ 0203454 … done
⏳ 0203483 … done
⏳ 0260028 … done
⏳ 0270018 … done
⏳ 0271017 … done
⏳ 0301505 … done
⏳ 0301521 … done
⏳ 0301534 … done
⏳ 0301550 … done
⏳ 0301589 … done
⏳ 0302216 … done
⏳ 0304023 … done
⏳ 0304214 … done
⏳ 0304227 … done
⏳ 0305080 … done
⏳ 0305226 … done
⏳ 0308029 … done
⏳ 0308210 … done
⏳ 0308223 … done
⏳ 0309109 … done
⏳ 0309112 … done
⏳ 0310020 … done
⏳ 0310224 … done
⏳ 0311087 … done
⏳ 0311126 … done
⏳ 0311139 … done
⏳ 0311142 … done
⏳ 0311812 … done
⏳ 0313085 … done
⏳ 0321019 … done
⏳ 0321022 … done
⏳ 0321213 … done
⏳ 0321226 … done
⏳ 0401023 … done
⏳ 0402006 … done
⏳ 0402174 … done
⏳ 0404017 … done
⏳ 0404033 … done
⏳ 0404046 … done
⏳ 0421016 … done
⏳ 

KeyError: 'short_description'