In [1]:
with open("dhl_test.html", "r") as f:
    html = f.read()

In [None]:
import re
from bs4 import BeautifulSoup

def find_item(html):
    soup = BeautifulSoup(html, "html.parser")

    # Step 1: Find the <span> that contains the text "ARTIKEL"
    artikel_span = soup.find("span", class_="rio_15_grey", string=re.compile(r"\bARTIKEL\b"))
    if not artikel_span:
        return "Unknown item"

    # Step 2: Get the parent <tr> that holds this <span>
    artikel_tr = artikel_span.find_parent("tr")
    if not artikel_tr:
        return "Unknown item"

    # Step 3: Get the next <tr> sibling
    next_tr = artikel_tr.find_next_sibling("tr")
    if not next_tr:
        return "Unknown item"

    # OPTIONAL: If you only want a specific <span> inside the next <tr>
    # that might hold the item name, e.g. <span class="rio_15_heavy_black"> Reorda&reg; Metallband...</span>:
    item_span = next_tr.find("span", class_="rio_15_heavy_black")
    if item_span:
        # decode_contents(formatter="html") preserves &reg; instead of converting it to ®
        item_text = item_span.decode_contents(formatter="html").strip()
        return item_text

    return "Unknown item"


In [3]:
find_item(html)

'WOCVRYY Autositz Organizer...'

In [None]:
def find_adress(html):
    """
    Extract the address from DHL notification email HTML.
    Returns formatted address like "Packstation 158, Südhöhe 38"
    """
    soup = BeautifulSoup(html, "html.parser")
    
    # Find the span containing "ABHOLORT"
    abholort_span = soup.find("span", string=re.compile(r"ABHOLORT"))
    if not abholort_span:
        return "Address not found"
    
    # Navigate to the tr that contains the address information
    abholort_tr = abholort_span.find_parent("tr")
    if not abholort_tr:
        return "Address not found"
    
    # The location name is in the 2nd tr after the header
    location_tr = abholort_tr.find_next_sibling("tr").find_next_sibling("tr") #type: ignore
    if not location_tr:
        return "Address not found"
    
    # The street address is in the next tr
    address_tr = location_tr.find_next_sibling("tr")
    if not address_tr:
        return "Address not found"
    
    # Extract the text and clean it
    location_text = location_tr.get_text(strip=True)
    address_text = address_tr.get_text(strip=True)
    
    # Format the complete address
    full_address = f"{location_text}, {address_text}"
    
    return full_address

In [15]:
find_adress(html)

'Packstation 158, Südhöhe 38'