In [27]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time

# Replace with your actual session cookie from Chrome
SESSION_COOKIE = "17f98ed892c219e814ccc434836ee213"

HEADERS = {
    "User-Agent": "Mozilla/5.0",
    "Cookie": f"sid={SESSION_COOKIE}"  # replace key name if it's different
}

def fetch_component_materials(component_id: int):
    url = f"https://www.bauteileditor.de/elements/#!/elements/{component_id}?tab=general&_isBaseReq=true"
    response = requests.get(url, headers=HEADERS)
    if response.status_code != 200:
        raise Exception(f"Failed to fetch component {component_id}")
    soup = BeautifulSoup(response.text, "html.parser")

    materials = []
    for li in soup.select("#element-composite ol.sortable li .element-selector"):
        material = li.text.strip()
        if material:
            materials.append(material)

    return materials

# Collect data
component_data = []
for cid in range(558048, 558050):  # <-- Adjust range as needed
    try:
        materials = fetch_component_materials(cid)
        if materials:
            component_data.append({
                "component_id": cid,
                "materials": materials,
                "material_count": len(materials)
            })
            print(f"✔️ {cid}: {materials}")
        else:
            print(f"⚪ {cid}: No materials")
    except Exception as e:
        print(f"❌ {cid}: Error - {e}")
    time.sleep(0.5)  # be polite to the server

# Flatten and save to CSV
rows = []
for item in component_data:
    for mat in item["materials"]:
        rows.append({"component_id": item["component_id"], "material": mat})

df = pd.DataFrame(rows)
df

⚪ 558048: No materials
⚪ 558049: No materials


In [None]:
import requests
from bs4 import BeautifulSoup

# Replace with the name/value from your browser exactly
COOKIE_NAME = "sid"
COOKIE_VALUE = "17f98ed892c219e814ccc434836ee213"

def fetch_component_materials(component_id: int):
    url = f"https://www.bauteileditor.de/elements/general/?e={component_id}&tab=general"
    
    headers = {
        "User-Agent": "Mozilla/5.0",
        "Cookie": f"sid={COOKIE_VALUE}"  # replace key name if it's different
    }

    response = requests.get(url, headers=headers)
    soup = BeautifulSoup(response.text, "html.parser")

    # Extract materials from HTML structure
    materials = []
    for li in soup.select("#element-composite ol.sortable li .element-selector"):
        materials.append(li.text.strip())

    return materials

# Test it
component_id = 558048
materials = fetch_component_materials(component_id)
print(f"Component {component_id} materials:", materials)



Component 558048 materials: []


In [19]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time

# 💡 1. Set your session cookie (grabbed from browser)
COOKIE_VALUE = "17f98ed892c219e814ccc434836ee213"

# 💡 2. Get UUID from a material's detail page
def get_material_uuid(material_id):
    url = f"https://www.bauteileditor.de/elements/general/?e={material_id}&tab=general"
    headers = {"User-Agent": "Mozilla/5.0"}
    cookies = {"sid": COOKIE_VALUE}

    try:
        response = requests.get(url, headers=headers, cookies=cookies, timeout=10)
        if response.status_code != 200:
            return None

        soup = BeautifulSoup(response.text, "html.parser")
        uuid_elem = soup.find("span", class_="selection-value", string=lambda x: x and "-" in x)
        return uuid_elem.text.strip() if uuid_elem else None
    except Exception as e:
        print(f"UUID fetch failed for {material_id}: {e}")
        return None

# 💡 3. For each component, get its name and linked materials
def fetch_component_data(component_id):
    url = f"https://www.bauteileditor.de/elements/general/?e={component_id}&tab=general"
    headers = {"User-Agent": "Mozilla/5.0"}
    cookies = {"sid": COOKIE_VALUE}

    try:
        response = requests.get(url, headers=headers, cookies=cookies, timeout=10)
        if response.status_code != 200:
            return []

        soup = BeautifulSoup(response.text, "html.parser")

        # Component name (e.g., "Außenwand / einschaliges Mauerwerk / erdberührt")
        name_input = soup.select_one("input[name='name']")
        component_name = name_input["value"] if name_input else None

        materials_block = soup.select("#element-composite ol.sortable li")
        results = []

        for li in materials_block:
            name_tag = li.select_one(".element-selector")
            link_tag = li.select_one("a[href*='/elements/'][href*='tab=general']")

            if name_tag and link_tag:
                material_name = name_tag.text.strip()
                material_id = link_tag["href"].split("/")[2]  # e.g., /elements/558049/
                material_uuid = get_material_uuid(material_id)

                results.append({
                    "component_id": component_id,
                    "component_name": component_name,
                    "material_name": material_name,
                    "material_id": material_id,
                    "material_uuid": material_uuid
                })

        return results

    except Exception as e:
        print(f"❌ Component {component_id} failed: {e}")
        return []

# 💡 4. Loop over component IDs
all_data = []
start_id = 1900
end_id = 1910  # expand this when stable

for cid in range(start_id, end_id):
    materials_info = fetch_component_data(cid)
    if materials_info:
        all_data.extend(materials_info)
        print(f"✅ {cid} → {len(materials_info)} materials")
    else:
        print(f"— {cid} skipped")
    time.sleep(0.25)  # delay to avoid hammering the server

# 💡 5. Save results
df = pd.DataFrame(all_data)
df.to_csv("component_materials_with_uuids.csv", index=False)
print("📦 Saved to component_materials_with_uuids.csv")


— 1900 skipped
— 1901 skipped
— 1902 skipped
— 1903 skipped
— 1904 skipped
— 1905 skipped
— 1906 skipped
— 1907 skipped
✅ 1908 → 3 materials
— 1909 skipped
📦 Saved to component_materials_with_uuids.csv


## Script to Scrape Category Overview Page
Here's an example script to scrape all components from one category (e.g. 330 exterior walls, t=246)
The URL you're referring to is a client-side rendered (JavaScript) app (with a #!/ in the path), which means that the content (like the component list) is not available in the raw HTML of the page source fetched with requests. That’s why BeautifulSoup is returning an empty result. We will use selenium instead

In [43]:
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from bs4 import BeautifulSoup
import re
import pandas as pd
import time

# Setup headless Chrome
options = Options()
options.add_argument("--headless")
driver = webdriver.Chrome(options=options)

def fetch_component_name_and_id_selenium(url):
    driver.get(url)
    time.sleep(5)  # wait for JS to load

    soup = BeautifulSoup(driver.page_source, "html.parser")

    components = []

    for h2 in soup.select("h2.headline.ref-element"):
        text = h2.get_text(strip=True)
        match = re.match(r"(.*)\[(\d+)\]$", text)
        if match:
            name, comp_id = match.groups()
            components.append({
                "component_id": int(comp_id),
                "component_name": name.strip()
            })

    return pd.DataFrame(components)

# Run
url = "https://www.bauteileditor.de/elements/#!/elements/list/?t=246"
df = fetch_component_name_and_id_selenium(url)
print(df)

driver.quit()


NoSuchDriverException: Message: Unable to obtain driver for chrome; For documentation on this error, please visit: https://www.selenium.dev/documentation/webdriver/troubleshooting/errors/driver_location


In [49]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options

# Set Chrome options
options = Options()
options.add_argument("--headless")
options.add_argument("--no-sandbox")
options.add_argument("--disable-dev-shm-usage")

# 👇 Correct way to provide the ChromeDriver path
service = Service("/usr/local/bin/chromedriver")

# Create driver with service and options
driver = webdriver.Chrome(service=service, options=options)

# Test run
driver.get("https://www.google.com")
print(driver.title)

driver.quit()




Google


In [52]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import re
import pandas as pd
import time

# === STEP 1: Setup Chrome ===
options = Options()
options.add_argument("--headless")
options.add_argument("--no-sandbox")
options.add_argument("--disable-dev-shm-usage")
service = Service("/usr/local/bin/chromedriver")

driver = webdriver.Chrome(service=service, options=options)

# === STEP 2: Visit base domain to enable cookie injection ===
driver.get("https://www.bauteileditor.de")
time.sleep(1)

# === STEP 3: Add your session cookie ===
driver.add_cookie({
    'name': 'sid',
    'value': '17f98ed892c219e814ccc434836ee213',  # <-- YOUR SID HERE
    'domain': 'www.bauteileditor.de',
    'path': '/',
})

# === STEP 4: Now visit the protected page ===
driver.get("https://www.bauteileditor.de/elements/#!/elements/list/?t=246")
time.sleep(3)

# === STEP 5: Scroll to load content ===
last_height = driver.execute_script("return document.body.scrollHeight")
while True:
    driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
    time.sleep(2)
    new_height = driver.execute_script("return document.body.scrollHeight")
    if new_height == last_height:
        break
    last_height = new_height

# === STEP 6: Extract component names and IDs ===
WebDriverWait(driver, 10).until(
    EC.presence_of_all_elements_located((By.CLASS_NAME, "ref-element"))
)

elements = driver.find_elements(By.CLASS_NAME, "ref-element")

component_data = []
for el in elements:
    text = el.text.strip()
    match = re.search(r"(.*)\[(\d+)\]", text)
    if match:
        name = match.group(1).strip()
        comp_id = int(match.group(2))
        component_data.append({"id": comp_id, "name": name})

print(driver.current_url)
print(driver.title)
print(driver.page_source[:1000])  # Show the first 1000 chars of the HTML


# === STEP 7: Convert to DataFrame ===
df = pd.DataFrame(component_data)
print(df.head())
driver.quit()


TimeoutException: Message: 
