## Web Scraping

In [None]:
import time
import pandas as pd
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.common.exceptions import StaleElementReferenceException


# === 0. Setup Selenium === 
options = Options()
# options.add_argument("--headless=new")
options.add_argument("--disable-gpu")
options.add_argument("--no-sandbox")
options.add_argument("--disable-dev-shm-usage")
options.add_experimental_option("detach", True)
driver = webdriver.Chrome(service=Service("/usr/local/bin/chromedriver"), options=options)



# === 1. Session Cookie and Headers ===
SESSION_COOKIE = "17f98ed892c219e814ccc434836ee213"
HEADERS = {"User-Agent": "Mozilla/5.0"}
COOKIES = {"sid": SESSION_COOKIE}

# === 2. Extract subcomponent IDs and names from a composite component ===
def get_l1_subcomponents(component_id):
    url = f"https://www.bauteileditor.de/elements/general/?e={component_id}&tab=general"
    try:
        driver.get("https://www.bauteileditor.de/favicon.ico")
        driver.delete_all_cookies()
        driver.add_cookie({
            'name': 'sid',
            'value': SESSION_COOKIE,
            'domain': 'www.bauteileditor.de',
            'path': '/',
        })
        driver.get(url)
        time.sleep(3)

        soup = BeautifulSoup(driver.page_source, "html.parser")
        name_input = soup.select_one("input[name='name']")
        component_name = name_input["value"] if name_input else None

        linked = soup.select("a.function-link.edit-link[href*='/elements/']")
        sub_ids = [a['href'].split("/")[2] for a in linked]

        labels = soup.select("div.element .element-selector")
        sub_names = [label.get_text(strip=True) for label in labels]

        sub_map = dict(zip(sub_ids, sub_names))
        return component_name, sub_map
    except Exception as e:
        print(f"⚠️ Failed to extract subcomponents from {component_id}: {e}")
        return None, {}

# === 3. Extract layered material data (UUIDs) from a subcomponent ===
def get_l2_materials_layers_selenium(component_id):
    url = f"https://www.bauteileditor.de/elements/{component_id}?tab=general"
    try:
        driver.get("https://www.bauteileditor.de/favicon.ico")
        driver.delete_all_cookies()
        driver.add_cookie({
            'name': 'sid',
            'value': SESSION_COOKIE,
            'domain': 'www.bauteileditor.de',
            'path': '/',
        })
        driver.get(url)
        time.sleep(4)

        toggles = driver.find_elements(By.CSS_SELECTOR, "a.toggle-link[title='Werte anzeigen']")
        for idx, toggle in enumerate(toggles):
            try:
                driver.execute_script("arguments[0].scrollIntoView(true);", toggle)
                time.sleep(0.3)
                driver.execute_script("arguments[0].click();", toggle)
                time.sleep(1)
                print(f"   ✅ Expanded toggle {idx+1}/{len(toggles)}")
            except StaleElementReferenceException:
                print(f"   ⚠️ Toggle {idx+1} skipped due to StaleElementReference")

        soup = BeautifulSoup(driver.page_source, "html.parser")
        materials = []
        layers = soup.select("ol.sortable li.sortable-item")
        print(f"🔍 Component {component_id} → {len(layers)} material blocks")

        for i, layer in enumerate(layers):
            name_tag = layer.select_one(".process-config-selector span")
            material_name = name_tag.get_text(strip=True) if name_tag else None #REVIEW - may need to remove it
            thickness_input = layer.select_one('input[name^="size"]')
            thickness = thickness_input["value"] if thickness_input else None

            print(f"  📦 Layer {i+1}: {material_name} | Thickness: {thickness}")

            rows = layer.select("table.process-databases tr")
            print(f"    🧾 {len(rows)-1} rows found")

            for row in rows[1:]:
                cols = row.find_all("td")
                if len(cols) >= 6:
                    database = cols[0].get_text(strip=True)
                    life_cycle = cols[1].get_text(strip=True)
                    process = cols[3].get_text(strip=True)
                    uuid = cols[5].get_text(strip=True)
                    if uuid:
                        materials.append({
                            "material_name": material_name,
                            "thickness_mm": thickness,
                            "database": database,
                            "life_cycle": life_cycle,
                            "process": process,
                            "uuid": uuid,
                            "subcomponent_id": component_id
                        })

        return materials

    except Exception as e:
        print(f"⚠️ Selenium error on {component_id}: {e}")
        return []

# === 4. Master function: from main components to UUIDs ===
def extract_all_materials_from_main_components(main_component_ids):
    all_data = []
    for main_id in main_component_ids:
        l1_name, sub_map = get_l1_subcomponents(main_id)
        if not sub_map:
            print(f"❌ No subcomponents found for {main_id}")
            continue
        print(f"\n🔗 Main {main_id} → {len(sub_map)} subcomponents")

        for sub_id, sub_name in sub_map.items():
            materials = get_l2_materials_layers_selenium(sub_id)
            if not materials:
                print(f"   — No materials in subcomponent {sub_id}")
                continue
            print(f"   ✅ Subcomponent {sub_id}: {len(materials)} materials")
            for mat in materials:
                mat["main_component_id"] = main_id
                mat["main_component_name"] = l1_name
                mat["subcomponent_name"] = sub_name
                all_data.append(mat)
            time.sleep(0.25)
    return pd.DataFrame(all_data)



# === 5. Run It ===
main_component_ids = all_ids
all_records = []
for t, component_ids in all_ids.items():
    df = extract_all_materials_from_main_components(component_ids)
    df["t_value"] = t
    all_records.append(df)

# Combine everything
df_all = pd.concat(all_records, ignore_index=True)
# df_all = extract_all_materials_from_main_components(main_component_ids)
# df_all.to_csv("all_uuid_materials_from_components.csv", index=False)
# print("\n✅ Saved to all_uuid_materials_from_components.csv")

driver.quit()



🔗 Main 2780 → 3 subcomponents
🔍 Component 1156 → 5 material blocks
  📦 Layer 1: Silicone resin plaster | Thickness: 5
    🧾 -1 rows found
  📦 Layer 2: Zementestrich - IWM | Thickness: 65
    🧾 -1 rows found
  📦 Layer 3: PE foil, dimpled | Thickness: 0.2
    🧾 -1 rows found
  📦 Layer 4: Perlites 0-1 | Thickness: 20
    🧾 -1 rows found
  📦 Layer 5: Bitumen sheets V 60 | Thickness: 5
    🧾 -1 rows found
   — No materials in subcomponent 1156
🔍 Component 1646 → 1 material blocks
  📦 Layer 1: Ready-mix concrete C25/30 | Thickness: 250
    🧾 -1 rows found
   — No materials in subcomponent 1646
🔍 Component 2678 → 1 material blocks
  📦 Layer 1: Extruded polystyrene (XPS) | Thickness: 260
    🧾 -1 rows found
   — No materials in subcomponent 2678

🔗 Main 3632 → 4 subcomponents
🔍 Component 2680 → 2 material blocks
  📦 Layer 1: Natural stone slab, flexible, indoor usage (20 mm) | Thickness: 30
    🧾 -1 rows found
  📦 Layer 2: Tile adhesive | Thickness: 10
    🧾 -1 rows found
   — No materials in

In [9]:
df_all.to_csv("all_uuid_materials_from_components.csv", index=False)