In [8]:
import getpass
import os

try:
    # load environment variables from .env file (requires `python-dotenv`)
    from dotenv import load_dotenv

    load_dotenv()
except ImportError:
    pass

In [9]:
os.environ["LANGSMITH_TRACING"] = "true"
if "LANGSMITH_API_KEY" not in os.environ:
    os.environ["LANGSMITH_API_KEY"] = getpass.getpass(
        prompt="Enter your LangSmith API key (optional): "
    )
if "LANGSMITH_PROJECT" not in os.environ:
    os.environ["LANGSMITH_PROJECT"] = getpass.getpass(
        prompt='Enter your LangSmith Project Name (default = "default"): '
    )
    if not os.environ.get("LANGSMITH_PROJECT"):
        os.environ["LANGSMITH_PROJECT"] = "default"

In [10]:
import getpass
import os

if not os.environ.get("OPENAI_API_KEY"):
    os.environ["OPENAI_API_KEY"] = getpass.getpass("Enter API key for OpenAI: ")

from langchain.chat_models import init_chat_model

model = init_chat_model("gpt-4o-mini", model_provider="openai")

In [11]:
import csv

product_urls = []

# Open the CSV file
with open('specbook.csv', 'r', newline='') as csvfile:
    # Create a csv.reader object
    reader = csv.reader(csvfile)

    # Iterate over rows and print them
    for row in reader:
        if row[1]:
            product_urls.append(row[1])

In [14]:
import requests

# fetch urls
html_pages = []

# Without headers, website responds with 403 b/c it suspects you're a bot
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36"
}

for url in product_urls:
    html = requests.get(url, headers=headers)
    html_pages.append(html)

In [15]:
print(len(product_urls))
print(len(html_pages))

87
87


In [16]:
from bs4 import BeautifulSoup

html_soups = []

for html in html_pages:
    soup = BeautifulSoup(html.text, "html.parser")
    html_soups.append(soup)

In [17]:
REMOVE_TAGS = [
    "script", "style", "noscript", "svg", "footer", "header",
    "nav", "form", "iframe", "aside", "canvas", "button", "input", "select", "option"
]

GARBAGE_KEYWORDS = ["cookie", "newsletter", "subscribe", "banner", "social", "share", "advert"]

preprocessed_html = []

for soup in html_soups:

    # Remove noise tags
    for tag in soup(REMOVE_TAGS):
        tag.decompose()

    # # Remove elements with garbage classes/ids
    # for el in soup.find_all(attrs={"class": True}):
    #     cls = " ".join(el.get("class"))
    #     if any(kw in cls.lower() for kw in GARBAGE_KEYWORDS):
    #         el.decompose()
    #
    # for el in soup.find_all(attrs={"id": True}):
    #     id_ = el.get("id")
    #     if id_ and any(kw in id_.lower() for kw in GARBAGE_KEYWORDS):
    #         el.decompose()

    # Extract visible text
    text = soup.get_text(separator="\n", strip=True)
    text_lines = [line.strip() for line in text.splitlines() if line.strip()]
    visible_text = "\n".join(text_lines)

    # Extract metadata
    metadata = {
        tag.get("property") or tag.get("name"): tag.get("content")
        for tag in soup.find_all("meta")
        if tag.get("content")
    }

    # Extract images with alt text
    images = []
    for img in soup.find_all("img"):
        src = img.get("src")
        alt = img.get("alt", "").strip()
        if src:
            images.append({"src": src, "alt": alt})

    preprocessed_html.append({
        "title": soup.title.string.strip() if soup.title and soup.title.string else "",
        "metadata": metadata,
        "text": visible_text,
        "images": images
    })


In [26]:
import json
# Max characters after pre-process
print(max([len(json.dumps(x)) for x in preprocessed_html]))

# with open('saved_product_pages.json', 'w') as json_file:
#     json.dump(preprocessed_html, json_file)

# with open('saved_product_pages.json', 'r') as json_file:
#     preprocessed_html = json.load(json_file)

25048


In [50]:
from langchain_core.prompts import ChatPromptTemplate


system_template = """
You are a helpful architectural assistant.
Extract structured product information relevant for an architect's specification book from the following pre-processed html product pages.
If you are not 99.9% sure that the information is correct, return the value with the highest probability, including the probability in the value field.

TITLE:
data['title']

METADATA:
data['metadata']

TEXT CONTENT:
data['text']

IMAGES:
data['images']

Return a JSON object with fields from data. You **don’t add extra formatting instructions yourself**
- image_url
- type
- descsription
- model_no

data:
{data}
"""

prompt_template = ChatPromptTemplate.from_messages(
    [("system", system_template)]
)

prompts = [
    prompt_template.invoke({"data", json.dumps(webpage)}) for webpage in preprocessed_html
]

In [51]:
msgs = []

In [52]:
for i in range(3):
    msgs.append(model.invoke(prompts[i+1]))

In [55]:
print(msgs[0].content)

{
  "Image URL": "http://www.coloratelierpaint.com/cdn/shop/files/Tadelakt_Stone_White_ea0d9bed-e633-4968-a159-7001007f1e9c.jpg?v=1747186065",
  "Type": "Shower Plaster",
  "Description": "Color Atelier Tadelakt is designed for use in wet areas like showers, tub surrounds, and backsplashes. It can also be used for decorative dry area applications, and outdoors. Tadelakt, an ancient lime plaster finish, dates back to Romans, originated in Morocco. For millennia, it has been appreciated for its beauty, and function. Color Atelier Tadelakt is manufactured in America with the highest quality natural ingredients. The artisanal techniques behind the Tadelakt finish have been modernized, making it a popular choice for both luxury residences and commercial properties. Its burnished smooth, waxed, and water repellent finish results in seamless, elegant, and durable applications. Naturally resistant to mold, mildew, and fungus.",
  "Model No": "",
  "Locations": ""
}
