In [1]:
%%capture --no-stderr
%pip install --quiet -U langgraph lxml chromadb langchain-chroma

In [2]:
#Fix notebook asyncio bug
import nest_asyncio
nest_asyncio.apply()

In [3]:
import os

os.environ["USER_AGENT"] = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36"

## Website scraper

In [4]:
from langchain_community.document_loaders.sitemap import SitemapLoader
from bs4 import BeautifulSoup
import re

products_the_row_us = []

class Product:
    def __init__(self, title, image, price_amount_regular, price_currency_regular, description, gender):
        self.title = title
        self.image = image
        self.price_amount_regular = price_amount_regular
        self.price_currency_regular = price_currency_regular
        self.description = description
        self.gender = gender

    def as_string(self):
        return getattr(self, "title", "null") + "|" + getattr(self, "price_currency_regular", "null") + " " + getattr(self, "price_amount_regular", "null") + "|" + getattr(self, "description", "null") + "|" + getattr(self, "gender", "null")+ "|" + getattr(self, "image", "null")

def get_gender(value: BeautifulSoup) -> str:
    if "women" in (value["data-tags"].lower()):
        result_gender = "women"
    elif "men" in (value["data-tags"].lower()):
        result_gender = "men"
    else:
        result_gender = "neutral"
    return result_gender

def sanitize_html(value: BeautifulSoup) -> str:
    if value is None:
        return "null"
    if isinstance (value, str):
        return value.strip().replace("|", " ").replace("\u00A0", " ").replace("\n", " ").replace("\r", " ")
    return value.get_text().strip().replace("|", " ").replace("\u00A0", " ").replace("\n", " ").replace("\r", " ")

def get_content_from_property(property: BeautifulSoup) -> str:
    return property.attrs['content']
    
def the_row_product_parser_us(content: BeautifulSoup) -> str:
    result_title = content.find("meta", property="og:title")
    result_image = content.find("meta", property="og:image")
    result_price_amount_regular = content.find("meta", property="product:price:amount")
    result_price_currency_regular = content.find("meta", property="product:price:currency")
    result_description = content.find("meta", property="og:description")
    result_gender_decider = content.find("div", attrs = {"data-app": "eastsideco_sizeGuides"})
    result_gender = get_gender(result_gender_decider)
    product = Product(title = get_content_from_property(result_title), 
                      image = get_content_from_property(result_image), 
                      price_amount_regular = get_content_from_property(result_price_amount_regular), 
                      price_currency_regular = get_content_from_property(result_price_currency_regular), 
                      description = get_content_from_property(result_description),  
                      gender = sanitize_html(result_gender))
    products_the_row_us.append(product)
    return product.as_string()

# Regex pattern
pattern = r"^https:\/\/www\.therow\.com\/products\/.*"


the_row_sitemap_loader_us = SitemapLoader(
    web_path="https://www.therow.com/sitemap_products_1.xml?from=6958080032873&to=14625140081012",
    filter_urls=[pattern],
    parsing_function=the_row_product_parser_us
)

## Scrape the website -- The Row US

In [5]:
documents_the_row_us = the_row_sitemap_loader_us.load()


etching pages: 100%|#########################| 848/848 [04:06<00:00,  3.44it/s]

In [6]:
print(documents_the_row_us[0])
print(products_the_row_us[0].as_string())

page_content='Luke T-Shirt Black in Cotton – The Row|EUR 380,00|Short sleeve t-shirt in soft, fluid Supima cotton jersey with ribbed neckline. 100% Cotton Made in Italy Style: 120K291BLK|men|http://www.therow.com/cdn/shop/products/120K291BLKF.jpg?v=1665786846' metadata={'source': 'https://www.therow.com/products/luke-t-shirt-black', 'loc': 'https://www.therow.com/products/luke-t-shirt-black', 'lastmod': '2025-02-19T10:34:34-05:00', 'changefreq': 'daily'}
Luke T-Shirt Black in Cotton – The Row|EUR 380,00|Short sleeve t-shirt in soft, fluid Supima cotton jersey with ribbed neckline. 100% Cotton Made in Italy Style: 120K291BLK|men|http://www.therow.com/cdn/shop/products/120K291BLKF.jpg?v=1665786846


## Storage setup

## Create the .csv file

In [7]:
with open('products_the_row_us.csv', "a+") as f:
    data = f.read()

with open('products_the_row_us.csv', 'w') as f:
    for i, p in enumerate(products_the_row_us):
        f.write(str(i) +"|" + p.as_string() + "\n")

## Create the embeddings database with ChromaDB

source: https://python.langchain.com/docs/integrations/vectorstores/chroma/

In [8]:
from langchain_chroma import Chroma
from langchain_ollama import OllamaEmbeddings
import chromadb

# Custom class to fix the signature mismatch for the embeddings function
class CustomOllamaEmbeddings(OllamaEmbeddings):

    def __init__(self, model, *args, **kwargs):
        super().__init__(model=model, *args, **kwargs)
        
    def _embed_documents(self, texts):
        return super().embed_documents(texts)  # <--- use OllamaEmbeddings's embedding function

    def __call__(self, input):
        return self._embed_documents(input)    # <--- get the embeddings

embeddings = CustomOllamaEmbeddings(model="mxbai-embed-large")

collection_name = "the-row-products"

persistent_client = chromadb.PersistentClient(path="./chroma_products_the_row_us")

vector_store_from_client = Chroma(
    client=persistent_client,
    collection_name=collection_name,
    embedding_function=embeddings
)

collection = persistent_client.get_collection(name=collection_name, embedding_function=embeddings)

retriever = vector_store_from_client.as_retriever()

## Enriching documents' metadata

In [9]:
# Setting the locale to handle US price numbers with comma separators, e.g. 1.900,00
import locale
locale.setlocale(locale.LC_ALL, 'de_DE')

for i, d in enumerate(documents_the_row_us):
    d.metadata["title"] = products_the_row_us[i].title
    d.metadata["image"] = products_the_row_us[i].image
    d.metadata["price_amount_regular"] = float(locale.atof(products_the_row_us[i].price_amount_regular.replace('.','')))
    d.metadata["price_currency_regular"] = products_the_row_us[i].price_currency_regular
    d.metadata["gender"] = products_the_row_us[i].gender

### Add products to the vector store

In [10]:
from uuid import uuid4

uuids = [str(uuid4()) for _ in range(len(documents_the_row_us))];

vector_store_from_client.add_documents(ids=uuids, documents=documents_the_row_us);

In [11]:
retriever_output = retriever.invoke("pants black wool")
print(retriever_output)

[Document(id='db724b00-552f-49dd-97b0-060740f6e7ee', metadata={'changefreq': 'daily', 'gender': 'men', 'image': 'http://www.therow.com/cdn/shop/files/279W1842BLKF.jpg?v=1682604735', 'lastmod': '2025-01-14T08:24:06-05:00', 'loc': 'https://www.therow.com/products/elijah-pants-black', 'price_amount_regular': 1410.0, 'price_currency_regular': 'EUR', 'source': 'https://www.therow.com/products/elijah-pants-black', 'title': 'Elijah Pants Black in Wool – The Row'}, page_content='Elijah Pants Black in Wool – The Row|EUR 1.410,00|Mid-rise straight leg pant in soft wool gabardine with tailored waistband construction and side slash pockets. 100% Wool Lined in 100% Cotton Made in USA Style: 279W1842BLK|men|http://www.therow.com/cdn/shop/files/279W1842BLKF.jpg?v=1682604735'), Document(id='3a693ee0-156e-4066-902d-1edf86c6a4b4', metadata={'changefreq': 'daily', 'gender': 'men', 'image': 'http://www.therow.com/cdn/shop/files/279W1842BLKF.jpg?v=1682604735', 'lastmod': '2025-02-19T10:34:34-05:00', 'loc':

In [12]:
# Query using metadata filter

collection.query(
    query_texts=["slim fit black wool trouser"],
    n_results=2,
    where={"$and":[{"gender": "men"},{"price_amount_regular": {"$lt":1300}}]}
)

{'ids': [['ff90eb8f-d46d-4adc-8a2e-61803a2c3be8',
   'bb5c006b-e113-4647-9c45-ec1267900ca0']],
 'embeddings': None,
 'documents': [['Rolf Pant Black in Cotton – The Row|EUR 920,00|Cropped, slim-fit trouser in soft brushed cotton with front double pleats, pressed front and back creases, and double welt pockets at back. 100% Cotton Made in Italy Style: 754W3241BLK|men|http://www.therow.com/cdn/shop/files/754W3241BLKF.jpg?v=1738442282',
   'Rolf Pant Black in Cotton – The Row|EUR 920,00|Cropped, slim-fit trouser in soft brushed cotton with front double pleats, pressed front and back creases, and double welt pockets at back. 100% Cotton Made in Italy Style: 754W3241BLK|men|http://www.therow.com/cdn/shop/files/754W3241BLKF.jpg?v=1720020243']],
 'uris': None,
 'data': None,
 'metadatas': [[{'changefreq': 'daily',
    'gender': 'men',
    'image': 'http://www.therow.com/cdn/shop/files/754W3241BLKF.jpg?v=1738442282',
    'lastmod': '2025-02-19T10:34:34-05:00',
    'loc': 'https://www.therow.co