In [None]:
%%capture --no-stderr
%pip install --quiet -U langgraph lxml chromadb langchain-chroma

In [None]:
#Fix notebook asyncio bug
import nest_asyncio
nest_asyncio.apply()

In [None]:
import os, getpass

def _set_env(var: str):
    # Check if the variable is set in the OS environment
    env_value = os.environ.get(var)
    if not env_value:
        # If not set, prompt the user for input
        env_value = getpass.getpass(f"{var}: ")
    
    # Set the environment variable for the current process
    os.environ[var] = env_value

_set_env("LANGCHAIN_API_KEY")
os.environ["LANGCHAIN_TRACING_V2"] = "true"
os.environ["LANGCHAIN_PROJECT"] = "souer-product-store"

os.environ["USER_AGENT"] = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36"

## Llama Client

In [None]:
from langchain_ollama import ChatOllama
# Initialize the Ollama LLM with the Llama 3 model
llm = ChatOllama(model="llama3.2", temperature=0, num_ctx = 24576)

## Website scraper

In [None]:
from langchain_community.document_loaders.sitemap import SitemapLoader
from bs4 import BeautifulSoup
import re


soeur_products = []

class Product:
    def __init__(self, title, price_regular, description, fabrication):
        self.title = title
        self.price_regular = price_regular
        self.description = description
        self.fabrication = fabrication

    def as_string(self):
        return str(getattr(self, "title", "null") + "|" + getattr(self, "price_regular", "null") + "|" + getattr(self, "description", "null") + "|" + getattr(self, "fabrication", "null"))

def sanitize_html(value: BeautifulSoup) -> str:
    if value is None:
        return "null"
    if isinstance (value, str):
        return value.strip().replace("|", " ").replace("\u00A0", " ").replace("\n", " ").replace("\r", " ")
    return value.get_text().strip().replace("|", " ").replace("\u00A0", " ").replace("\n", " ").replace("\r", " ")

def soeur_product_parser(content: BeautifulSoup) -> str:
    result_title = content.find("div", attrs={"class": "product__title"})
    result_price_regular = content.find("span", attrs={"class": "price-item price-item--regular"})
    result_description = content.find("div", attrs={"class": "tab", "data-target": "description"})
    result_fabrication = content.find("div", attrs={"class": "tab", "data-target": "fabrication"})
    product = Product(title = sanitize_html(result_title), price_regular = sanitize_html(result_price_regular), description = sanitize_html(result_description), fabrication = sanitize_html(result_fabrication))
    soeur_products.append(product)
    return product.as_string()

# Regex pattern
pattern = r"https?://[^/]+/en/products/[^/]+"


soeur_sitemap_loader = SitemapLoader(
    web_path="https://www.soeur.fr/en/sitemap_products_1.xml?from=5151806455948&to=14995974619510",
    filter_urls=[pattern],
    parsing_function=soeur_product_parser
)

## Scrape the website -- Soeur

In [None]:
documents_soeur = soeur_sitemap_loader.load()

In [None]:
print(documents_soeur[0])
print(soeur_products[0].as_string())

## Storage setup

## Create the .csv file

In [None]:
with open('products_souer.csv', "a+") as f:
    data = f.read()

with open('products_souer.csv', 'w') as f:
    for i, p in enumerate(soeur_products):
        f.write(str(i) +"|" + p.as_string() + "\n")

## Create the embeddings database with ChromaDB

source: https://python.langchain.com/docs/integrations/vectorstores/chroma/

In [78]:
from langchain_chroma import Chroma
from langchain_ollama import OllamaEmbeddings
import chromadb

# Custom class to fix the signature mismatch for the embeddings function
class CustomOllamaEmbeddings(OllamaEmbeddings):

    def __init__(self, model, *args, **kwargs):
        super().__init__(model=model, *args, **kwargs)
        
    def _embed_documents(self, texts):
        return super().embed_documents(texts)  # <--- use OllamaEmbeddings's embedding function

    def __call__(self, input):
        return self._embed_documents(input)    # <--- get the embeddings

embeddings = CustomOllamaEmbeddings(model="mxbai-embed-large")

collection_name = "soeur-products"

# Emptying the collection
persistent_client = chromadb.PersistentClient(path="./chroma_products_souer")
persistent_client.delete_collection(name=collection_name)

collection = persistent_client.get_or_create_collection(name=collection_name, embedding_function=embeddings)

vector_store_from_client = Chroma(
    client=persistent_client,
    collection_name=collection_name,
    embedding_function=embeddings
)

retriever = vector_store_from_client.as_retriever()

## Enriching documents' metadata

In [79]:
# Setting the locale to handle price numbers
import locale
locale.setlocale(locale.LC_ALL, 'de_DE')

# Import the function to get the product category using a LLM
from product_category import get_product_category

for i, d in enumerate(documents_soeur):
    d.metadata["key"] = i
    d.metadata["title"] = soeur_products[i].title
    d.metadata["price_regular"] = float(locale.atof(soeur_products[i].price_regular.replace("€","").replace(',','')))
    d.metadata["gender"] = "women"
    d.metadata["category"] = get_product_category(d.page_content)

### Add products to the vector store

In [80]:
from uuid import uuid4

uuids = [str(uuid4()) for _ in range(len(documents_soeur))];

vector_store_from_client.add_documents(ids=uuids, documents=documents_soeur);

In [81]:
retriever_output = retriever.invoke("pants black wool")
print(retriever_output)

[Document(id='344e5235-c8f3-42ab-97dd-de47dbb73635', metadata={'category': 'Trousers', 'changefreq': 'daily', 'gender': 'women', 'key': 70, 'lastmod': '2025-02-26T09:42:46+01:00', 'loc': 'https://www.soeur.fr/en/products/pantalon-felix-e23', 'price_regular': 145.0, 'source': 'https://www.soeur.fr/en/products/pantalon-felix-e23', 'title': 'felix trousers Black wool gabardine trousers    FELIX BLACK TROUSERS'}, page_content="felix trousers Black wool gabardine trousers    FELIX BLACK TROUSERS|145.00 €|Black woollen gabardine trousers - Straight and loose cut- Adjustable lower leg thanks to two buttons- Slant pockets on the front - Welt pockets on the back  The model is 1m80 (5'11) tall and wears a size 38.Reference: PAN1102FELIX24WNOI01|55% POLYESTER 45% NEW WOOL Made In Romania"), Document(id='35a03e9f-ee03-442e-9dfe-0c04b7fe6878', metadata={'category': 'Trousers', 'changefreq': 'daily', 'gender': 'women', 'key': 37, 'lastmod': '2025-02-26T09:42:46+01:00', 'loc': 'https://www.soeur.fr/e

In [94]:
# Query using metadata filter

product_query = "slim fit black wool pants"
product_category = get_product_category(product_query)

collection.query(
    query_texts=product_query,
    n_results=2,
    where={"$and":[{"gender": "women"},{"category": product_category},{"price_regular": {"$lt":130}}]}
)

product_ids = [999999999]

collection.query(
    query_texts=product_query,
    where={"key": { "$in": product_ids }}
)

{'ids': [[]],
 'embeddings': None,
 'documents': [[]],
 'uris': None,
 'data': None,
 'metadatas': [[]],
 'distances': [[]],
 'included': [<IncludeEnum.distances: 'distances'>,
  <IncludeEnum.documents: 'documents'>,
  <IncludeEnum.metadatas: 'metadatas'>]}