# Generate Components Catalog

## Setup

In [1]:
import getpass
import os

from dotenv import load_dotenv

load_dotenv(override=True)

if "GOOGLE_API_KEY" not in os.environ:
    os.environ["GOOGLE_API_KEY"] = getpass.getpass("Enter your Google AI API key: ")

In [15]:
%pip install -qU langchain-google-genai

Note: you may need to restart the kernel to use updated packages.


In [2]:
from langchain_google_genai import ChatGoogleGenerativeAI

llm = ChatGoogleGenerativeAI(
    model="gemini-2.5-pro-preview-03-25",
    temperature=0,
    max_tokens=None,
    timeout=None,
    max_retries=2,
    # other params...
)

In [3]:
messages = [
    (
        "system",
        "You are a helpful assistant that translates English to French. Translate the user sentence.",
    ),
    ("human", "I love programming."),
]
ai_msg = llm.invoke(messages)
ai_msg

AIMessage(content='Here are a few options, depending on the nuance you want:\n\n1.  **J\'adore programmer.** (This is the most common and natural way to express strong liking or "love" for an activity like programming. "Adorer" means "to adore" or "to love" in this context.)\n2.  **J\'aime beaucoup programmer.** (This means "I like programming a lot" or "I really like programming", which is also a very good translation for "I love programming".)\n3.  **J\'aime la programmation.** (This uses the noun "programming" and means "I like programming" or "I love programming" - focusing slightly more on the field itself.)\n4.  **J\'adore la programmation.** (Similar to the previous one, but with stronger emphasis using "adorer".)\n\nThe most idiomatic and frequently used translation is **J\'adore programmer.**', additional_kwargs={}, response_metadata={'prompt_feedback': {'block_reason': 0, 'safety_ratings': []}, 'finish_reason': 'STOP', 'model_name': 'gemini-2.5-pro-preview-03-25', 'safety_rat

## Simulate Content

In [4]:
from langchain_core.prompts import PromptTemplate

PROMPT_TEMPLATE = PromptTemplate.from_template("""
You are helping simulate a bill of materials (BOM) dataset that will demonstrate the power of GraphRAG (Graph Retrieval Augmented Generation) for BOM and supply chain analysis.
A big topic/concern now is finding the impact of tariffs and supply chain shortages. Specifically how such prices and disruptions for upstream components may effect downstream product costs and availability.

This specific supply chain we are simulating is focused on manufacturing agricultural equipment.

Below is a list of components that are part of a bill of materials. For each component please provide a component catalog entry. The entry should

1. be in the narrative style of a catalog section
2. include information about the raw materials used to make the component and where they are sourced (to provide for the analytics described above)
3. include descriptions about what the component is and its uses.


## Component List
{componentList}
""")

In [5]:
from typing import List
from pydantic import BaseModel, Field


#define inputs
class Component(BaseModel):
    skuId: str = Field(..., description="The unique identifier for the component")
    name: str = Field(..., description="The descriptive name of the component")

class ComponentList(BaseModel):
    components: list[Component] = Field(..., description="A list of components to be included in the catalog")

#define outputs
class CatalogEntry(BaseModel):
    """
    Represents a catalog entry with detailed information for a single component
    """
    skuId: str = Field(..., description="The provided component sku_id to trace back this entry to the product in downstream data processing")
    name: str = Field(..., description="The provided component name to trace back this entry to the product in downstream data processing")
    entry: str = Field(..., description="The catalog entry that includes the raw materials, sourcing, and descriptions")

class CatalogEntries(BaseModel):
    """
    A list of catalog entries
    """
    entries: List[CatalogEntry] = Field(..., description="A list of catalog entries for each component")

In [6]:
llm_for_catalog = llm.with_structured_output(CatalogEntries)

In [7]:
from tqdm.asyncio import tqdm as tqdm_async
import asyncio


def chunks(xs, n=10):
    n = max(1, n)
    return [xs[i:i + n] for i in range(0, len(xs), n)]

async def placeholder_llm(components:List[Component]) -> List[CatalogEntry]:
    component_list = ComponentList(components=components)
    entries = []
    for component in component_list.components:
        entries.append(CatalogEntry(skuId=component.skuId, name=component.name, entry="placeholder entry...."))
    return entries

async def sim_entries(components:List[Component], semaphore) -> List[CatalogEntry]:
    async with semaphore:
        prompt = PROMPT_TEMPLATE.invoke({'componentList': ComponentList(components=components).model_dump_json(indent=4)})
        # pprint(prompt.text)
        # Use structured LLM for extraction
        catalog_entries:CatalogEntries = await llm_for_catalog.ainvoke(prompt)
    return catalog_entries.entries

async def sim_all_entries(components:List[Component], chunk_size=10, max_workers=10) -> List[CatalogEntry]:

    # Create a semaphore with the desired number of workers
    semaphore = asyncio.Semaphore(max_workers)

    # Create tasks with the semaphore
    component_chunks = chunks(components, chunk_size)
    tasks = [sim_entries(component_chunk, semaphore) for component_chunk in component_chunks]

    # Explicitly update progress using `tqdm` as tasks complete
    entries: List[CatalogEntry] = []
    with tqdm_async(total=len(tasks), desc="Simulating Catalog Entries") as pbar:
        for future in asyncio.as_completed(tasks):
            result = await future
            entries.extend(result)
            pbar.update(1)  # Increment progress bar for each completed task
    return entries


In [8]:
import pandas as pd

df = pd.read_csv("source-csvs/items-estimated-tier-6.csv")
df

Unnamed: 0,sku_id,name
0,M1519819X,Fastener_TDE9S
1,KM100605,Bolt_HZU3P
2,M5637H75FX,Fastener_UJIDT
3,KM100373,RubberSeal_QLYES
4,M7687Q89,RubberSeal_7Y9HE
...,...,...
289,KM100368,Rod_A11CF
290,KM100335,RawWire_B2QZM
291,M5272R68X,PrecisionBolt_OP7DY
292,M7669U85,Casting_KKZJL


In [9]:
components = [Component(skuId=comp['sku_id'], name=comp['name']) for comp in df.to_dict('records')]
components[:3]

[Component(skuId='M1519819X', name='Fastener_TDE9S'),
 Component(skuId='KM100605', name='Bolt_HZU3P'),
 Component(skuId='M5637H75FX', name='Fastener_UJIDT')]

In [10]:
entries = await sim_all_entries(components)
entries[:3]

Simulating Catalog Entries: 100%|██████████| 30/30 [02:12<00:00,  4.41s/it]


[CatalogEntry(skuId='M6673J13QX', name='PrecisionBolt_LKBIG', entry='The PrecisionBolt_LKBIG is a high-strength fastener engineered for critical applications within heavy agricultural machinery. Manufactured from a specialized high-carbon steel alloy containing chromium and molybdenum, it offers exceptional tensile strength and fatigue resistance. The primary steel component is sourced from mills in China, utilizing iron ore from Australia. Alloying elements are procured globally, with chromium originating from South Africa and molybdenum sourced from mines in Chile. These bolts are essential for securing high-stress assemblies such as engine mounts, transmission housings, and main chassis connections, ensuring operational integrity under demanding field conditions.'),
 CatalogEntry(skuId='M4882B36X', name='Rod_6TIHG', entry='Rod_6TIHG is a solid medium-carbon steel rod, providing robust performance in linkage and structural applications. The steel is produced in Germany using high-qua

## Write to PDF and Store in Google Cloud Storage

In [17]:
from fpdf import FPDF
from typing import List


# Create a PDF generator
class PDF(FPDF):
    def header(self):
        self.set_font("Arial", size=12)
        self.set_font("Arial", size=16, style="B")  # Larger font size and bold for a professional look
        self.cell(0, 10, "AG Inc. Components Catalog", align="C", ln=True)  # Updated title
        self.ln(10)

    def footer(self):
        # Add a page number at the bottom
        self.set_y(-15)
        self.set_font("Arial", size=8)
        self.cell(0, 10, f"Page {self.page_no()}", align="C")


# Generate PDF
pdf = PDF()

# Add each entry to the PDF in a new page
for entry in entries:
    pdf.add_page()
    # Enlarge and bold the SKU field
    pdf.set_font("Arial", size=14, style="B")
    pdf.cell(0, 10, f"SKU: {entry.skuId}", ln=True)

    # Write the entry's data with normal font
    pdf.set_font("Arial", size=12)
    pdf.multi_cell(0, 10, entry.entry)

# Output the PDF to a file
pdf.output("component-catalog.pdf")

''

In [20]:
!gsutil cp component-catalog.pdf gs://neo4j-workshop-data/genai-bom

I0000 00:00:1745342058.761571 31881519 fork_posix.cc:75] Other threads are currently calling into gRPC, skipping fork() handlers


Copying file://component-catalog.pdf [Content-Type=application/pdf]...
/ [1 files][232.1 KiB/232.1 KiB]                                                
Operation completed over 1 objects/232.1 KiB.                                    
