In [31]:
!gcloud auth application-default login > /dev/null 2>&1

In [46]:
import os
import random

os.makedirs("source-data", exist_ok=True)


In [47]:
tier_archetypes = {
    "Tier6": {
        "RSEL": "RubberSeal",
        "PBLT": "PrecisionBolt",
        "BOLT": "Bolt",
        "TUBE": "Tubing",
        "FAST": "Fastener",
        "WIRE": "RawWire",
        "CAST": "Casting",
        "MROD": "Rod"
    },
    "Tier5": {
        "HFIT": "HydraulicFitting",
        "WRLM": "WiringLoom",
        "BEAR": "Bearing",
        "GSET": "GearSet",
        "GEAR": "Gear",
        "BELT": "Belt",
        "SENS": "Sensor",
        "HARN": "Harness",
        "PULL": "Pulley",
        "SPRG": "Spring",
        "BUSH": "Bushing",
        "CLMP": "Clamp"
    },
    "Tier4": {
        "PUMP": "PumpAssembly",
        "SUNT": "SensorUnit",
        "CVLV": "ControlValve",
        "MDRV": "MotorDrive",
        "CONN": "Connector",
        "CYLR": "Cylinder"
    },
    "Tier3": {
        "ENGC": "EngineCore",
        "AXLE": "AxleAssembly",
        "SCLM": "SteeringColumn",
        "EBOX": "ElectricalControlBox",
        "FRME": "FrameSegment",
        "CMOD": "ComponentModule",
        "SMOD": "SensorModule",
        "CTRL": "ControlUnit",
        "BORD": "Board"
    },
    "Tier2": {
        "MFRM": "MachineFrame",
        "MARM": "MachineArm",
        "OCAB": "OperatorCab",
        "PMOD": "PowerSystemModule",
        "TMKT": "ToolMountKit"
    },
    "Tier1": {
        "DPLT": "DrivePlatform",
        "HSYS": "HydraulicSystem",
        "CFRM": "ChassisFrame",
        "CWRE": "CabWiringUnit",
        "MRIG": "MachineRig",
        "CASM": "ControlAssembly"
    },
    "Tier0": {
        "FTRC": "FarmTractor",
        "CHRV": "CropHarvester",
        "HCOL": "HayCollector",
        "FSPR": "FieldSprayer",
        "SPLT": "SeedPlanter"
    }
}

items_per_tier_arch = {"Tier6": (20,50), "Tier5": (40,90), "Tier4": (40,90), "Tier3": (20,50), "Tier2": (10,40), "Tier1": (8,20), "Tier0": (3,10)}

In [48]:
import pandas as pd

random.seed(7474) #set seed so simulation is consistent
bom_items = []
for tier, archetypes in tier_archetypes.items():
    sample_size_window = items_per_tier_arch[tier]
    for arch_sku, archetype in archetypes.items():
        num_items = random.randint(sample_size_window[0], sample_size_window[1])
        for i in range(num_items):
            sku_id = f"{arch_sku}{tier.replace('Tier', '')}_{i:04d}"
            bom_items.append({
                "sku_id": sku_id,
                "family": archetype,
                "tier":tier})
items_df = pd.DataFrame(bom_items)
items_df.to_csv("source-data/items-stage.csv", index=False)
items_df

Unnamed: 0,sku_id,family,tier
0,RSEL6_0000,RubberSeal,Tier6
1,RSEL6_0001,RubberSeal,Tier6
2,RSEL6_0002,RubberSeal,Tier6
3,RSEL6_0003,RubberSeal,Tier6
4,RSEL6_0004,RubberSeal,Tier6
...,...,...,...
1923,SPLT0_0002,SeedPlanter,Tier0
1924,SPLT0_0003,SeedPlanter,Tier0
1925,SPLT0_0004,SeedPlanter,Tier0
1926,SPLT0_0005,SeedPlanter,Tier0


In [49]:
import getpass
from dotenv import load_dotenv

load_dotenv(override=True)

if "GOOGLE_API_KEY" not in os.environ:
    os.environ["GOOGLE_API_KEY"] = getpass.getpass("Enter your Google AI API key: ")

In [50]:
from langchain_google_genai import ChatGoogleGenerativeAI

llm = ChatGoogleGenerativeAI(
    model="gemini-2.5-pro-preview-05-06",
    temperature=0,
    max_tokens=None,
    timeout=None,
    max_retries=2
)

messages = [
    (
        "system",
        "You are a helpful assistant that translates English to French. Translate the user sentence.",
    ),
    ("human", "I love programming."),
]
ai_msg = llm.invoke(messages)
ai_msg

AIMessage(content='J\'aime la programmation.\n\nYou could also say:\n*   **J\'adore la programmation.** (This implies a stronger feeling, like "I adore programming.")\n*   **J\'aime programmer.** (This means "I love to program," focusing on the action.)\n*   **J\'adore programmer.** (This means "I adore programming/to program," also focusing on the action with strong emphasis.)\n\nThe first option, **J\'aime la programmation.**, is a direct and common translation.', additional_kwargs={}, response_metadata={'prompt_feedback': {'block_reason': 0, 'safety_ratings': []}, 'finish_reason': 'STOP', 'model_name': 'models/gemini-2.5-pro-preview-05-06', 'safety_ratings': []}, id='run-71b999f6-d9a4-48a7-8ff3-b22778bf9fe6-0', usage_metadata={'input_tokens': 21, 'output_tokens': 110, 'total_tokens': 4600, 'input_token_details': {'cache_read': 0}})

In [51]:
from langchain_core.prompts import PromptTemplate

TIER_6_PROMPT_TEMPLATE = PromptTemplate.from_template("""
You are helping simulate a bill of materials (BOM) dataset that will demonstrate the power of GraphRAG (Graph Retrieval Augmented Generation) for BOM and supply chain analysis.
A big topic/concern now is finding the impact of tariffs and supply chain shortages. Specifically how such prices and disruptions for upstream components may effect downstream product costs and availability.

This specific supply chain we are simulating is focused on manufacturing agricultural equipment.

Below is a list of components that are part of a bill of materials. For each component please provide a component catalog entry. The entry should

1. be in the narrative style of a catalog section
2. include information about the raw materials used to make the component and where they are sourced (to provide for the analytics described above)
3. include descriptions about what the component is and its uses.

Do not include any text that describes opinions/concerns about tariffs or potential risks. This is just information about the item and its sourcing.




## Component List
{componentList}
""")

In [52]:
from typing import List
from pydantic import BaseModel, Field


#define inputs
class Component(BaseModel):
    skuId: str = Field(..., description="The unique identifier for the component")
    family: str = Field(..., description="The family the component belongs to describing what type of component it is")


class ComponentList(BaseModel):
    components: list[Component] = Field(..., description="A list of components to be included in the catalog")


#define outputs
class CatalogEntry(BaseModel):
    """
    Represents a catalog entry with detailed information for a single component
    """
    skuId: str = Field(...,
                       description="The provided component sku_id to trace back this entry to the product in downstream data processing")
    name: str = Field(...,
                      description="a descriptive name for the component based on the entry")
    entry: str = Field(..., description="The catalog entry that includes the raw materials, sourcing, and descriptions")


class CatalogEntries(BaseModel):
    """
    A list of catalog entries
    """
    entries: List[CatalogEntry] = Field(..., description="A list of catalog entries for each component")


llm_for_catalog = llm.with_structured_output(CatalogEntries)

In [53]:
from tqdm.asyncio import tqdm as tqdm_async
import asyncio


def chunks(xs, n=10):
    n = max(1, n)
    return [xs[i:i + n] for i in range(0, len(xs), n)]


async def sim_entries(components: List[Component], prompt_template, semaphore) -> List[CatalogEntry]:
    async with semaphore:
        prompt = prompt_template.invoke(
            {'componentList': ComponentList(components=components).model_dump_json(indent=4)})
        # pprint(prompt.text)
        # Use structured LLM for extraction
        catalog_entries: CatalogEntries = await llm_for_catalog.ainvoke(prompt)
    return catalog_entries.entries


async def sim_all_entries(components: List[Component], prompt_template, chunk_size=10, max_workers=10) -> List[CatalogEntry]:
    # Create a semaphore with the desired number of workers
    semaphore = asyncio.Semaphore(max_workers)

    # Create tasks with the semaphore
    component_chunks = chunks(components, chunk_size)
    tasks = [sim_entries(component_chunk, prompt_template,semaphore) for component_chunk in component_chunks]

    # Explicitly update progress using `tqdm` as tasks complete
    entries: List[CatalogEntry] = []
    with tqdm_async(total=len(tasks), desc="Simulating Catalog Entries") as pbar:
        for future in asyncio.as_completed(tasks):
            result = await future
            entries.extend(result)
            pbar.update(1)  # Increment progress bar for each completed task
    return entries

In [58]:
components = [Component(skuId=comp['sku_id'], family=comp['family']) for comp in items_df[items_df.tier=='Tier6'].to_dict('records')]
components[:3]

[Component(skuId='RSEL6_0000', family='RubberSeal'),
 Component(skuId='RSEL6_0001', family='RubberSeal'),
 Component(skuId='RSEL6_0002', family='RubberSeal')]

In [59]:
entries = await sim_all_entries(components, TIER_6_PROMPT_TEMPLATE)
entries[:3]

Simulating Catalog Entries: 100%|██████████| 30/30 [01:48<00:00,  3.60s/it]


[CatalogEntry(skuId='MROD6_0009', name='Hydraulic Piston Rod - 4140 Steel', entry='The MROD6_0009 is a high-strength, precision-engineered steel rod, primarily used as a piston rod in hydraulic systems for heavy-duty agricultural machinery. This component is crafted from chromium-molybdenum steel (4140 grade), renowned for its toughness and wear resistance. The raw steel is predominantly sourced from mills in the United States, with supplementary quantities imported from Germany to ensure consistent supply. Its robust construction allows it to withstand high-pressure environments and repetitive stress cycles, making it ideal for applications such as tractor hydraulic lifts and harvester actuator mechanisms.'),
 CatalogEntry(skuId='MROD6_0010', name='Linkage Rod - Medium Carbon Steel', entry='Introducing the MROD6_0010, a versatile linkage rod designed for connecting various mechanical parts in agricultural implements. This rod is manufactured from medium-carbon steel (1045 grade), chos

In [62]:
from fpdf import FPDF
from typing import List


# Create a PDF generator
class PDF(FPDF):
    def header(self):
        self.set_font("Arial", size=12)
        self.set_font("Arial", size=16, style="B")  # Larger font size and bold for a professional look
        self.cell(0, 10, "AG Inc. Components Catalog", align="C", ln=True)  # Updated title
        self.ln(10)

    def footer(self):
        # Add a page number at the bottom
        self.set_y(-15)
        self.set_font("Arial", size=8)
        self.cell(0, 10, f"Page {self.page_no()}", align="C")


# Generate PDF
pdf = PDF()

# Add each entry to the PDF in a new page
for entry in entries:
    pdf.add_page()
    # Add product name as title with larger font
    pdf.set_font("Arial", size=18, style="B")
    pdf.cell(0, 15, entry.name, ln=True, align="C")  # Centered alignment

    # Add some space after the title
    pdf.ln(5)

    # Enlarge and bold the SKU field
    pdf.set_font("Arial", size=14, style="B")


    pdf.cell(0, 10, f"SKU: {entry.skuId}", ln=True)

    # Write the entry's data with normal font
    pdf.set_font("Arial", size=12)
    pdf.multi_cell(0, 10, entry.entry)

# Output the PDF to a file
pdf.output("source-data/component-catalog.pdf")

''

In [71]:
TIER_5_TO_1_PROMPT_TEMPLATE = PromptTemplate.from_template("""
You are helping simulate a bill of materials (BOM) dataset that will demonstrate the power of GraphRAG (Graph Retrieval Augmented Generation) for BOM and supply chain analysis. This is for a fictional company "Ag Inc." that manufactures agricultural equipment.

Below is a list of items that are part of the BOM. For each item please provide a
1. name
2. short description
3. a list of inputs from the inputPartsList

This BOM works in tiers that represents depth in the BOM chain.  the lower the tier, the closer it is to finished product, the higher it is, the more simple the item and closer to raw inputs it is.

The list of items below are at Tier{tier}. When choosing inputs, prioritize those in the next highest tier: Tier{nextHighestTier} (this should maker up your biggest portion of inputs) followed by Tier{nextHighestTier2}, Tier{nextHighestTier3}, etc. each level making up a smaller portion of inputs.

## Item List
{inputItemList}

## Input Part List
{inputPartsList}`
""")

In [72]:
# items
class InputItem(BaseModel):
    skuId: str = Field(..., description="The unique identifier for the item")
    family: str = Field(..., description="The family the item belongs to describing what type of item it is")

class InputItemList(BaseModel):
    items: list[InputItem] = Field(..., description="A list of items to be included in the BOM")

#parts
class InputPart(BaseModel):
    tier:str = Field(..., description="The BOM tier of the part")
    skuId: str = Field(..., description="The unique identifier for the item")
    name: str = Field(...,
                      description="a descriptive name for the part")
    description: str = Field(..., description="A short description of the part")

class InputPartsList(BaseModel):
    parts: list[InputPart] = Field(..., description="A list of parts to be included in the BOM")

#outputs
class OutputItem(BaseModel):
    skuId: str = Field(..., description="The provided item sku_id to trace back this item in downstream data processing")
    name: str = Field(...,
                      description="a descriptive name for the item.")
    description: str = Field(..., description="A short description of the item (a couple sentences). Don't mention the tier. ")
    inputs: List[str] = Field(..., description="A list of input part sku_ids for the item")

class OutputItemList(BaseModel):
    items: list[OutputItem] = Field(..., description="A list of items to be included in the BOM")


llm_for_bom = llm.with_structured_output(OutputItemList)

async def sim_bom_items(
        items: List[InputItem],
        parts: List[InputPart],
        tier: int,
        semaphore) -> List[OutputItem]:
    async with semaphore:
        prompt = TIER_5_TO_1_PROMPT_TEMPLATE.invoke(
            {
                'tier': tier,
                'nextHighestTier': tier + 1,
                'nextHighestTier2': tier + 2,
                'nextHighestTier3': tier + 3,
                'inputItemList': InputItemList(items=items).model_dump_json(indent=4),
                'inputPartsList': InputPartsList(parts=parts).model_dump_json(indent=4),
             })
        # pprint(prompt.text)
        # Use structured LLM for extraction
        output_times: OutputItemList = await llm_for_bom.ainvoke(prompt)
    return output_times.items

async def sim_all_bom_items_at_tier(
        items: List[InputItem],
        parts: List[InputPart],
        tier: int,
        chunk_size=10,
        max_workers=10) -> List[OutputItem]:
    # Create a semaphore with the desired number of workers
    semaphore = asyncio.Semaphore(max_workers)

    # Create tasks with the semaphore
    item_chunks = chunks(items, chunk_size)
    tasks = [sim_bom_items(item_chunk, parts, tier,semaphore) for item_chunk in item_chunks]

    # Explicitly update progress using `tqdm` as tasks complete
    output_items: List[OutputItem] = []
    with tqdm_async(total=len(tasks), desc="Simulating BOM Items") as pbar:
        for future in asyncio.as_completed(tasks):
            result = await future
            output_items.extend(result)
            pbar.update(1)  # Increment progress bar for each completed task
    return output_items

In [82]:
import json

os.makedirs("source-data/tmp", exist_ok=True)

bom_items = []
input_parts = [InputPart(tier='Tier6', skuId=part.skuId, name=part.name, description='') for part in entries][:3]
for i in range(5,-1, -1):
    print(f"Simulating BOM Items at Tier{i}")
    input_items = [InputItem(skuId=comp['sku_id'], family=comp['family']) for comp in items_df[items_df.tier==f'Tier{i}'].to_dict('records')][:3]
    output_items = await sim_all_bom_items_at_tier(input_items, input_parts, i)
    pd.DataFrame([part.model_dump() for part in output_items]).to_csv()
    with open(f"source-data/tmp/bom-items-tier{i}.csv", "w") as f:
        json.dump(output_items.model, f, indent=4)
    input_parts.extend([InputPart(tier=f'Tier{i}', skuId=part.skuId, name=part.name, description=part.description) for part in output_items])
    bom_items.extend(output_items)

Simulating BOM Items at Tier5


Simulating BOM Items: 100%|██████████| 1/1 [00:18<00:00, 18.27s/it]


Simulating BOM Items at Tier4


Simulating BOM Items: 100%|██████████| 1/1 [00:20<00:00, 20.90s/it]


Simulating BOM Items at Tier3


Simulating BOM Items: 100%|██████████| 1/1 [00:14<00:00, 14.32s/it]


Simulating BOM Items at Tier2


Simulating BOM Items: 100%|██████████| 1/1 [02:19<00:00, 139.49s/it]


Simulating BOM Items at Tier1


Simulating BOM Items: 100%|██████████| 1/1 [02:25<00:00, 145.72s/it]


Simulating BOM Items at Tier0


Simulating BOM Items: 100%|██████████| 1/1 [02:00<00:00, 120.32s/it]


In [87]:
pd.DataFrame([part.model_dump() for part in input_parts]).to_csv()

Unnamed: 0,skuId,name,description
0,MROD6_0009,Hydraulic Piston Rod - 4140 Steel,
1,MROD6_0010,Linkage Rod - Medium Carbon Steel,
2,MROD6_0011,Actuator Rod - 6061 Aluminum,
3,HFIT5_0000,Straight Hydraulic Connector Fitting - Steel,"A Tier 5 straight connector fitting, machined ..."
4,HFIT5_0001,90-Degree Elbow Hydraulic Fitting - Carbon Steel,"A Tier 5 90-degree elbow hydraulic fitting, cr..."
5,HFIT5_0002,Bulkhead Hydraulic Fitting - Aluminum,"A Tier 5 hydraulic bulkhead fitting, manufactu..."
6,PUMP4_0000,Standard Duty Hydraulic Pump Assembly,A Tier 4 standard duty hydraulic pump assembly...
7,PUMP4_0001,Heavy Duty Hydraulic Pump Assembly,A Tier 4 heavy-duty hydraulic pump assembly en...
8,PUMP4_0002,Compact Hydraulic Pump Assembly,"A Tier 4 compact hydraulic pump assembly, opti..."
9,ENGC3_0000,Standard Performance Engine Core Assembly,A Tier 3 standard performance engine core for ...


In [88]:
pd.DataFrame([item.model_dump() for item in bom_items])

Unnamed: 0,skuId,name,description,inputs
0,HFIT5_0000,Straight Hydraulic Connector Fitting - Steel,"A Tier 5 straight connector fitting, machined ...",[MROD6_0009]
1,HFIT5_0001,90-Degree Elbow Hydraulic Fitting - Carbon Steel,"A Tier 5 90-degree elbow hydraulic fitting, cr...",[MROD6_0010]
2,HFIT5_0002,Bulkhead Hydraulic Fitting - Aluminum,"A Tier 5 hydraulic bulkhead fitting, manufactu...",[MROD6_0011]
3,PUMP4_0000,Standard Duty Hydraulic Pump Assembly,A Tier 4 standard duty hydraulic pump assembly...,"[HFIT5_0000, HFIT5_0001, MROD6_0009]"
4,PUMP4_0001,Heavy Duty Hydraulic Pump Assembly,A Tier 4 heavy-duty hydraulic pump assembly en...,"[HFIT5_0000, HFIT5_0002, MROD6_0010]"
5,PUMP4_0002,Compact Hydraulic Pump Assembly,"A Tier 4 compact hydraulic pump assembly, opti...","[HFIT5_0001, HFIT5_0002, MROD6_0011]"
6,ENGC3_0000,Standard Performance Engine Core Assembly,A Tier 3 standard performance engine core for ...,"[PUMP4_0000, HFIT5_0000, HFIT5_0001, MROD6_0009]"
7,ENGC3_0001,High-Output Engine Core Assembly,A Tier 3 high-output engine core designed for ...,"[PUMP4_0001, HFIT5_0000, HFIT5_0002, MROD6_0010]"
8,ENGC3_0002,Compact Engine Core Assembly,A Tier 3 compact engine core optimized for Ag ...,"[PUMP4_0002, HFIT5_0001, HFIT5_0002, MROD6_0011]"
9,MFRM2_0000,Standard Machine Frame Module,A Tier 2 standard machine frame module for Ag ...,[ENGC3_0000]


In [73]:
input_parts = [InputPart(skuId=part.skuId, name=part.name, description='') for part in entries]
input_parts[:3]

[InputPart(skuId='MROD6_0009', name='Hydraulic Piston Rod - 4140 Steel', description=''),
 InputPart(skuId='MROD6_0010', name='Linkage Rod - Medium Carbon Steel', description=''),
 InputPart(skuId='MROD6_0011', name='Actuator Rod - 6061 Aluminum', description='')]

In [74]:
input_items = [InputItem(skuId=comp['sku_id'], family=comp['family']) for comp in items_df[items_df.tier=='Tier5'].to_dict('records')]
input_items[:3]

[InputItem(skuId='HFIT5_0000', family='HydraulicFitting'),
 InputItem(skuId='HFIT5_0001', family='HydraulicFitting'),
 InputItem(skuId='HFIT5_0002', family='HydraulicFitting')]

In [78]:
output_items = await sim_all_bom_items_at_tier(input_items, input_parts, 5)

Simulating BOM Items: 100%|██████████| 79/79 [09:14<00:00,  7.02s/it] 


In [79]:
output_items[:10]

[OutputItem(skuId='BELT5_0034', name='Heavy-Duty V-Belt BELT5_0034', description='A durable V-belt for power transmission in heavy-duty agricultural machinery. Assembled from various Tier 6 components like rods, wires, seals, and fasteners.', inputs=['MROD6_0000', 'WIRE6_0000', 'RSEL6_0000', 'FAST6_0000', 'PBLT6_0000', 'TUBE6_0000']),
 OutputItem(skuId='BELT5_0035', name='Conveyor Belt Segment BELT5_0035', description='A segment of a larger conveyor belt system used in harvesting equipment. Constructed with Tier 6 materials including castings, wires, seals, and bolts for resilience.', inputs=['CAST6_0000', 'WIRE6_0001', 'RSEL6_0001', 'BOLT6_0000', 'MROD6_0001']),
 OutputItem(skuId='BELT5_0036', name='Timing Belt BELT5_0036', description='Precision timing belt for synchronized shaft rotation in Ag Inc. engines and implements. This Tier 5 assembly incorporates precision bolts, specialized wiring, and durable seals.', inputs=['PBLT6_0011', 'WIRE6_0010', 'RSEL6_0010', 'FAST6_0013', 'TUBE6_