# Map Unstructured Text to Neo4j Graph

![doc-to-neo4j](img/doc-to-neo4j.png)

## Setup

In [1]:
!gcloud auth application-default login > /dev/null 2>&1

In [2]:
import getpass
from dotenv import load_dotenv
import os

env_file = '.env'
if os.path.exists(env_file):
    load_dotenv(env_file, override=True)

    NEO4J_URI = os.getenv('NEO4J_URI')
    NEO4J_USERNAME = os.getenv('NEO4J_USERNAME')
    NEO4J_PASSWORD = os.getenv('NEO4J_PASSWORD')

else:
    print(f"File {env_file} not found.")

if "GOOGLE_API_KEY" not in os.environ:
    os.environ["GOOGLE_API_KEY"] = getpass.getpass("Enter your Google AI API key: ")

## Text Extraction

In [46]:
from langchain_core.prompts import PromptTemplate

PROMPT_TEMPLATE = PromptTemplate.from_template("""
You are responsible for extracting data from a parts catalog into structured part entities connected to countries that they are dependent on.

Every Component requires a skuId (i.e. SKU) which can be found in the catalog. You must use this exact sku so we can resolve components to their suppliers in downstream processing.

When entering information for countries please use the standardized country names and codes as defined in the ISO 3166 standards.

## Catalog Entries
{catalogEntries}

## ISO 3166 Standard Country Code and Names
__Organized as {code:name} combinations:__
{countryCodes}
""")

In [132]:
import json
from iso3166 import countries

country_dict  = {}
for country in countries:
    country_dict[country.alpha3] = country.name
print(json.dumps(country_dict, indent=2)[:202])

{
  "AFG": "Afghanistan",
  "ALA": "\u00c5land Islands",
  "ALB": "Albania",
  "DZA": "Algeria",
  "ASM": "American Samoa",
  "AND": "Andorra",
  "AGO": "Angola",
  "AIA": "Anguilla",
  "ATA": "Antarcti


In [94]:
from typing import List
from pydantic import Field, BaseModel

# define Outputs
class Country(BaseModel):
    code: str = Field(description="the standardized ISO 3166 alpha-3 code")
    name: str = Field(description="the standardized ISO 3166 country name")

class DependsOnCountry(BaseModel):
    country: Country = Field(description="the country the component depends on")
    description:str = Field(description="the description of the dependency i.e. for sourcing materials or manufacturing")

class Component(BaseModel):
    skuId: str = Field(description="the skuId of the component")
    description: str = Field(description="the detailed description of the component from the catalog")
    dependsOn: List[DependsOnCountry] = Field(description="the countries the component depends on and why")

class ComponentList(BaseModel):
    components: List[Component] = Field(description="the list of components")

In [95]:
from langchain_google_genai import ChatGoogleGenerativeAI

llm = ChatGoogleGenerativeAI(
    model="gemini-2.5-pro-preview-03-25",
    temperature=0,
    max_tokens=None,
    timeout=None,
    max_retries=2,
    # other params...
)

In [96]:
#test connection
messages = [("system","You are a helpful assistant that translates English to French. Translate the user sentence.",),
    ("human", "I love programming.")]
llm.invoke(messages)

AIMessage(content='Here are a few options, depending on the nuance you want:\n\n1.  **J\'adore programmer.** (This is the most common and natural way to express strong liking or "love" for an activity like programming. "Adorer" means "to adore" or "to love" in this context.)\n2.  **J\'aime beaucoup programmer.** (This means "I like programming a lot" or "I really like programming", which is also a very good translation for "I love programming".)\n3.  **J\'aime la programmation.** (This uses the noun "programming" and means "I like programming" or "I love programming" - focusing slightly more on the field itself.)\n4.  **J\'adore la programmation.** (Similar to the previous one, but with stronger emphasis using "adorer".)\n\nThe most idiomatic and frequently used translation is **J\'adore programmer.**', additional_kwargs={}, response_metadata={'prompt_feedback': {'block_reason': 0, 'safety_ratings': []}, 'finish_reason': 'STOP', 'model_name': 'gemini-2.5-pro-preview-03-25', 'safety_rat

In [97]:
llm_for_components = llm.with_structured_output(ComponentList)

In [82]:
from tqdm.asyncio import tqdm as tqdm_async
import asyncio


def chunks(xs, n=10):
    n = max(1, n)
    return [xs[i:i + n] for i in range(0, len(xs), n)]

async def extract(texts: List[str], semaphore) -> List[Component]:
    async with semaphore:
        prompt = PROMPT_TEMPLATE.invoke({'catalogEntries': '\n\n'.join(texts), 'countryCodes': json.dumps(country_dict, indent=2)})
        # Use structured LLM for extraction
        res: ComponentList = await llm_for_components.ainvoke(prompt)
    return res.components


async def extract_all(texts: List[str], chunk_size=10, max_workers=10) -> List[Component]:
    # Create a semaphore with the desired number of workers
    semaphore = asyncio.Semaphore(max_workers)

    # Create tasks with the semaphore
    text_chunks = chunks(texts, chunk_size)
    tasks = [extract(text_chunk, semaphore) for text_chunk in text_chunks]

    # Explicitly update progress using `tqdm` as tasks complete
    components: List[Component] = []
    with tqdm_async(total=len(tasks), desc="Simulating Catalog Entries") as pbar:
        for future in asyncio.as_completed(tasks):
            result = await future
            components.extend(result)
            pbar.update(1)  # Increment progress bar for each completed task
    return components

In [98]:
from io import BytesIO
import requests
from PyPDF2 import PdfReader

response = requests.get('https://storage.googleapis.com/neo4j-workshop-data/genai-bom/component-catalog.pdf')
response.raise_for_status()  # Raise HTTP errors if any
pdf_bytes = BytesIO(response.content)

reader = PdfReader(pdf_bytes)
catalog_pages = [page.extract_text() or ""  for page in reader.pages]
catalog_pages[:3]



['AG Inc. Components Catalog\nSKU: M6673J13QX\nThe PrecisionBolt_LKBIG is a high-strength fastener engineered for critical applications within heavy\nagricultural machinery. Manufactured from a specialized high-carbon steel alloy containing\nchromium and molybdenum, it offers exceptional tensile strength and fatigue resistance. The\nprimary steel component is sourced from mills in China, utilizing iron ore from Australia. Alloying\nelements are procured globally, with chromium originating from South Africa and molybdenum\nsourced from mines in Chile. These bolts are essential for securing high-stress assemblies such as\nengine mounts, transmission housings, and main chassis connections, ensuring operational integrity\nunder demanding field conditions.\nPage 1',
 'AG Inc. Components Catalog\nSKU: M4882B36X\nRod_6TIHG is a solid medium-carbon steel rod, providing robust performance in linkage and\nstructural applications. The steel is produced in Germany using high-quality iron ore impor

In [102]:
components = await extract_all(catalog_pages)
components

Simulating Catalog Entries: 100%|██████████| 30/30 [01:54<00:00,  3.82s/it]


[Component(skuId='M8104R57X', description='This high-tensile fastener is engineered for critical structural joints in heavy agricultural machinery, ensuring reliability under high-vibration and heavy-load conditions typical of tractors and harvesting equipment. It provides superior clamping force and fatigue resistance.', dependsOn=[DependsOnCountry(country=Country(code='DEU', name='Germany'), description='Sourcing base high-carbon steel alloy'), DependsOnCountry(country=Country(code='CHN', name='China'), description='Sourcing zinc for protective coating')]),
 Component(skuId='M7675V76X', description='This component is a reinforced polymer tubing designed for demanding hydraulic fluid transfer applications within agricultural equipment. It offers excellent flexibility, pressure resistance, and durability against abrasion and weathering, suitable for connecting hydraulic pumps, cylinders, and valves.', dependsOn=[DependsOnCountry(country=Country(code='KOR', name='South Korea'), descript

In [118]:
len(components)

294

## Write Extracted Entities to Neo4j

In [114]:
from neo4j import GraphDatabase

# Construct neo4j driver
driver = GraphDatabase.driver(NEO4J_URI, auth=(NEO4J_USERNAME, NEO4J_PASSWORD))

In [120]:
from tqdm import tqdm

driver.execute_query(f"CREATE CONSTRAINT IF NOT EXISTS FOR (n:Country) REQUIRE (n.code) IS NODE KEY")
for records in tqdm(chunks([component.model_dump() for component in components], 100)):
    driver.execute_query("""
    UNWIND $records AS rec
    MATCH (n:Item {sku_id: rec.skuId})
    SET n.description = rec.description
    WITH n, rec
    FOREACH (dependsOn IN rec.dependsOn |
            MERGE(c:Country {code: dependsOn.country.code})
            MERGE (n)-[:DEPENDS_ON {description:dependsOn.description}]->(c)
            SET c.name = dependsOn.country.name
            )
    """,records=records)

100%|██████████| 3/3 [00:00<00:00,  4.54it/s]


In [124]:
# create dataframe for effective embedding batching
import pandas as pd
desc_df = pd.DataFrame([{'sku_id':c.skuId, 'description':c.description} for c in components ])

## Text Embedding & Vector Index for Semantic search

In [125]:
from langchain_google_genai import GoogleGenerativeAIEmbeddings

embedder = GoogleGenerativeAIEmbeddings(model='models/embedding-001')

res = []
for desc_chunks in chunks(desc_df.description, n=30):
    res.extend(embedder.embed_documents(list(desc_chunks)))
desc_df['desc_embedding'] = res
desc_df

Unnamed: 0,sku_id,description,desc_embedding
0,M8104R57X,This high-tensile fastener is engineered for c...,"[0.023949019610881805, -0.04034088924527168, -..."
1,M7675V76X,This component is a reinforced polymer tubing ...,"[0.00923293735831976, -0.020914115011692047, -..."
2,M4882G26X,An industrial-grade bolt designed for secure a...,"[-0.004318585153669119, -0.062175851315259933,..."
3,M1519Q13X,This custom-molded rubber seal is specifically...,"[-0.0010546328267082572, -0.027247631922364235..."
4,M7307V67X,This product is a high-conductivity raw copper...,"[0.032294947654008865, -0.04866011440753937, -..."
...,...,...,...
289,M7687Q88,Our M7687Q88 Tubing (Model 17KWW) is a versati...,"[0.010898726060986519, -0.06329736858606339, -..."
290,M7307P31X,The M7307P31X Precision Bolt (Model GN1A1) is ...,"[0.012401655316352844, -0.07435838878154755, -..."
291,M7687U30X,Specify the M7687U30X Tubing (Model 8950Z) for...,"[0.0016307869227603078, -0.09196709096431732, ..."
292,M9119X46X,The M9119X46X Tubing (Model YT0BA) offers a li...,"[0.005185646004974842, -0.07479916512966156, -..."


In [126]:
for records in tqdm(chunks(desc_df.to_dict(orient='records'), 200)):
    driver.execute_query("""
    UNWIND $records AS rec
    MATCH(n:Item {sku_id: rec.sku_id})
    CALL db.create.setNodeVectorProperty(n, 'desc_embedding', rec.desc_embedding)
    """,records=records)


100%|██████████| 2/2 [00:01<00:00,  1.49it/s]


In [127]:
# set vector index
# create vector index on text embeddings
driver.execute_query('''
CREATE VECTOR INDEX component_text_embeddings IF NOT EXISTS FOR (n:Item) ON (n.desc_embedding)
OPTIONS {indexConfig: {
 `vector.dimensions`: toInteger($dimension),
 `vector.similarity_function`: 'cosine'
}}
''', dimension=len(desc_df.desc_embedding[0]))

# wait for index to come online
driver.execute_query('CALL db.awaitIndex("component_text_embeddings", 300)')

EagerResult(records=[], summary=<neo4j._work.summary.ResultSummary object at 0x12f6d3290>, keys=[])

In [6]:
from langchain_neo4j import Neo4jVector
from langchain_google_genai import GoogleGenerativeAIEmbeddings

embedder = GoogleGenerativeAIEmbeddings(model='models/embedding-001')

store = Neo4jVector.from_existing_graph(
    embedder,
    url=NEO4J_URI,
    username=NEO4J_USERNAME,
    password=NEO4J_PASSWORD,
    index_name="component_text_embeddings",
    node_label="Item",
    text_node_properties=["description"],
    embedding_node_property="desc_embedding",
)

In [33]:
store.similarity_search("Nickel Sourcing", k=5)[:2]

[Document(metadata={'family': 'Rod', 'is_finished_product': False, 'name': 'Rod_RON86', 'sku_id': 'KM100378'}, page_content='\ndescription: The Rod_RON86 is a solid, high-strength steel alloy rod designed for demanding applications. The base steel is typically sourced from Germany, while critical alloying elements like chromium and vanadium are procured from South Africa and Kazakhstan, respectively. These rods are precision-machined for use as piston rods in hydraulic cylinders, linkage components in steering systems, or structural tie rods, offering excellent resistance to bending and fatigue. Supply chain vulnerability exists concerning the availability of specific alloying metals.'),
 Document(metadata={'family': 'Bolt', 'is_finished_product': False, 'name': 'Bolt_ZYOPF', 'sku_id': 'M7687Q83'}, page_content='\ndescription: This specialized bolt is designed for applications requiring enhanced corrosion resistance, such as attaching ground-engaging tools (e.g., plowshares, cultivator

In [37]:
graph_store = Neo4jVector.from_existing_graph(
    embedder,
    url=NEO4J_URI,
    username=NEO4J_USERNAME,
    password=NEO4J_PASSWORD,
    index_name="component_text_embeddings",
    node_label="Item",
    text_node_properties=["description"],
    embedding_node_property="desc_embedding",
    retrieval_query="""
        WITH node AS component, score
        MATCH (product:Item {is_finished_product:True})<-[:BOM*]-(component)
        RETURN avg(score) AS score, component.sku_id + ':\n' + component.description AS text, { dependant_products: collect(product.sku_id + '( ' + product.name + ' )' )} as metadata
        """
)

In [39]:
graph_store.similarity_search("Nickel Sourcing", k=5)[:1]

[Document(metadata={'dependant_products': ['35775983( HayCollector_ZA35T )', '35775982( FarmTractor_VPA13 )', '35860396( FieldSprayer_5DOMP )', '35425199( FieldSprayer_BX5C7 )', '35425196( SeedPlanter_OD14I )', '35357313( CropHarvester_B0CWX )', '35419081( SeedPlanter_HZJQD )', '35780333( FieldSprayer_V7NT5 )', '35415398( FarmTractor_ILX2S )', '35780325( FarmTractor_B25J0 )', '35780324( FieldSprayer_4FEBQ )', 'PE531732( SeedPlanter_72RG6 )', '35775980( FieldSprayer_ZHJWI )', '35419080( CropHarvester_YNMUE )', '35764350( HayCollector_K1LYI )', '35415400( CropHarvester_RCWJK )', '35805298( HayCollector_TT4XO )', '35805301( FieldSprayer_IS8RE )', '35686544( FarmTractor_7EDBW )', '35779489( CropHarvester_L2OVS )', '35853578( HayCollector_PLYBF )', '35764354( HayCollector_EOM9L )', '35764351( HayCollector_IJUQR )', '35779490( SeedPlanter_2GNQG )', '35455049( FieldSprayer_TSSPF )', '35544759( CropHarvester_9KQ17 )', '35780327( FieldSprayer_Q0DQO )', '35780332( FarmTractor_Q09GX )', '35775728