# Coding Setup
## Installation and Import

In [1]:
%pip install --upgrade --quiet  langchain langchain-community langchain-ollama langchain-experimental neo4j tiktoken yfiles_jupyter_graphs python-dotenv json-repair langchain-openai langchain_core pandas

Note: you may need to restart the kernel to use updated packages.


In [1]:
from langchain_core.runnables import  RunnablePassthrough

from pydantic import BaseModel, Field
from langchain_core.output_parsers import StrOutputParser
from langchain_community.graphs import Neo4jGraph
from langchain_community.chat_models import ChatOllama
from neo4j import GraphDatabase
from langchain_community.vectorstores import Neo4jVector
from langchain_community.document_loaders import TextLoader
from langchain_community.vectorstores.neo4j_vector import remove_lucene_chars
from langchain_ollama import OllamaEmbeddings
import os
from langchain_openai import AzureChatOpenAI
from neo4j import  Driver
import pandas as pd
from langchain_core.prompts import ChatPromptTemplate
from dotenv import load_dotenv
import json

load_dotenv()



True

# Data Upload

In [None]:
def dataUploadAndMappingToGraph():
    # Initialize Neo4j driver
    driver = GraphDatabase.driver(
        uri=os.environ["NEO4J_URI"],
        auth=(os.environ["NEO4J_USERNAME"], os.environ["NEO4J_PASSWORD"])
    )

    def clear_database(tx):
        """Clear all nodes and relationships"""
        tx.run("MATCH (n) DETACH DELETE n")

    def import_fmea_data(csv_file_path):
        """Import FMEA data from CSV into Neo4j"""
        
        # Read CSV
        df = pd.read_csv(csv_file_path)
        
        # Track ID counters for each entity type
        entity_counters = {
            'product': 0,
            'subsystem': 0,
            'system_element': 0,
            'function': 0,
            'failure_mode': 0,
            'failure_cause': 0,
            'failure_effect': 0,
            'measure': 0,

        }
        
        # Track existing entities to avoid duplicates and ID conflicts
        existing_entities = {
            'product': {},
            'subsystem': {},
            'system_element': {},
            'function': {},
            'failure_mode': {},
            'failure_cause': {},
            'failure_effect': {},
            'measure': {},

        }
        
        severity_conflicts = []
        
        with driver.session() as session:
            # Clear existing data
            session.execute_write(clear_database)
            print("Database cleared.")
            
            for index, row in df.iterrows():
                print(f"Processing row {index + 1}/{len(df)}")
                
                # Create/get Product
                product_key = row['product']
                if product_key not in existing_entities['product']:
                    entity_counters['product'] += 1
                    product_id = entity_counters['product']
                    existing_entities['product'][product_key] = product_id
                    session.execute_write(create_product_node, product_id, row['product'])
                else:
                    product_id = existing_entities['product'][product_key]
                
                # Create/get Subsystem
                subsystem_key = f"{product_key}_{row['subsystem']}"
                if subsystem_key not in existing_entities['subsystem']:
                    entity_counters['subsystem'] += 1
                    subsystem_id = entity_counters['subsystem']
                    existing_entities['subsystem'][subsystem_key] = subsystem_id
                    session.execute_write(create_subsystem_node, subsystem_id, row['subsystem'], product_id)
                else:
                    subsystem_id = existing_entities['subsystem'][subsystem_key]
                
                # Create/get SystemElement
                system_element_key = f"{subsystem_key}_{row['system_element']}"
                if system_element_key not in existing_entities['system_element']:
                    entity_counters['system_element'] += 1
                    system_element_id = entity_counters['system_element']
                    existing_entities['system_element'][system_element_key] = system_element_id
                    session.execute_write(create_system_element_node, system_element_id, row['system_element'], subsystem_id)
                else:
                    system_element_id = existing_entities['system_element'][system_element_key]
                
                # Create/get Function
                function_key = f"{system_element_key}_{row['function']}"
                if function_key not in existing_entities['function']:
                    entity_counters['function'] += 1
                    function_id = entity_counters['function']
                    existing_entities['function'][function_key] = function_id
                    session.execute_write(create_function_node, function_id, row['function'], system_element_id)
                else:
                    function_id = existing_entities['function'][function_key]
                
                # Create/get FailureMode
                failure_mode_key = f"{function_key}_{row['failure_mode']}"
                if failure_mode_key not in existing_entities['failure_mode']:
                    entity_counters['failure_mode'] += 1
                    failure_mode_id = entity_counters['failure_mode']
                    existing_entities['failure_mode'][failure_mode_key] = failure_mode_id
                    session.execute_write(create_failure_mode_node, failure_mode_id, row['failure_mode'], function_id)
                else:
                    failure_mode_id = existing_entities['failure_mode'][failure_mode_key]
                
                # Create/get FailureEffect with severity check
                failure_effect_key = f"{product_key}_{row['failure_effect']}"
                if failure_effect_key not in existing_entities['failure_effect']:
                    entity_counters['failure_effect'] += 1
                    failure_effect_id = entity_counters['failure_effect']
                    existing_entities['failure_effect'][failure_effect_key] = failure_effect_id
                    session.execute_write(create_failure_effect_node, failure_effect_id, row['failure_effect'], 
                                        row['severity_rating'], failure_mode_id)
                else:
                    failure_effect_id = existing_entities['failure_effect'][failure_effect_key]
                    # Check for severity rating conflicts
                    existing_severity = session.execute_read(get_failure_effect_severity, failure_effect_id)
                    if existing_severity != row['severity_rating']:
                        severity_conflicts.append({
                            'failure_effect': row['failure_effect'],
                            'existing_severity': existing_severity,
                            'new_severity': row['severity_rating'],
                            'row': index + 1
                        })
                        # Add both severity ratings as properties
                        session.execute_write(add_severity_conflict, failure_effect_id, row['severity_rating'])
                    else:
                        # Create relationship if it doesn't exist
                        session.execute_write(create_failure_mode_effect_relationship, failure_mode_id, failure_effect_id)
                
                # Create/get FailureCause
                failure_cause_key = f"{product_key}_{row['failure_cause']}"
                if failure_cause_key not in existing_entities['failure_cause']:
                    entity_counters['failure_cause'] += 1
                    failure_cause_id = entity_counters['failure_cause']
                    existing_entities['failure_cause'][failure_cause_key] = failure_cause_id
                    session.execute_write(create_failure_cause_node, failure_cause_id, row['failure_cause'], 
                                        row['occurrence_rating'], row['detection_rating'], failure_mode_id)
                else:
                    failure_cause_id = existing_entities['failure_cause'][failure_cause_key]
                    # Create relationship if it doesn't exist
                    session.execute_write(create_failure_mode_cause_relationship, failure_mode_id, failure_cause_id)
                
                # Create/get Measure (replaces both preventive and detective sections)
                measure_key = f"{product_key}_{row['measure_name']}"
                if measure_key not in existing_entities['measure']:
                    entity_counters['measure'] += 1
                    measure_id = entity_counters['measure']
                    existing_entities['measure'][measure_key] = measure_id
                    session.execute_write(create_measure_node, measure_id, row['measure_name'], 
                                        row['measure_description'], row['measure_type'], failure_cause_id)
                else:
                    measure_id = existing_entities['measure'][measure_key]
                    session.execute_write(create_measure_relationship, failure_cause_id, measure_id, row['measure_type'])
                
        print(f"\nImport completed successfully!")
        print(f"Created {entity_counters['product']} products")
        print(f"Created {entity_counters['subsystem']} subsystems")
        print(f"Created {entity_counters['system_element']} system elements")
        print(f"Created {entity_counters['function']} functions")
        print(f"Created {entity_counters['failure_mode']} failure modes")
        print(f"Created {entity_counters['failure_cause']} failure causes")
        print(f"Created {entity_counters['failure_effect']} failure effects")
        print(f"Created {entity_counters['measure']} measures")

        
        if severity_conflicts:
            print(f"\nWARNING: Found {len(severity_conflicts)} severity rating conflicts:")
            for conflict in severity_conflicts:
                print(f"  Row {conflict['row']}: '{conflict['failure_effect']}' - "
                    f"Existing: {conflict['existing_severity']}, New: {conflict['new_severity']}")

    # Node creation functions
    def create_product_node(tx, product_id, name):
        query = """
        MERGE (p:Product {id: $product_id, name: $name})
        """
        tx.run(query, product_id=product_id, name=name)

    def create_subsystem_node(tx, subsystem_id, name, product_id):
        query = """
        MERGE (s:Subsystem {id: $subsystem_id, name: $name})
        MERGE (p:Product {id: $product_id})
        MERGE (p)-[:hasSubsystem]->(s)
        """
        tx.run(query, subsystem_id=subsystem_id, name=name, product_id=product_id)

    def create_system_element_node(tx, system_element_id, name, subsystem_id):
        query = """
        MERGE (se:SystemElement {id: $system_element_id, name: $name})
        MERGE (s:Subsystem {id: $subsystem_id})
        MERGE (s)-[:hasSystemElement]->(se)
        """
        tx.run(query, system_element_id=system_element_id, name=name, subsystem_id=subsystem_id)

    def create_function_node(tx, function_id, name, system_element_id):
        query = """
        MERGE (f:Function {id: $function_id, name: $name})
        MERGE (se:SystemElement {id: $system_element_id})
        MERGE (se)-[:hasFunction]->(f)
        """
        tx.run(query, function_id=function_id, name=name, system_element_id=system_element_id)

    def create_failure_mode_node(tx, failure_mode_id, name, function_id):
        query = """
        MERGE (fm:FailureMode {id: $failure_mode_id, name: $name})
        MERGE (f:Function {id: $function_id})
        MERGE (f)-[:hasFailureMode]->(fm)
        """
        tx.run(query, failure_mode_id=failure_mode_id, name=name, function_id=function_id)

    def create_failure_effect_node(tx, failure_effect_id, name, severity_rating, failure_mode_id):
        query = """
        MERGE (fe:FailureEffect {id: $failure_effect_id, name: $name, severity_rating: $severity_rating})
        MERGE (fm:FailureMode {id: $failure_mode_id})
        MERGE (fm)-[:resultsInFailureEffect]->(fe)
        """
        tx.run(query, failure_effect_id=failure_effect_id, name=name, 
            severity_rating=severity_rating, failure_mode_id=failure_mode_id)

    def create_failure_cause_node(tx, failure_cause_id, name, occurrence_rating, detection_rating, failure_mode_id):
        query = """
        MERGE (fc:FailureCause {id: $failure_cause_id, name: $name, 
                            occurrence_rating: $occurrence_rating, 
                            detection_rating: $detection_rating})
        MERGE (fm:FailureMode {id: $failure_mode_id})
        MERGE (fm)-[:isDueToFailureCause]->(fc)
        """
        tx.run(query, failure_cause_id=failure_cause_id, name=name, 
            occurrence_rating=occurrence_rating, detection_rating=detection_rating, 
            failure_mode_id=failure_mode_id)

    def create_measure_node(tx, measure_id, name, description, measure_type, failure_cause_id):
        if measure_type == 'preventive':
            relationship = 'isImprovedByPreventiveMeasure'
        else:  # detective
            relationship = 'isImprovedByDetectiveMeasure'
        
        query = f"""
        MERGE (m:Measure {{id: $measure_id, name: $name, description: $description, type: $measure_type}})
        MERGE (fc:FailureCause {{id: $failure_cause_id}})
        MERGE (fc)-[:{relationship}]->(m)
        """
        tx.run(query, measure_id=measure_id, name=name, description=description, 
            measure_type=measure_type, failure_cause_id=failure_cause_id)

    # Relationship creation functions for existing nodes
    def create_failure_mode_effect_relationship(tx, failure_mode_id, failure_effect_id):
        query = """
        MERGE (fm:FailureMode {id: $failure_mode_id})
        MERGE (fe:FailureEffect {id: $failure_effect_id})
        MERGE (fm)-[:resultsInFailureEffect]->(fe)
        """
        tx.run(query, failure_mode_id=failure_mode_id, failure_effect_id=failure_effect_id)

    def create_failure_mode_cause_relationship(tx, failure_mode_id, failure_cause_id):
        query = """
        MERGE (fm:FailureMode {id: $failure_mode_id})
        MERGE (fc:FailureCause {id: $failure_cause_id})
        MERGE (fm)-[:isDueToFailureCause]->(fc)
        """
        tx.run(query, failure_mode_id=failure_mode_id, failure_cause_id=failure_cause_id)

    def create_measure_relationship(tx, failure_cause_id, measure_id, measure_type):
        if measure_type == 'preventive':
            relationship = 'isImprovedByPreventiveMeasure'
        else:  # detective
            relationship = 'isImprovedByDetectiveMeasure'
        
        query = f"""
        MERGE (fc:FailureCause {{id: $failure_cause_id}})
        MERGE (m:Measure {{id: $measure_id}})
        MERGE (fc)-[:{relationship}]->(m)
        """
        tx.run(query, failure_cause_id=failure_cause_id, measure_id=measure_id)

    # Utility functions
    def get_failure_effect_severity(tx, failure_effect_id):
        query = "MATCH (fe:FailureEffect {id: $failure_effect_id}) RETURN fe.severity_rating"
        result = tx.run(query, failure_effect_id=failure_effect_id)
        record = result.single()
        return record["fe.severity_rating"] if record else None

    def add_severity_conflict(tx, failure_effect_id, new_severity):
        query = """
        MATCH (fe:FailureEffect {id: $failure_effect_id})
        SET fe.severity_rating_conflict = $new_severity
        """
        tx.run(query, failure_effect_id=failure_effect_id, new_severity=new_severity)

    csv_file_path = "washing_machine2.csv"  # Replace with your CSV file path
    import_fmea_data(csv_file_path)
    driver.close()

# Data Pre-Processing

## Create Vector Embedding for Graph Data

In [None]:
def create_failure_mode_embeddings():
    """Generate rich contextual embeddings for FailureMode nodes"""
    
    driver = GraphDatabase.driver(
        uri=os.environ["NEO4J_URI"],
        auth=(os.environ["NEO4J_USERNAME"], os.environ["NEO4J_PASSWORD"])
    )
    
    with driver.session() as session:
        # Get all FailureMode nodes with their rich context
        failure_modes = session.execute_read(get_failure_modes_with_context)
        
        print(f"Found {len(failure_modes)} failure modes to process")
        
        # Generate text chunks for each failure mode
        for failure_mode in failure_modes:
            text_chunk = generate_failure_mode_text_chunk(failure_mode)
            
            # Create VectorEmbedding node with the text chunk
            session.execute_write(create_vector_embedding_node, 
                                failure_mode['failure_mode_id'], 
                                text_chunk)
            
            print(f"Created embedding for: {failure_mode['failure_mode_name']}")
    
    driver.close()
    print("Embedding generation completed!")

def get_failure_modes_with_context(tx):
    """Retrieve all FailureMode nodes with their complete context"""
    
    query = """
    MATCH (p:Product)-[:hasSubsystem]->(s:Subsystem)-[:hasSystemElement]->(se:SystemElement)
          -[:hasFunction]->(f:Function)-[:hasFailureMode]->(fm:FailureMode)
    
    OPTIONAL MATCH (fm)-[:isDueToFailureCause]->(fc:FailureCause)
    OPTIONAL MATCH (fc)-[:isImprovedByPreventiveMeasure]->(pm:Measure {type: 'preventive'})
    OPTIONAL MATCH (fc)-[:isImprovedByDetectiveMeasure]->(dm:Measure {type: 'detective'})
    OPTIONAL MATCH (fm)-[:resultsInFailureEffect]->(fe:FailureEffect)
    
    WITH fm, p, s, se, f, fc, fe,
         collect(DISTINCT pm.name) as preventive_measures_for_cause,
         collect(DISTINCT dm.name) as detective_measures_for_cause
    
    WITH fm, p, s, se, f,
         collect(DISTINCT {
             name: fc.name,
             occurrence_rating: fc.occurrence_rating,
             detection_rating: fc.detection_rating,
             preventive_measures: preventive_measures_for_cause,
             detective_measures: detective_measures_for_cause
         }) as causes_with_measures,
         collect(DISTINCT {
             name: fe.name,
             severity_rating: fe.severity_rating
         }) as effects_data
    
    RETURN fm.id as failure_mode_id,
           fm.name as failure_mode_name,
           p.name as product_name,
           s.name as subsystem_name,
           se.name as system_element_name,
           f.name as function_name,
           causes_with_measures,
           effects_data
    """
    
    result = tx.run(query)
    
    # Transform results with proper cause-measure relationships
    transformed_results = []
    for record in result:
        data = record.data()
        
        # Clean up causes data - filter out null entries and format properly
        causes = []
        for cause_data in data['causes_with_measures']:
            if cause_data['name'] is not None:  # Only include actual causes
                formatted_cause = {
                    'cause_name': cause_data['name'],
                    'occurrence_rating': cause_data['occurrence_rating'],
                    'detection_rating': cause_data['detection_rating'],
                    'preventive_measures': [m for m in cause_data['preventive_measures'] if m is not None],
                    'detective_measures': [m for m in cause_data['detective_measures'] if m is not None]
                }
                causes.append(formatted_cause)
        
        # Clean up effects data - filter out null entries
        effects = []
        for effect_data in data['effects_data']:
            if effect_data['name'] is not None:  # Only include actual effects
                formatted_effect = {
                    'effect_name': effect_data['name'],
                    'severity_rating': effect_data['severity_rating']
                }
                effects.append(formatted_effect)
        
        # Create the final data structure
        transformed_data = {
            'failure_mode_id': data['failure_mode_id'],
            'failure_mode_name': data['failure_mode_name'],
            'product_name': data['product_name'],
            'subsystem_name': data['subsystem_name'],
            'system_element_name': data['system_element_name'],
            'function_name': data['function_name'],
            'causes': causes,
            'effects': effects
        }
        
        transformed_results.append(transformed_data)
    
    return transformed_results

def generate_failure_mode_text_chunk(failure_mode_data):
    """Generate rich contextual text chunk for a FailureMode"""
    
    # Extract basic information
    fm_name = failure_mode_data['failure_mode_name']
    product = failure_mode_data['product_name']
    subsystem = failure_mode_data['subsystem_name']
    system_element = failure_mode_data['system_element_name']
    function = failure_mode_data['function_name']
    causes = failure_mode_data['causes']
    effects = failure_mode_data['effects']
    
    # Build the text chunk
    text_parts = []
    
    # System hierarchy context
    text_parts.append(f"The failure mode '{fm_name}' occurs in the '{system_element}' component, "
                     f"which is part of the '{subsystem}' subsystem in the '{product}' system.")
    
    # Function context
    text_parts.append(f"This failure affects the '{function}' function of the '{system_element}'.")
    
    # Failure causes context
    if causes and any(cause['cause_name'] for cause in causes if cause['cause_name']):
        text_parts.append(f"The failure mode '{fm_name}' can be caused by the following failure causes: ")
        for cause in causes:
            if cause['cause_name']:
                cause_text = f" Failure cause '{cause['cause_name']}'"
                if cause['occurrence_rating']:
                    cause_text += f" with an occurrence rating of {cause['occurrence_rating']}"
                if cause['detection_rating']:
                    cause_text += f" and a detection rating of {cause['detection_rating']}."
                elif cause['occurrence_rating']:
                    cause_text += ")"
                
                # Add preventive measures
                preventive_measures = [m for m in cause['preventive_measures'] if m]
                if preventive_measures:
                    cause_text += f" Preventive measures for the failure cause '{cause['cause_name']}' are: '{', '.join(preventive_measures)}'."
                
                # Add detective measures
                detective_measures = [m for m in cause['detective_measures'] if m]
                if detective_measures:
                    cause_text += f" Detective measures for the failure cause '{cause['cause_name']}' are: '{', '.join(detective_measures)}.'"
                
                text_parts.append(cause_text)
    
    # Failure effects context
    if effects and any(effect['effect_name'] for effect in effects if effect['effect_name']):
        text_parts.append(f" The failure mode '{fm_name}' results in the following failure effects:")
        for effect in effects:
            if effect['effect_name']:
                effect_text = f" Failure effect '{effect['effect_name']}'"
                if effect['severity_rating']:
                    effect_text += f" with a severity rating of {effect['severity_rating']}."
                text_parts.append(effect_text)
    
    # Join all parts with proper spacing
    full_text = " ".join(text_parts)
    
    return full_text

def create_vector_embedding_node(tx, failure_mode_id, text_chunk):
    """Create a VectorEmbedding node linked to the FailureMode"""
    
    query = """
    MATCH (fm:FailureMode {id: $failure_mode_id})
    MERGE (ve:VectorEmbedding {
        failure_mode_id: $failure_mode_id,
        text_chunk: $text_chunk
    })
    MERGE (fm)-[:HAS_EMBEDDING]->(ve)
    """
    
    tx.run(query, failure_mode_id=failure_mode_id, text_chunk=text_chunk)


## Create Vector Index

In [8]:
def create_vector_index():
    """Create the Neo4j vector index - run this ONLY after new data is added"""
    
    # Initialize embeddings
    # This is where I could set more parameters for the embeddings like: model='mxbai-embed-large' validate_model_on_init=False base_url=None client_kwargs={} async_client_kwargs={} sync_client_kwargs={} mirostat=None mirostat_eta=None mirostat_tau=None num_ctx=None num_gpu=None keep_alive=None num_thread=None repeat_last_n=None repeat_penalty=None temperature=None stop=None tfs_z=None top_k=None top_p=None
    embeddings = OllamaEmbeddings(
        model="mxbai-embed-large",
        # Add other params as needed for production
    )
    
    # Create the actual vector index in Neo4j
    # This creates the index structure in the database
    vector_index = Neo4jVector.from_existing_graph(
        embeddings,
        search_type="hybrid",
        node_label="VectorEmbedding", 
        text_node_properties=["text_chunk"],
        embedding_node_property="embedding",
        index_name="failure_mode_context_index",
        keyword_index_name="failure_mode_keyword_index"
    )
    
    print("Vector index 'failure_mode_context_index' created successfully!")
    print("Vector index 'failure_mode_keyword_index' created successfully!")
    return True


def get_retriever(amountResults):
    
    # Initialize the same embeddings configuration
    embeddings = OllamaEmbeddings(
        model="mxbai-embed-large",
    )
    
    # Connect to existing index (doesn't recreate it)
    vector_store = Neo4jVector.from_existing_index(
        embeddings,
        search_type="hybrid",
        index_name="failure_mode_context_index",
        keyword_index_name="failure_mode_keyword_index"
    )
    
    return vector_store.as_retriever(search_kwargs={"k": amountResults})



## Creation of Full text index


In [9]:
def create_fmea_fulltext_indexes():
    """Create fulltext indexes for all FMEA entity types."""
    
    driver = GraphDatabase.driver(
        uri=os.environ["NEO4J_URI"],
        auth=(os.environ["NEO4J_USERNAME"], os.environ["NEO4J_PASSWORD"])
    )
    
    def create_fulltext_index(tx):
        # Single comprehensive index for all FMEA entities
        query = '''
        CREATE FULLTEXT INDEX `fulltext_entity_id` IF NOT EXISTS
        FOR (n:SystemElement|Subsystem|FailureMode|Function|FailureCause|FailureEffect|Measure|Product) 
        ON EACH [n.name]
        '''
        tx.run(query)
    
    try:
        with driver.session() as session:
            session.execute_write(create_fulltext_index)
            print("FMEA fulltext index created successfully.")
    except Exception as e:
        print(f"Index creation failed or already exists: {e}")
    finally:
        driver.close()

# RAG
## Component 1: Entity Extraction
Purpose: Identifies character names from user questions

In [32]:
def extract_fmea_entities(question: str, llm, debug: bool) -> dict:
    fmea_entity_prompt_old = ChatPromptTemplate.from_messages([
        ("system", 
         "You are an FMEA entity extraction specialist. Extract relevant entities from user questions "
         "to enable precise knowledge graph queries. Map entities to these exact types: "
         "FailureCause, FailureEffect, FailureMode, Function, Measure, Product, Subsystem, SystemElement."
         "\n\nPRODUCT ENTITY SPECIAL RULES:"
         "\n- Extract Product entities ONLY when user explicitly requests system structure generation"
         "\n- Generation keywords: 'create', 'develop', 'build', 'generate', 'design', 'structure'"
         "\n- For Product entities, provide 2 linguistic variations: synonyms and abstraction levels"
         "\n- Example: 'Electric Vehicle' → variations: ['EV', 'Electric Car'] (synonym + abstraction)"
         "\n- Analytical questions about existing systems should have Product: []"
         "\n\nENTITY MAPPING GUIDELINES:"
         "\n- SystemElement: Physical components (motor, pump, brake, sensor, valve)"
         "\n- Subsystem: Groups of components (brake system, hydraulic system, control system)"
         "\n- Product: Complete systems or end products"
         "\n- Function: What something does (braking, pumping, cooling, monitoring)"
         "\n- FailureMode: Ways things can fail (overheating, leakage, fracture, jamming)"
         "\n- FailureCause: Root causes (wear, corrosion, overload, contamination)"
         "\n- FailureEffect: Consequences (loss of function, safety risk, performance degradation)"
         "\n- Measure: Preventive/corrective actions (inspection, maintenance, design change)"
         "\n\nSYNONYM HANDLING:"
         "\n- Technical variants: breakdown→failure, component→SystemElement"
         "\n- Context-aware: 'failure' alone → look for context clues"
         "\n\nFORMAT REQUIREMENTS:"
         "\n- Return ONLY valid JSON, no explanations or additional text"
         "\n- Use empty arrays [] for entity types with no matches"
         "\n- All entity type keys must be present in output"
         "\n- Entity names should be clean and standardized"
         "\n\nEXAMPLE OUTPUTS:"
         "\n- Question: 'What failures occur in brake system?'"
         "\n  Response: {\"FailureCause\": [], \"FailureEffect\": [], \"FailureMode\": [], \"Function\": [], \"Measure\": [], \"Product\": [], \"Subsystem\": [\"Brake System\"], \"SystemElement\": []}"
         "\n- Question: 'What measures prevent motor overheating?'"
         "\n  Response: {\"FailureCause\": [], \"FailureEffect\": [], \"FailureMode\": [\"Overheating\"], \"Function\": [], \"Measure\": [], \"Product\": [], \"Subsystem\": [], \"SystemElement\": [\"Motor\"]}"
         "\n- Question: 'Create FMEA structure for electric vehicle'"
         "\n  Response: {\"FailureCause\": [], \"FailureEffect\": [], \"FailureMode\": [], \"Function\": [], \"Measure\": [], \"Product\": [\"Electric Vehicle\", \"EV\", \"Electric Car\"], \"Subsystem\": [], \"SystemElement\": []}"
        ),
        ("human", 
         "Extract FMEA entities from this question: {question}"
         "\nReturn only the JSON with all entity types, using empty arrays for unmatched types."
        )
    ])

    fmea_entity_prompt = ChatPromptTemplate.from_messages([
        ("system", 
         "You are an FMEA entity extraction specialist. Extract relevant entities from user questions "
         "to enable precise knowledge graph queries. Map entities to these exact types: "
         "FailureCause, FailureEffect, FailureMode, Function, Measure, Product, Subsystem, SystemElement. "
         "\n\nPRODUCT ENTITY SPECIAL RULES:"
         "\n- When user explicitly requests system structure generation ONLY the Product entity should be extracted."
         "\n- Generation keywords: 'create', 'develop', 'build', 'generate', 'design', 'structure'"
         "\n- For Product entities, provide 2 additional linguistic variations: synonyms and abstraction levels"
         "\n- Example: ['Electric Vehicle', 'EV', 'Electric Car'] (extracted entity, synonym, abstraction)"
         "\n\nENTITY MAPPING GUIDELINES:"
         "\n- SystemElement: Physical components (motor, pump, brake, sensor, valve)"
         "\n- Subsystem: Groups of components (brake system, hydraulic system, control system)"
         "\n- Product: Complete systems or end products"
         "\n- Function: What something does (braking, pumping, cooling, monitoring)"
         "\n- FailureMode: Ways things can fail (overheating, leakage, fracture, jamming)"
         "\n- FailureCause: Root causes (wear, corrosion, overload, contamination)" 
         "\n- FailureEffect: Consequences (loss of function, safety risk, performance degradation)"
         "\n- Measure: Preventive/corrective actions (inspection, maintenance, design change)"
         "\n\nSYNONYM HANDLING:"
         "\n- Technical variants: breakdown→failure, component→SystemElement"
         "\n- Context-aware: 'failure' alone → look for context clues"
         "\n\nFORMAT REQUIREMENTS:"
         "\n- Return ONLY valid JSON, no explanations or additional text"
         "\n- Use empty arrays [] for entity types with no matches"
         "\n- All entity type keys must be present in output"
         "\n- Entity names should be clean and standardized"
         "\n- Entity types must match the defined categories exactly and are the key of the output, following a list with extracted entities."
        ),
        ("human", 
         "Extract FMEA entities from this question: {question}"
         "\nReturn only the JSON with all entity types, using empty arrays for unmatched types."
        )
    ])
    
    
    try:
        response = llm.invoke(fmea_entity_prompt.format(question=question))
        print(response)
        json_text = response.content.strip()
        if debug:
            print(f"Raw LLM response: '{json_text}'") 
            print(f"Response length: {len(json_text)}")
        
        # Try to clean common issues
        json_text = json_text.replace('```json', '').replace('```', '')
        json_text = json_text.strip()
        
        entities = json.loads(json_text)
        if debug:
            print("Entities: ", entities)
            print("Successful entity extraction")
        return entities
    
    except json.JSONDecodeError as e:
        if debug:
            print(f"JSON parsing failed: {e}")
            print(f"Problematic text: '{json_text}'") 
        # Return fallback
        return {
            "FailureCause": [],
            "FailureEffect": [],
            "FailureMode": [],
            "Function": [],
            "Measure": [],
            "Product": [],
            "Subsystem": [],
            "SystemElement": []
        }
    except Exception as e:
        if debug:
            print(f"General entity extraction failed: {e}")
        return {
            "FailureCause": [],
            "FailureEffect": [],
            "FailureMode": [],
            "Function": [],
            "Measure": [],
            "Product": [],
            "Subsystem": [],
            "SystemElement": []
        }


# Component 3: Graph neighborhood retrievel
Purpose: Find character relationships using graph traversal

In [55]:
def generate_cypher_query(question: str, entities: dict, llm) -> str:
    """Generate executable Cypher query from extracted FMEA entities."""
    
    cypher_prompt = ChatPromptTemplate.from_messages([
        (
            "system", 
            "Generate valid Cypher queries for FMEA knowledge graphs. Output ONLY the query.\n\n"
            "EXACT SCHEMA:\n"
            "Nodes: Product, Subsystem, SystemElement, Function, FailureMode, FailureCause, FailureEffect, Measure\n"
            "Relationships: hasSubsystem, hasSystemElement, hasFunction, hasFailureMode, isDueToFailureCause, resultsInFailureEffect, isImprovedByPreventiveMeasure, isImprovedByDetectiveMeasure\n"
            "Properties: name, description, severity_rating, occurrence_rating, detection_rating, id, type, failure_mode_id, text_chunk\n\n"
            "MANDATORY RULES:\n"
            "1. Use ONLY relationships from schema above\n"
            "2. Use ONLY node labels from schema above\n" 
            "3. Use ONLY properties from schema above\n"
            "4. ALWAYS exclude VectorEmbedding: WHERE NOT 'VectorEmbedding' IN labels(n)\n"
            "5. Use fuzzy matching: WHERE n.name CONTAINS 'EntityName'\n"
            "6. Include ratings ONLY when question contains: rating, severity, occurrence, detection, S, O, D\n"
            "7. Always LIMIT 20\n"
            "8. Return as: node.property_name\n\n"
            "EXAMPLES:\n"
            "Entities: {'SystemElement': ['Motor']}\n"
            "Query: MATCH (se:SystemElement) WHERE se.name CONTAINS 'Motor' AND NOT 'VectorEmbedding' IN labels(se) OPTIONAL MATCH (se)-[:hasFunction]->(f:Function)-[:hasFailureMode]->(fm:FailureMode) RETURN se.name, f.name, fm.name, fm.description LIMIT 20\n\n"
            "OUTPUT: Return ONLY the Cypher query."
        ),
        (
            "human",
            "Question: {question}\nEntities: {entities}"
        )
    ])
    
    response = llm.invoke(cypher_prompt.format(question=question, entities=str(entities)))
    return response.content.strip()


def generate_fallback_query(entities: dict) -> str:
    """Generate simple fallback query retrieving all connected nodes."""
    print(entities)
    for entity_type, entity_list in entities.items():
        if entity_list:
            for entity in entity_list:
                return f"""
                MATCH (n:{entity_type})
                WHERE toLower(n.name) CONTAINS toLower({entity})
                OPTIONAL MATCH (n)-[r]-(connected)
                RETURN n.name as entity_name,
                    properties(n) as entity_properties,
                    type(r) as relationship_type,
                    connected.name as connected_name,
                    properties(connected) as connected_properties
                LIMIT 20
                """
    
    return "MATCH (n) WHERE NOT 'VectorEmbedding' IN labels(n) RETURN n.name, labels(n) LIMIT 10"

def execute_fmea_query(question: str, entities: dict, llm, graph, debug: bool) -> dict:
    """Execute Cypher query with fallback on failure."""
    try:
        cypher_query = generate_cypher_query(question, entities, llm)
        print("Cypher Query: ", cypher_query)
        raw_results = graph.query(cypher_query)
        if debug:
            print(cypher_query)
            print(raw_results)
        
        return {
            "query": cypher_query,
            "results_count": len(raw_results),
            "fmea_data": [dict(record) for record in raw_results]
        }
        
    except Exception as e:
        print("exception fallback")
        fallback_query = generate_fallback_query(entities)
        
        try:
            fallback_results = graph.query(fallback_query)
            if debug:
                print("Primary query failed, executed fallback query.")
                print(fallback_query)
                print(fallback_results)
            return {
                "query": fallback_query, 
                "results_count": len(fallback_results),
                "fmea_data": [dict(record) for record in fallback_results],
                "fallback_used": True,
                "original_error": str(e)
            }
        except Exception as fallback_error:
            print("fallback, fallback")
            return {
                "error": f"Primary: {str(e)}, Fallback: {str(fallback_error)}",
                "results_count": 0,
                "fmea_data": []
            }


# Component 4: hybrid retrieval Combination
Purpose: Combines Graph and vector search results

In [None]:
def comprehensive_got_retriever(question: str) -> str:
    """
    Combines multiple retrieval strategies for comprehensive information gathering.
    
    This hybrid approach leverages:
    1. Graph traversal for explicit relationship data
    2. Vector search for semantic similarity and relationship descriptions
    3. Character profile information for context
    
    Why hybrid retrieval is powerful:
    - Graph data provides factual, structured relationships
    - Vector data provides rich, descriptive context
    - Combination gives both precision and comprehensiveness
    
    For FMEA adaptation:
    - Graph data would provide explicit failure mode linkages
    - Vector search would find semantically similar failure patterns
    - Could combine quantitative risk data with qualitative descriptions
    """
    # Get structured relationship data from graph traversal
    graph_data = get_character_relationships(question)
    
    # Get semantic relationship information from vector search
    # Using both character and relationship retrievers for comprehensive coverage

    relationship_vector_data = relationship_retriever.invoke(question)
    
    # Combine character profiles and relationship descriptions

    relationship_descriptions = [doc.page_content for doc in relationship_vector_data]
    
    # Structure the combined data for the language model
    final_context = f"""
=== GRAPH RELATIONSHIP DATA ===
{graph_data}

=== RELATIONSHIP DESCRIPTIONS ===  
{chr(10).join(relationship_descriptions)}
"""
    
    print("Retrieved context length:", len(final_context))
    return final_context

# Component 5: Respones Generation Chain
Purpose: Generates natural language answers using retrieved information

In [None]:
# Create specialized prompt template for Game of Thrones questions
got_rag_template = """You are a Game of Thrones expert assistant. Answer the question based only on the provided context information.

Context Information:
{context}

Question: {question}

Instructions:
- Use only the information provided in the context
- Be specific about book numbers when relationships are mentioned
- Include interaction strengths when relevant
- If the context doesn't contain enough information, say so clearly
- Provide a natural, conversational response

Answer:"""

got_prompt = ChatPromptTemplate.from_template(got_rag_template)

# Create the complete RAG chain
got_rag_chain = (
    {
        "context": comprehensive_got_retriever,
        "question": RunnablePassthrough(),
    }
    | got_prompt
    | chat_llm
    | StrOutputParser()
)

# Component 6: Enhanced Question Answering Interface
Purpose: provides user Friendly interface with detailed response

In [None]:
def test_entity_extraction():
    """
    Test function to verify entity extraction is working correctly.
    This kind of component testing is essential for complex AI systems.
    """
    print("Testing entity extraction component...")
    
    test_questions = [
        "Who is Jon Snow?",
        "Tell me about Jon Snow and Tyrion Lannister",
        "What about the Stark family?",
        "Who does Jon Snow interact with and how strong are those relationships?"
    ]
    
    for question in test_questions:
        print(f"\nTesting: '{question}'")
        try:
            entities = extract_got_entities(question)
            print(f"✅ Success: {entities}")
        except Exception as e:
            print(f"❌ Failed: {e}")

def ask_got_question(question: str, show_context: bool = False):
    """
    User-friendly interface for asking Game of Thrones questions.
    
    This wrapper function:
    - Executes the full RAG pipeline
    - Optionally shows the retrieved context for transparency
    - Provides clear formatting for responses
    
    For FMEA adaptation:
    - Would include risk level indicators in responses
    - Could show confidence scores for failure predictions
    - Might include regulatory compliance information
    """
    print(f"Question: {question}")
    print("=" * 50)
    
    if show_context:
        context = comprehensive_got_retriever(question)
        print("RETRIEVED CONTEXT:")
        print(context)
        print("=" * 50)
    
    answer = got_rag_chain.invoke(question)
    print("ANSWER:")
    print(answer)
    print("=" * 50)
    
    return answer

# Main

## Testing

=== Testing LLM Invoke Directly ===
✅ Simple invoke works: content='Hello! How can I assist you today?' additional_kwargs={'refusal': None} response_metadata={'token_usage': {'completion_tokens': 10, 'prompt_tokens': 9, 'total_tokens': 19, 'completion_tokens_details': {'accepted_prediction_tokens': 0, 'audio_tokens': 0, 'reasoning_tokens': 0, 'rejected_prediction_tokens': 0}, 'prompt_tokens_details': {'audio_tokens': 0, 'cached_tokens': 0}}, 'model_name': 'gpt-4.1-mini-2025-04-14', 'system_fingerprint': 'fp_4f3d32ad4e', 'id': 'chatcmpl-CBdyYHXeCvO3y5jpAh8rqoVRsPsnx', 'service_tier': None, 'prompt_filter_results': [{'prompt_index': 0, 'content_filter_results': {'hate': {'filtered': False, 'severity': 'safe'}, 'jailbreak': {'filtered': False, 'detected': False}, 'self_harm': {'filtered': False, 'severity': 'safe'}, 'sexual': {'filtered': False, 'severity': 'safe'}, 'violence': {'filtered': False, 'severity': 'safe'}}}], 'finish_reason': 'stop', 'logprobs': None, 'content_filter_results':

In [56]:
# Variable definitions
dataUploadRequired = False # active if new data should be uploaded to knowledge graph

# Setup
graph = Neo4jGraph()

# For entity extraction (keep the functions model)
entity_llm = ChatOllama(model="gemma3:4b", temperature=0 
)

# For conversational responses (new standard chat model)
chat_llm = ChatOllama(model="gemma3:4b", temperature=0.3)

azure_llm = AzureChatOpenAI(
    api_version="2024-12-01-preview",
    azure_deployment="gpt-4.1-mini",
    model_name="gpt-4.1-mini"
)

if dataUploadRequired:
    dataUploadAndMappingToGraph()
    create_failure_mode_embeddings()
    create_vector_index()
    create_fmea_fulltext_indexes()

# initiate retriever for vector query
retriever = get_retriever(amountResults=10)

question = "What are the failure causes for the failure mode Excessive vibration?"

 # Extract entities using your existing function
#entities = extract_fmea_entities(question, azure_llm, debug=False)

entities = {'FailureCause': [], 'FailureEffect': [], 'FailureMode': ['Excessive vibration'], 'Function': [], 'Measure': [], 'Product': [], 'Subsystem': [], 'SystemElement': []}

# Execute graph query
graph_results = execute_fmea_query(question, entities, azure_llm, graph, debug=True)


exception fallback
{'FailureCause': [], 'FailureEffect': [], 'FailureMode': ['Excessive vibration'], 'Function': [], 'Measure': [], 'Product': [], 'Subsystem': [], 'SystemElement': []}
fallback, fallback


# Testing

In [None]:
# Direct character relationship question
ask_got_question(
        "Who talks Tyrion Lannister the most to?",
        show_context=True
    )