In [103]:
!pip install neo4j>=5.16.0 qdrant-client>=1.7.3 openai>=1.7.2 neo4j-graphrag>=0.1.1 python-dotenv>=1.0.0 pydantic>=2.5.0 google
from neo4j import GraphDatabase
from qdrant_client import QdrantClient, models
from dotenv import load_dotenv
from pydantic import BaseModel
from openai import OpenAI
from collections import defaultdict
from neo4j_graphrag.retrievers import QdrantNeo4jRetriever
import uuid
import os
import json
import pandas as pd
import os
import google.generativeai as genai
import json 


In [104]:
load_dotenv()

qdrant_key = os.getenv("QDRANT_KEY")
qdrant_url = os.getenv("QDRANT_URL")
neo4j_uri = os.getenv("NEO4J_URI")
neo4j_username = os.getenv("NEO4J_USERNAME")
neo4j_password = os.getenv("NEO4J_PASSWORD")
openai_key = os.getenv("OPENAI_API_KEY")

In [105]:
neo4j_driver = GraphDatabase.driver(neo4j_uri, auth=(neo4j_username, neo4j_password))

collection_name = "graphRAGstoreds"
qdrant_client = QdrantClient(url=qdrant_url, api_key=qdrant_key)

try:
    qdrant_client.delete_collection(collection_name=collection_name)
    print(f"Collection '{collection_name}' has been deleted successfully.")
except Exception as e:
    print(f"Could not delete collection (it might not exist, which is OK): {e}")

Collection 'graphRAGstoreds' has been deleted successfully.


In [106]:
class single(BaseModel):
    node: str
    target_node: str
    relationship: str

class GraphComponents(BaseModel):
    graph: list[single]

In [107]:
client = OpenAI(
    base_url="https://openrouter.ai/api/v1",
    api_key=openai_key,
)

def openai_llm_parser(prompt: str) -> GraphComponents:
    """
    Extracts graph relationships from text using a free model on OpenRouter.
    This version includes data cleaning to handle malformed LLM responses.
    """
    completion = client.chat.completions.create(
        model="qwen/qwen3-235b-a22b:free",
        response_format={"type": "json_object"},
        messages=[
            {
                "role": "system",
                "content": 
                    """You are a precise graph relationship extractor. Extract all 
                    relationships from the text and format them as a JSON object 
                    with this exact structure:
                    {
                        "graph": [
                            {"node": "Person/Entity", 
                             "target_node": "Related Entity", 
                             "relationship": "Type of Relationship"},
                            ...more relationships...
                        ]
                    }
                    Every object in the 'graph' list MUST contain 'node', 'target_node', and 'relationship'.
                    If you cannot determine all three parts of a relationship, omit it entirely.
                    Include ALL valid relationships mentioned in the text. Be thorough and precise."""
            },
            {
                "role": "user",
                "content": prompt
            }
        ]
    )
    
    response_content = completion.choices[0].message.content
    print("Raw response from model:", response_content)

    # 1. Parse the JSON string into a Python dictionary first.
    try:
        data = json.loads(response_content)
    except json.JSONDecodeError:
        print("Error: Model did not return valid JSON.")
        return GraphComponents(graph=[]) # Return an empty graph on JSON error

    # 2. Filter the list to keep only valid entries.
    # A valid entry must be a dictionary and have all three required keys.
    raw_graph_list = data.get("graph", [])
    if not isinstance(raw_graph_list, list):
        print("Error: 'graph' key is not a list.")
        return GraphComponents(graph=[])

    cleaned_graph_list = []
    for entry in raw_graph_list:
        if isinstance(entry, dict) and 'node' in entry and 'target_node' in entry and 'relationship' in entry:
            cleaned_graph_list.append(entry)
        else:
            print(f"Skipping malformed entry: {entry}")
            
    # 3. Create a new dictionary with the cleaned list.
    cleaned_data = {"graph": cleaned_graph_list}

    # 4. Validate the cleaned data using Pydantic's model_validate (for dicts).
    return GraphComponents.model_validate(cleaned_data)

In [108]:
def extract_graph_components(raw_data):
    prompt = f"Extract nodes and relationships from the following text:\n{raw_data}"

    parsed_response = openai_llm_parser(prompt)  # Assuming this returns a list of dictionaries
    parsed_response = parsed_response.graph  # Assuming the 'graph' structure is a key in the parsed response

    nodes = {}
    relationships = []

    for entry in parsed_response:
        node = entry.node
        target_node = entry.target_node  # Get target node if available
        relationship = entry.relationship  # Get relationship if available

        # Add nodes to the dictionary with a unique ID
        if node not in nodes:
            nodes[node] = str(uuid.uuid4())

        if target_node and target_node not in nodes:
            nodes[target_node] = str(uuid.uuid4())

        # Add relationship to the relationships list with node IDs
        if target_node and relationship:
            relationships.append({
                "source": nodes[node],
                "target": nodes[target_node],
                "type": relationship
            })

    return nodes, relationships

In [109]:
def extract_graph_components(raw_data):
    prompt = f"Extract nodes and relationships from the following text:\n{raw_data}"

    parsed_response = openai_llm_parser(prompt)  # Assuming this returns a list of dictionaries
    parsed_response = parsed_response.graph  # Assuming the 'graph' structure is a key in the parsed response

    nodes = {}
    relationships = []

    for entry in parsed_response:
        node = entry.node
        target_node = entry.target_node  # Get target node if available
        relationship = entry.relationship  # Get relationship if available

        # Add nodes to the dictionary with a unique ID
        if node not in nodes:
            nodes[node] = str(uuid.uuid4())

        if target_node and target_node not in nodes:
            nodes[target_node] = str(uuid.uuid4())

        # Add relationship to the relationships list with node IDs
        if target_node and relationship:
            relationships.append({
                "source": nodes[node],
                "target": nodes[target_node],
                "type": relationship
            })

    return nodes, relationships

In [110]:
def ingest_to_neo4j(nodes, relationships):
    """
    Ingest nodes and relationships into Neo4j.
    """

    with neo4j_driver.session() as session:
        # Create nodes in Neo4j
        for name, node_id in nodes.items():
            session.run(
                "CREATE (n:Entity {id: $id, name: $name})",
                id=node_id,
                name=name
            )

        # Create relationships in Neo4j
        for relationship in relationships:
            session.run(
                "MATCH (a:Entity {id: $source_id}), (b:Entity {id: $target_id}) "
                "CREATE (a)-[:RELATIONSHIP {type: $type}]->(b)",
                source_id=relationship["source"],
                target_id=relationship["target"],
                type=relationship["type"]
            )

    return nodes

In [111]:
def create_collection(client, collection_name, vector_dimension):
    # Try to fetch the collection status
    try:
        collection_info = client.get_collection(collection_name)
        print(f"Skipping creating collection; '{collection_name}' already exists.")
    except Exception as e:
        # If collection does not exist, an error will be thrown, so we create the collection
        if 'Not found: Collection' in str(e):
            print(f"Collection '{collection_name}' not found. Creating it now...")

            client.create_collection(
                collection_name=collection_name,
                vectors_config=models.VectorParams(size=vector_dimension, distance=models.Distance.COSINE)
            )

            print(f"Collection '{collection_name}' created successfully.")
        else:
            print(f"Error while checking collection: {e}")

In [112]:
api_key = os.getenv("GEMINI_API_KEY")
    
genai.configure(api_key=api_key)

def gemini_embeddings(text: str, task: str = "RETRIEVAL_DOCUMENT") -> list[float]:
    """
    Tạo vector embedding cho một đoạn văn bản sử dụng mô hình của Gemini.

    Args:
        text (str): Đoạn văn bản cần embedding.
        task (str): Loại tác vụ cho embedding. Các giá trị phổ biến bao gồm:
                    - "RETRIEVAL_QUERY": Cho truy vấn tìm kiếm.
                    - "RETRIEVAL_DOCUMENT": Cho tài liệu trong cơ sở dữ liệu vector.
                    - "SEMANTIC_SIMILARITY": Cho việc so sánh sự tương đồng ngữ nghĩa.
                    - "CLASSIFICATION": Cho các tác vụ phân loại.
                    - "CLUSTERING": Cho các tác vụ phân cụm.
    
    Returns:
        list[float]: Một danh sách các số thực biểu diễn vector embedding.
    """
    model = "models/embedding-001"
    
    try:
        response = genai.embed_content(
            model=model,
            content=text,
            task_type=task
        )
        
        return response['embedding']
        
    except Exception as e:
        print(f"Đã xảy ra lỗi khi tạo embedding: {e}")
        return []

Defaulting to user installation because normal site-packages is not writeable


In [113]:
def ingest_to_qdrant(collection_name, node_id_mapping):
    """
    Creates embeddings for the names of the extracted nodes and ingests them into Qdrant.
    This approach is more robust and logical than embedding raw text lines.
    """
    points_to_upsert = []
    
    print(f"Preparing to create embeddings for {len(node_id_mapping)} nodes...")
    
    # Lặp qua các node đã được trích xuất
    for node_name, node_id in node_id_mapping.items():
        if not node_name or not node_name.strip():
            print(f"Skipping empty node name for id {node_id}")
            continue

        # Tạo embedding cho tên của node
        embedding = gemini_embeddings(node_name, task="RETRIEVAL_DOCUMENT")
        
        if embedding:
            points_to_upsert.append(
                models.PointStruct(
                    id=str(uuid.uuid4()),      # ID cho điểm vector trong Qdrant
                    vector=embedding,
                    payload={"id": node_id}    # ID của node trong Neo4j để liên kết
                )
            )
        else:
            print(f"Failed to create embedding for node: {node_name}")

    # Upsert tất cả các điểm hợp lệ cùng một lúc
    if points_to_upsert:
        print(f"Ingesting {len(points_to_upsert)} points into Qdrant...")
        qdrant_client.upsert(
            collection_name=collection_name,
            points=points_to_upsert
        )
    else:
        print("No valid points to ingest into Qdrant.")

In [114]:
def retriever_search(neo4j_driver, qdrant_client, collection_name, query):
    retriever = QdrantNeo4jRetriever(
        driver=neo4j_driver,
        client=qdrant_client,
        collection_name=collection_name,
        id_property_external="id",
        id_property_neo4j="id",
    )

    results = retriever.search(query_vector=gemini_embeddings(query, task="RETRIEVAL_QUERY"), top_k=5)
    
    return results

In [115]:
def fetch_related_graph(neo4j_client, entity_ids):
    query = """
    MATCH (e:Entity)-[r1]-(n1)-[r2]-(n2)
    WHERE e.id IN $entity_ids
    RETURN e, r1 as r, n1 as related, r2, n2
    UNION
    MATCH (e:Entity)-[r]-(related)
    WHERE e.id IN $entity_ids
    RETURN e, r, related, null as r2, null as n2
    """
    with neo4j_client.session() as session:
        result = session.run(query, entity_ids=entity_ids)
        subgraph = []
        for record in result:
            subgraph.append({
                "entity": record["e"],
                "relationship": record["r"],
                "related_node": record["related"]
            })
            if record["r2"] and record["n2"]:
                subgraph.append({
                    "entity": record["related"],
                    "relationship": record["r2"],
                    "related_node": record["n2"]
                })
    return subgraph

In [116]:
def format_graph_context(subgraph):
    nodes = set()
    edges = []

    for entry in subgraph:
        entity = entry["entity"]
        related = entry["related_node"]
        relationship = entry["relationship"]

        nodes.add(entity["name"])
        nodes.add(related["name"])

        edges.append(f"{entity['name']} {relationship['type']} {related['name']}")

    return {"nodes": list(nodes), "edges": edges}

In [118]:
def graphRAG_run(graph_context, user_query):
    nodes_str = ", ".join(graph_context["nodes"])
    edges_str = "; ".join(graph_context["edges"])
    prompt = f"""
    You are an intelligent assistant with access to the following knowledge graph:

    Nodes: {nodes_str}

    Edges: {edges_str}

    Using this graph, Answer the following question:

    User Query: "{user_query}"
    """
    
    try:
        response = client.chat.completions.create(
            # model="qwen/qwen3-235b-a22b:free",
            model = 'meta-llama/llama-3-8b-instruct',
            messages=[
                {"role": "system", "content": "Provide the answer for the following question:"},
                {"role": "user", "content": prompt}
            ]
        )
        return response.choices[0].message
    
    except Exception as e:
        return f"Error querying LLM: {str(e)}"

In [119]:
if __name__ == "__main__":
    print("Script started")
    print("Loading environment variables...")
    load_dotenv('.env.local')
    print("Environment variables loaded")
    
    print("Initializing clients...")
    neo4j_driver = GraphDatabase.driver(neo4j_uri, auth=(neo4j_username, neo4j_password))
    qdrant_client = QdrantClient(
        url=qdrant_url,
        api_key=qdrant_key
    )
    print("Clients initialized")
    
    print("Creating collection...")
    collection_name = "graphRAGstoreds"
    vector_dimension = 768
    create_collection(qdrant_client, collection_name, vector_dimension)
    print("Collection created/verified")
    
    print("Extracting graph components...")
    
    raw_data_table = """
    Here is a table of lap records:

    | Category        | Time          | Driver             | Vehicle             | Event              |
    |-----------------|---------------|--------------------|---------------------|--------------------|
    | LMP1            | 1:07.056 [16] | Christian Klien    | Peugeot 908 HDi FAP | 2008 Petit Le Mans |
    | LMP2            | 1:08.489 [16] | Ryan Briscoe       | Porsche RS Spyder Evo | 2008 Petit Le Mans |
    | DPi             | 1:08.869 [17] | Felipe Nasr        | Cadillac DPi-V.R    | 2019 Petit Le Mans |
    | LMDh            | 1:10.917 [18] | Sébastien Bourdais | Cadillac V-Series.R | 2023 Petit Le Mans |
    | LMP900          | 1:11.782 [19] | Allan McNish       | Audi R8             | 2000 Petit Le Mans |
    | LMP             | 1:12.653 [20] | J.J. Lehto         | BMW V12 LMR         | 1999 Petit Le Mans |
    | LMP675          | 1:12.781 [21] | James Weaver       | Lola EX257          | 2002 Petit Le Mans |
    | DP              | 1:13.478 [22] | Olivier Pla        | Ligier JS P2        | 2016 Petit Le Mans |
    | GT1 (Prototype) | 1:15.239 [23] | Allan McNish       | Porsche 911 GT1-98  | 1998 Petit Le Mans |
    """

    nodes, relationships = extract_graph_components(raw_data_table)
    print("Nodes:", nodes)
    print("Relationships:", relationships)
    
    print("Ingesting to Neo4j...")
    node_id_mapping = ingest_to_neo4j(nodes, relationships)
    print("Neo4j ingestion complete")
    
    
    print("Ingesting to Qdrant...")
    ingest_to_qdrant(collection_name, node_id_mapping)
    print("Qdrant ingestion complete")

    query = "What vehicle did Allan McNish drive in the 2000 Petit Le Mans?"
    print("Starting retriever search...")
    retriever_result = retriever_search(neo4j_driver, qdrant_client, collection_name, query)
    print("Retriever results:", retriever_result)
    
    print("Extracting entity IDs...")
    entity_ids = [item.content.split("'id': '")[1].split("'")[0] for item in retriever_result.items]
    print("Entity IDs:", entity_ids)
    
    print("Fetching related graph...")
    subgraph = fetch_related_graph(neo4j_driver, entity_ids)
    print("Subgraph:", subgraph)
    
    print("Formatting graph context...")
    graph_context = format_graph_context(subgraph)
    print("Graph context:", graph_context)
    
    print("Running GraphRAG...")
    answer = graphRAG_run(graph_context, query)
    print("Final Answer:", answer)

Script started
Loading environment variables...
Environment variables loaded
Initializing clients...
Clients initialized
Creating collection...
Collection 'graphRAGstoreds' not found. Creating it now...
Collection 'graphRAGstoreds' created successfully.
Collection created/verified
Extracting graph components...
Raw response from model: {
  "graph": [
    {"node": "Christian Klien", "target_node": "Peugeot 908 HDi FAP", "relationship": "drives"},
    {"node": "Christian Klien", "target_node": "2008 Petit Le Mans", "relationship": "achieved_at"},
    {"node": "Peugeot 908 HDi FAP", "target_node": "2008 Petit Le Mans", "relationship": "recorded_at"},
    {"node": "Ryan Briscoe", "target_node": "Porsche RS Spyder Evo", "relationship": "drives"},
    {"node": "Ryan Briscoe", "target_node": "2008 Petit Le Mans", "relationship": "achieved_at"},
    {"node": "Porsche RS Spyder Evo", "target_node": "2008 Petit Le Mans", "relationship": "recorded_at"},
    {"node": "Felipe Nasr", "target_node": 