In [None]:
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import OpenAIEmbeddings
from langchain.chat_models import ChatOpenAI
from langchain.vectorstores import Qdrant
from qdrant_client import QdrantClient
from graphiti.graph import KnowledgeGraph
from neo4j_graphrag.retrievers import QdrantNeo4jRetriever

import os
from pathlib import Path
from dotenv import load_dotenv, find_dotenv

In [None]:
load_dotenv(find_dotenv())
qdrant_key = os.getenv("QDRANT_KEY")
qdrant_url = os.getenv("QDRANT_URL")
neo4j_uri = os.getenv("NEO4J_URI")
neo4j_username = os.getenv("NEO4J_USERNAME")
neo4j_password = os.getenv("NEO4J_PASSWORD")
openai_key = os.getenv("OPENAI_API_KEY")

In [None]:
neo4j_driver = GraphDatabase.driver(neo4j_uri, auth=(neo4j_username, neo4j_password))

collection_name = "graphRAGstoreds"
qdrant_client = QdrantClient(url=qdrant_url, api_key=qdrant_key)

try:
    qdrant_client.delete_collection(collection_name=collection_name)
    print(f"Collection '{collection_name}' has been deleted successfully.")
except Exception as e:
    print(f"Could not delete collection (it might not exist, which is OK): {e}")

In [None]:
client = OpenAI(
    base_url="https://openrouter.ai/api/v1",
    api_key=openai_key,
)

def openai_llm_parser(prompt_input: str) -> KnowledgeGraph:
    """
    Extracts structured information using a generalized prompt, 
    allowing the LLM to infer entity types and relationships.
    """
    

    system_prompt = f"""Your task is to act as an expert information extractor. From the provided INPUT_TEXT, you will extract a knowledge graph.

    The output must be a JSON object with a single key "graph", which contains a list of structured objects. Each object represents a relationship triplet and must have the following keys: 'h', 'type_h', 'r', 'o', 'type_t'.

    GUIDELINES:
    1.  'h' (head) and 'o' (tail) are the entities.
    2.  'type_h' and 'type_t' are the general categories. You must infer these types. Types should be concise, capitalized, singular nouns (e.g., PERSON, COMPANY, VEHICLE, LOCATION, PRODUCT).
    3.  **Crucially, identify abstract concepts like EVENTS (e.g., 'Battle of New York', 'Ultron's Attack') and PROTOCOLS (e.g., 'Sokovia Accords').**
    4.  'r' (relationship) is a short, active verb.
      - For actions between entities, use verbs like: Drove, Invented, Created, Wields, Led, Defeated.
      - **For cause-and-effect, use verbs like: Caused, LedTo, ResultedIn.**
      - **For participation, use: ParticipatedIn.**
    5.  **Entity Disambiguation**: Consolidate different names for the same entity.
    6.  **Simplicity**: Keep entity names short and specific.

    EXAMPLE 1 (Business):
    - Input: 'The 2008 financial crisis led to the creation of the Dodd-Frank Act.'
    - Output:
    {{
      "graph": [
        {{ "h": "2008 Financial Crisis", "type_h": "EVENT", "r": "LedTo", "o": "Dodd-Frank Act", "type_t": "PROTOCOL" }}
      ]
    }}

    EXAMPLE 2 (MCU - a more relevant example for you):
    - Input: 'The Battle of New York was a major conflict where the Avengers first assembled to fight Loki.'
    - Output:
    {{
      "graph": [
          {{ "h": "Avengers", "type_h": "GROUP", "r": "ParticipatedIn", "o": "Battle of New York", "type_t": "EVENT" }},
          {{ "h": "Loki", "type_h": "PERSON", "r": "ParticipatedIn", "o": "Battle of New York", "type_t": "EVENT" }}
      ]
    }}

    Your output MUST be a valid JSON object. Do not add any text before or after the JSON.

    ===========================================================
    INPUT_TEXT:
    {prompt_input}
    """
    
    completion = client.chat.completions.create(
        model="qwen/qwen3-235b-a22b:free",
        response_format={"type": "json_object"},
        messages=[
            {
                "role": "system",
                "content": system_prompt
            }
        ]
    )
    
    response_content = completion.choices[0].message.content
    print("Raw response from model:", response_content)

    try:
        return KnowledgeGraph.model_validate_json(response_content)
    except Exception as e:
        print(f"Pydantic validation failed: {e}")
        return KnowledgeGraph(graph=[])

In [None]:
class HybridRAGPipeline:
    def __init__(self, pdf_folder: str, qdrant_url: str, graph_name: str):
        self.pdf_folder = pdf_folder
        self.qdrant_url = qdrant_url
        self.graph_name = graph_name

        self.embedding_model = OpenAIEmbeddings()
        self.llm = ChatOpenAI(temperature=0)
        self.graph = KnowledgeGraph(name=graph_name, db="mongo")
        self.qdrant_client = QdrantClient(qdrant_url)
        self.text_splitter = RecursiveCharacterTextSplitter(chunk_size=300, chunk_overlap=30)

        self.documents = []
        self.chunks = []

    def load_pdfs(self):
        all_pdfs = list(Path(self.pdf_folder).glob("*.pdf"))
        for file in all_pdfs:
            loader = PyPDFLoader(str(file))
            docs = loader.load()
            self.documents.extend(docs)
        print(f"✅ Loaded {len(self.documents)} documents from {len(all_pdfs)} PDFs.")

    def split_documents(self):
        self.chunks = self.text_splitter.split_documents(self.documents)
        for i, chunk in enumerate(self.chunks):
            chunk.metadata["id"] = f"chunk_{i}"
        print(f"🧩 Split into {len(self.chunks)} chunks.")

    def index_embeddings_to_qdrant(self):
        vectorstore = Qdrant.from_documents(
            documents=self.chunks,
            embedding=self.embedding_model,
            qdrant_client=self.qdrant_client,
            collection_name=self.graph_name,
            ids=[chunk.metadata["id"] for chunk in self.chunks]
        )
        print("📦 Stored vectors into Qdrant.")

    def create_ontology_graph(self):
        for chunk in self.chunks:
            content = chunk.page_content
            ontology_prompt = f"Extract key ontology concepts from the text:\n\n{content}"
            ontology = self.llm.predict(ontology_prompt)
            self.graph.add_node(content, metadata={"id": chunk.metadata["id"], "ontology": ontology})

        self.graph.link_similar_nodes()
        self.graph.save()
        print("🌐 Created semantic graph in Graphiti.")
        
    def hybrid_retrieve_and_answer(self, query: str, top_k: int = 3):
        # Semantic retrieval
        vectorstore = Qdrant(
            client=self.qdrant_client,
            collection_name=self.graph_name,
            embeddings=self.embedding_model
        )
        semantic_docs = vectorstore.similarity_search(query, k=top_k)

        # Semantic graph retrieval
        graph_docs = self.graph.find_related(query, top_k=top_k)

        # Kết hợp
        context = "\n\n".join(
            [doc.page_content for doc in semantic_docs] +
            [node["content"] for node in graph_docs]
        )

        prompt = f"""Answer the question based on the following context:\n\n{context}\n\nQuestion: {query}"""
        answer = self.llm.predict(prompt)
        return answer


In [None]:
pipeline = HybridRAGPipeline(
    pdf_folder="pdf_docs",                # 🗂️ thư mục chứa file .pdf
    qdrant_url="http://localhost:6333",   # 🌐 URL Qdrant local
    graph_name="carbon_hybrid_rag"        # 🧠 Tên collection/graph
)

pipeline.load_pdfs()
pipeline.split_documents()
pipeline.index_embeddings_to_qdrant()
pipeline.create_ontology_graph()
