In [50]:
%pip install unstructured
%pip install unstructured-ingest
%pip install llama-index
%pip install llama-index-readers-json
%pip install llama-index-readers-file
%pip install llama-index-graph-stores-neo4j
%pip install llama-index-embeddings-azure-openai
%pip install llama-index-llms-azure-openai
%pip install llama-index-llms-huggingface
%pip install llama-index-vector-stores-neo4jvector
%pip install llama-index-extractors-entity
%pip install neo4j
%pip install "transformers[torch]" "huggingface_hub[inference]"
%pip install accelerate
%pip install llama-index-embeddings-huggingface
%pip install llama-index-embeddings-instructor

Collecting numpy<2 (from unstructured)
  Using cached numpy-1.26.4-cp310-cp310-macosx_11_0_arm64.whl.metadata (61 kB)
Using cached numpy-1.26.4-cp310-cp310-macosx_11_0_arm64.whl (14.0 MB)
Installing collected packages: numpy
  Attempting uninstall: numpy
    Found existing installation: numpy 2.1.1
    Uninstalling numpy-2.1.1:
      Successfully uninstalled numpy-2.1.1
ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
mythril 0.24.8 requires eth-abi<5.0.0,>=4.0.0, but you have eth-abi 5.1.0 which is incompatible.
mythril 0.24.8 requires eth-hash<0.4.0,>=0.3.1, but you have eth-hash 0.7.0 which is incompatible.
mythril 0.24.8 requires hexbytes<0.3.0, but you have hexbytes 1.2.1 which is incompatible.
Successfully installed numpy-1.26.4
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use up

In [53]:
import warnings

warnings.filterwarnings('ignore')

import logging

neo4j_log = logging.getLogger("neo4j")
neo4j_log.setLevel(logging.CRITICAL)

In [54]:
from src.classes.utils.DebugLogger import DebugLogger
from src.classes.utils.EnvLoader import EnvLoader
import os
import nest_asyncio
import nltk
from llama_index.core import PropertyGraphIndex, StorageContext
from llama_index.core import SimpleDirectoryReader
from llama_index.core.base.response.schema import Response
from llama_index.core.indices.property_graph import SimpleLLMPathExtractor, SchemaLLMPathExtractor
from llama_index.core.ingestion import IngestionPipeline
from llama_index.core.node_parser import SimpleFileNodeParser
from llama_index.core.postprocessor import SimilarityPostprocessor
from llama_index.core.query_engine import RetrieverQueryEngine
from llama_index.core.schema import TransformComponent, NodeWithScore, Document
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.extractors.entity import EntityExtractor
from llama_index.graph_stores.neo4j import Neo4jPropertyGraphStore
from llama_index.llms.azure_openai import AzureOpenAI
from llama_index.llms.openai import OpenAI
from llama_index.vector_stores.neo4jvector import Neo4jVectorStore
from neo4j import GraphDatabase
from llama_index.embeddings.openai import OpenAIEmbedding
from abc import ABC, abstractmethod
from typing import List, Literal, Tuple, Type, Union, Optional

from llama_index.core.graph_stores import PropertyGraphStore
from llama_index.core.indices.base import BaseIndex
from llama_index.legacy.chat_engine.types import BaseChatEngine, AgentChatResponse
from llama_index.legacy.llms import HuggingFaceLLM

nest_asyncio.apply()
nltk.download('punkt_tab')
nltk.download('averaged_perceptron_tagger_eng')

EnvLoader(env_dir="config").load_env_files()

logger = DebugLogger(use_panel_for_errors=True)

LLM_MODE = "azure"

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/matteorizzo/PycharmProjects/smart-contracts-
[nltk_data]     vulnerabilities-ml-detector/venv/lib/python3.10/site-
[nltk_data]     packages/llama_index/legacy/_static/nltk_cache...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/matteorizzo/PycharmProjects/smart-contracts-
[nltk_data]     vulnerabilities-ml-detector/venv/lib/python3.10/site-
[nltk_data]     packages/llama_index/legacy/_static/nltk_cache...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package punkt_tab to
[nltk_data]     /Users/matteorizzo/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /Users/matteorizzo/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!


In [55]:
class ModelManager:
    """
    Manages the configuration and lazy initialization of LLMs and embedding models
    for OpenAI, Azure OpenAI, and Hugging Face. Reads all configuration parameters
    from environment variables with sensible defaults.
    """

    def __init__(self) -> None:
        # Load configuration from environment variables
        self.openai_model = os.getenv("OPENAI_MODEL_NAME_CHAT")

        self.azure_model = os.getenv("OPENAI_MODEL_NAME_CHAT")
        self.azure_deployment_name = os.getenv("AZURE_OPENAI_CHAT_DEPLOYMENT_NAME")
        self.azure_api_key = os.getenv("AZURE_OPENAI_API_KEY")
        self.azure_endpoint = os.getenv("AZURE_OPENAI_ENDPOINT")
        self.azure_api_version = os.getenv("AZURE_OPENAI_API_VERSION")

        self.huggingface_model_name = os.getenv("HUGGINGFACE_LLM_MODEL")
        self.huggingface_embed_model_name = os.getenv("HUGGINGFACE_EMBED_MODEL")

        # Lazily initialized models
        self._openai_llm = None
        self._azure_llm = None
        self._local_llm = None
        self._openai_embed_model = None
        self._local_embed_model = None

    @property
    def openai_llm(self) -> OpenAI:
        """Lazy initialization of the OpenAI LLM."""
        if self._openai_llm is None:
            self._openai_llm = OpenAI(model=self.openai_model)
        return self._openai_llm

    @property
    def azure_llm(self) -> AzureOpenAI:
        """Lazy initialization of the Azure OpenAI LLM."""
        if self._azure_llm is None:
            self._azure_llm = AzureOpenAI(
                model=self.azure_model,
                deployment_name=self.azure_deployment_name,
                api_key=self.azure_api_key,
                azure_endpoint=self.azure_endpoint,
                api_version=self.azure_api_version,
            )
        return self._azure_llm

    @property
    def local_llm(self) -> HuggingFaceLLM:
        """Lazy initialization of the Hugging Face LLM."""
        if self._local_llm is None:
            self._local_llm = HuggingFaceLLM(model_name=self.huggingface_model_name, device_map="auto")
        return self._local_llm

    @property
    def openai_embed_model(self) -> OpenAIEmbedding:
        """Lazy initialization of the OpenAI embedding model."""
        if self._openai_embed_model is None:
            self._openai_embed_model = OpenAIEmbedding(model="text-embedding-ada-002")
        return self._openai_embed_model

    @property
    def local_embed_model(self) -> HuggingFaceEmbedding:
        """Lazy initialization of the Hugging Face embedding model."""
        if self._local_embed_model is None:
            self._local_embed_model = HuggingFaceEmbedding(model_name=self.huggingface_embed_model_name)
        return self._local_embed_model

    def get_llm(self, llm_type: str) -> Union[OpenAI, AzureOpenAI, HuggingFaceLLM]:
        """
        Retrieve the desired LLM based on the specified type.

        :param llm_type: The type of LLM to retrieve ("openai", "azure", "local").
        :return: The requested LLM instance.
        :raises ValueError: If an invalid llm_type is provided.
        """
        if llm_type == "openai":
            return self.openai_llm
        elif llm_type == "azure":
            return self.azure_llm
        elif llm_type == "local":
            return self.local_llm
        else:
            raise ValueError(f"Invalid llm_type '{llm_type}'. Expected 'openai', 'azure', or 'local'.")

    def get_embedding_model(self, embed_type: str) -> Union[OpenAIEmbedding, HuggingFaceEmbedding]:
        """
        Retrieve the desired embedding model based on the specified type.

        :param embed_type: The type of embedding model to retrieve ("openai", "local").
        :return: The requested embedding model instance.
        :raises ValueError: If an invalid embed_type is provided.
        """
        if embed_type == "openai":
            return self.openai_embed_model
        elif embed_type == "local":
            return self.local_embed_model
        else:
            raise ValueError(f"Invalid embed_type '{embed_type}'. Expected 'openai' or 'local'.")

In [56]:
class Neo4jDBManager:
    """
    Manages the configuration and creation of Neo4j graph and vector stores.
    """

    def __init__(self, url: str = None, username: str = None, password: str = None, database: str = None):
        """
        Initialize Neo4j connection parameters.

        :param url: The URL for the Neo4j instance, defaults to "bolt://localhost:7687".
        :param username: Username for Neo4j authentication, defaults to "neo4j".
        :param password: Password for Neo4j authentication, retrieved from environment if not provided.
        :param database: Name of the Neo4j database, defaults to "neo4j".
        """
        self.logger = DebugLogger(use_panel_for_errors=True)
        self.url = url or os.getenv("NEO4J_URL", "bolt://localhost:7687")
        self.username = username or os.getenv("NEO4J_USERNAME", "neo4j")
        self.password = password or os.getenv("NEO4J_PASSWORD")
        self.database = database or os.getenv("NEO4J_DATABASE", "neo4j")

        self._validate_password()
        self.logger.success(f"Neo4jDBManager initialized with URL: '{self.url}', Database: '{self.database}'.")

    def _validate_password(self):
        """
        Validate that a Neo4j password is set.

        :raises ValueError: If the password is not provided.
        """
        if not self.password:
            error_message = (
                "Neo4j password is required. Set it in the environment or pass it directly."
            )
            self.logger.error(error_message)
            raise ValueError(error_message)

    def create_graph_store(self) -> Neo4jPropertyGraphStore:
        """
        Create and return a Neo4jPropertyGraphStore instance.

        :return: Configured Neo4jPropertyGraphStore instance.
        """
        return self._create_store(Neo4jPropertyGraphStore, "Neo4jPropertyGraphStore")

    def create_vector_store(self, embedding_dimension: int = 1536, hybrid_search: bool = True) -> Neo4jVectorStore:
        """
        Create and return a Neo4jVectorStore instance.

        :param embedding_dimension: Dimension of embeddings, defaults to 1536.
        :param hybrid_search: Enables hybrid search, defaults to True.
        :return: Configured Neo4jVectorStore instance.
        """
        return self._create_store(
            Neo4jVectorStore,
            "Neo4jVectorStore",
            embedding_dimension=embedding_dimension,
            hybrid_search=hybrid_search,
        )

    def _create_store(self, store_class: type, store_name: str, **kwargs):
        """
        Helper method to create a store instance with the provided configuration.

        :param store_class: The class of the store to be created.
        :param store_name: The name of the store, used for logging purposes.
        :param kwargs: Additional configuration parameters for the store.
        :return: Configured store instance.
        :raises RuntimeError: If store creation fails.
        """
        try:
            store_instance = store_class(
                username=self.username,
                password=self.password,
                url=self.url,
                database=self.database,
                **kwargs,
            )
            self.logger.success(f"{store_name} instance created successfully.")
            return store_instance
        except Exception as e:
            error_message = f"Failed to create {store_name}: {e}"
            self.logger.error(error_message)
            raise RuntimeError(error_message) from e

In [57]:
# Define your database configuration
db_config = Neo4jDBManager()

# Connect to the Neo4j database
driver = GraphDatabase.driver(db_config.url, auth=(db_config.username, db_config.password))

In [58]:
# Function to reset the database completely
def reset_database(driver):
    with driver.session() as session:
        # Clear all nodes and relationships
        session.run("MATCH (n) DETACH DELETE n")
        print("Data cleared.")

        # Drop all constraints
        constraints = session.run("SHOW CONSTRAINTS")
        for record in constraints:
            constraint_name = record["name"]
            session.run(f"DROP CONSTRAINT {constraint_name}")
        print("All constraints dropped.")

        # Drop all indexes
        indexes = session.run("SHOW INDEXES")
        for record in indexes:
            index_name = record["name"]
            session.run(f"DROP INDEX {index_name}")
        print("All indexes dropped.")


# Execute the reset function
reset_database(driver)

# Close the driver connection
driver.close()

Data cleared.
All constraints dropped.
All indexes dropped.


In [59]:
class UnstructuredTransform(TransformComponent):
    def __call__(self, docs, **kwargs):
        transformations = [
            EntityExtractor()
        ]

        pipeline = IngestionPipeline(transformations=[SimpleFileNodeParser()] + transformations)
        base_nodes = pipeline.run(documents=docs, show_progress=True)

        return base_nodes

In [60]:
class SchemaHandler:
    """
    Handles schema definitions for knowledge graph validation, including entities, relations,
    and validation schemas.
    """

    @staticmethod
    def get_validation_schema() -> List[Tuple[str, str, str]]:
        """
        Retrieve the validation schema defining valid triples in the knowledge graph.

        :return: A list of tuples representing valid (entity, relation, entity) triples.
        """
        return [
            # Product-related triples
            ("PRODUCT", "USED_BY", "PRODUCT"),
            ("PRODUCT", "USED_FOR", "MARKET"),
            ("PRODUCT", "HAS", "TECHNOLOGY"),
            ("PRODUCT", "MENTIONS", "CONCEPT"),
            ("PRODUCT", "PART_OF", "ORGANIZATION"),

            # Market-related triples
            ("MARKET", "LOCATED_IN", "LOCATION"),
            ("MARKET", "HAS", "TECHNOLOGY"),
            ("MARKET", "PART_OF", "ORGANIZATION"),
            ("MARKET", "REPRESENTS", "CONCEPT"),

            # Technology-related triples
            ("TECHNOLOGY", "USED_BY", "PRODUCT"),
            ("TECHNOLOGY", "USED_FOR", "MARKET"),
            ("TECHNOLOGY", "LOCATED_IN", "LOCATION"),
            ("TECHNOLOGY", "PART_OF", "ORGANIZATION"),
            ("TECHNOLOGY", "IS_A", "PRODUCT"),
            ("TECHNOLOGY", "ASSOCIATED_WITH", "CONCEPT"),

            # Event-related triples
            ("EVENT", "LOCATED_IN", "LOCATION"),
            ("EVENT", "PART_OF", "ORGANIZATION"),
            ("EVENT", "MENTIONS", "TOPIC"),
            ("EVENT", "REPRESENTS", "CONCEPT"),

            # Concept-related triples
            ("CONCEPT", "USED_BY", "TECHNOLOGY"),
            ("CONCEPT", "USED_FOR", "PRODUCT"),
            ("CONCEPT", "INFLUENCES", "ORGANIZATION"),
            ("CONCEPT", "REPRESENTS", "TOPIC"),
            ("CONCEPT", "DERIVED_FROM", "DATA"),

            # Organization-related triples
            ("ORGANIZATION", "LOCATED_IN", "LOCATION"),
            ("ORGANIZATION", "PART_OF", "ORGANIZATION"),
            ("ORGANIZATION", "PART_OF", "MARKET"),
            ("ORGANIZATION", "SUPPORTS", "PRODUCT"),
            ("ORGANIZATION", "ASSOCIATED_WITH", "CONCEPT"),

            # Person-related triples
            ("PERSON", "BORN_IN", "LOCATION"),
            ("PERSON", "BORN_IN", "TIME"),
            ("PERSON", "DIED_IN", "LOCATION"),
            ("PERSON", "DIED_IN", "TIME"),
            ("PERSON", "WORKED_ON", "EVENT"),
            ("PERSON", "WORKED_ON", "PRODUCT"),
            ("PERSON", "WORKED_ON", "CONCEPT"),
            ("PERSON", "WORKED_ON", "TECHNOLOGY"),
            ("PERSON", "ASSOCIATED_WITH", "ORGANIZATION"),

            # Location-related triples
            ("LOCATION", "LOCATED_IN", "LOCATION"),
            ("LOCATION", "PART_OF", "LOCATION"),
            ("LOCATION", "REPRESENTS", "MARKET"),
            ("LOCATION", "MENTIONS", "EVENT"),

            # Data-related triples
            ("DATA", "MEASURES", "STATISTIC"),
            ("DATA", "REPRESENTS", "TOPIC"),
            ("DATA", "DERIVED_FROM", "DOCUMENT"),
            ("DATA", "CONTAINS", "CELL"),

            # Document-related triples
            ("DOCUMENT", "HAS", "DATA"),
            ("DOCUMENT", "MENTIONS", "QUESTION"),
            ("DOCUMENT", "REFERENCES", "ANSWER"),
            ("DOCUMENT", "SUPPORTS", "TASK"),
            ("DOCUMENT", "SUMMARIZES", "TOPIC"),

            # Miscellaneous triples
            ("TABLE", "CONTAINS", "ROW"),
            ("TABLE", "HAS_COLUMN", "COLUMN"),
            ("ROW", "PART_OF", "TABLE"),
            ("COLUMN", "HEADER_OF", "ROW"),
            ("CELL", "VALUE_OF", "COLUMN"),
        ]

    @staticmethod
    def get_entities() -> Type[str]:
        """
        Retrieve the list of possible entity types for the knowledge graph.

        :return: A Literal type representing the valid entity types.
        """
        return Literal[
            "PERSON", "PLACE", "THING", "STATISTIC", "EVENT",
            "ORGANIZATION", "CONCEPT", "DATA", "DOCUMENT",
            "QUESTION", "ANSWER", "TASK", "TOPIC", "TABLE",
            "ROW", "COLUMN", "CELL", "PRODUCT", "MARKET",
            "TECHNOLOGY", "LOCATION", "TIME", "MISCELLANEOUS",
        ]

    @staticmethod
    def get_relations() -> Type[str]:
        """
        Retrieve the list of possible relation types for the knowledge graph.

        :return: A Literal type representing the valid relation types.
        """
        return Literal[
            "PART_OF", "HAS", "IS_A", "BELONGS_TO",
            "LOCATED_IN", "ASSOCIATED_WITH", "CREATED_BY",
            "INFLUENCES", "USES", "MEASURES", "REPRESENTS",
            "MENTIONS", "REFERENCES", "DERIVED_FROM",
            "EXPLAINS", "SUPPORTS", "SUMMARIZES", "CONTAINS",
            "HAS_COLUMN", "HAS_ROW", "HAS_CELL", "HEADER_OF", "VALUE_OF",
            "USED_BY", "USED_FOR", "WORKED_ON", "BORN_IN",
            "DIED_IN", "HAS_ALIAS",
        ]

In [61]:
class KnowledgeManager(ABC):
    """
    Abstract base class for managing indexing, retrieval, and querying of knowledge data.
    Provides methods for indexing documents, creating a chat engine, and executing queries.
    """

    def __init__(self, store: PropertyGraphStore, storage_context: StorageContext) -> None:
        """
        Initialize the KnowledgeManager with a store and storage context.

        :param store: The storage backend for managing knowledge data.
        :param storage_context: Configuration context for the storage backend.
        """
        self.store = store
        self.storage_context = storage_context
        self.persist_dir: str = ""
        self.index: Optional[BaseIndex] = None
        self.chat_engine: Optional[BaseChatEngine] = None
        self.logger = DebugLogger(use_panel_for_errors=True)

    def get_index(self) -> Optional[BaseIndex]:
        """
        Get the current index, if available.

        :return: The current index or None if not initialized.
        """
        return self.index

    def get_query_engine(self) -> Optional[BaseChatEngine]:
        """
        Get the current chat engine, if available.

        :return: The current chat engine or None if not initialized.
        """
        return self.chat_engine

    def index_documents(self, documents: List[Document], reload_index: bool = False) -> None:
        """
        Index a list of documents into the knowledge store.

        :param documents: List of Document objects to be indexed.
        :param reload_index: Whether to reload an existing index if available.
        :raises Exception: If an error occurs during indexing.
        """
        try:
            if reload_index and self.load_index():
                self.logger.success("Index loaded successfully. Skipping re-indexing.")
                return

            self.logger.info("Starting document indexing... This may take a while.")
            self.create_index(documents)
            self.logger.success("Document indexing completed successfully.")
        except Exception as e:
            self.logger.error("Error during document indexing:", exc_info=True)
            raise

    def refresh_index(self, documents: List[Document]):
        try:
            if not self.index and self.load_index():
                self.logger.success("Index loaded successfully.")

            self.logger.info("Re-indexing with new document.")
            self.index.refresh(documents, transformations=[UnstructuredTransform()])

        except Exception as e:
            self.logger.error("Error during new document indexing:", exc_info=True)
            raise

    def create_chat_engine(self) -> None:
        """
        Initialize the chat engine using the current retriever.

        :raises ValueError: If the chat engine setup fails.
        """
        if not self.index:
            error_message = "Cannot create chat engine: Index is not initialized."
            self.logger.error(error_message)
            raise ValueError(error_message)

        try:
            self.chat_engine = self.index.as_chat_engine(verbose=True)
            self.logger.success("Query engine initialized successfully.")
        except Exception as e:
            error_message = f"Failed to initialize chat engine: {e}"
            self.logger.error(error_message, exc_info=True)
            raise ValueError(error_message) from e

    def execute_query(self, query: str) -> Optional[AgentChatResponse]:
        """
        Execute a query on the knowledge store and return the result.

        :param query: The query string to execute.
        :return: The response object if the query is successful, or None otherwise.
        """
        if not self.chat_engine:
            self.logger.info("Query engine not initialized. Creating a new chat engine...")
            self.create_chat_engine()

        try:
            return self.chat_engine.chat(query)
        except Exception as e:
            self.logger.error("Query execution failed:", exc_info=True)
            return None

    @abstractmethod
    def create_index(self, documents: List[Document]) -> None:
        """
        Abstract method to create an index from a list of documents.

        :param documents: List of Document objects to be indexed.
        """
        pass

    @abstractmethod
    def load_index(self) -> bool:
        """
        Abstract method to load an existing index from storage.

        :return: True if the index was loaded successfully, False otherwise.
        """
        pass

In [62]:
class GraphManager(KnowledgeManager):
    """
    Manages graph store operations, including cleaning, indexing, and query execution.
    Extends KnowledgeManager to support configurations specific to graph-based indexing.
    """

    def __init__(self, store: PropertyGraphStore, storage_context: StorageContext) -> None:
        """
        Initialize the GraphManager with a graph-based store and storage context.

        :param store: The graph data storage backend.
        :param storage_context: Context or configuration settings for storage management.
        """
        super().__init__(store, storage_context)
        self.persist_dir = "graph_index"
        self.logger.info("GraphManager initialized with a graph store.")

    def create_index(self, documents: List[Document]) -> None:
        """
        Index a list of documents into the knowledge graph with specified configurations.

        :param documents: List of Document objects to be indexed.
        """
        self.logger.info("Starting document indexing into the graph store. This may take some time.")

        try:
            self.index = PropertyGraphIndex.from_documents(
                documents=documents,
                kg_extractors=[
                    SimpleLLMPathExtractor(),
                    SchemaLLMPathExtractor(
                        llm=ModelManager().get_llm(LLM_MODE),
                        possible_entities=SchemaHandler.get_entities(),
                        possible_relations=SchemaHandler.get_relations(),
                        kg_validation_schema=SchemaHandler.get_validation_schema(),
                        strict=False
                    ),
                ],
                property_graph_store=self.store,
                storage_context=self.storage_context,
                embed_kg_nodes=True,
                show_progress=True,
                transformations=[UnstructuredTransform()],
            )
            self.logger.success("Document indexing completed successfully.")
        except Exception as e:
            self.logger.error("Error during document indexing:", exc_info=True)
            raise RuntimeError("Graph indexing failed.") from e

    def load_index(self) -> bool:
        """
        Load the index from the graph store if available.

        :return: True if the index was successfully loaded, False otherwise.
        """
        if not self.store:
            self.logger.warning("No graph store is available. Unable to load index.")
            return False

        try:
            self.logger.info("Attempting to load the index from the graph store.")
            self.index = PropertyGraphIndex.from_existing(
                property_graph_store=self.store, embed_kg_nodes=True
            )
            self.logger.success("Index loaded successfully from the graph store.")
            return True
        except Exception as e:
            self.logger.error("Error while loading the index:", exc_info=True)
            return False

In [63]:
class RAG:
    """
    A hybrid retrieval pipeline utilizing Neo4j for both vector-based document retrieval
    and knowledge graph storage. Supports structured and unstructured query handling.
    """

    def __init__(self, logger: Optional[DebugLogger] = None, db_manager: Optional[Neo4jDBManager] = None) -> None:
        """
        Initializes the RAG pipeline with components for graph and vector-based retrieval.

        :param db_manager: Instance of Neo4jDBManager for database interaction (optional).
        """
        self.logger = logger or DebugLogger(use_panel_for_errors=True)
        self.db_manager = db_manager or Neo4jDBManager()
        self.chat_engine = None
        self.knowledge_manager = None
        self._initialize_managers()

    def _initialize_managers(self) -> None:
        """
        Initializes necessary managers (e.g., RAG Manager).
        Placeholder for future initialization logic.
        """
        # Example initialization (Replace with actual manager setup)
        self.logger.info("Initializing RAG managers...")
        # self.knowledge_manager = RAGManager(self.db_manager)

    def load_and_index_documents(self, folder_path: str, reload_index: bool = False) -> None:
        """
        Loads, chunks, and indexes documents into the Neo4j vector store and knowledge graph.

        :param folder_path: Path to the folder containing document files.
        :param reload_index: If True, reloads the index regardless of existing data.
        """
        self.logger.info(f"{'Reloading' if reload_index else 'Loading'} documents from: {folder_path}")

        docs = [] if reload_index else self._load_docs(folder_path)

        if not docs and not reload_index:
            self.logger.warning("No documents available for indexing.")
            return

        self._index(docs, reload_index=reload_index)

    @staticmethod
    @DebugLogger.profile
    def _load_docs(folder_path: str) -> List[Document]:
        """
        Loads documents from the specified folder.

        :param folder_path: Path to the folder containing document files.
        :return: List of loaded Document objects.
        """
        reader = SimpleDirectoryReader(input_dir=folder_path, errors="strict")
        return reader.load_data(show_progress=True)

    @DebugLogger.profile
    def _index(self, docs: List[Document], reload_index: bool) -> None:
        """
        Indexes the given documents in the vector store.

        :param docs: List of Document objects to index.
        :param reload_index: If True, reloads the index.
        """
        try:
            self.logger.info("Indexing documents into Neo4j...")
            self.knowledge_manager.index_documents(docs, reload_index)
            self.logger.success("Document indexing completed successfully.")
        except Exception as e:
            self.logger.error(f"Error during indexing: {e}", exc_info=True)

    def _initialize_chat_engine(self) -> None:
        """
        Sets up the RAG chat engine for document retrieval.
        """
        try:
            self.logger.info("Initializing chat engine...")
            self.knowledge_manager.create_chat_engine()
            self.chat_engine = self.knowledge_manager.get_query_engine()
            self.logger.success("Query engine initialized successfully.")
        except Exception as e:
            self.logger.error(f"Failed to initialize chat engine: {e}", exc_info=True)

    def as_chat_engine(self) -> RetrieverQueryEngine:
        if not self.chat_engine:
            self._initialize_chat_engine()
        return self.chat_engine

    @DebugLogger.profile
    def query(self, question: str) -> Optional[Response]:
        """
        Executes a query using the vector store.

        :param question: The input query as a string.
        :return: Query response or None in case of an error.
        """
        if not self.chat_engine:
            self._initialize_chat_engine()

        try:
            self.logger.info(f"Executing query: {question}")
            response = self.knowledge_manager.execute_query(question)
            self.logger.success("Query executed successfully.")
            return response
        except Exception as e:
            self.logger.error(f"Error during query execution: {e}", exc_info=True)
            return None

    @staticmethod
    def fetch_sources(nodes: List[NodeWithScore]) -> List[str]:
        """
        Filters nodes by similarity score and extracts unique source filenames.

        :param nodes: List of nodes from the query response.
        :return: List of unique filenames from filtered nodes.
        """
        processor = SimilarityPostprocessor(similarity_cutoff=0.75)
        filtered_nodes = processor.postprocess_nodes(nodes)
        return list({node.node.metadata["file_name"] for node in filtered_nodes if "file_name" in node.node.metadata})

In [64]:
class GraphRAG(RAG):
    """
    A specialized implementation of the RAG (Retrieval-Augmented Generation) pipeline
    that leverages Neo4j for both vector-based document retrieval and knowledge graph storage.

    This class supports structured and unstructured query handling by integrating graph-based
    storage and retrieval mechanisms with the RAG pipeline.
    """

    def __init__(self) -> None:
        """
        Initializes the GraphRAG pipeline with components for graph-based and vector-based retrieval.
        """
        super().__init__()

    def _initialize_managers(self) -> None:
        """
        Configures the graph manager and its associated storage contexts.

        This method sets up the graph store and initializes the graph manager, enabling
        efficient graph-based storage and retrieval operations. It ensures that the pipeline
        can handle both graph structures and their integration with vector-based retrieval.

        :raises Exception:
            If the initialization of the graph manager fails, an error is logged with details.
        """
        try:
            self.logger.info("Initializing graph manager...")

            # Create the graph store and its associated storage context
            graph_store = self.db_manager.create_graph_store()
            graph_storage_context = StorageContext.from_defaults(graph_store=graph_store)

            # Initialize the GraphManager
            self.knowledge_manager = GraphManager(graph_store, graph_storage_context)

            self.logger.success("Graph manager initialized successfully.")
        except Exception as e:
            self.logger.error(f"Failed to initialize graph manager: {e}", exc_info=True)