In [None]:
%pip install autoflow-ai==0.0.1.dev4
%pip install dotenv
%pip install sqlalchemy

In [42]:
import logging
import os
import dotenv
from sqlalchemy import create_engine

logger = logging.getLogger(__name__)

dotenv.load_dotenv()

True

#### Create knowledge base

In [44]:
from autoflow import Autoflow
from autoflow.schema import IndexMethod
from autoflow.llms.chat_models import ChatModel
from autoflow.llms.embeddings import EmbeddingModel


db_engine = create_engine(os.getenv("DATABASE_URL"))
af = Autoflow(db_engine=db_engine)

chat_model = ChatModel("gpt-4o-mini")
embed_model = EmbeddingModel(model_name="text-embedding-3-small", dimensions=1536)

# Create Knowledge base
kb = af.create_knowledge_base(
    name="New KB",
    description="This is a knowledge base for testing",
    index_methods=[IndexMethod.VECTOR_SEARCH, IndexMethod.KNOWLEDGE_GRAPH],
    chat_model=chat_model,
    embedding_model=embed_model,
)
kb

INFO:autoflow.storage.doc_store.tidb.tidb_doc_store:Document table <documents> is already exists, no action to do.
INFO:autoflow.storage.doc_store.tidb.tidb_doc_store:Chunk table <chunks_fcf4123e-7c90-43d0-b5cd-746966ca8207> has been created successfully.
INFO:autoflow.storage.graph_store.tidb.tidb_graph_store:Entities table <entities_fcf4123e-7c90-43d0-b5cd-746966ca8207> has been created successfully.
INFO:autoflow.storage.graph_store.tidb.tidb_graph_store:Relationships table <relationships_fcf4123e-7c90-43d0-b5cd-746966ca8207> has been created successfully.


KnowledgeBase(id=UUID('fcf4123e-7c90-43d0-b5cd-746966ca8207'), name='New KB', index_methods=[<IndexMethod.VECTOR_SEARCH: 'VECTOR_SEARCH'>, <IndexMethod.KNOWLEDGE_GRAPH: 'KNOWLEDGE_GRAPH'>], description='This is a knowledge base for testing', chunking_config=GeneralChunkingConfig(mode=<ChunkingMode.GENERAL: 'general'>, chunk_size=1200, chunk_overlap=200, paragraph_separator='\n\n\n'), data_sources=[])

#### Import documents from files

In [45]:
import os

current_dir = os.path.dirname(os.path.abspath("__file__"))
current_dir

'/Users/liangzhiyuan/Projects/tidb.ai/core/tests'

In [46]:
from pathlib import Path

kb.import_documents_from_files(
    files=[
        Path(current_dir) / "fixtures" / "tidb-overview.md",
    ]
)

INFO:autoflow.storage.graph_store.tidb.tidb_graph_store:Save entities for relationship: TiDB -> TiDB can be deployed in a Self-Managed model, providing users with control over their database setup. -> Self-Managed
INFO:autoflow.storage.graph_store.tidb.tidb_graph_store:Save entities for relationship: TiDB -> TiDB Self-Managed is a deployment option of TiDB that allows users to manage the database on their own infrastructure. -> TiDB Self-Managed
INFO:autoflow.storage.graph_store.tidb.tidb_graph_store:Save entities for relationship: TiDB -> TiDB utilizes TiKV as its row-based storage engine to support real-time data replication. -> TiKV
INFO:autoflow.storage.graph_store.tidb.tidb_graph_store:Save entities for relationship: TiDB -> TiDB employs TiFlash as its columnar storage engine to ensure consistent data storage and real-time replication from TiKV. -> TiFlash
INFO:autoflow.storage.graph_store.tidb.tidb_graph_store:Save entities for relationship: TiDB -> TiDB Operator facilitates the 

[]

In [47]:
result = kb.search_documents(
    query="What is TiDB?",
    similarity_top_k=2,
)
[(c.score, c.chunk.text) for c in result.chunks]

[(0.7382171054172685,
  'What is TiDB Self-Managed Key features\n<!-- Localization note for TiDB:\n- English: use distributed SQL, and start to emphasize HTAP\n- Chinese: can keep "NewSQL" and emphasize one-stop real-time HTAP ("一栈式实时 HTAP")\n- Japanese: use NewSQL because it is well-recognized\n-->\nTiDB (/\'taɪdiːbi:/, "Ti" stands for Titanium) is an open-source distributed SQL database that supports Hybrid Transactional and Analytical Processing (HTAP) workloads. It is MySQL compatible and features horizontal scalability, strong consistency, and high availability. The goal of TiDB is to provide users with a one-stop database solution that covers OLTP (Online Transactional Processing), OLAP (Online Analytical Processing), and HTAP services. TiDB is suitable for various use cases that require high availability and strong consistency with large-scale data.\nTiDB Self-Managed is a product option of TiDB, where users or organizations can deploy and manage TiDB on their own infrastructure

In [50]:
kg = kb.search_knowledge_graph(
    query="What is TiDB?",
)
[(r.rag_description) for r in kg.relationships]

INFO:autoflow.storage.graph_store.tidb.tidb_graph_store:Debug - SQL Query: 
SELECT `relationships_fcf4123e-7c90-43d0-b5cd-746966ca8207`.id, `relationships_fcf4123e-7c90-43d0-b5cd-746966ca8207`.description, `relationships_fcf4123e-7c90-43d0-b5cd-746966ca8207`.description_vec, `relationships_fcf4123e-7c90-43d0-b5cd-746966ca8207`.source_entity_id, `relationships_fcf4123e-7c90-43d0-b5cd-746966ca8207`.target_entity_id, `relationships_fcf4123e-7c90-43d0-b5cd-746966ca8207`.meta, `relationships_fcf4123e-7c90-43d0-b5cd-746966ca8207`.weight, `relationships_fcf4123e-7c90-43d0-b5cd-746966ca8207`.chunk_id, `relationships_fcf4123e-7c90-43d0-b5cd-746966ca8207`.document_id, `relationships_fcf4123e-7c90-43d0-b5cd-746966ca8207`.created_at, `relationships_fcf4123e-7c90-43d0-b5cd-746966ca8207`.updated_at, %s - anon_1.embedding_distance AS similarity_score, `entities_fcf4123e-7c90-43d0-b5cd-746966ca8207_1`.id AS id_1, `entities_fcf4123e-7c90-43d0-b5cd-746966ca8207_1`.entity_type, `entities_fcf4123e-7c90-43

INFO:autoflow.storage.graph_store.tidb.tidb_graph_store:Debug - SQL Query: 
SELECT `relationships_fcf4123e-7c90-43d0-b5cd-746966ca8207`.id, `relationships_fcf4123e-7c90-43d0-b5cd-746966ca8207`.description, `relationships_fcf4123e-7c90-43d0-b5cd-746966ca8207`.description_vec, `relationships_fcf4123e-7c90-43d0-b5cd-746966ca8207`.source_entity_id, `relationships_fcf4123e-7c90-43d0-b5cd-746966ca8207`.target_entity_id, `relationships_fcf4123e-7c90-43d0-b5cd-746966ca8207`.meta, `relationships_fcf4123e-7c90-43d0-b5cd-746966ca8207`.weight, `relationships_fcf4123e-7c90-43d0-b5cd-746966ca8207`.chunk_id, `relationships_fcf4123e-7c90-43d0-b5cd-746966ca8207`.document_id, `relationships_fcf4123e-7c90-43d0-b5cd-746966ca8207`.created_at, `relationships_fcf4123e-7c90-43d0-b5cd-746966ca8207`.updated_at, %s - anon_1.embedding_distance AS similarity_score, `entities_fcf4123e-7c90-43d0-b5cd-746966ca8207_1`.id AS id_1, `entities_fcf4123e-7c90-43d0-b5cd-746966ca8207_1`.entity_type, `entities_fcf4123e-7c90-43

['TiDB -> TiDB can be deployed in a Self-Managed model, providing users with control over their database setup. -> Self-Managed',
 'TiDB -> TiDB Self-Managed is a deployment option of TiDB that allows users to manage the database on their own infrastructure. -> TiDB Self-Managed',
 'TiDB -> TiDB utilizes TiKV as its row-based storage engine to support real-time data replication. -> TiKV',
 'TiDB -> TiDB employs TiFlash as its columnar storage engine to ensure consistent data storage and real-time replication from TiKV. -> TiFlash',
 'TiDB -> TiDB Operator facilitates the management of TiDB on Kubernetes, automating operational tasks. -> TiDB Operator',
 'TiDB -> TiDB Cloud is the fully-managed service that allows users to deploy and run TiDB clusters in the cloud. -> TiDB Cloud',
 'TiDB Self-Managed -> TiDB Self-Managed utilizes TiDB Architecture to structure its deployment and operations. -> TiDB Architecture',
 'TiDB Self-Managed -> TiDB Self-Managed relies on TiDB Storage for data m