Parsing, Text Chunking and Indexing (Ex. 1): Fast Start to RAG


In [None]:
%pip install "grpcio<=1.60.0,>=1.49.1" --no-cache-dir ## just to be safe with enviroments
!pip install -q llmware


Collecting grpcio<=1.60.0,>=1.49.1
  Downloading grpcio-1.60.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (5.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.4/5.4 MB[0m [31m24.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: grpcio
Successfully installed grpcio-1.60.0


# ## If deploying locally, use these database options


In [1]:
from llmware.configs import LLMWareConfig
LLMWareConfig().set_active_db("sqlite")
LLMWareConfig().set_vector_db("faiss")

In [2]:
import os
from llmware.library import Library ### Import for library creating
from llmware.retrieval import Query ### Import for querying
from llmware.setup import Setup ### Import for setup
from llmware.configs import LLMWareConfig ### Import for configs


In [3]:
# Initialize the library name for creating a new library in llmware
library_name = "example1_library"
print(f"\nStep 1 - creating library {library_name}")

# Create a new library instance with the specified name using llmware's Library class
library = Library().create_new_library(library_name)



Step 1 - creating library example1_library


In [4]:
sample_folders = ["Agreements", "Invoices", "UN-Resolutions-500", "SmallLibrary", "FinDocs", "AgreementsLarge"]
selected_folder=sample_folders[0]

In [5]:
selected_folder

'Agreements'

In [6]:
sample_files_path = Setup().load_sample_files(over_write=False)
print (f"Step 2 - loading the llmware sample files and saving at: {sample_files_path}")

Step 2 - loading the llmware sample files and saving at: /root/llmware_data/sample_files


In [10]:
sample_folder = "Agreements"  # Example folder
ingestion_folder_path = os.path.join(sample_files_path, sample_folder)
print(f"Step 3 - parsing and indexing files from {ingestion_folder_path}")

Step 3 - parsing and indexing files from /root/llmware_data/sample_files/Agreements


In [11]:
parsing_output = library.add_files(ingestion_folder_path)
print(f"Step 4 - completed parsing - {parsing_output}")

Step 4 - completed parsing - {'docs_added': 15, 'blocks_added': 1653, 'images_added': 0, 'pages_added': 204, 'tables_added': 0, 'rejected_files': []}


In [12]:
updated_library_card = library.get_library_card()
doc_count = updated_library_card["documents"]
block_count = updated_library_card["blocks"]
print(f"Step 5 - updated library card - documents - {doc_count} - blocks - {block_count} - {updated_library_card}")

Step 5 - updated library card - documents - 15 - blocks - 1653 - {'_id': 1, 'library_name': 'example1_library', 'embedding': [{'embedding_status': 'no', 'embedding_model': 'none', 'embedding_db': 'none', 'embedded_blocks': 0, 'embedding_dims': 0, 'time_stamp': 'NA'}], 'knowledge_graph': 'no', 'unique_doc_id': 15, 'documents': 15, 'blocks': 1653, 'images': 0, 'pages': 204, 'tables': 0, 'account_name': 'llmware'}


In [13]:
library_path = library.library_main_path
print(f"Step 6 - library artifacts - including extracted images - saved at folder path - {library_path}")

Step 6 - library artifacts - including extracted images - saved at folder path - /root/llmware_data/accounts/llmware/example1_library


In [14]:
test_query = "base salary"
print(f"\nStep 7 - running a test query - {test_query}\n")

query_results = Query(library).text_query(test_query, result_count=10)

for i, result in enumerate(query_results):
    print("query results: ", i, result)


Step 7 - running a test query - base salary

query results:  0 {'query': 'base salary', '_id': '25', 'text': " Executive's base salary shall be reviewed annually   by the Board (or the compensation committee of the Board), pursuant to Employer's normal   compensation and performance review policies for senior level executives, and may be increased but not   decreased. The amount of any increase for each year shall be determined accordingly. For purposes of this   Agreement, the term “Base Salary” shall mean the amount of Executive's base salary established from   time to time pursuant to this Section 2.2.", 'doc_ID': 1, 'block_ID': 24, 'page_num': 3, 'content_type': 'text', 'author_or_speaker': '', 'special_field1': '', 'file_source': 'Eileithyia EXECUTIVE EMPLOYMENT AGREEMENT.pdf', 'added_to_collection': 'Mon Apr 15 21:18:56 2024', 'table': '', 'coords_x': 1093, 'coords_y': -1825, 'coords_cx': 2, 'coords_cy': 134, 'external_files': '', 'score': -8.027992635076878, 'similarity': 0.0, 