# **If you are using Colab for free, we highly recommend you activate the T4 GPU hardware accelerator. Our models are designed to run with at least 16GB of RAM, activating T4 will grant the notebook 16GB of GDDR6 RAM as opposed to the 13GB Colab gives automatically.**
# **To active T4:**
# **1.   click on the "Runtime" tab**
# **2.   click on "Change runtime type"**
# **3.   select T4 GPU under Hardware Accelerator**
# **NOTE: there is a weekly usage limit on using T4 for free**





In [None]:
!pip install llmware

Collecting llmware
  Downloading llmware-0.3.0-py3-none-any.whl (56.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.0/56.0 MB[0m [31m7.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting boto3>=1.24.53 (from llmware)
  Downloading boto3-1.34.120-py3-none-any.whl (139 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m139.3/139.3 kB[0m [31m698.5 kB/s[0m eta [36m0:00:00[0m
Collecting pymongo>=4.7.0 (from llmware)
  Downloading pymongo-4.7.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (669 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m669.1/669.1 kB[0m [31m7.1 MB/s[0m eta [36m0:00:00[0m
Collecting psycopg-binary==3.1.17 (from llmware)
  Downloading psycopg_binary-3.1.17-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.3/3.3 MB[0m [31m33.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting psycopg==3.1.17 (from llmware)
 

In [None]:
import os
from llmware.library import Library
from llmware.retrieval import Query
from llmware.setup import Setup
from llmware.configs import LLMWareConfig

In [None]:
LLMWareConfig().set_active_db("sqlite")
LLMWareConfig().set_config("debug_mode", 2)

In [None]:
sample_folders = ["Agreements", "Invoices", "UN-Resolutions-500", "SmallLibrary", "FinDocs", "AgreementsLarge"]
library_name = "example1_library"
selected_folder = sample_folders[0]

In [None]:
print(f"\nExample - Parsing Files into Library")
print(f"\nStep 1 - creating library {library_name}")
library = Library().create_new_library(library_name)


Example - Parsing Files into Library

Step 1 - creating library example1_library


In [None]:
sample_files_path = Setup().load_sample_files(over_write=False)
print(f"Step 2 - loading the llmware sample files and saving at: {sample_files_path}")

Step 2 - loading the llmware sample files and saving at: /root/llmware_data/sample_files


In [None]:
ingestion_folder_path = os.path.join(sample_files_path, selected_folder)
print(f"Step 3 - parsing and indexing files from {ingestion_folder_path}")

Step 3 - parsing and indexing files from /root/llmware_data/sample_files/Agreements


In [None]:
parsing_output = library.add_files(ingestion_folder_path)
print(f"Step 4 - completed parsing - {parsing_output}")

Step 4 - completed parsing - {'docs_added': 15, 'blocks_added': 2211, 'images_added': 0, 'pages_added': 204, 'tables_added': 0, 'rejected_files': []}


In [None]:
updated_library_card = library.get_library_card()
doc_count = updated_library_card["documents"]
block_count = updated_library_card["blocks"]
print(f"Step 5 - updated library card - documents - {doc_count} - blocks - {block_count} - {updated_library_card}")

Step 5 - updated library card - documents - 15 - blocks - 2211 - {'_id': 1, 'library_name': 'example1_library', 'embedding': [{'embedding_status': 'no', 'embedding_model': 'none', 'embedding_db': 'none', 'embedded_blocks': 0, 'embedding_dims': 0, 'time_stamp': 'NA'}], 'knowledge_graph': 'no', 'unique_doc_id': 15, 'documents': 15, 'blocks': 2211, 'images': 0, 'pages': 204, 'tables': 0, 'account_name': 'llmware'}


In [None]:
library_path = library.library_main_path
print(f"Step 6 - library artifacts - including extracted images - saved at folder path - {library_path}")

Step 6 - library artifacts - including extracted images - saved at folder path - /root/llmware_data/accounts/llmware/example1_library


In [None]:
test_query = "base salary"
print(f"\nStep 7 - running a test query - {test_query}\n")
query_results = Query(library).text_query(test_query, result_count=10)
for i, result in enumerate(query_results):
  text = result["text"]
  file_source = result["file_source"]
  page_number = result["page_num"]
  doc_id = result["doc_ID"]
  block_id = result["block_ID"]
  matches = result["matches"]
  print("query results: ", i, result)


Step 7 - running a test query - base salary

query results:  0 {'query': 'base salary', '_id': '1663', 'text': " Base Salary. For all the services rendered by Executive hereunder, during the Employment Period,   Employer shall pay Executive a base salary at the annual rate of $200,000, payable semimonthly in   accordance with Employer's normal payroll practices. Executive's base salary shall be reviewed annually   by the Board (or the compensation committee of the Board), pursuant to Employer's normal   compensation ", 'doc_ID': 12, 'block_ID': 33, 'page_num': 3, 'content_type': 'text', 'author_or_speaker': '', 'special_field1': '', 'file_source': 'Metis EXECUTIVE EMPLOYMENT AGREEMENT.pdf', 'added_to_collection': 'Thu Jun  6 13:51:29 2024', 'table': '', 'coords_x': 427, 'coords_y': -1681, 'coords_cx': 2, 'coords_cy': 123, 'external_files': '', 'score': -8.71512754073446, 'similarity': 0.0, 'distance': 0.0, 'matches': [[1, 'base'], [6, 'salary'], [131, 'base'], [136, 'salary'], [265, '