In [12]:
!pip install llmware
!pip install lancedb
!pip install transformers
!pip install torch

Collecting lancedb
  Downloading lancedb-0.8.2-cp38-abi3-manylinux_2_28_x86_64.whl (19.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m19.1/19.1 MB[0m [31m28.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting deprecation (from lancedb)
  Downloading deprecation-2.1.0-py2.py3-none-any.whl (11 kB)
Collecting pylance==0.12.1 (from lancedb)
  Downloading pylance-0.12.1-cp39-abi3-manylinux_2_28_x86_64.whl (23.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m23.1/23.1 MB[0m [31m26.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting ratelimiter~=1.0 (from lancedb)
  Downloading ratelimiter-1.2.0.post0-py3-none-any.whl (6.6 kB)
Collecting retry>=0.9.2 (from lancedb)
  Downloading retry-0.9.2-py2.py3-none-any.whl (8.0 kB)
Collecting overrides>=0.7 (from lancedb)
  Downloading overrides-7.7.0-py3-none-any.whl (17 kB)
Collecting py<2.0.0,>=1.4.26 (from retry>=0.9.2->lancedb)
  Downloading py-1.11.0-py2.py3-none-any.whl (98 kB)
[2K     [90m━━━━━━━━━━━━━━

In [13]:
import os
from llmware.library import Library
from llmware.retrieval import Query
from llmware.setup import Setup
from llmware.status import Status
from llmware.models import ModelCatalog
from llmware.configs import LLMWareConfig, MilvusConfig

from importlib import util


if not util.find_spec("torch") or not util.find_spec("transformers"):
  print("\nto run this example, with the selected embedding model, please install transformers and torch, e.g., "
        "\n`pip install torch`"
        "\n`pip install transformers`")

if not (util.find_spec("chromadb") or util.find_spec("pymilvus") or util.find_spec("lancedb") or util.find_spec("faiss")):
  print("\nto run this example, you will need to pip install the vector db drivers. see comments above,")

In [27]:
LLMWareConfig.set_active_db("sqlite")
LLMWareConfig().set_vector_db("lancedb")

In [28]:
print("\nupdate: Creating library: {}".format("example2_library"))
library = Library().create_new_library("example2_library")


update: Creating library: example2_library


In [29]:
embedding_record = library.get_embedding_status()
print("embedding record - before embedding ", embedding_record)

embedding record - before embedding  [{'embedding_status': 'no', 'embedding_model': 'none', 'embedding_db': 'none', 'embedded_blocks': 0, 'embedding_dims': 0, 'time_stamp': 'NA'}]


In [30]:
print("update: Downloading Sample Files")
sample_files_path = Setup().load_sample_files(over_write=False)

update: Downloading Sample Files


In [31]:
print("update: Parsing and Text Indexing Files")
library.add_files(input_folder_path=os.path.join(sample_files_path, "Agreements"), chunk_size=400, max_chunk_size=600, smart_chunking=1)

update: Parsing and Text Indexing Files


{'docs_added': 0,
 'blocks_added': 0,
 'images_added': 0,
 'pages_added': 0,
 'tables_added': 0,
 'rejected_files': []}

In [32]:
embedding_models = ModelCatalog().list_embedding_models()
embedding_model = "mini-lm-sbert"

In [33]:
library_name = library.library_name
vector_db = LLMWareConfig().get_vector_db()
print(f"\nupdate: Starting the Embedding: "
      f"library - {library_name} - "
      f"vector_db - {vector_db} - "
      f"model - {embedding_model}")


update: Starting the Embedding: library - example2_library - vector_db - lancedb - model - mini-lm-sbert


In [34]:
library.install_new_embedding(embedding_model_name=embedding_model, vector_db=vector_db,batch_size=100)

INFO:llmware.embeddings:update: embedding_handler - Lancedb - Embeddings Created: 100 of 2211
INFO:llmware.embeddings:update: embedding_handler - Lancedb - Embeddings Created: 200 of 2211
INFO:llmware.embeddings:update: embedding_handler - Lancedb - Embeddings Created: 300 of 2211
INFO:llmware.embeddings:update: embedding_handler - Lancedb - Embeddings Created: 400 of 2211
INFO:llmware.embeddings:update: embedding_handler - Lancedb - Embeddings Created: 500 of 2211
INFO:llmware.embeddings:update: embedding_handler - Lancedb - Embeddings Created: 600 of 2211
INFO:llmware.embeddings:update: embedding_handler - Lancedb - Embeddings Created: 700 of 2211
INFO:llmware.embeddings:update: embedding_handler - Lancedb - Embeddings Created: 800 of 2211
INFO:llmware.embeddings:update: embedding_handler - Lancedb - Embeddings Created: 900 of 2211
INFO:llmware.embeddings:update: embedding_handler - Lancedb - Embeddings Created: 1000 of 2211
INFO:llmware.embeddings:update: embedding_handler - Lancedb

{'embeddings_created': 2211,
 'embedded_blocks': 2211,
 'embedding_dims': 384,
 'time_stamp': 'Thu Jun  6 16:09:59 2024'}

In [35]:
update = Status().get_embedding_status(library_name, embedding_model)
print("update: Embeddings Complete - Status() check at end of embedding - ", update)

update: Embeddings Complete - Status() check at end of embedding -  [{'_id': 2, 'key': 'example2_library_embedding_mini-lm-sbert', 'summary': '2211 of 2211 blocks', 'start_time': '1717690179.087806', 'end_time': '1717690199.5373614', 'total': 2211, 'current': 2211, 'units': 'blocks'}]


In [36]:
sample_query = "incentive compensation"
print("\n\nupdate: Run a sample semantic/vectory query: {}".format(sample_query))



update: Run a sample semantic/vectory query: incentive compensation


In [37]:
query_results = Query(library).semantic_query(sample_query, result_count=20)
for i, entries in enumerate(query_results):
  text = entries["text"]
  document_source = entries["file_source"]
  page_num = entries["page_num"]
  vector_distance = entries["distance"]

  if len(text) > 125: text = text[0:125] + " ... "

  print("\nupdate: query results - {} - document - {} - page num - {} distance - {} ".format(i, document_source, page_num, vector_distance))

  print("update: text sample - ", text)


update: query results - 0 - document - Artemis Poseidon EXECUTIVE EMPLOYMENT AGREEMENT.pdf - page num - 4 distance - 0.24837934970855713 
update: text sample -  actual   incentive bonus (“Incentive Bonus”) for any fiscal year as determined by the Board (or the compensation   committee  ... 

update: query results - 1 - document - Athena EXECUTIVE EMPLOYMENT AGREEMENT.pdf - page num - 4 distance - 0.24837934970855713 
update: text sample -  actual   incentive bonus (“Incentive Bonus”) for any fiscal year as determined by the Board (or the compensation   committee  ... 

update: query results - 2 - document - Amphitrite EXECUTIVE EMPLOYMENT AGREEMENT.pdf - page num - 4 distance - 0.2483793944120407 
update: text sample -  actual   incentive bonus (“Incentive Bonus”) for any fiscal year as determined by the Board (or the compensation   committee  ... 

update: query results - 3 - document - Apollo EXECUTIVE EMPLOYMENT AGREEMENT.pdf - page num - 4 distance - 0.24960115551948547 
update: t

In [38]:
embedding_record = library.get_embedding_status()
print("\nupdate: embedding record - ", embedding_record)


update: embedding record -  [{'embedding_status': 'yes', 'embedding_model': 'mini-lm-sbert', 'embedding_db': 'lancedb', 'embedding_dims': 384, 'embedded_blocks': 2211, 'time_stamp': 'Thu Jun  6 16:09:59 2024'}]
