### This NB shows ho to add another book and query using the enhanced knowledge base
* storing is managed using **OracleVectorStore**

In [1]:
import logging
import sys
import re
from tqdm import tqdm
import hashlib

from typing import List, Any, Optional, Dict, Tuple
from llama_index.vector_stores.types import (
    VectorStore,
    VectorStoreQuery,
    VectorStoreQueryResult,
)
from llama_index import SimpleDirectoryReader
from llama_index import StorageContext, VectorStoreIndex, ServiceContext
from llama_index.schema import TextNode, BaseNode, Document

import oci
import ads

# only
import oracledb
from oci_utils import load_oci_config
from ads.llm import GenerativeAIEmbeddings, GenerativeAI
from oracle_vector_db import OracleVectorStore

from config_private import COMPARTMENT_OCID, ENDPOINT
from config import ID_GEN_METHOD, EMBED_MODEL

In [2]:
# version I'm using
print(f"oracledb version: {oracledb.__version__}")
print(f"oci version: {oci.__version__}")

oracledb version: 2.0.0.dev20231121
oci version: 2.112.1+preview.1.1649


In [3]:
BOOK_TO_ADD = "Fare_grafica_editoriale_Progettare_il_libro_storia,_teorie,_tecniche.pdf"

In [4]:
# some simple text preprocessing
# TODO: this function must be customized to fit your pdf
def preprocess_text(text):
    text = text.replace("\t", " ")
    text = text.replace(" -\n", "")
    text = text.replace("-\n", "")
    text = text.replace("\n", " ")

    # remove repeated blanks
    text = re.sub(r"\s+", " ", text)

    return text


# remove pages with num words < threshold
def remove_short_pages(pages, threshold):
    n_removed = 0
    for pag in pages:
        if len(pag.text.split(" ")) < threshold:
            pages.remove(pag)
            n_removed += 1

    logging.info(f"Removed {n_removed} short pages...")

    return pages


def read_and_split_in_pages(input_files):
    pages = SimpleDirectoryReader(input_files=input_files).load_data()

    logging.info(f"Read total {len(pages)} pages...")

    # preprocess text
    for doc in pages:
        doc.text = preprocess_text(doc.text)

    # remove pages with num words < threshold
    pages = remove_short_pages(pages, threshold=10)

    # create a list of text (these are the chuncks to be embedded and saved)
    pages_text = [doc.text for doc in pages]

    # 23/12 register the num of the page
    # must be a string
    pages_num = [doc.metadata["page_label"] for doc in pages]

    # extract list of id
    # this way id have been generated by llama-index
    if ID_GEN_METHOD == "LLINDEX":
        pages_id = [doc.id_ for doc in pages]
    # this way generated hashing the page
    if ID_GEN_METHOD == "HASH":
        logging.info("Hashing to compute id...")
        pages_id = []
        for doc in tqdm(pages):
            encoded_text = doc.text.encode()
            hash_object = hashlib.sha256(encoded_text)
            hash_hex = hash_object.hexdigest()
            pages_id.append(hash_hex)

    return pages_text, pages_id, pages_num


BATCH_SIZE = 20


def compute_embeddings(embed_model, pages_text):
    embeddings = []
    for i in tqdm(range(0, len(pages_text), BATCH_SIZE)):
        batch = pages_text[i : i + BATCH_SIZE]

        # here we compute embeddings for a batch
        embeddings_batch = embed_model.embed_documents(batch)
        # add to the final list
        embeddings.extend(embeddings_batch)

    return embeddings

In [5]:
# Configure logging
logging.basicConfig(
    level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s"
)

In [6]:
pages = SimpleDirectoryReader(input_files=[BOOK_TO_ADD]).load_data()

for doc in pages:
    doc.text = preprocess_text(doc.text)

In [7]:
pages_text, pages_id, pages_num = read_and_split_in_pages([BOOK_TO_ADD])

2024-01-05 17:35:17,919 - INFO - Read total 178 pages...
2024-01-05 17:35:17,968 - INFO - Removed 5 short pages...
2024-01-05 17:35:17,968 - INFO - Hashing to compute id...
100%|█████████████████████████████████████████████████████████████████| 173/173 [00:00<00:00, 89959.66it/s]


In [8]:
pages = remove_short_pages(pages, 10)

2024-01-05 17:35:18,025 - INFO - Removed 5 short pages...


#### Create the wrappers

In [9]:
# setup
oci_config = load_oci_config()

# need to do this way
api_keys_config = ads.auth.api_keys(oci_config)

# english, or for other language use: multilingual

embed_model = GenerativeAIEmbeddings(
    compartment_id=COMPARTMENT_OCID,
    model=EMBED_MODEL,
    auth=ads.auth.api_keys(oci_config),
    truncate="END",
    # Optionally you can specify keyword arguments for the OCI client, e.g. service_endpoint.
    client_kwargs={"service_endpoint": ENDPOINT},
)

In [10]:
v_store = OracleVectorStore(verbose=True)

In [11]:
embeddings = compute_embeddings(embed_model, pages_text)

100%|████████████████████████████████████████████████████████████████████████| 9/9 [00:12<00:00,  1.43s/it]


In [14]:
# add embeddings to pages
for i, doc in enumerate(pages):
    doc.embedding = embeddings[i]

In [15]:
list_ids = v_store.add(pages)

In [16]:
# save to db
v_store.persist()

2024-01-05 17:39:16,324 - INFO - Persisting to DB...
2024-01-05 17:39:16,834 - INFO - Saving embeddings to DB...
100%|████████████████████████████████████████████████████████████████████| 173/173 [00:05<00:00, 30.23it/s]
2024-01-05 17:39:22,563 - INFO - Tot. errors in save_embeddings: 0
2024-01-05 17:39:22,563 - INFO - Saving texts to DB...
100%|████████████████████████████████████████████████████████████████████| 173/173 [00:16<00:00, 10.72it/s]
2024-01-05 17:39:38,704 - INFO - Tot. errors in save_chunks: 0
