### This NB shows ho to add another book and query using the enhanced knowledge base
* storing is managed using **OracleVectorStore**

In [2]:
import logging
import sys
import re
from tqdm import tqdm
import hashlib

from typing import List, Any, Optional, Dict, Tuple
from llama_index.vector_stores.types import (
    VectorStore,
    VectorStoreQuery,
    VectorStoreQueryResult,
)
from llama_index import SimpleDirectoryReader
from llama_index import StorageContext, VectorStoreIndex, ServiceContext
from llama_index.schema import TextNode, BaseNode, Document

import oci
import ads

# only
import oracledb
from oci_utils import load_oci_config
from ads.llm import GenerativeAIEmbeddings, GenerativeAI
from oracle_vector_db import OracleVectorStore

from config_private import COMPARTMENT_OCID, ENDPOINT
from config import ID_GEN_METHOD, EMBED_MODEL

In [3]:
# version I'm using
print(f"oracledb version: {oracledb.__version__}")
print(f"oci version: {oci.__version__}")

oracledb version: 2.0.0.dev20231121
oci version: 2.112.1+preview.1.1649


In [17]:
BOOK_TO_ADD = "Fare_grafica_editoriale_Progettare_il_libro_storia,_teorie,_tecniche.pdf"

In [18]:
# some simple text preprocessing
# TODO: this function must be customized to fit your pdf
def preprocess_text(text):
    text = text.replace("\t", " ")
    text = text.replace(" -\n", "")
    text = text.replace("-\n", "")
    text = text.replace("\n", " ")

    # remove repeated blanks
    text = re.sub(r"\s+", " ", text)

    return text


# remove pages with num words < threshold
def remove_short_pages(pages, threshold):
    n_removed = 0
    for pag in pages:
        if len(pag.text.split(" ")) < threshold:
            pages.remove(pag)
            n_removed += 1

    logging.info(f"Removed {n_removed} short pages...")

    return pages


def read_and_split_in_pages(input_files):
    pages = SimpleDirectoryReader(input_files=input_files).load_data()

    logging.info(f"Read total {len(pages)} pages...")

    # preprocess text
    for doc in pages:
        doc.text = preprocess_text(doc.text)

    # remove pages with num words < threshold
    pages = remove_short_pages(pages, threshold=10)

    # create a list of text (these are the chuncks to be embedded and saved)
    pages_text = [doc.text for doc in pages]

    # extract list of id
    # this way id have been generated by llama-index
    if ID_GEN_METHOD == "LLINDEX":
        pages_id = [doc.id_ for doc in pages]
    if ID_GEN_METHOD == "HASH":
        logging.info("Hashing to compute id...")
        pages_id = []
        for doc in tqdm(pages):
            encoded_text = doc.text.encode()
            hash_object = hashlib.sha256(encoded_text)
            hash_hex = hash_object.hexdigest()
            pages_id.append(hash_hex)

            # rewrite id
            doc.id_ = hash_hex

    return pages_text, pages_id, pages


BATCH_SIZE = 20


def compute_embeddings(embed_model, pages_text):
    embeddings = []
    for i in tqdm(range(0, len(pages_text), BATCH_SIZE)):
        batch = pages_text[i : i + BATCH_SIZE]

        # here we compute embeddings for a batch
        embeddings_batch = embed_model.embed_documents(batch)
        # add to the final list
        embeddings.extend(embeddings_batch)

    return embeddings

In [19]:
# Configure logging
logging.basicConfig(
    level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s"
)

In [20]:
pages = SimpleDirectoryReader(input_files=[BOOK_TO_ADD]).load_data()

pages[0]

Document(id_='e17d3b54-70d7-4fad-924b-b9c04ddebb06', embedding=None, metadata={'page_label': '1', 'file_name': 'Fare_grafica_editoriale_Progettare_il_libro_storia,_teorie,_tecniche.pdf', 'file_path': 'Fare_grafica_editoriale_Progettare_il_libro_storia,_teorie,_tecniche.pdf', 'file_type': 'application/pdf', 'file_size': 66510896, 'creation_date': '2023-12-30', 'last_modified_date': '2023-10-31', 'last_accessed_date': '2023-12-30'}, excluded_embed_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], excluded_llm_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], relationships={}, hash='f3af017d889ae879b6f6b0a3e24b8a86b6d47c0b1d8b02205ce3828f1837bdd4', text='Franco Achilli (Milano, 1957, anno in cui appare l’Helvetica) ha iniziato la sua attività \ndi visual designer durante gli studi di architettura al Politecnico di Milano, proget -\ntando libri, guide e collan

In [7]:
pages_text, pages_id, pages = read_and_split_in_pages([BOOK_TO_ADD])

2023-12-30 19:50:03,349 - INFO - Read total 303 pages...
2023-12-30 19:50:03,899 - INFO - Removed 152 short pages...
2023-12-30 19:50:03,899 - INFO - Hashing to compute id...
100%|███████████████████████████████████████████████████████████████████████████████████████████████| 151/151 [00:00<00:00, 279991.12it/s]


In [11]:
pages_text[100]

''

#### Create the wrappers

In [None]:
# setup
oci_config = load_oci_config()

# need to do this way
api_keys_config = ads.auth.api_keys(oci_config)

# english, or for other language use: multilingual

embed_model = GenerativeAIEmbeddings(
    compartment_id=COMPARTMENT_OCID,
    model=EMBED_MODEL,
    auth=ads.auth.api_keys(oci_config),
    # Optionally you can specify keyword arguments for the OCI client, e.g. service_endpoint.
    client_kwargs={"service_endpoint": ENDPOINT},
)

In [None]:
v_store = OracleVectorStore(verbose=True)

In [None]:
embeddings = compute_embeddings(embed_model, pages_text)

In [None]:
# add embeddings to pages
for i, doc in enumerate(pages):
    doc.embedding = embeddings[i]

In [None]:
list_ids = v_store.add(pages)

In [None]:
# save to db
v_store.persist()