In [3]:
import logging
import re
from typing import List
from tqdm import tqdm
import array
import numpy as np
import time

# to generate id from text
import hashlib

from llama_index import SimpleDirectoryReader
from llama_index.node_parser import SentenceSplitter

import oracledb
import ads

from tokenizers import Tokenizer

# This is the wrapper for GenAI Embeddings
from ads.llm import GenerativeAIEmbeddings

from oci_utils import load_oci_config

# this way we don't show & share
from config_private import (
    DB_USER,
    DB_PWD,
    DB_SERVICE,
    DB_HOST_IP,
    COMPARTMENT_OCID,
    ENDPOINT,
)

#
# Configs
#
from config import (
    INPUT_FILES,
    EMBED_MODEL,
    TOKENIZER,
    EMBEDDINGS_BITS,
    ID_GEN_METHOD,
    ENABLE_CHUNKING,
    MAX_CHUNK_SIZE,
    CHUNK_OVERLAP,
)

# to create embeddings in batch
BATCH_SIZE = 20

In [13]:
def generate_id(nodes_list: List):
    """
    get a list of nodes (pages, chunks) and generate the id

    return: list of id
    """
    if ID_GEN_METHOD == "LLINDEX":
        nodes_ids = [doc.id_ for doc in nodes_list]
    # this way generated hashing the page
    if ID_GEN_METHOD == "HASH":
        logging.info("Hashing to compute id...")
        nodes_ids = []
        for doc in tqdm(nodes_list):
            encoded_text = doc.text.encode()
            hash_object = hashlib.sha256(encoded_text)
            hash_hex = hash_object.hexdigest()
            nodes_ids.append(hash_hex)

    return nodes_ids


# remove pages with num words < threshold
def remove_short_pages(pages, threshold):
    n_removed = 0
    for pag in pages:
        if len(pag.text.split(" ")) < threshold:
            pages.remove(pag)
            n_removed += 1

    logging.info(f"Removed {n_removed} short pages...")

    return pages


def preprocess_text(text):
    text = text.replace("\t", " ")
    text = text.replace(" -\n", "")
    text = text.replace("-\n", "")
    text = text.replace("\n", " ")

    # remove repeated blanks
    text = re.sub(r"\s+", " ", text)

    return text


def read_and_split_in_pages(input_files):
    pages = SimpleDirectoryReader(input_files=input_files).load_data()

    logging.info(f"Read total {len(pages)} pages...")

    # preprocess text
    for doc in pages:
        doc.text = preprocess_text(doc.text)

    # remove pages with num words < threshold
    pages = remove_short_pages(pages, threshold=10)

    # create a list of text (these are the chuncks to be embedded and saved)
    pages_text = [doc.text for doc in pages]

    # 23/12 register the num of the page
    # must be a string
    pages_num = [doc.metadata["page_label"] for doc in pages]

    # extract list of id
    # this way id have been generated by llama-index
    # 08/01/2024 refactored
    pages_id = generate_id(pages)

    return pages_text, pages_id, pages_num

In [14]:
input_files = ["high-availability-23c.pdf"]

In [15]:
pages_text, pages_id, pages_num = read_and_split_in_pages(input_files)

2024-02-05 23:33:09,525 - INFO - Read total 561 pages...
2024-02-05 23:33:09,565 - INFO - Removed 0 short pages...
2024-02-05 23:33:09,566 - INFO - Hashing to compute id...
100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 561/561 [00:00<00:00, 244544.23it/s]


In [20]:
oci_config = load_oci_config()

# need to do this way
api_keys_config = ads.auth.api_keys(oci_config)

# the embedding client
embed_model = GenerativeAIEmbeddings(
    compartment_id=COMPARTMENT_OCID,
    model=EMBED_MODEL,
    auth=api_keys_config,
    truncate="END",
    # Optionally you can specify keyword arguments for the OCI client, e.g. service_endpoint.
    client_kwargs={"service_endpoint": ENDPOINT},
)

In [21]:
def compute_embeddings(embed_model, nodes_text):
    cohere_tokenizer = Tokenizer.from_pretrained(TOKENIZER)
    embeddings = []
    for i in tqdm(range(0, len(nodes_text), BATCH_SIZE)):
        batch = nodes_text[i : i + BATCH_SIZE]

        # here we compute embeddings for a batch
        embeddings_batch = embed_model.embed_documents(batch)
        # add to the final list
        embeddings.extend(embeddings_batch)

    return embeddings

In [22]:
vet_embeddings = compute_embeddings(embed_model, pages_text)

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 29/29 [00:18<00:00,  1.59it/s]
