### Demo 1: add a book to the Vector Store
* using **LangChain**

In [45]:
import numpy as np
import oracledb

from tqdm import tqdm
import tiktoken
from langchain_community.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

# this is the class wrapping our Cohere embeddngs model
from langchain_community.embeddings import OCIGenAIEmbeddings

from config import EMBED_MODEL

from config_private import (
    DB_USER,
    DB_PWD,
    DB_SERVICE,
    DB_HOST_IP,
    COMPARTMENT_OCID,
    ENDPOINT,
)

In [36]:
def num_tokens_from_string(string: str) -> int:
    """Returns the number of tokens in a text string."""
    encoding = tiktoken.encoding_for_model("gpt-3.5-turbo")

    return len(encoding.encode(string))

In [17]:
BOOK_NAME = "covid19_treatment_guidelines.pdf"

In [39]:
loader = PyPDFLoader(BOOK_NAME)
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1024,
    chunk_overlap=50,
)

pages = loader.load()
chunks = text_splitter.split_documents(pages)

In [40]:
# compute max token per chunk
max_token = 0

for chunk in tqdm(chunks):
    n_tok = num_tokens_from_string(chunk.page_content)
    if n_tok > max_token:
        max_token = n_tok

print(f"The maximum number of token per chunk is {max_token}")

100%|██████████████████████████████████████████████████████████████| 1755/1755 [00:00<00:00, 7707.34it/s]

The maximum number of token per chunk is 390





In [41]:
embed_model = OCIGenAIEmbeddings(
    model_id=EMBED_MODEL,
    service_endpoint="https://inference.generativeai.us-chicago-1.oci.oraclecloud.com",
    compartment_id=COMPARTMENT_OCID,
)

In [42]:
BATCH_SIZE = 50

embeddings = []
txts = [chunk.page_content for chunk in chunks]

for i in tqdm(range(0, len(txts), BATCH_SIZE)):
    batch = txts[i : i + BATCH_SIZE]

    # here we compute embeddings for a batch
    embeddings_batch = embed_model.embed_documents(batch)
    # add to the final list
    embeddings.extend(embeddings_batch)

100%|████████████████████████████████████████████████████████████████████| 36/36 [00:38<00:00,  1.07s/it]


In [46]:
ENDPOINT

'https://generativeai.aiservice.us-chicago-1.oci.oraclecloud.com'