### Oracle AI Vector Search: test Loading the Vector Store
* based on the **LangChain** integration
* based on **OCI GenAI multi-lingual embeddings**
* Data will be stored in a single table (ORACLE_KNOWLEDGE)

In [1]:
import logging
from glob import glob
import pandas as pd
from tqdm.auto import tqdm

# to load and split txt documents
from langchain_community.document_loaders import DirectoryLoader
from langchain_community.document_loaders import TextLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

# to compute embeddings vectors
from langchain_community.embeddings import OCIGenAIEmbeddings

# the class to integrate OCI AI Vector Search with LangChain
from oracle_vector_db_lc import OracleVectorStore

from config_private import COMPARTMENT_OCID

#### Setup

In [2]:
#
# Helper functions
#


# find the url from the file name, using references.csv
def find_ref(df, f_name):
    condition = df["file_name"] == f_name

    ref = df.loc[condition]["url"].values[0]

    return ref


def set_url_in_docs(docs, df_ref):
    docs = docs.copy()
    for doc in docs:
        # remove txt from file_name
        file_name = doc.metadata["source"]
        only_name = file_name.split("/")[-1]
        # find the url from the csv
        ref = find_ref(df_ref, only_name)

        doc.metadata["source"] = ref

    return docs

In [3]:
# directory where the Knowledge base is contained in txt files
TXT_DIR = "./txt"
# file with f_name,url
REF_FILE = "references.csv"

# OCI GenAI model used for Embeddings
EMBED_MODEL = "cohere.embed-multilingual-v3.0"
ENDPOINT = "https://inference.generativeai.us-chicago-1.oci.oraclecloud.com"

# max length in token of the input for embeddings
MAX_LENGTH = 512

# max chunk size, in char, for splitting
CHUNK_SIZE = 1500
# this parameters needs to be adjusted for the Embed model (for example, lowered for Cohere)

# Configure logging
logging.basicConfig(
    level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s"
)

In [4]:
# this is the file list containing the Knowledge base
file_list = sorted(glob(TXT_DIR + "/" + "*.txt"))

print(f"There are {len(file_list)} files to be loaded...")

There are 75 files to be loaded...


#### Load all text files and then splits in chunks
Here we do some preprocessing on the txt file:
* we replace the file_name in source with the url the txtis coming from

In [5]:
# read all references (url)
df_ref = pd.read_csv(REF_FILE)

# load txt and splits in chunks
# with TextLoader it is fast
# documents read not yet splitted
origin_docs = DirectoryLoader(
    TXT_DIR, glob="**/*.txt", show_progress=True, loader_cls=TextLoader
).load()


# replace the f_name with the reference (url)
origin_docs = set_url_in_docs(origin_docs, df_ref)

# split docs in chunks
text_splitter = RecursiveCharacterTextSplitter(
    # thse params must be adapted to Knowledge base
    chunk_size=CHUNK_SIZE,
    chunk_overlap=100,
    length_function=len,
    is_separator_regex=False,
)

docs_splitted = text_splitter.split_documents(origin_docs)

print(f"We have splitted docs in {len(docs_splitted)} chunks...")

 99%|██████████████████████████████████████████████████████████████████████████████████▉ | 75/76 [00:00<00:00, 4345.59it/s]

We have splitted docs in 437 chunks...





#### Create Embed Model, Vector Store and load vectors + embeddings in the DB

In [6]:
# clean the existing table
# be careful: do you really want to delete all the existing records?
OracleVectorStore.drop_collection(collection_name="ORACLE_KNOWLEDGE")

2024-02-26 12:50:53,766 - INFO - ORACLE_KNOWLEDGE truncated!!!


In [7]:
# create embedding model and then the vector store

# create the Embedding Model
embed_model = OCIGenAIEmbeddings(
    # this code is done to be run in OCI DS.
    # If outside replace with API_KEY and provide API_KEYS
    auth_type="API_KEY",
    model_id=EMBED_MODEL,
    service_endpoint=ENDPOINT,
    compartment_id=COMPARTMENT_OCID,
)

# this one compute embeddings and load texts + embeddings in DB
# can take minutes (for embeddings)
v_store = OracleVectorStore.from_documents(
    docs_splitted, embed_model, collection_name="ORACLE_KNOWLEDGE", verbose=True
)

ValidationError: 1 validation error for OCIGenAIEmbeddings
__root__
  Could not authenticate with OCI client. Please check if ~/.oci/config exists. If INSTANCE_PRINCIPLE or RESOURCE_PRINCIPLE is used, Please check the specified auth_profile and auth_type are valid. (type=value_error)

#### Do a query for test

In [None]:
# k is the number of docs we want to retrieve
retriever = v_store.as_retriever(search_kwargs={"k": 5})

In [None]:
question = "What is Autonomous Database on OCI"

result_docs = retriever.get_relevant_documents(question)

In [None]:
for doc in result_docs:
    print(doc.page_content)
    print(doc.metadata["source"])
    print("----------------------------")
    print("")