In [1]:
from glob import glob
import logging
import oracledb

from langchain_community.vectorstores.oraclevs import OracleVS
from langchain_community.vectorstores.utils import DistanceStrategy
from langchain_core.documents import Document

# to load and split txt documents
from langchain_community.document_loaders import DirectoryLoader
from langchain_community.document_loaders import TextLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

# to compute embeddings vectors
# from langchain_community.embeddings import OCIGenAIEmbeddings

from oci_cohere_embeddings_utils import OCIGenAIEmbeddingsWithBatch
from config_private import DB_USER, DB_PWD, DB_HOST_IP, DB_SERVICE
from chunk_index_utils import load_books_and_split
from utils import enable_tracing

from config import OCI_EMBED_MODEL, ENDPOINT
from config_private import COMPARTMENT_ID

In [2]:
# test the connection
DSN = f"{DB_HOST_IP}/{DB_SERVICE}"

try:
    connection = oracledb.connect(user=DB_USER, password=DB_PWD, dsn=DSN)
    print("Connection successful!")
except Exception as e:
    print("Connection failed!")

Connection successful!


In [3]:
enable_tracing()

In [4]:
#
# Some configurations
#

# directory where our Knowledge base is contained in txt files
BOOKS_DIR = "./books"

# Configure logging
logging.basicConfig(
    level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s"
)

embed_model = OCIGenAIEmbeddingsWithBatch(
    # this code is done to be run in OCI DS.
    # If outside replace with API_KEY and provide API_KEYS
    # auth_type = "RESOURCE_PRINCIPAL"
    auth_type="API_KEY",
    model_id=OCI_EMBED_MODEL,
    service_endpoint=ENDPOINT,
    compartment_id=COMPARTMENT_ID,
)

In [None]:
# this is the file list containing the Knowledge base
file_list = sorted(glob(BOOKS_DIR + "/" + "*.pdf"))

print(f"There are {len(file_list)} files to be loaded...")
for f_name in file_list:
    print(f_name)

In [None]:
docs = load_books_and_split(BOOKS_DIR)

In [None]:
vector_store_max = OracleVS.from_documents(
    docs,
    embed_model,
    client=connection,
    table_name="CHUNKS_VECTORS",
    distance_strategy=DistanceStrategy.COSINE,
)

In [None]:
vector_store_max.table_name

In [None]:
oraclevs.create_index(
    connection,
    vector_store_max,
    params={
        "idx_name": "hnsw_idx3",
        "idx_type": "HNSW",
        "accuracy": 97,
        "parallel": 8,
    },
)

In [5]:
vector_store_max = OracleVS(
    embedding_function=embed_model,
    client=connection,
    table_name="CHUNKS_VECTORS",
    query="Select * from CHUNKS_VECTORS",
    distance_strategy=DistanceStrategy.COSINE,
)

In [None]:
vector_store_max.client.close()

In [6]:
try:
    connection = oracledb.connect(user=DB_USER, password=DB_PWD, dsn=DSN)
    print("Connection successful!")
except Exception as e:
    print("Connection failed!")

vector_store_max.client = connection

Connection successful!


In [7]:
vector_store_max.client.is_healthy()

True

In [8]:
%%time
query = "La metformina si può usare per curare il diabete di tipo 2?"

results = vector_store_max.similarity_search(query=query, k=6)

results

CPU times: user 12.8 ms, sys: 2.6 ms, total: 15.4 ms
Wall time: 224 ms


[]

In [None]:
retriever = vector_store_max.as_retriever()

In [9]:
# retriever.vectorstore.client.close()

retriever.vectorstore.client = oracledb.connect(user=DB_USER, password=DB_PWD, dsn=DSN)
print("Connection ok...")

query = "La metformina si può usare per curare il diabete di tipo 2?"

results = retriever.invoke(query)

retriever.vectorstore.client.close()

results

NameError: name 'retriever' is not defined

In [None]:
retriever.vectorstore.client.is_healthy()

In [None]:
retriever.vectorstore.client