In [1]:
import glob
import os
from dotenv import load_dotenv

from langchain_community.embeddings.oci_generative_ai import OCIGenAIEmbeddings
from langchain_community.vectorstores.oraclevs import OracleVS
from langchain_community.vectorstores.utils import DistanceStrategy
from langchain_community.document_loaders.oracleai import OracleTextSplitter

import oracledb

In [2]:
_ = load_dotenv()
un = os.getenv("ORACLE_USERNAME")
pw = os.getenv("ORACLE_PASSWORD")
dsn = os.getenv("ORACLE_DSN")
config_dir = "/tmp/wallet"
wallet_location = "/tmp/wallet"
wallet_password = os.getenv("WALLET_PASSWORD")

compartment_id = os.getenv("COMPARTMENT_ID")
service_endpoint = os.getenv("GENAI_ENDPOINT")

In [3]:
embeddings = OCIGenAIEmbeddings(
    auth_type="INSTANCE_PRINCIPAL",
    model_id="cohere.embed-multilingual-v3.0",
    service_endpoint=service_endpoint,
    compartment_id=compartment_id,
)

In [None]:
with oracledb.connect(user=un, password=pw, dsn=dsn, config_dir=config_dir, wallet_location=wallet_location, wallet_password=wallet_password) as connection:
    oracle_vs = OracleVS(
        client=connection,
        embedding_function=embeddings,
        table_name="THINKIT",
        distance_strategy=DistanceStrategy.COSINE,
        query="What is a Oracle Database"
    )
    splitter_params = {"split": "recursively", "max": 100, "by": "words", "overlap": 10, "normalize": "all"}
    splitter = OracleTextSplitter(conn=connection, params=splitter_params)
    files = glob.glob("../../data/text/*.txt")
    
    for file in files:
        title = file.title().strip("../../Data/Text/")
        with open(file) as f:
            content = f.read()
            chunks = splitter.split_text(text=content)
            oracle_vs.add_texts(
                texts=chunks,
                metadatas=[
                    {"event": "ochacafe", "title": title}
                    for _ in range(len(chunks))
                ] 
            )