### Finding duplicate candidates in the documentation using **AI Vector Search**

In [1]:
# we need this for the reranker
import ads
from typing import List

import oracledb
import time
import logging
from tqdm import tqdm

from langchain_core.documents import Document

from oci_utils import load_oci_config
from oci_baai_reranker_general import OCIBAAIRerankerGeneral

from config import RERANKER_ID
from config_private import (DB_HOST_IP, 
                            DB_USER, 
                            DB_PWD, 
                            DB_SERVICE)

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Configure logging
logging.basicConfig(
    level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s"
)

In [3]:
#
# This function returns the (id1, id2) of the candidate pairs + distance (DOT)
#
def self_similarity_search(top_k = 10, 
                           vector_table_name="vectors",
                           vector_field_name="vec",
                           text_table_name="chunks",
                           text_field_name="chunk",
                           verbose=False) -> List:
    """
    Executes a query against an Oracle database to find the top_k candidate duplicates.
    """
    tStart = time.time()

    # build the DSN from data taken from config.py
    DSN = DB_HOST_IP + "/" + DB_SERVICE

    try:
        with oracledb.connect(user=DB_USER, password=DB_PWD, dsn=DSN) as connection:
            with connection.cursor() as cursor:

                # only constraint: table has to have an id field
                select = f"""select a.id, b.id, 
                vector_distance(a.{vector_field_name}, b.{vector_field_name}, DOT) as d 
                from {vector_table_name} a, {vector_table_name} b
                where a.id < b.id
                order by d asc
                fetch first {top_k} rows only"""

                if verbose:
                    logging.info(f"Select: {select}")

                cursor.execute(select)

                rows = cursor.fetchall()

                ids_list = []
                d_list = []
                # prepare output
                for row in rows:
                    ids_list.append([row[0], row[1]])
                    d_list.append(row[2])
                    
    except Exception as e:
        logging.error(f"Error occurred in self_similarity_search: {e}")

        return None

    docs_with_distance = []
    for (id1, id2), d in zip(tqdm(ids_list), d_list):
        d = round(-d, 4)
        text1, page_num1 = find_text_and_metadata(id1, text_table_name, text_field_name)
        text2, page_num2 = find_text_and_metadata(id2, text_table_name, text_field_name)
        
        doc1 = Document(page_content=text1, metadata={"page_num":page_num1})
        doc2 = Document(page_content=text2, metadata={"page_num":page_num2})
        
        docs_with_distance.append((doc1, doc2, d))
    
    tEla = time.time() - tStart
    
    return docs_with_distance

#
# This function retrieve text + metadata from id
#
def find_text_and_metadata(id, text_table_name="chunks", text_field_name="chunk", 
                           verbose=False):
    DSN = DB_HOST_IP + "/" + DB_SERVICE

    try:
        with oracledb.connect(user=DB_USER, password=DB_PWD, dsn=DSN) as connection:
            with connection.cursor() as cursor:
                select = f"""select {text_field_name}, page_num 
                from {text_table_name}
                where id = :1
                """

                if verbose:
                    logging.info(f"select: {select}")

                cursor.execute(select, [id])

                rows = cursor.fetchall()

                # prepare output
                for row in rows:
                    clob_pointer = row[0]
                    page_num = row[1]
                    ifull_clob_data = clob_pointer.read()
                    
    except Exception as e:
        logging.error(f"Error occurred in oracle_query: {e}")

        return None
    
    return ifull_clob_data, page_num

In [4]:
%%time
docs_with_distance = self_similarity_search(top_k = 10, verbose=True)

2024-02-02 14:10:35,403 - INFO - Select: select a.id, b.id, 
                vector_distance(a.vec, b.vec, DOT) as d 
                from vectors a, vectors b
                where a.id < b.id
                order by d asc
                fetch first 10 rows only
100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10/10 [00:05<00:00,  1.78it/s]

CPU times: user 388 ms, sys: 75.3 ms, total: 463 ms
Wall time: 56.3 s





In [8]:
MAX_LEN_DISP = 1000

for doc1, doc2, d in docs_with_distance:
    print(doc1.page_content[:MAX_LEN_DISP])
    print(f"Pag: {doc1.metadata['page_num']}")
    print("")
    print(doc2.page_content[:MAX_LEN_DISP])
    print(f"Pag: {doc2.metadata['page_num']}")
    print(f"Distance: {d}")
    print("----------------------")
    print()

COVID-19 Treatment Guidelines 109Table 3d. Therapeutic Management of Hospitalized Pediatric Patients With MIS-C Panel’s Recommendations MIS-CInitial treatment for MIS-C includes both immunomodulatory and antithrombotic therapy. Initial Immunomodulatory Therapy •IVIG 2 g/kg IBW (up to a maximum total dose of 100 g) IV plus low to moderate dose methylprednisolone (1–2 mg/kg/day) IVa or another glucocorticoid at an equivalent dosea (AIIb) . •Glucocorticoid monotherapy , only if IVIG is unavailable or contraindicated (BIIa) . •IVIG monotherapy , only if glucocorticoids are contraindicated (BIIb) . Intensification Immunomodulatory Therapy •Intensification therapy is recommended for children with refractory MIS-C who do not improve within 24 hours of receiving initial immunomodulatory therapy (AIII) . One of the following can be used (listed in alphabetical order): •High-dose anakinra 5–10 mg/kg IV or SUBQ once daily (BIIb) •Higher-dose glucocorticoid (e.g., methylprednisolone 10–30 mg/kg/da

#### Display results

In [9]:
#
# display results
#

# to be used with rerankink
candidate_pairs = []

for doc1, doc2, d in docs_with_distance:
    print(doc1.page_content)
    print(f"Page num: {doc1.metadata['page_num']}")
    print("")
    print(doc2.page_content)
    print(f"Page num: {doc2.metadata['page_num']}")
    print("")
    print(f"Distance: {round(d, 3)}")
    print("------------------")
    print("")

    candidate_pairs.append([doc1.page_content, doc2.page_content])

COVID-19 Treatment Guidelines 109Table 3d. Therapeutic Management of Hospitalized Pediatric Patients With MIS-C Panel’s Recommendations MIS-CInitial treatment for MIS-C includes both immunomodulatory and antithrombotic therapy. Initial Immunomodulatory Therapy •IVIG 2 g/kg IBW (up to a maximum total dose of 100 g) IV plus low to moderate dose methylprednisolone (1–2 mg/kg/day) IVa or another glucocorticoid at an equivalent dosea (AIIb) . •Glucocorticoid monotherapy , only if IVIG is unavailable or contraindicated (BIIa) . •IVIG monotherapy , only if glucocorticoids are contraindicated (BIIb) . Intensification Immunomodulatory Therapy •Intensification therapy is recommended for children with refractory MIS-C who do not improve within 24 hours of receiving initial immunomodulatory therapy (AIII) . One of the following can be used (listed in alphabetical order): •High-dose anakinra 5–10 mg/kg IV or SUBQ once daily (BIIb) •Higher-dose glucocorticoid (e.g., methylprednisolone 10–30 mg/kg/da

#### Adding a reranker. Does it improve?

In [11]:
oci_config = load_oci_config()

# need to do this way
api_keys_config = ads.auth.api_keys(oci_config)

baai_reranker = OCIBAAIRerankerGeneral(
            auth=api_keys_config, 
            deployment_id=RERANKER_ID, region="eu-frankfurt-1")

2024-02-02 14:11:49,278 - INFO - Created OCI reranker client...
2024-02-02 14:11:49,279 - INFO - Region: eu-frankfurt-1...
2024-02-02 14:11:49,279 - INFO - Deployment id: ocid1.datasciencemodeldeployment.oc1.eu-frankfurt-1.amaaaaaangencdyaulxbosgii6yajt2jdsrrvfbequkxt3mepz675uk3ui3q...
2024-02-02 14:11:49,279 - INFO - 


In [12]:
results = baai_reranker.rerank(candidate_pairs, top_n=6)

In [13]:
results

[{'doc1': 'Table 12-3 (Cont.) Serializable Transaction Session 1 Session 2 Explanation No action. SQL> ROLLBACK;Session 2 rolls back transaction 4, which ends the transaction. No action. SQL> SET TRANSACTION ISOLATION LEVEL SERIALIZABLE;Session 2 begins transaction 5 and sets it to the SERIALIZABLE isolation level. No action. SQL> SELECT last_name, salary FROM employees WHERE last_name IN (\'Banda\', \'Greene\', \'Hintz\'); LAST_NAME SALARY ------------- ---------Banda 7000 Greene 9900 Hintz 7100Transaction 5 queries the salaries for Banda, Greene, and Hintz. The Hintz salary update committed by transaction 3 is visible. No action. SQL> UPDATE employees SET salary = 7200 WHERE last_name=\'Hintz\'; 1 row updated.Transaction 5 updates the Hintz salary to a different value. Because the Hintz update made by transaction 3 committed before the start of transaction 5, the serialized access problem is avoided. Note: If a different transaction updated and committed the Hintz row after transacti