### Finding duplicate candidates in the documentation using **AI Vector Search**

In [1]:
# we need this for the reranker
import ads
from typing import List

import oracledb
import time
import logging
from tqdm import tqdm

from langchain_core.documents import Document

from oci_utils import load_oci_config

from config_private import DB_HOST_IP, DB_USER, DB_PWD, DB_SERVICE

In [2]:
# Configure logging
logging.basicConfig(
    level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s"
)

In [3]:
#
# This function returns the (id1, id2) of the candidate pairs + distance (DOT)
#
def self_similarity_search(
    top_k=10,
    vector_table_name="vectors",
    vector_field_name="vec",
    text_table_name="chunks",
    text_field_name="chunk",
    verbose=False,
) -> List:
    """
    Executes a query against an Oracle database to find the top_k candidate duplicates.
    """
    tStart = time.time()

    # build the DSN from data taken from config.py
    DSN = DB_HOST_IP + "/" + DB_SERVICE

    try:
        with oracledb.connect(user=DB_USER, password=DB_PWD, dsn=DSN) as connection:
            with connection.cursor() as cursor:

                # only constraint: table has to have an id field
                select = f"""select a.id, b.id, 
                vector_distance(a.{vector_field_name}, b.{vector_field_name}, DOT) as d 
                from {vector_table_name} a, {vector_table_name} b
                where a.id < b.id
                order by d asc
                fetch first {top_k} rows only"""

                if verbose:
                    logging.info(f"Select: {select}")

                cursor.execute(select)

                rows = cursor.fetchall()

                ids_list = []
                d_list = []
                # prepare output
                for row in rows:
                    ids_list.append([row[0], row[1]])
                    d_list.append(row[2])

    except Exception as e:
        logging.error(f"Error occurred in self_similarity_search: {e}")

        return None

    docs_with_distance = []
    for (id1, id2), d in zip(tqdm(ids_list), d_list):
        d = round(-d, 4)
        text1, page_num1 = find_text_and_metadata(id1, text_table_name, text_field_name)
        text2, page_num2 = find_text_and_metadata(id2, text_table_name, text_field_name)

        doc1 = Document(page_content=text1, metadata={"page_num": page_num1})
        doc2 = Document(page_content=text2, metadata={"page_num": page_num2})

        docs_with_distance.append((doc1, doc2, d))

    tEla = time.time() - tStart

    return docs_with_distance


#
# This function retrieve text + metadata from id
#
def find_text_and_metadata(
    id, text_table_name="chunks", text_field_name="chunk", verbose=False
):
    DSN = DB_HOST_IP + "/" + DB_SERVICE

    try:
        with oracledb.connect(user=DB_USER, password=DB_PWD, dsn=DSN) as connection:
            with connection.cursor() as cursor:
                select = f"""select {text_field_name}, page_num 
                from {text_table_name}
                where id = :1
                """

                if verbose:
                    logging.info(f"select: {select}")

                cursor.execute(select, [id])

                rows = cursor.fetchall()

                # prepare output
                for row in rows:
                    clob_pointer = row[0]
                    page_num = row[1]
                    ifull_clob_data = clob_pointer.read()

    except Exception as e:
        logging.error(f"Error occurred in oracle_query: {e}")

        return None

    return ifull_clob_data, page_num

In [4]:
%%time
docs_with_distance = self_similarity_search(top_k=10, verbose=True)

2024-03-03 09:12:27,098 - INFO - Select: select a.id, b.id, 
                vector_distance(a.vec, b.vec, DOT) as d 
                from vectors a, vectors b
                where a.id < b.id
                order by d asc
                fetch first 10 rows only
100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10/10 [00:05<00:00,  1.76it/s]

CPU times: user 410 ms, sys: 64.2 ms, total: 474 ms
Wall time: 50 s





In [5]:
MAX_LEN_DISP = 1000

for doc1, doc2, d in docs_with_distance:
    print(doc1.page_content[:MAX_LEN_DISP])
    print(f"Pag: {doc1.metadata['page_num']}")
    print("")
    print(doc2.page_content[:MAX_LEN_DISP])
    print(f"Pag: {doc2.metadata['page_num']}")
    print(f"Distance: {d}")
    print("----------------------")
    print()

COVID-19 Treatment Guidelines 99Table 3c. Therapeutic Management of Hospitalized Children With COVID-19 Disease Severity Panel’s Recommendations Hospitalized for COVID-19For children aged ≥12 years admitted for COVID-19, use prophylactic anticoagulation unless contraindicated (BIII) .a Does Not Require Supplemental OxygenFor children admitted for COVID-19 who are at the highest risk of progression to severe COVID-19,b consider using remdesivirc for children aged 12–17 years (CIII) . There is insufficient evidence for using remdesivir in children aged 28 days to <12 years. For children admitted for reasons other than COVID-19 who have mild to moderate COVID-19 and are at the highest risk of progression,b refer to Therapeutic Management of Nonhospitalized Children With COVID-19 . Requires Conventional OxygendUse 1 of the following options: •Remdesivirc (BIII) •Dexamethasone plus remdesivirc for children with increasing oxygen needs, particularly adolescents (BIII) Requires Oxygen Through

#### Display results

In [6]:
#
# display results
#

# to be used with rerankink
candidate_pairs = []

for doc1, doc2, d in docs_with_distance:
    print(doc1.page_content)
    print(f"Page num: {doc1.metadata['page_num']}")
    print("")
    print(doc2.page_content)
    print(f"Page num: {doc2.metadata['page_num']}")
    print("")
    print(f"Distance: {round(d, 3)}")
    print("------------------")
    print("")

    candidate_pairs.append([doc1.page_content, doc2.page_content])

COVID-19 Treatment Guidelines 99Table 3c. Therapeutic Management of Hospitalized Children With COVID-19 Disease Severity Panel’s Recommendations Hospitalized for COVID-19For children aged ≥12 years admitted for COVID-19, use prophylactic anticoagulation unless contraindicated (BIII) .a Does Not Require Supplemental OxygenFor children admitted for COVID-19 who are at the highest risk of progression to severe COVID-19,b consider using remdesivirc for children aged 12–17 years (CIII) . There is insufficient evidence for using remdesivir in children aged 28 days to <12 years. For children admitted for reasons other than COVID-19 who have mild to moderate COVID-19 and are at the highest risk of progression,b refer to Therapeutic Management of Nonhospitalized Children With COVID-19 . Requires Conventional OxygendUse 1 of the following options: •Remdesivirc (BIII) •Dexamethasone plus remdesivirc for children with increasing oxygen needs, particularly adolescents (BIII) Requires Oxygen Through