<a href="https://colab.research.google.com/github/madhusudhanrao-ppm/dbdevrel/blob/main/source-codes/colab-code/similaritysearch/similaritysearch-colab.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install -q sentence-transformers faiss-cpu oracledb numpy

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m23.8/23.8 MB[0m [31m53.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.4/2.4 MB[0m [31m62.2 MB/s[0m eta [36m0:00:00[0m
[?25h

In [3]:
import json
from typing import List, Dict, Optional
import numpy as np
import oracledb
from sentence_transformers import SentenceTransformer
import faiss
from google.colab import drive
drive.mount('/content/drive')

MODEL_NAME = "sentence-transformers/all-MiniLM-L6-v2"

def fetch_from_oracle(
    table: str = "MYNOTES",
    column: str = "NOTES",
    max_rows: int = 1000,
    use_wallet: bool = True,
    wallet_dir: Optional[str] = None,
) -> List[str]:
    """Fetch text rows from Oracle. Uses env vars ORACLE_USER, ORACLE_PASSWORD, ORACLE_DSN.
    If use_wallet=True, provide wallet_dir and wallet_password via env vars.
    Returns list of strings (may be empty).
    """
    username = "DEMOUSER"  # Update with your username
    password = "Welcome123456#"  # Update with your password
    tns_name = "indeducation_high"
    wall_config_dir = "/content/drive/MyDrive/Wallet_IndEducation"
    wall_pwd = "welcome1"
    table = "MYNOTES"
    col = "NOTES"

    if not (username and password and tns_name):
        print("Oracle credentials (ORACLE_USER, ORACLE_PASSWORD, ORACLE_DSN) not set. Returning empty list.")
        return []

    conn = None
    try:
        conn = oracledb.connect(user=username,
                              password=password,
                              dsn=tns_name,
                              config_dir=wall_config_dir,
                              wallet_location=wall_config_dir,
                              wallet_password=wall_pwd)

        # Always use config_dir and wallet_location if wall_config_dir is set,
        # as TNS aliases often require it to find tnsnames.ora.

        print("✓ Successfully connected to Oracle Database")

        cur = conn.cursor()
        sql = f"SELECT {column} FROM {table} WHERE {column} IS NOT NULL AND ROWNUM <= :maxrows"
        cur.execute(sql, [max_rows])
        rows = cur.fetchall()
        texts = [r[0] for r in rows if r and r[0] is not None]
        cur.close()
        return texts
    except Exception as e:
        print(f"Error fetching from Oracle: {e}")
        return []
    finally:
        if conn:
            try:
                conn.close()
            except Exception:
                pass

def build_embeddings(texts: List[str], model_name: str = MODEL_NAME, batch_size: int = 64):
    """Return (model, numpy array of embeddings L2-normalized)."""
    model = SentenceTransformer(model_name)
    embs = model.encode(texts, batch_size=batch_size, convert_to_numpy=True, show_progress_bar=True)
    # normalize
    faiss.normalize_L2(embs)
    return model, embs

def build_faiss_index(embs: np.ndarray):
    """Create an inner-product index. Inputs should be L2-normalized vectors; inner product == cosine similarity."""
    dim = embs.shape[1]
    index = faiss.IndexFlatIP(dim)
    index.add(embs)
    return index

def search_index(query: str, model: SentenceTransformer, index: faiss.IndexFlatIP, texts: List[str], k: int = 5):
    q_emb = model.encode([query], convert_to_numpy=True)
    faiss.normalize_L2(q_emb)
    D, I = index.search(q_emb, k)
    # D contains inner products (cosine similarity between -1 and 1)
    results = []
    for score, idx in zip(D[0], I[0]):
        if idx < 0:
            continue
        results.append({"index": int(idx), "text": texts[idx], "score": float(score)})
    return results

def save_index(path_prefix: str, embs: np.ndarray, texts: List[str]):
    np.save(f"{path_prefix}_emb.npy", embs)
    with open(f"{path_prefix}_texts.json", "w", encoding="utf-8") as f:
        json.dump(texts, f, ensure_ascii=False)

def load_index(path_prefix: str):
    embs = np.load(f"{path_prefix}_emb.npy")
    with open(f"{path_prefix}_texts.json", "r", encoding="utf-8") as f:
        texts = json.load(f)
    index = build_faiss_index(embs)
    return index, embs, texts


# Configuration: switch to True to fetch from Oracle
use_oracle = True

texts = []
if use_oracle:
    print("Fetching from Oracle...")
    # Note: 'use_wallet=False' was passed here, but the DSN might still implicitly require a config_dir.
    # The hardcoded values for Oracle connection parameters inside fetch_from_oracle also need review.
    texts = fetch_from_oracle(table="MYNOTES", column="NOTES", max_rows=1000, use_wallet=False)
    print(f"Fetched {len(texts)} rows")
else:
    texts = [
        "I want to open an account.",
        "I want a credit card.",
        "I need to update my address.",
        "I want to apply for a loan.",
        "How do I check my balance?",
        "I lost my debit card."
    ]
    print(f"Using {len(texts)} sample texts")

# Build model + embeddings + index
model = None # Initialize model to None
embs = None # Initialize embs to None
index = None # Initialize index to None

if texts: # Only build embeddings and index if texts list is not empty
    model, embs = build_embeddings(texts)
    index = build_faiss_index(embs)
    print("Index ready. Use search_index(query, model, index, texts, k)")
else:
    print("No texts available to build embeddings and index.")
    print("Please check the Oracle connection details or provide sample texts.")


query = input("Enter search query: ")
if query.strip():
    results = search_index(query, model, index, texts, k=5)
    for i, r in enumerate(results, 1):
        print(f"{i}. score={r['score']:.4f}\n   {r['text'][:300]}\n")
else:
    print("No query provided.")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Fetching from Oracle...
✓ Successfully connected to Oracle Database
Fetched 24 rows


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Index ready. Use search_index(query, model, index, texts, k)
Enter search query: Cancer
1. score=0.3734
   Lung cancer screening scheduled for next month.

2. score=0.3673
   Breast cancer screening scheduled for next month.

3. score=0.3186
   Started on gemcitabine + nab‑paclitaxel; mild neutropenia noted.

4. score=0.3097
   Debulking surgery successful; plan for platinum‑based chemo.

5. score=0.2727
   Bilateral lung metastases stable after 8 cycles of nivolumab.

