In [0]:
%pip install --upgrade "mlflow[databricks]>=3.1.0" openai "databricks-connect>=16.1"
%pip install databricks-vectorsearch
dbutils.library.restartPython()

In [0]:
import mlflow
from openai import OpenAI
import pandas as pd
from pyspark.sql.functions import col, lit
from databricks.vector_search.client import VectorSearchClient



In [0]:
# Parameters
CATALOG = "mzervou"
SCHEMA = "healthcare"
SOURCE_TABLE = f"{CATALOG}.{SCHEMA}.synthetic_audio_transcripts_enriched"
INDEX_NAME = "mzervou.healthcare.synthetic_audio_transcripts_enriched_index"
EMBEDDING_MODEL = "databricks-bge-large-en"  # Most up-to-date embedding model on Databricks
VECTOR_ENDPOINT = "one-env-shared-endpoint-10"

# Step 1: Load the source table
source_df = spark.table(SOURCE_TABLE)

In [0]:
client = VectorSearchClient()

In [0]:
# client.create_endpoint(
#     name=VECTOR_ENDPOINT,
#     endpoint_type="STANDARD" # or "STORAGE_OPTIMIZED"
# )

In [0]:
%sql

ALTER TABLE mzervou.healthcare.synthetic_audio_transcripts_enriched
SET TBLPROPERTIES (delta.enableChangeDataFeed = true);

In [0]:
# create index
index = client.create_delta_sync_index(
  endpoint_name=VECTOR_ENDPOINT,
  source_table_name=SOURCE_TABLE,
  index_name=f"{SOURCE_TABLE}_index",
  pipeline_type="TRIGGERED",
  primary_key="patient_id",
  embedding_source_column="audio_transcript",
  embedding_model_endpoint_name=EMBEDDING_MODEL
)

In [0]:

# Create index object
index = client.get_index(
    endpoint_name=VECTOR_ENDPOINT,
    index_name=f"{SOURCE_TABLE}_index"
)

# Step 5: Example query
QUERY = "Patient reports chest pain and shortness of breath"


# Match rows where `title` exactly matches `Athena` or `Ares`
results = index.similarity_search(
    query_text=QUERY,
    columns=["patient_id", "audio_transcript", "department"],
    filters={"department": ["General Medicine", "Cardiology"]},
    num_results=2
    )

print(results)


In [0]:
df_enriched = spark.read.table("mzervou.healthcare.synthetic_audio_transcripts_enriched")


# Use vector_search to semantically search transcripts or SOAP notes by meaning, not keywords.

## Why it's important:
- Find similar patient cases instantly.
- Supports research, case reviews, decision support.
- Bridges the gap between structured data and medical intuition.


In [0]:

result_structured = spark.sql(
    f"""
    
      SELECT
        a.audio_transcript as original_audio_transcript,
        a.patient_id,
        search.*
      FROM
        mzervou.healthcare.synthetic_audio_transcripts_enriched as a,
        LATERAL(
      SELECT * FROM VECTOR_SEARCH(index => "mzervou.healthcare.synthetic_audio_transcripts_enriched_index", query_text => audio_transcript, num_results => 2)
        ) as search
    """
)

result_structured.display()
