# Create Mosaic AI Vector Search Index

Sets up a Vector Search endpoint and Delta Sync index for work embeddings.

**Source**: `openalex.vector_search.work_embeddings`
**Endpoint**: Storage-optimized for cost efficiency at scale
**Sync**: Delta Sync (automatic updates from source table)

In [None]:
# Configuration
ENDPOINT_NAME = "openalex-vector-search"
INDEX_NAME = "openalex.vector_search.work_embeddings_index"
SOURCE_TABLE = "openalex.vector_search.work_embeddings"
EMBEDDING_COLUMN = "embedding"
PRIMARY_KEY = "work_id"

# Metadata columns for filtering
METADATA_COLUMNS = ["publication_year", "type", "is_oa", "has_abstract"]

## Step 1: Create Vector Search Endpoint (storage-optimized)

Storage-optimized endpoints are up to 7x cheaper than standard endpoints.
- 1 unit = 64M vectors @ 768 dimensions
- For 250M vectors @ 1536 dimensions = ~8 units

In [None]:
from databricks.vector_search.client import VectorSearchClient

# Initialize client
vsc = VectorSearchClient()

In [None]:
# Check if endpoint exists
try:
    endpoint = vsc.get_endpoint(ENDPOINT_NAME)
    print(f"Endpoint '{ENDPOINT_NAME}' already exists")
    print(f"  Status: {endpoint.get('endpoint_status', {}).get('state')}")
    print(f"  Type: {endpoint.get('endpoint_type')}")
except Exception as e:
    print(f"Endpoint does not exist, will create: {e}")

In [None]:
# Create storage-optimized endpoint
# Only run if endpoint doesn't exist

try:
    vsc.get_endpoint(ENDPOINT_NAME)
    print(f"Endpoint '{ENDPOINT_NAME}' already exists, skipping creation")
except Exception:
    endpoint = vsc.create_endpoint(
        name=ENDPOINT_NAME,
        endpoint_type="STORAGE_OPTIMIZED"  # 7x cheaper than STANDARD
    )
    print(f"Created endpoint: {endpoint}")

In [None]:
# Wait for endpoint to be ready
import time

while True:
    endpoint = vsc.get_endpoint(ENDPOINT_NAME)
    state = endpoint.get('endpoint_status', {}).get('state')
    print(f"Endpoint state: {state}")
    
    if state == 'ONLINE':
        print("Endpoint is ready!")
        break
    elif state in ['OFFLINE', 'FAILED']:
        raise Exception(f"Endpoint failed to start: {endpoint}")
    
    time.sleep(30)

## Step 2: Enable Change Data Feed on source table

Delta Sync requires Change Data Feed to be enabled on the source table.

In [None]:
%%sql
-- Enable Change Data Feed if not already enabled
ALTER TABLE openalex.vector_search.work_embeddings 
SET TBLPROPERTIES ('delta.enableChangeDataFeed' = 'true');

## Step 3: Create Delta Sync Index

In [None]:
# Check if index exists
try:
    index = vsc.get_index(ENDPOINT_NAME, INDEX_NAME)
    print(f"Index '{INDEX_NAME}' already exists")
    print(f"  Status: {index.get('status', {}).get('ready')}")
except Exception as e:
    print(f"Index does not exist, will create: {e}")

In [None]:
# Create Delta Sync index
# Only run if index doesn't exist

try:
    vsc.get_index(ENDPOINT_NAME, INDEX_NAME)
    print(f"Index '{INDEX_NAME}' already exists, skipping creation")
except Exception:
    index = vsc.create_delta_sync_index(
        endpoint_name=ENDPOINT_NAME,
        index_name=INDEX_NAME,
        source_table_name=SOURCE_TABLE,
        primary_key=PRIMARY_KEY,
        embedding_dimension=1536,
        embedding_vector_column=EMBEDDING_COLUMN,
        # Enable filtering by metadata columns
        columns_to_sync=METADATA_COLUMNS,
        # Use triggered sync for cost control (vs continuous)
        pipeline_type="TRIGGERED"
    )
    print(f"Created index: {index}")

In [None]:
# Wait for index to sync
import time

while True:
    index = vsc.get_index(ENDPOINT_NAME, INDEX_NAME)
    status = index.get('status', {})
    ready = status.get('ready', False)
    indexed_count = status.get('indexed_row_count', 0)
    
    print(f"Index ready: {ready}, Indexed rows: {indexed_count:,}")
    
    if ready:
        print("Index is ready!")
        break
    
    time.sleep(60)

## Step 4: Test similarity search

In [None]:
# Test query using a sample embedding
import mlflow.deployments

# Get embedding for test query
mlflow_client = mlflow.deployments.get_deploy_client("databricks")

test_query = "climate change impacts on coral reef ecosystems"

embedding_response = mlflow_client.predict(
    endpoint="openai-embedding-3-small",
    inputs={"input": test_query}
)

query_embedding = embedding_response["data"][0]["embedding"]
print(f"Query embedding dimensions: {len(query_embedding)}")

In [None]:
# Search for similar works
index = vsc.get_index(ENDPOINT_NAME, INDEX_NAME)

results = index.similarity_search(
    query_vector=query_embedding,
    num_results=10,
    columns=["work_id", "publication_year", "type", "is_oa"]
)

print(f"Found {len(results.get('result', {}).get('data_array', []))} results")
for row in results.get('result', {}).get('data_array', []):
    print(f"  work_id: {row[0]}, year: {row[1]}, type: {row[2]}, is_oa: {row[3]}, score: {row[4]:.4f}")

In [None]:
# Test with metadata filter
results_filtered = index.similarity_search(
    query_vector=query_embedding,
    num_results=10,
    filters={"publication_year >": 2020},
    columns=["work_id", "publication_year", "type", "is_oa"]
)

print(f"Found {len(results_filtered.get('result', {}).get('data_array', []))} results (year > 2020)")
for row in results_filtered.get('result', {}).get('data_array', []):
    print(f"  work_id: {row[0]}, year: {row[1]}, type: {row[2]}, is_oa: {row[3]}, score: {row[4]:.4f}")

## Step 5: Trigger manual sync (for updates)

Call this after new embeddings are added to sync the index.

In [None]:
# Trigger manual sync (for TRIGGERED pipeline type)
index = vsc.get_index(ENDPOINT_NAME, INDEX_NAME)
sync_result = index.sync()
print(f"Sync triggered: {sync_result}")

## Index Info

In [None]:
# Get current index info
index = vsc.get_index(ENDPOINT_NAME, INDEX_NAME)
print("Index configuration:")
print(f"  Name: {index.get('name')}")
print(f"  Source table: {index.get('delta_sync_index_spec', {}).get('source_table')}")
print(f"  Embedding column: {index.get('delta_sync_index_spec', {}).get('embedding_vector_columns')}")
print(f"  Embedding dimension: {index.get('delta_sync_index_spec', {}).get('embedding_dimension')}")
print(f"  Pipeline type: {index.get('delta_sync_index_spec', {}).get('pipeline_type')}")
print(f"  Status: {index.get('status')}")