In [2]:
# Import required libraries
import boto3
from botocore.client import Config
import pandas as pd
import requests
import io
from pymilvus import connections, utility, Collection
from sentence_transformers import SentenceTransformer

print("✓ All libraries imported successfully")

✓ All libraries imported successfully


In [3]:
# MinIO Configuration (using host port from .env)
MINIO_ENDPOINT = "http://localhost:9100"  # Host port mapped from container 9000
MINIO_ACCESS_KEY = "admin"
MINIO_SECRET_KEY = "password"

# Create S3 client
s3 = boto3.client(
    "s3",
    endpoint_url=MINIO_ENDPOINT,
    aws_access_key_id=MINIO_ACCESS_KEY,
    aws_secret_access_key=MINIO_SECRET_KEY,
    config=Config(signature_version="s3v4"),
    region_name="us-east-1",
)

# Test connection
buckets = s3.list_buckets()
print("✓ MinIO Connected")
print(f"  Buckets: {[b['Name'] for b in buckets['Buckets']]}")

✓ MinIO Connected
  Buckets: ['lakehouse']


In [4]:
# Check RavenDB
ravendb_url = "http://localhost:8080"
try:
    resp = requests.get(f"{ravendb_url}/databases", timeout=5, headers={"Accept-Encoding": "identity"})
    if resp.status_code == 200:
        print("✓ RavenDB Connected")
    else:
        print(f"⚠ RavenDB returned status {resp.status_code}")
except Exception as e:
    print(f"✗ RavenDB: {e}")

✓ RavenDB Connected


In [5]:
# Check Nessie
nessie_url = "http://localhost:19120"
try:
    resp = requests.get(f"{nessie_url}/api/v1/config", timeout=5)
    config = resp.json()
    print("✓ Nessie Connected")
    print(f"  Default branch: {config.get('defaultBranch', 'main')}")
except Exception as e:
    print(f"✗ Nessie: {e}")

✓ Nessie Connected
  Default branch: main


In [6]:
# Check Milvus
try:
    connections.connect(host="localhost", port="19530")
    collections = utility.list_collections()
    print("✓ Milvus Connected")
    print(f"  Collections: {collections}")
except Exception as e:
    print(f"✗ Milvus: {e}")

✓ Milvus Connected
  Collections: ['orders_vector_index']


In [7]:
# Check Dremio
dremio_url = "http://localhost:9047"
try:
    resp = requests.get(dremio_url, timeout=5)
    if resp.status_code == 200:
        print("✓ Dremio UI accessible")
        print(f"  Web UI: {dremio_url}")
        print(f"  ODBC/JDBC: localhost:31010")
        print(f"  Arrow Flight: localhost:32010")
        print("\n  Note: First-time setup requires creating an admin account via the UI")
except Exception as e:
    print(f"✗ Dremio: {e}")

✓ Dremio UI accessible
  Web UI: http://localhost:9047
  ODBC/JDBC: localhost:31010
  Arrow Flight: localhost:32010

  Note: First-time setup requires creating an admin account via the UI


## 2. Explore the Data Lake

In [8]:
# List contents of the lakehouse bucket
def list_s3_prefix(prefix, max_keys=20):
    """List objects under a prefix."""
    resp = s3.list_objects_v2(Bucket="lakehouse", Prefix=prefix, MaxKeys=max_keys)
    return [obj['Key'] for obj in resp.get('Contents', [])]

print("Bronze Layer (Raw Files):")
for key in list_s3_prefix("bronze/"):
    print(f"  {key}")

print("\nSilver Layer (RavenDB Landing):")
for key in list_s3_prefix("silver/"):
    print(f"  {key}")

print("\nGold Layer (Vectors):")
for key in list_s3_prefix("gold/"):
    print(f"  {key}")

Bronze Layer (Raw Files):
  bronze/files/

Silver Layer (RavenDB Landing):
  silver/ravendb_landing/orders/
  silver/ravendb_landing/orders/2023-12-13/data.parquet
  silver/ravendb_landing/orders/2023-12-14/data.parquet
  silver/ravendb_landing/orders/2023-12-21/data.parquet
  silver/ravendb_landing/orders/2023-12-25/data.parquet
  silver/ravendb_landing/orders/2023-12-26/data.parquet
  silver/ravendb_landing/orders/2023-12-27/data.parquet
  silver/ravendb_landing/orders/2023-12-30/data.parquet
  silver/ravendb_landing/orders/2024-01-03/data.parquet
  silver/ravendb_landing/orders/2024-01-04/data.parquet
  silver/ravendb_landing/orders/2024-01-05/data.parquet
  silver/ravendb_landing/orders/2024-01-06/data.parquet
  silver/ravendb_landing/orders/2024-01-07/data.parquet
  silver/ravendb_landing/orders/2024-01-08/data.parquet
  silver/ravendb_landing/orders/2024-01-10/data.parquet
  silver/ravendb_landing/orders/2024-01-11/data.parquet
  silver/ravendb_landing/orders/2024-01-12/data.parq

In [7]:
# Read order data from landing zone
def read_parquet_from_s3(key):
    """Read a Parquet file from MinIO."""
    obj = s3.get_object(Bucket="lakehouse", Key=key)
    return pd.read_parquet(io.BytesIO(obj['Body'].read()))

# Find first Parquet file
parquet_files = [k for k in list_s3_prefix("silver/ravendb_landing/orders", 100) if k.endswith('.parquet')]
if parquet_files:
    df_orders = read_parquet_from_s3(parquet_files[0])
    print(f"Sample orders from: {parquet_files[0]}")
    display(df_orders.head())
else:
    print("No order data found. Run ravendb_sync.py first.")

Sample orders from: silver/ravendb_landing/orders/2023-12-13/data.parquet


Unnamed: 0,OrderId,CustomerId,OrderDate,TotalAmount,Status,ShipCity,ShipCountry,LineCount,SyncedAt
0,orders/0220-A,customers/18-A,2023-12-13 16:54:41.696,1479.87,Delivered,Chicago,USA,2,2025-12-11 17:10:12.464
1,orders/0300-A,customers/34-A,2023-12-13 16:54:41.919,799.96,Processing,Boston,USA,1,2025-12-11 17:10:12.464


## 3. Vector Search Demo

In [8]:
from sentence_transformers import SentenceTransformer
from pymilvus import Collection

# Load embedding model
model = SentenceTransformer('all-MiniLM-L6-v2')
print("✓ Loaded embedding model")

✓ Loaded embedding model


In [9]:
# Connect to Milvus collection
COLLECTION_NAME = "orders_vector_index"

if utility.has_collection(COLLECTION_NAME):
    collection = Collection(COLLECTION_NAME)
    collection.load()
    print(f"✓ Collection '{COLLECTION_NAME}' loaded")
    print(f"  Entities: {collection.num_entities}")
else:
    print(f"✗ Collection not found. Run milvus_bulk_load.py first.")

✓ Collection 'orders_vector_index' loaded
  Entities: 500


In [10]:
def semantic_search(query: str, limit: int = 5):
    """
    Perform semantic search on orders.
    
    Args:
        query: Natural language search query
        limit: Number of results to return
    """
    # Generate query embedding
    query_embedding = model.encode([query])
    
    # Search in Milvus
    search_params = {"metric_type": "COSINE", "params": {"nprobe": 10}}
    
    results = collection.search(
        data=query_embedding.tolist(),
        anns_field="embedding",
        param=search_params,
        limit=limit,
        output_fields=["order_id"]
    )
    
    print(f"Query: '{query}'")
    print(f"\nTop {limit} matching orders:")
    print("-" * 50)
    
    for i, hit in enumerate(results[0]):
        order_id = hit.entity.get('order_id')
        score = hit.distance
        print(f"{i+1}. {order_id} (similarity: {score:.4f})")
    
    return [hit.entity.get('order_id') for hit in results[0]]

In [11]:
# Try some semantic searches
semantic_search("high value orders shipped to New York")

Query: 'high value orders shipped to New York'

Top 5 matching orders:
--------------------------------------------------
1. orders/0335-A (similarity: 0.7426)
2. orders/0350-A (similarity: 0.7383)
3. orders/0376-A (similarity: 0.7333)
4. orders/0159-A (similarity: 0.7307)
5. orders/0204-A (similarity: 0.7302)


['orders/0335-A',
 'orders/0350-A',
 'orders/0376-A',
 'orders/0159-A',
 'orders/0204-A']

In [None]:
semantic_search("pending orders with multiple items")

In [None]:
semantic_search("delivered orders from Seattle")

## 4. Full Read Path: Vector Search → Metadata Lookup

In [None]:
def get_order_details(order_ids: list) -> pd.DataFrame:
    """
    Look up full order details from the landing zone.
    
    In production, this would query Iceberg via Spark/Trino.
    For demo, we read directly from Parquet.
    """
    # Read all order parquet files
    parquet_files = [k for k in list_s3_prefix("silver/ravendb_landing/orders", 100) if k.endswith('.parquet')]
    
    dfs = []
    for key in parquet_files:
        df = read_parquet_from_s3(key)
        dfs.append(df)
    
    df_all = pd.concat(dfs, ignore_index=True)
    
    # Filter to requested order IDs
    return df_all[df_all['OrderId'].isin(order_ids)]

In [None]:
# End-to-end: Search → Lookup
query = "cancelled orders"
order_ids = semantic_search(query)

print("\n" + "=" * 50)
print("Full Order Details:")
print("=" * 50)

df_details = get_order_details(order_ids)
display(df_details)

## 5. Generate Presigned URLs (for External Compute)

In [None]:
def generate_presigned_url(key: str, expiration: int = 3600) -> str:
    """
    Generate a presigned URL for accessing a file in MinIO.
    
    Args:
        key: S3 object key
        expiration: URL expiration in seconds
    """
    return s3.generate_presigned_url(
        'get_object',
        Params={'Bucket': 'lakehouse', 'Key': key},
        ExpiresIn=expiration
    )

# Example: Generate URLs for Parquet files
parquet_files = [k for k in list_s3_prefix("gold/milvus_import", 10) if k.endswith('.parquet')]

print("Presigned URLs for External Compute:")
print("-" * 50)
for key in parquet_files[:3]:
    url = generate_presigned_url(key)
    print(f"\n{key}:")
    print(f"  {url[:80]}...")

## 6. Architecture Summary

```
┌─────────────┐     ┌──────────────┐     ┌─────────────────┐
│  RavenDB    │────▶│    MinIO     │────▶│  Apache Iceberg │
│  (Source)   │     │  (Storage)   │     │  (Table Format) │
└─────────────┘     └──────────────┘     └─────────────────┘
                           │                      │
                           ▼                      ▼
                    ┌─────────────┐      ┌────────────────┐
                    │   Nessie    │◀────▶│    Dremio      │
                    │  (Catalog)  │      │  (SQL Engine)  │
                    └─────────────┘      └────────────────┘
                           │                      │
                           ▼                      ▼
                    ┌─────────────┐      ┌────────────────┐
                    │   Milvus    │      │   BI Tools     │
                    │(Vector DB)  │      │ (ODBC/JDBC)    │
                    └─────────────┘      └────────────────┘
```

### Service URLs

| Service | URL |
|---------|-----|
| RavenDB Studio | http://localhost:8080 |
| MinIO Console | http://localhost:9101 |
| MinIO S3 API | http://localhost:9100 |
| Nessie API | http://localhost:19120 |
| Milvus | localhost:19530 |
| Spark UI | http://localhost:4040 |
| **Dremio UI** | http://localhost:9047 |
| Dremio ODBC/JDBC | localhost:31010 |
| Dremio Arrow Flight | localhost:32010 |