In [1]:
# Cell 1: Setup - ALL IN ONE CELL TO RUN AND WALK AWAY
"""
Complete RAG Pipeline Test
Run this cell and take a break - it handles everything
"""

# Fix path issues for notebook
import sys
import os
from pathlib import Path
from qdrant_client import QdrantClient
import pandas as pd
import logging
import polars as pl

# Handle both local and Docker environments
if os.path.exists('/opt/airflow/plugins'):
    # Running in Docker
    sys.path.insert(0, '/opt/airflow/plugins')
    DATA_PATH = Path('/opt/airflow/data/exports')
else:
    # Running locally - adjust this path to YOUR structure
    notebook_dir = Path.cwd()
    project_root = notebook_dir.parent  # Go up from notebooks/ to project root
    sys.path.insert(0, str(project_root / 'plugins'))
    DATA_PATH = project_root / 'data' / 'exports'

# For autoreload in notebook (optional but helpful)
%load_ext autoreload
%autoreload 2


In [2]:

# Now imports should work
from rag_utils import (
    load_and_enrich_data,
    create_smart_chunks,
    generate_embeddings_batch,
    init_qdrant_collection,
    upsert_to_qdrant,
    test_retrieval
)

import logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

print("Imports successful")

# --- QUICK VALIDATION SECTION ---
print("\n" + "="*50)
print("STEP 1: Testing with 100 sentences first")
print("="*50)

# Load and sample
df = load_and_enrich_data(min_tokens=3)
print(f"Loaded {len(df)} total sentences")

df_sample = df.sample(100, seed=42)
print(f"Using {len(df_sample)} sentences for test")

# Create chunks
chunks = create_smart_chunks(df_sample, window_size=3, stride=2)
print(f"Created {len(chunks)} chunks from sample")

# Generate embeddings for sample
print("\nGenerating embeddings for sample (30 seconds)...")
chunks_with_embeddings = generate_embeddings_batch(
    chunks, 
    model_name="sentence-transformers/all-mpnet-base-v2",
    batch_size=8
)
print(f"Generated {len(chunks_with_embeddings)} embeddings")

# Test Qdrant connection
from qdrant_client import QdrantClient

try:
    client = QdrantClient(host="localhost", port=6333)
    collections = client.get_collections()
    print(f"\nConnected to Qdrant. Existing collections: {len(collections.collections)}")
except Exception as e:
    print(f"ERROR: Cannot connect to Qdrant: {e}")
    print("Make sure docker-compose is running!")
    raise

# Store sample in test collection
print("\nStoring sample in test collection...")
test_collection = "sec_filings_test"
client = init_qdrant_collection(collection_name=test_collection, vector_size=768)
client = upsert_to_qdrant(chunks_with_embeddings, collection_name=test_collection)
print(f"Stored {len(chunks_with_embeddings)} chunks in test collection")

# Test retrieval
print("\nTesting retrieval...")
results = test_retrieval(query="What are the main business risks?", n_results=3, collection_name="sec_filings_test")
print(f"Found {len(results)} results - search working!")

print("\n" + "="*50)
print("VALIDATION COMPLETE - All systems working!")
print("="*50)


  from .autonotebook import tqdm as notebook_tqdm
INFO:rag_utils:Running in Local environment
INFO:rag_utils:Data path: d:\JoelDesktop folds_24\NEU FALL2025\MLops IE7374 18008\MasterRepo & LabRepo\mlops-labs-portfolio\Airflow_Docker_VDB_Lab\data\exports\sec_filings_small_full.parquet
INFO:rag_utils:Qdrant host: localhost
INFO:rag_utils:Loaded 200000 total sentences


Imports successful

STEP 1: Testing with 100 sentences first


INFO:rag_utils:Using all 191418 sentences
INFO:rag_utils:Created 90 chunks from 100 sentences
INFO:rag_utils:Chunk stats - Min chars: 17, Max: 1431, Avg: 238
INFO:rag_utils:Loading embedding model: sentence-transformers/all-mpnet-base-v2
INFO:rag_utils:First run downloads ~420MB model - cached for future use
INFO:sentence_transformers.SentenceTransformer:Load pretrained SentenceTransformer: sentence-transformers/all-mpnet-base-v2


Loaded 191418 total sentences
Using 100 sentences for test
Created 90 chunks from sample

Generating embeddings for sample (30 seconds)...


INFO:sentence_transformers.SentenceTransformer:Use pytorch device_name: cpu
INFO:rag_utils:Model max sequence length: 384 tokens
INFO:rag_utils:Generating embeddings for 90 chunks (batch_size=8)
INFO:rag_utils:Expected time: ~25-35 minutes for 70k chunks
Batches: 100%|██████████| 12/12 [00:04<00:00,  2.47it/s]
INFO:rag_utils:Generated 90 embeddings of dimension 768
INFO:httpx:HTTP Request: GET http://localhost:6333/collections "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: GET http://localhost:6333/collections "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: GET http://localhost:6333/collections/sec_filings_test "HTTP/1.1 200 OK"
INFO:rag_utils:Collection sec_filings_test exists with 90 points
INFO:rag_utils:Keeping existing collection (dimensions match)
INFO:httpx:HTTP Request: GET http://localhost:6333/collections "HTTP/1.1 200 OK"


Generated 90 embeddings

Connected to Qdrant. Existing collections: 1

Storing sample in test collection...


INFO:httpx:HTTP Request: GET http://localhost:6333/collections/sec_filings_test "HTTP/1.1 200 OK"
INFO:rag_utils:Collection sec_filings_test exists with 90 points
INFO:rag_utils:Keeping existing collection (dimensions match)
INFO:rag_utils:Preparing to upsert 90 chunks to Qdrant...
INFO:httpx:HTTP Request: PUT http://localhost:6333/collections/sec_filings_test/points?wait=true "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: GET http://localhost:6333/collections/sec_filings_test "HTTP/1.1 200 OK"
INFO:rag_utils:Collection 'sec_filings_test' now has 180 vectors
INFO:rag_utils:Data persisted in ./qdrant_storage/
INFO:rag_utils:
INFO:rag_utils:Testing retrieval with query: 'What are the main business risks?'
INFO:sentence_transformers.SentenceTransformer:Load pretrained SentenceTransformer: sentence-transformers/all-mpnet-base-v2


Stored 90 chunks in test collection

Testing retrieval...


INFO:sentence_transformers.SentenceTransformer:Use pytorch device_name: cpu
Batches: 100%|██████████| 1/1 [00:00<00:00, 31.82it/s]
INFO:httpx:HTTP Request: POST http://localhost:6333/collections/sec_filings_test/points/search "HTTP/1.1 200 OK"
INFO:rag_utils:
Top 3 Results:
INFO:rag_utils:--------------------------------------------------------------------------------
INFO:rag_utils:--------------------------------------------------------------------------------
INFO:rag_utils:Search completed successfully



1. [Score: 0.4613]
   Company: ADVANCED MICRO DEVICES INC
   Section: 8 (MD&A)
   Date: 2015-12-26
   Priority: high
   Text Preview: Potential events or circumstance that could reasonably be expected to negatively affect the key assumptions we used in estimating the fair value of our reporting units include adverse changes in our industry, increased competition, an inability to successfully introduce new products in the marketpla...
   Chunk ID: 0000002488_10-K_2015_sec8_full

2. [Score: 0.4613]
   Company: ADVANCED MICRO DEVICES INC
   Section: 8 (MD&A)
   Date: 2015-12-26
   Priority: high
   Text Preview: Potential events or circumstance that could reasonably be expected to negatively affect the key assumptions we used in estimating the fair value of our reporting units include adverse changes in our industry, increased competition, an inability to successfully introduce new products in the marketpla...
   Chunk ID: 0000002488_10-K_2015_sec8_full

3. [Score: 0.4594]
   Company: CE

In [4]:
# Cell 2: Qdrant Database Operations

from qdrant_client import QdrantClient
from qdrant_client.models import Filter, FieldCondition, MatchValue

client = QdrantClient(host="localhost", port=6333)

# 1. Show all collections (like SHOW TABLES)
collections = client.get_collections().collections
print("Collections (tables):")
for col in collections:
    info = client.get_collection(col.name)
    print(f"  {col.name}: {info.points_count} vectors")

# 2. Count vectors in specific collection (like SELECT COUNT(*))
collection_name = "sec_filings_test"  # Change to your collection
try:
    info = client.get_collection(collection_name)
    print(f"\n{collection_name}: {info.points_count} total vectors")
    print(f"Vector dimensions: {info.config.params.vectors.size}")
except:
    print(f"Collection {collection_name} doesn't exist")

# 3. Get sample records (like SELECT * LIMIT 5)
if info.points_count > 0:
    records = client.scroll(
        collection_name=collection_name,
        limit=5
    )[0]  # Returns (records, next_offset)
    
    print(f"\nFirst 5 records:")
    for record in records:
        print(f"ID: {record.id}")
        print(f"  Company: {record.payload.get('company')}")
        print(f"  Section: {record.payload.get('section')}")
        print(f"  Text preview: {record.payload.get('text', '')[:50]}...")

# 4. Query with filters (like SELECT * WHERE section=8)
filtered = client.scroll(
    collection_name=collection_name,
    scroll_filter=Filter(
        must=[
            FieldCondition(
                key="section",
                match=MatchValue(value=8)
            )
        ]
    ),
    limit=10
)[0]
print(f"\nFound {len(filtered)} records with section=8")

# 5. Get collection statistics
stats = client.get_collection(collection_name)
print(f"\nCollection Stats:")
print(f"  Vectors: {stats.points_count}")
print(f"  Indexed: {stats.indexed_vectors_count}")
print(f"  Status: {stats.status}")

# 6. Delete specific vectors (like DELETE WHERE)
# client.delete(
#     collection_name=collection_name,
#     points_filter=Filter(
#         must=[FieldCondition(key="company", match=MatchValue(value="ACME CORP"))]
#     )
# )

INFO:httpx:HTTP Request: GET http://localhost:6333/collections "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: GET http://localhost:6333/collections/sec_filings_test "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: GET http://localhost:6333/collections/sec_filings_test "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST http://localhost:6333/collections/sec_filings_test/points/scroll "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST http://localhost:6333/collections/sec_filings_test/points/scroll "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: GET http://localhost:6333/collections/sec_filings_test "HTTP/1.1 200 OK"


Collections (tables):
  sec_filings_test: 180 vectors

sec_filings_test: 180 total vectors
Vector dimensions: 768

First 5 records:
ID: 118610
  Company: ADVANCED MICRO DEVICES INC
  Section: 8
  Text preview: Historically, the ATI business had lower gross mar...
ID: 1903369
  Company: ADVANCED MICRO DEVICES INC
  Section: 9
  Text preview: During the first quarter of 2011, we reassessed ou...
ID: 2400525
  Company: ACME UNITED CORP
  Section: 8
  Text preview: MANAGEMENT'S DISCUSSION AND ANALYSIS OF FINANCIAL ...
ID: 2947496
  Company: ABBOTT LABORATORIES
  Section: 0
  Text preview: Abbott retained the branded generics pharmaceutica...
ID: 3112637
  Company: BK Technologies Corp
  Section: 8
  Text preview: In July 2015, the FASB issued ASU 2015-11, “Simpli...

Found 10 records with section=8

Collection Stats:
  Vectors: 180
  Indexed: 0
  Status: green


In [5]:
# Cell 1: Delete All Collections
from qdrant_client import QdrantClient

client = QdrantClient(host="localhost", port=6333)

# Get all collections
collections = client.get_collections().collections
print(f"Found {len(collections)} collections:")
for col in collections:
    print(f"  - {col.name}")

# Delete each collection
for col in collections:
    client.delete_collection(col.name)
    print(f"Deleted: {col.name}")

print("\nAll collections deleted!")

INFO:httpx:HTTP Request: GET http://localhost:6333/collections "HTTP/1.1 200 OK"


Found 1 collections:
  - sec_filings_test


INFO:httpx:HTTP Request: DELETE http://localhost:6333/collections/sec_filings_test "HTTP/1.1 200 OK"


Deleted: sec_filings_test

All collections deleted!


In [6]:
## Do pipeline for LAB, Perform with 5k sentences or lesser.

from rag_utils import (
    load_and_enrich_data,
    create_smart_chunks,
    generate_embeddings_batch,
    init_qdrant_collection,
    upsert_to_qdrant,
    test_retrieval
)

import logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

df = load_and_enrich_data(min_tokens=3, sample_size=2000)

chunks = create_smart_chunks(df)
chunks_with_embeddings = generate_embeddings_batch(chunks, batch_size=64)
upsert_to_qdrant(chunks_with_embeddings, collection_name="sec_filings_sample")

INFO:rag_utils:Loaded 200000 total sentences
INFO:rag_utils:Randomly sampled 2000 sentences
INFO:rag_utils:Using 2000 sentences
INFO:rag_utils:Created 989 chunks from 2000 sentences
INFO:rag_utils:Chunk stats - Min chars: 12, Max: 2120, Avg: 373
INFO:rag_utils:Split 4 long chunks (likely tables/lists)
INFO:rag_utils:Loading embedding model: sentence-transformers/all-mpnet-base-v2
INFO:rag_utils:First run downloads ~420MB model - cached for future use
INFO:sentence_transformers.SentenceTransformer:Load pretrained SentenceTransformer: sentence-transformers/all-mpnet-base-v2
INFO:sentence_transformers.SentenceTransformer:Use pytorch device_name: cpu
INFO:rag_utils:Model max sequence length: 384 tokens
INFO:rag_utils:Generating embeddings for 989 chunks (batch_size=64)
INFO:rag_utils:Expected time: ~25-35 minutes for 70k chunks
Batches: 100%|██████████| 16/16 [01:23<00:00,  5.21s/it]
INFO:rag_utils:Generated 989 embeddings of dimension 768
INFO:httpx:HTTP Request: GET http://localhost:6333

<qdrant_client.qdrant_client.QdrantClient at 0x2b6776db990>

## Validation complete earlier. Bottom Error - ignore.
#### Bottom part was trying to force 95k embeddings and chunking, it was taking more than 5 hours potentially. Not needed.
#### Wrote func to filter df. 

In [3]:
# --- DECISION POINT ---
print("\nReady for FULL PIPELINE?")
print("This will process:")
print("- 200,000 sentences")
print("- Create ~70,000 chunks")
print("- Generate embeddings (35 minutes)")
print("- Store in Qdrant (10 minutes)")
print("\nTOTAL TIME: ~45 minutes")
print("="*50)

# Auto-proceed (comment out if you want manual confirmation)
proceed = "YES"  # Change to "NO" to skip full pipeline

if proceed == "YES":
    print("\n" + "="*50)
    print("STARTING FULL PIPELINE")
    print("="*50)
    
    # 1. Load all data
    print("\n1. Loading all data...")
    df_full = load_and_enrich_data(min_tokens=3)
    print(f"   Loaded {len(df_full)} sentences")
    
    # 2. Create all chunks
    print("\n2. Creating chunks...")
    chunks_full = create_smart_chunks(df_full, window_size=3, stride=2)
    print(f"   Created {len(chunks_full)} chunks")
    
    # 3. Generate embeddings (LONG PROCESS)
    print("\n3. Generating embeddings...")
    print("   THIS WILL TAKE ~35 MINUTES")
    print("   Started at:", pd.Timestamp.now().strftime("%H:%M:%S"))
    
    chunks_with_embeddings_full = generate_embeddings_batch(
        chunks_full,
        model_name="sentence-transformers/all-mpnet-base-v2",
        batch_size=16
    )
    
    print(f"   Generated {len(chunks_with_embeddings_full)} embeddings")
    print("   Completed at:", pd.Timestamp.now().strftime("%H:%M:%S"))
    
    # 4. Store in Qdrant
    print("\n4. Storing in Qdrant...")
    print("   THIS WILL TAKE ~10 MINUTES")
    
    client = upsert_to_qdrant(
        chunks_with_embeddings_full,
        collection_name="sec_filings"  # Real collection name
    )
    
    print("\n" + "="*50)
    print("PIPELINE COMPLETE!")
    print("="*50)
    
    # 5. Final verification
    print("\nFinal verification...")
    final_results = test_retrieval(
        query="revenue growth factors",
        n_results=5
    )
    print(f"Retrieval test successful - {len(final_results)} results found")
    
    # Check collection stats
    info = client.get_collection("sec_filings")
    print(f"\nCollection statistics:")
    print(f"  Total vectors: {info.points_count}")
    print(f"  Vector dimensions: {info.config.params.vectors.size}")
    print(f"  Distance metric: {info.config.params.vectors.distance}")
    
    print("\n" + "="*50)
    print("ALL DONE - Embeddings stored in ./qdrant_storage/")
    print("Data persists even after Docker restart")
    print("="*50)
    
else:
    print("Skipped full pipeline")

INFO:rag_utils:Loaded 200000 total sentences



Ready for FULL PIPELINE?
This will process:
- 200,000 sentences
- Create ~70,000 chunks
- Generate embeddings (35 minutes)
- Store in Qdrant (10 minutes)

TOTAL TIME: ~45 minutes

STARTING FULL PIPELINE

1. Loading all data...


INFO:rag_utils:Using all 191418 sentences


   Loaded 191418 sentences

2. Creating chunks...


INFO:rag_utils:Created 94991 chunks from 191418 sentences
INFO:rag_utils:Chunk stats - Min chars: 17, Max: 4850, Avg: 525
INFO:rag_utils:Split 587 long chunks (likely tables/lists)
INFO:rag_utils:Loading embedding model: sentence-transformers/all-mpnet-base-v2
INFO:rag_utils:First run downloads ~420MB model - cached for future use
INFO:sentence_transformers.SentenceTransformer:Load pretrained SentenceTransformer: sentence-transformers/all-mpnet-base-v2


   Created 94991 chunks

3. Generating embeddings...
   THIS WILL TAKE ~35 MINUTES
   Started at: 06:54:20


INFO:sentence_transformers.SentenceTransformer:Use pytorch device_name: cpu
INFO:rag_utils:Model max sequence length: 384 tokens
INFO:rag_utils:Generating embeddings for 94991 chunks (batch_size=16)
INFO:rag_utils:Expected time: ~25-35 minutes for 70k chunks
Batches:  14%|█▍        | 818/5937 [38:29<4:00:54,  2.82s/it]


KeyboardInterrupt: 

In [None]:
## LATERRRR.

# Cell 2: Verify Persistence (RUN SEPARATELY AFTER RESTART)
"""
Run this cell after docker-compose restart to verify persistence
"""
from qdrant_client import QdrantClient

client = QdrantClient(host="localhost", port=6333)
collections = client.get_collections().collections

print("Collections in Qdrant:")
for collection in collections:
    info = client.get_collection(collection.name)
    print(f"  {collection.name}: {info.points_count} vectors")

# Test search still works
if any(c.name == "sec_filings" for c in collections):
    from sentence_transformers import SentenceTransformer
    
    model = SentenceTransformer("sentence-transformers/all-mpnet-base-v2")
    query = "What are the main risk factors?"
    query_embedding = model.encode(query, normalize_embeddings=True).tolist()
    
    results = client.search(
        collection_name="sec_filings",
        query_vector=query_embedding,
        limit=3
    )
    
    print(f"\nSearch test: Found {len(results)} results")
    print("Data successfully persisted!")