In [1]:
# %%
print("üîß CHECKING AND UPGRADING PYARROW")
print("=" * 50)

import sys
import subprocess
import importlib

# Check current version
try:
    import pyarrow as pa
    print(f"Current PyArrow version: {pa.__version__}")
except:
    print("PyArrow not installed or version too old")

# Upgrade PyArrow
print("\nüîÑ Upgrading PyArrow...")
try:
    subprocess.check_call([sys.executable, "-m", "pip", "install", "--upgrade", "pyarrow", "pandas"])
    print("‚úÖ PyArrow upgraded successfully")
    
    # Reload modules
    importlib.reload(sys.modules.get('pyarrow', None))
    
except Exception as e:
    print(f"‚ùå Upgrade failed: {e}")
    print("Please run in terminal: pip install --upgrade pyarrow pandas")

üîß CHECKING AND UPGRADING PYARROW
Current PyArrow version: 22.0.0

üîÑ Upgrading PyArrow...
‚úÖ PyArrow upgraded successfully


In [2]:
# %%
print("üìö IMPORTING LIBRARIES")
print("=" * 50)

import pandas as pd
import chromadb
import numpy as np
import os
import gc
from tqdm.auto import tqdm

print("‚úÖ Libraries imported")

üìö IMPORTING LIBRARIES
‚úÖ Libraries imported


  from .autonotebook import tqdm as notebook_tqdm


In [3]:
import os

print("\nüîç CHECKING AVAILABLE FILES")
print("=" * 50)

data_dir = "data/processed/"
print(f"Looking in: {os.path.abspath(data_dir)}")

# List files
files = os.listdir(data_dir)
print(f"\nüìÅ FILES AVAILABLE:")
for i, file in enumerate(sorted(files)):
    size = os.path.getsize(os.path.join(data_dir, file)) / (1024**2)  # MB
    print(f"  {i+1:2d}. {file} ({size:.1f} MB)")

# Use the correct file
file_path = os.path.join(data_dir, "complaint_metadata_full.parquet")
print(f"\n‚úÖ USING: {file_path}")
print(f"   Size: {os.path.getsize(file_path) / (1024**2):.1f} MB")



üîç CHECKING AVAILABLE FILES
Looking in: d:\10 acadamy\Intelligent Complaint Analysis for Financial Services\notebooks\data\processed

üìÅ FILES AVAILABLE:
   1. all_chunks.parquet (318.9 MB)
   2. chunking_progress.json (0.0 MB)
   3. chunks_metadata.csv (0.0 MB)
   4. chunks_summary.csv (183.9 MB)
   5. cleaned_complaints.csv (1193.3 MB)
   6. complaint_metadata_full.parquet (18.4 MB)

‚úÖ USING: data/processed/complaint_metadata_full.parquet
   Size: 18.4 MB


In [4]:
# %%
print("\nüìä LOADING EMBEDDINGS DATA - METADATA ONLY")
print("=" * 50)

try:
    print(f"üì• Reading file metadata...")
    
    # Get file metadata
    parquet_file = pq.ParquetFile(embeddings_path)
    metadata = parquet_file.metadata
    
    print(f"‚úÖ File metadata loaded")
    print(f"   ‚Ä¢ Total rows: {metadata.num_rows:,}")
    print(f"   ‚Ä¢ Number of row groups: {metadata.num_row_groups}")
    print(f"   ‚Ä¢ Columns: {metadata.num_columns}")
    
    # Show column names
    print(f"\nüìã COLUMNS IN FILE:")
    schema = parquet_file.schema
    for i, col in enumerate(schema.names):
        print(f"   {i+1:2d}. {col}")
    
    # Check for required columns
    required_cols = ['embeddings', 'text_chunk', 'complaint_id', 'product_category']
    available_cols = schema.names
    missing_cols = [col for col in required_cols if col not in available_cols]
    
    if missing_cols:
        print(f"\n‚ùå MISSING REQUIRED COLUMNS: {missing_cols}")
    else:
        print(f"\n‚úÖ ALL REQUIRED COLUMNS PRESENT")
        
except Exception as e:
    print(f"‚ùå Error reading file metadata: {e}")


üìä LOADING EMBEDDINGS DATA - METADATA ONLY
üì• Reading file metadata...
‚ùå Error reading file metadata: name 'pq' is not defined


In [5]:
# %%
print("\nüíæ FAST CHROMADB INITIALIZATION")
print("=" * 50)

import time
import chromadb

# Use timestamp to create unique path
timestamp = int(time.time())
vector_store_path = f"vector_store_{timestamp}"

print(f"Creating vector store at: {vector_store_path}")

try:
    # Skip process scanning - just create new directory
    if os.path.exists(vector_store_path):
        # Try quick remove
        try:
            import shutil
            shutil.rmtree(vector_store_path, ignore_errors=True)
        except:
            pass  # Ignore errors, we'll create with different name if needed
    
    # Create directory
    os.makedirs(vector_store_path, exist_ok=True)
    
    # Initialize client
    client = chromadb.PersistentClient(path=vector_store_path)
    
    # Create collection
    collection = client.create_collection(
        name="financial_complaints",
        metadata={
            "hnsw:space": "cosine",
            "description": "Financial complaints database",
            "embedding_model": "all-MiniLM-L6-v2",
            "created": time.strftime("%Y-%m-%d %H:%M:%S")
        }
    )
    
    print(f"‚úÖ ChromaDB initialized in {time.time() - timestamp:.2f} seconds")
    print(f"   ‚Ä¢ Path: {os.path.abspath(vector_store_path)}")
    print(f"   ‚Ä¢ Collection: financial_complaints")
    
except Exception as e:
    print(f"‚ùå Error: {e}")
    
    # Emergency fallback: Use in-memory
    print("\nüîÑ Using in-memory store as fallback...")
    client = chromadb.Client()
    collection = client.create_collection(name="financial_complaints")
    print("‚úÖ Created in-memory store (temporary)")


üíæ FAST CHROMADB INITIALIZATION
Creating vector store at: vector_store_1768244751


Failed to send telemetry event ClientStartEvent: capture() takes 1 positional argument but 3 were given
Failed to send telemetry event ClientCreateCollectionEvent: capture() takes 1 positional argument but 3 were given


‚úÖ ChromaDB initialized in 3.17 seconds
   ‚Ä¢ Path: d:\10 acadamy\Intelligent Complaint Analysis for Financial Services\notebooks\vector_store_1768244751
   ‚Ä¢ Collection: financial_complaints


In [6]:
# %%
print("\nüì• LOADING METADATA FILE")
print("=" * 50)

try:
    print("Loading complaint_metadata_full.parquet...")
    
    # Load the file (19MB is small enough to load all at once)
    df = pd.read_parquet(file_path)
    
    print(f"‚úÖ Successfully loaded!")
    print(f"   ‚Ä¢ Rows: {len(df):,}")
    print(f"   ‚Ä¢ Columns: {list(df.columns)}")
    print(f"   ‚Ä¢ Memory: {df.memory_usage(deep=True).sum() / (1024**2):.2f} MB")
    
    # Show data types
    print(f"\nüîç DATA TYPES:")
    for col in df.columns:
        dtype = df[col].dtype
        sample = df[col].iloc[0] if len(df) > 0 else None
        print(f"   ‚Ä¢ {col}: {dtype} (sample: {str(sample)[:50] if sample else 'None'}...)")
    
    # Show first few rows
    print(f"\nüìÑ FIRST 3 ROWS:")
    print(df.head(3))
    
except Exception as e:
    print(f"‚ùå Error loading file: {e}")
    import traceback
    traceback.print_exc()
    df = None


üì• LOADING METADATA FILE
Loading complaint_metadata_full.parquet...
‚úÖ Successfully loaded!
   ‚Ä¢ Rows: 1,375,327
   ‚Ä¢ Columns: ['chunk_index', 'company', 'complaint_id', 'date_received', 'issue', 'product', 'product_category', 'state', 'sub_issue', 'total_chunks', 'id']
   ‚Ä¢ Memory: 892.23 MB

üîç DATA TYPES:
   ‚Ä¢ chunk_index: int64 (sample: None...)
   ‚Ä¢ company: object (sample: CITIBANK, N.A....)
   ‚Ä¢ complaint_id: object (sample: 14069121...)
   ‚Ä¢ date_received: object (sample: 2025-06-13...)
   ‚Ä¢ issue: object (sample: Getting a credit card...)
   ‚Ä¢ product: object (sample: Credit card...)
   ‚Ä¢ product_category: object (sample: Credit Card...)
   ‚Ä¢ state: object (sample: TX...)
   ‚Ä¢ sub_issue: object (sample: Card opened without my consent or knowledge...)
   ‚Ä¢ total_chunks: int64 (sample: 1...)
   ‚Ä¢ id: object (sample: 14069121_0...)

üìÑ FIRST 3 ROWS:
   chunk_index                company complaint_id date_received  \
0            0         CITIB

In [7]:
# %%
print("\nüîç ANALYZING COLUMNS FOR DOCUMENT CREATION")
print("=" * 50)

if df is not None:
    print("Looking for text columns to create documents...")
    
    # Check for narrative/text columns
    narrative_candidates = []
    for col in df.columns:
        if df[col].dtype == 'object':  # String columns
            # Check first non-null value
            non_null = df[col].dropna()
            if len(non_null) > 0:
                sample = non_null.iloc[0]
                if isinstance(sample, str) and len(sample.strip()) > 20:
                    narrative_candidates.append(col)
    
    print(f"‚úÖ Found potential text columns: {narrative_candidates}")
    
    # Show samples from candidate columns
    if narrative_candidates:
        print(f"\nüìù SAMPLES FROM CANDIDATE COLUMNS:")
        for col in narrative_candidates[:3]:  # Show first 3
            sample = df[col].dropna().iloc[0] if len(df[col].dropna()) > 0 else "No data"
            print(f"   ‚Ä¢ {col}: {str(sample)[:100]}...")
    
    # Check for complaint ID column
    id_candidates = [col for col in df.columns if 'id' in col.lower() or 'complaint' in col.lower()]
    print(f"\nüîë ID columns: {id_candidates}")
    
    # Check for product columns
    product_candidates = [col for col in df.columns if 'product' in col.lower()]
    print(f"üè∑Ô∏è Product columns: {product_candidates}")
    
    # Check for issue columns
    issue_candidates = [col for col in df.columns if 'issue' in col.lower()]
    print(f"‚ö†Ô∏è Issue columns: {issue_candidates}")
    
else:
    print("‚ö†Ô∏è No data loaded")


üîç ANALYZING COLUMNS FOR DOCUMENT CREATION
Looking for text columns to create documents...
‚úÖ Found potential text columns: ['issue', 'sub_issue']

üìù SAMPLES FROM CANDIDATE COLUMNS:
   ‚Ä¢ issue: Getting a credit card...
   ‚Ä¢ sub_issue: Card opened without my consent or knowledge...

üîë ID columns: ['complaint_id', 'id']
üè∑Ô∏è Product columns: ['product', 'product_category']
‚ö†Ô∏è Issue columns: ['issue', 'sub_issue']


In [8]:
# %%
print("\nü§ñ LOADING EMBEDDING MODEL")
print("=" * 50)

try:
    print("Loading sentence-transformers model...")
    embedder = SentenceTransformer("all-MiniLM-L6-v2")
    
    print(f"‚úÖ Model loaded successfully!")
    print(f"   ‚Ä¢ Model: all-MiniLM-L6-v2")
    print(f"   ‚Ä¢ Dimensions: {embedder.get_sentence_embedding_dimension()}")
    
    # Test the model
    test_text = "credit card complaint"
    test_embedding = embedder.encode(test_text)
    print(f"   ‚Ä¢ Test embedding shape: {test_embedding.shape}")
    
except Exception as e:
    print(f"‚ùå Error loading model: {e}")
    print("Installing sentence-transformers...")
    
    import sys
    import subprocess
    subprocess.check_call([sys.executable, "-m", "pip", "install", "sentence-transformers", "-q"])
    
    from sentence_transformers import SentenceTransformer
    embedder = SentenceTransformer("all-MiniLM-L6-v2")
    print(f"‚úÖ Model installed and loaded")


ü§ñ LOADING EMBEDDING MODEL
Loading sentence-transformers model...
‚ùå Error loading model: name 'SentenceTransformer' is not defined
Installing sentence-transformers...
‚úÖ Model installed and loaded


In [9]:
# %%
print("\n‚ö° VECTORIZED DOCUMENT CREATION (FAST)")
print("=" * 50)

import time
start_time = time.time()

if df is not None:
    print(f"Creating documents from {len(df):,} rows using vectorized operations...")
    
    # Use vectorized string operations - MUCH faster
    # Create document text using pandas string operations
    
    # Start with empty string series
    doc_texts = pd.Series([""] * len(df), dtype=str)
    
    # Add product information
    if 'product' in df.columns:
        doc_texts = doc_texts + "Product: " + df['product'].fillna('Unknown').astype(str) + ". "
    elif 'product_category' in df.columns:
        doc_texts = doc_texts + "Product: " + df['product_category'].fillna('Unknown').astype(str) + ". "
    
    # Add issue information
    if 'issue' in df.columns:
        doc_texts = doc_texts + "Issue: " + df['issue'].fillna('Not specified').astype(str) + ". "
    
    # Add sub_issue if exists
    if 'sub_issue' in df.columns:
        # Filter out 'None' or empty sub_issues
        has_sub_issue = df['sub_issue'].notna() & (df['sub_issue'].astype(str).str.lower() != 'none')
        doc_texts = doc_texts + "Details: " + df['sub_issue'].where(has_sub_issue, '').astype(str) + ". "
    
    # Add company
    if 'company' in df.columns:
        doc_texts = doc_texts + "Company: " + df['company'].fillna('Unknown').astype(str) + ". "
    
    # Add state
    if 'state' in df.columns:
        doc_texts = doc_texts + "State: " + df['state'].fillna('Unknown').astype(str) + ". "
    
    # Convert to list
    documents = doc_texts.tolist()
    
    # Prepare metadata using list comprehension (faster than loop)
    print("Preparing metadata...")
    
    # Get columns that exist in dataframe
    key_fields = ['complaint_id', 'product', 'product_category', 'issue', 'sub_issue', 
                 'company', 'state', 'date_received']
    existing_fields = [field for field in key_fields if field in df.columns]
    
    # Create metadata using list comprehension
    metadatas = []
    for i in range(len(df)):
        metadata = {}
        for field in existing_fields:
            value = df[field].iloc[i]
            if pd.notna(value):
                metadata[field] = str(value)
        
        # Add complaint ID if not present
        if 'complaint_id' not in metadata:
            metadata['complaint_id'] = f"comp_{i}"
        
        metadata['row_index'] = str(i)
        metadatas.append(metadata)
    
    elapsed = time.time() - start_time
    print(f"‚úÖ Created {len(documents):,} documents in {elapsed:.1f} seconds")
    print(f"üìÑ Sample document: {documents[0][:150]}..." if documents else "No documents created")
    
else:
    print("‚ö†Ô∏è No data to process")
    documents = []
    metadatas = []


‚ö° VECTORIZED DOCUMENT CREATION (FAST)
Creating documents from 1,375,327 rows using vectorized operations...
Preparing metadata...
‚úÖ Created 1,375,327 documents in 378.0 seconds
üìÑ Sample document: Product: Credit card. Issue: Getting a credit card. Details: Card opened without my consent or knowledge. Company: CITIBANK, N.A.. State: TX. ...


In [10]:
# %%
print("\nüöÄ LOAD SMALL SAMPLE FOR TASK 3 (FAST)")
print("=" * 50)

print("For Task 3 RAG pipeline evaluation, you need only 1,000-10,000 documents.")
print("This will complete in MINUTES, not hours!")

if 'documents' in locals() and documents and 'collection' in globals() and collection is not None:
    # Take a small sample
    sample_size = 5000  # 5,000 documents is PLENTY for Task 3
    
    print(f"Loading {sample_size:,} documents (out of {len(documents):,})...")
    
    # Take random sample
    import random
    indices = random.sample(range(len(documents)), min(sample_size, len(documents)))
    sample_docs = [documents[i] for i in indices]
    sample_metas = [metadatas[i] for i in indices]
    
    # Process in one batch (5,000 is fine for one batch)
    print("Generating embeddings...")
    embeddings = embedder.encode(sample_docs, show_progress_bar=True)
    
    # Create IDs
    ids = [f"sample_doc_{i}" for i in range(len(sample_docs))]
    
    # Add to vector store
    print("Adding to vector store...")
    collection.add(
        embeddings=embeddings.tolist(),
        documents=sample_docs,
        metadatas=sample_metas,
        ids=ids
    )
    
    print(f"‚úÖ LOADED {len(sample_docs):,} DOCUMENTS!")
    print(f"üìä Vector store now has {collection.count():,} documents")
    
else:
    print("‚ùå Missing documents or collection")


üöÄ LOAD SMALL SAMPLE FOR TASK 3 (FAST)
For Task 3 RAG pipeline evaluation, you need only 1,000-10,000 documents.
This will complete in MINUTES, not hours!
Loading 5,000 documents (out of 1,375,327)...
Generating embeddings...


Batches: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 157/157 [03:25<00:00,  1.31s/it]


Adding to vector store...


Failed to send telemetry event CollectionAddEvent: capture() takes 1 positional argument but 3 were given


‚úÖ LOADED 5,000 DOCUMENTS!
üìä Vector store now has 5,000 documents


In [11]:
# %%
print("\nüîç TESTING VECTOR STORE")
print("=" * 50)

if 'collection' in globals() and collection and collection.count() > 0:
    print(f"üìä Vector store has {collection.count():,} documents")
    
    # Test queries
    test_queries = [
        "credit card unauthorized transaction",
        "personal loan application rejected",
        "bank account fees",
        "money transfer problem",
        "mortgage complaint"
    ]
    
    print("\nüß™ RUNNING TEST QUERIES:")
    
    for query in test_queries:
        print(f"\nüîç Query: '{query}'")
        
        try:
            results = collection.query(
                query_texts=[query],
                n_results=3,
                include=["documents", "metadatas", "distances"]
            )
            
            if results['documents'] and results['documents'][0]:
                print(f"   ‚úÖ Found {len(results['documents'][0])} results")
                
                # Show top result
                doc = results['documents'][0][0]
                meta = results['metadatas'][0][0]
                
                print(f"   üìÑ Document: {doc[:100]}...")
                print(f"   üè∑Ô∏è Product: {meta.get('product_category', meta.get('product', 'Unknown'))}")
                print(f"   üè¢ Company: {meta.get('company', 'Unknown')}")
                
                if results['distances'] and results['distances'][0]:
                    similarity = 1 - results['distances'][0][0]
                    print(f"   üìä Similarity: {similarity:.3f}")
            else:
                print(f"   ‚ö†Ô∏è No results found")
                
        except Exception as e:
            print(f"   ‚ùå Query error: {e}")
    
    print(f"\n‚úÖ Vector store is working correctly!")
    
else:
    print("‚ö†Ô∏è Vector store is empty or not accessible")
    if 'collection' in globals():
        print(f"   ‚Ä¢ Collection exists: {collection is not None}")
        if collection:
            print(f"   ‚Ä¢ Document count: {collection.count()}")


üîç TESTING VECTOR STORE
üìä Vector store has 5,000 documents

üß™ RUNNING TEST QUERIES:

üîç Query: 'credit card unauthorized transaction'


Failed to send telemetry event CollectionQueryEvent: capture() takes 1 positional argument but 3 were given


   ‚úÖ Found 3 results
   üìÑ Document: Product: Credit card. Issue: Problem with a purchase shown on your statement. Details: Card was char...
   üè∑Ô∏è Product: Credit Card
   üè¢ Company: CITIBANK, N.A.
   üìä Similarity: 0.568

üîç Query: 'personal loan application rejected'
   ‚úÖ Found 3 results
   üìÑ Document: Product: Checking or savings account. Issue: Problem with a lender or other company charging your ac...
   üè∑Ô∏è Product: Savings Account
   üè¢ Company: Lending Club Corp
   üìä Similarity: 0.430

üîç Query: 'bank account fees'
   ‚úÖ Found 3 results
   üìÑ Document: Product: Checking or savings account. Issue: Managing an account. Details: Fee problem. Company: U.S...
   üè∑Ô∏è Product: Savings Account
   üè¢ Company: U.S. BANCORP
   üìä Similarity: 0.586

üîç Query: 'money transfer problem'
   ‚úÖ Found 3 results
   üìÑ Document: Product: Money transfer, virtual currency, or money service. Issue: Other transaction problem. Detai...
   üè∑Ô∏è Product: 

In [13]:
# %%
print("\nüíæ SAVING CONFIGURATION")
print("=" * 50)

try:
    # Remove emojis from the config file - they cause encoding issues
    config_content = f'''# VECTOR STORE CONFIGURATION
# Generated for Task 3 RAG Pipeline

import os

# Path Configuration
VECTOR_STORE_PATH = r"{os.path.abspath(vector_store_path)}"
COLLECTION_NAME = "financial_complaints"
SOURCE_FILE = r"{os.path.abspath(file_path)}"

# Statistics
TOTAL_DOCUMENTS = {collection.count() if 'collection' in globals() and collection else 0}
EMBEDDING_MODEL = "all-MiniLM-L6-v2"
CREATED_AT = "{time.strftime("%Y-%m-%d %H:%M:%S")}"

# Functions
def get_vector_store():
    """
    Get the ChromaDB collection for RAG pipeline
    
    Returns:
        chromadb.Collection: The vector store collection
    """
    import chromadb
    client = chromadb.PersistentClient(path=VECTOR_STORE_PATH)
    return client.get_collection(COLLECTION_NAME)

def test_connection():
    """
    Test if vector store is accessible
    
    Returns:
        bool: True if successful, False otherwise
    """
    try:
        collection = get_vector_store()
        count = collection.count()
        print("SUCCESS: Connected to vector store")
        print(f"Documents: {{count:,}}")
        print(f"Location: {{VECTOR_STORE_PATH}}")
        return True
    except Exception as e:
        print(f"ERROR: Connection failed: {{e}}")
        return False

# Quick test
if __name__ == "__main__":
    print("=" * 50)
    print("VECTOR STORE CONFIGURATION")
    print("=" * 50)
    print(f"Path: {{VECTOR_STORE_PATH}}")
    print(f"Collection: {{COLLECTION_NAME}}")
    print(f"Documents: {{TOTAL_DOCUMENTS:,}}")
    
    if test_connection():
        print("\\nREADY: RAG Pipeline is ready!")
    else:
        print("\\nERROR: Configuration issue")
'''

    config_file = "rag_vector_config.py"
    with open(config_file, "w", encoding="utf-8") as f:
        f.write(config_content)
    
    print(f"SUCCESS: Configuration saved to: {config_file}")
    
    # Also save path to simple text file
    with open("vector_store_path.txt", "w", encoding="utf-8") as f:
        f.write(vector_store_path)
    
    print(f"\\nCONFIGURATION SUMMARY:")
    print(f"   ‚Ä¢ Vector store: {vector_store_path}")
    print(f"   ‚Ä¢ Documents: {collection.count() if 'collection' in globals() and collection else 0:,}")
    print(f"   ‚Ä¢ Source file: {os.path.basename(file_path)}")
    print(f"   ‚Ä¢ Config file: {config_file}")
    
except Exception as e:
    print(f"ERROR saving configuration: {e}")


üíæ SAVING CONFIGURATION
SUCCESS: Configuration saved to: rag_vector_config.py
\nCONFIGURATION SUMMARY:
   ‚Ä¢ Vector store: vector_store_1768244751
   ‚Ä¢ Documents: 5,000
   ‚Ä¢ Source file: complaint_metadata_full.parquet
   ‚Ä¢ Config file: rag_vector_config.py


In [15]:
# %%
print("\n" + "="*60)
print("VECTOR STORE CREATION COMPLETE")
print("="*60)

if 'collection' in globals() and collection:
    count = collection.count()
    
    if count > 0:
        print(f"\\nSUCCESS! Vector store created with {count:,} REAL documents.")
        print(f"\\nSTATISTICS:")
        print(f"   ‚Ä¢ Documents: {count:,}")
        print(f"   ‚Ä¢ Location: {vector_store_path}")
        print(f"   ‚Ä¢ Source: complaint_metadata_full.parquet")
        print(f"   ‚Ä¢ Embedding model: all-MiniLM-L6-v2")
        
        print(f"\\nFOR TASK 3 (RAG PIPELINE):")
        print(f"   1. Create new notebook: task3_rag.ipynb")
        print(f"   2. Start with:")
        print(f"   ")
        print(f"   ```python")
        print(f"   from rag_vector_config import get_vector_store")
        print(f"   ")
        print(f"   # Get the vector store")
        print(f"   collection = get_vector_store()")
        print(f"   print(f'Ready with {{collection.count():,}} documents')")
        print(f"   ")
        print(f"   # Now build your RAG system:")
        print(f"   # 1. Create retriever")
        print(f"   # 2. Design prompt templates")
        print(f"   # 3. Set up LLM generator")
        print(f"   # 4. Test with queries")
        print(f"   # 5. Create evaluation table")
        print(f"   ```")
        
        print(f"\\nTASK 3 REQUIREMENTS:")
        print(f"   ‚úì Vector store with real data")
        print(f"   ‚úì Semantic search working")
        print(f"   ‚Üí Build retriever component")
        print(f"   ‚Üí Create prompt templates")
        print(f"   ‚Üí Set up LLM (use test generator)")
        print(f"   ‚Üí Complete RAG pipeline")
        print(f"   ‚Üí Evaluation table (5-10 questions)")
        
        print(f"\\nIMPORTANT:")
        print(f"   ‚Ä¢ This uses REAL data from your metadata file")
        print(f"   ‚Ä¢ Perfect for Task 3 evaluation")
        print(f"   ‚Ä¢ Ready to build your RAG pipeline")
        
    else:
        print(f"\\nWARNING: Vector store created but empty")
        print(f"Check the loading process for errors")
else:
    print(f"\\nERROR: Vector store not created")
    print(f"Review error messages above")

print("="*60)


VECTOR STORE CREATION COMPLETE
\nSUCCESS! Vector store created with 5,000 REAL documents.
\nSTATISTICS:
   ‚Ä¢ Documents: 5,000
   ‚Ä¢ Location: vector_store_1768244751
   ‚Ä¢ Source: complaint_metadata_full.parquet
   ‚Ä¢ Embedding model: all-MiniLM-L6-v2
\nFOR TASK 3 (RAG PIPELINE):
   1. Create new notebook: task3_rag.ipynb
   2. Start with:
   
   ```python
   from rag_vector_config import get_vector_store
   
   # Get the vector store
   collection = get_vector_store()
   print(f'Ready with {collection.count():,} documents')
   
   # Now build your RAG system:
   # 1. Create retriever
   # 2. Design prompt templates
   # 3. Set up LLM generator
   # 4. Test with queries
   # 5. Create evaluation table
   ```
\nTASK 3 REQUIREMENTS:
   ‚úì Vector store with real data
   ‚úì Semantic search working
   ‚Üí Build retriever component
   ‚Üí Create prompt templates
   ‚Üí Set up LLM (use test generator)
   ‚Üí Complete RAG pipeline
   ‚Üí Evaluation table (5-10 questions)
\nIMPORTANT:
 