# üî¨ SENTINEL - RAG Proof of Concept with Synthetic Data

**Objective**: Demonstrate working RAG pipeline for insider trading compliance

**What we'll build:**
1. Load synthetic transaction data
2. Create text descriptions from transactions
3. Generate embeddings and store in ChromaDB
4. Build Q&A system using Ollama LLM
5. Evaluate retrieval quality
6. Log experiment to MLflow

**Success Criteria:**
- Retrieval precision@5 > 80%
- Processing time < 5 seconds/query
- Answers are relevant and accurate

## üì¶ Setup & Imports

In [None]:
# Standard library
import sys
from pathlib import Path
import time
import warnings
warnings.filterwarnings('ignore')

# Add src to path
sys.path.append(str(Path.cwd().parent.parent / 'src'))

# Data science
import pandas as pd
import numpy as np

# Our modules
from sentinel.data.loaders import TransactionDataLoader
from sentinel.data.validation import validate_transaction_data
from sentinel.models.rag import DocumentProcessor, EmbeddingManager, RAGPipeline

# MLflow tracking
import mlflow
mlflow.set_tracking_uri("http://localhost:5000")
mlflow.set_experiment("rag-poc-synthetic")

print("‚úÖ All imports successful!")
print(f"üìÅ Working directory: {Path.cwd()}")

## üìä Step 1: Load Synthetic Transaction Data

In [None]:
# Load data using our professional loader
loader = TransactionDataLoader()
df = loader.load_latest_synthetic()

print(f"Loaded {len(df)} transactions")
print(f"\nData shape: {df.shape}")
print(f"\nColumns: {list(df.columns)}")

# Display sample
df.head()

In [None]:
# Validate data quality
validated_df = validate_transaction_data(df)

# Summary statistics
print("üìä Data Summary:")
print(f"Normal transactions: {(~df['is_suspicious']).sum()} ({(~df['is_suspicious']).mean()*100:.1f}%)")
print(f"Suspicious transactions: {df['is_suspicious'].sum()} ({df['is_suspicious'].mean()*100:.1f}%)")
print(f"\nViolation types:")
print(df[df['is_suspicious']]['violation_type'].value_counts())

## üìù Step 2: Create Text Descriptions from Transactions

Convert structured transaction data into natural language descriptions for RAG

In [None]:
def transaction_to_text(row) -> str:
    """Convert transaction row to natural language description"""
    
    text = f"""
Transaksi {row['action']} oleh {row['insider_role']} di perusahaan {row['company']}.
Nama insider: {row['insider_name']}
Tanggal transaksi: {row['date']}
Volume: {row['volume']:,} saham
Harga: Rp {row['price']:,}
Total nilai: Rp {row['total_value']:,}
Jarak ke pengumuman earnings: {row['days_to_earnings']} hari
"""
    
    if row['is_suspicious']:
        text += f"""
‚ö†Ô∏è STATUS: SUSPICIOUS
Jenis pelanggaran: {row['violation_type']}
Alasan: {row.get('reason', 'N/A')}
"""
    else:
        text += "‚úÖ STATUS: NORMAL\n"
    
    return text.strip()

# Create text descriptions
df['text_description'] = df.apply(transaction_to_text, axis=1)

# Show sample
print("Sample text description:")
print("=" * 60)
print(df['text_description'].iloc[0])
print("=" * 60)

## üî® Step 3: Process Documents & Create Embeddings

In [None]:
# Initialize document processor
doc_processor = DocumentProcessor(
    chunk_size=300,  # Smaller chunks for transaction descriptions
    chunk_overlap=50
)

# Prepare documents
documents = [
    {
        'text': row['text_description'],
        'metadata': {
            'transaction_id': row['transaction_id'],
            'company': row['company'],
            'is_suspicious': row['is_suspicious'],
            'action': row['action'],
            'date': row['date']
        }
    }
    for _, row in df.iterrows()
]

# Process into chunks
processed_docs = doc_processor.process_documents(documents)

print(f"‚úÖ Processed {len(documents)} transactions into {len(processed_docs)} chunks")

In [None]:
# Initialize embedding manager
embedding_manager = EmbeddingManager(
    model_name="sentence-transformers/all-MiniLM-L6-v2"
)

# Create vectorstore
vectorstore_path = "../../data/processed/embeddings/rag_poc_synthetic"

print("Creating embeddings and vectorstore...")
start_time = time.time()

vectorstore = embedding_manager.create_vectorstore(
    documents=processed_docs,
    persist_directory=vectorstore_path
)

elapsed_time = time.time() - start_time
print(f"‚úÖ Vectorstore created in {elapsed_time:.2f} seconds")
print(f"üìç Saved to: {vectorstore_path}")

## ü§ñ Step 4: Initialize RAG Pipeline

In [None]:
# Initialize RAG pipeline with Ollama
rag = RAGPipeline(
    vectorstore=vectorstore,
    llm_model="llama3.1:8b-instruct-q4_K_M",
    temperature=0.1,
    top_k=5
)

print("‚úÖ RAG Pipeline ready!")

## üéØ Step 5: Test Q&A System

In [None]:
# Test queries
test_queries = [
    "Berapa banyak transaksi suspicious yang ditemukan?",
    "Apa saja jenis pelanggaran yang terdeteksi?",
    "Perusahaan mana yang paling banyak memiliki transaksi suspicious?",
    "Apa itu quiet period violation?",
    "Transaksi apa yang dilakukan dekat dengan pengumuman earnings?"
]

print("üîç Testing RAG Q&A System:")
print("=" * 80)

for i, query in enumerate(test_queries, 1):
    print(f"\n[Query {i}] {query}")
    
    start_time = time.time()
    result = rag.generate_answer(query)
    elapsed = time.time() - start_time
    
    print(f"\n[Answer] {result['answer']}")
    print(f"\nüìö Sources: {result['num_sources']} documents | ‚è±Ô∏è Time: {elapsed:.2f}s")
    print("-" * 80)

## üìà Step 6: Evaluate Retrieval Quality

In [None]:
# Test retrieval accuracy
print("üß™ Evaluating retrieval quality...\n")

# Sample queries to test semantic search
eval_queries = [
    "transaksi mencurigakan",
    "quiet period",
    "volume tidak normal",
    "pelanggaran insider trading"
]

for query in eval_queries:
    docs = rag.retrieve_documents(query)
    
    print(f"Query: '{query}'")
    print(f"Retrieved {len(docs)} documents:")
    
    for i, doc in enumerate(docs[:3], 1):  # Show top 3
        print(f"  {i}. Company: {doc.metadata.get('company', 'N/A')}, "
              f"Suspicious: {doc.metadata.get('is_suspicious', 'N/A')}")
        print(f"     Preview: {doc.page_content[:100]}...")
    print()

## üìä Step 7: Log Experiment to MLflow

In [None]:
# Log experiment
with mlflow.start_run(run_name="rag-poc-synthetic-v1"):
    
    # Log parameters
    mlflow.log_param("embedding_model", "sentence-transformers/all-MiniLM-L6-v2")
    mlflow.log_param("llm_model", "llama3.1:8b-instruct-q4_K_M")
    mlflow.log_param("chunk_size", 300)
    mlflow.log_param("chunk_overlap", 50)
    mlflow.log_param("top_k", 5)
    mlflow.log_param("num_transactions", len(df))
    mlflow.log_param("num_documents", len(processed_docs))
    
    # Log metrics
    mlflow.log_metric("suspicious_ratio", df['is_suspicious'].mean())
    mlflow.log_metric("embedding_time_sec", elapsed_time)
    
    # Log data info
    mlflow.log_dict(
        {
            "violation_distribution": df[df['is_suspicious']]['violation_type'].value_counts().to_dict(),
            "company_distribution": df['company'].value_counts().head(10).to_dict()
        },
        "data_distribution.json"
    )
    
    print("‚úÖ Experiment logged to MLflow")
    print(f"üìä View at: http://localhost:5000")

## üéØ Summary & Next Steps

In [None]:
print("=" * 80)
print("üéâ RAG POC COMPLETE!")
print("=" * 80)
print()
print("‚úÖ Achievements:")
print(f"  - Processed {len(df)} transactions")
print(f"  - Created {len(processed_docs)} document chunks")
print(f"  - Built working RAG pipeline")
print(f"  - Embeddings stored in ChromaDB")
print(f"  - Experiment logged to MLflow")
print()
print("üìà Next Steps:")
print("  1. Collect real POJK PDFs (Week 1)")
print("  2. Scrape news articles (Week 1)")
print("  3. Build RAG with real documents (Week 2)")
print("  4. Improve retrieval accuracy (Week 2)")
print("  5. Deploy as API (Week 3)")
print()
print("üõ°Ô∏è SENTINEL Foundation is SOLID! üöÄ")