In [None]:
# Import the OpenEmbeddings library
import sys
import os
sys.path.append('..')

from openembeddings.models.dense_embedder import DenseEmbedder
from openembeddings.models.sparse_embedder import SparseEmbedder
from openembeddings.models.hybrid_retriever import HybridRetriever
from openembeddings.models.reranker import ReRanker

print("OpenEmbeddings library imported successfully!")
print("Available components:")
print("- DenseEmbedder: State-of-the-art sentence transformers")
print("- SparseEmbedder: BM25-based lexical search")
print("- HybridRetriever: Advanced fusion of dense and sparse retrieval")
print("- ReRanker: Cross-encoder re-ranking for precision")


In [None]:
# Create a sample document corpus
documents = [
    "Machine learning is a subset of artificial intelligence that focuses on algorithms.",
    "Deep learning uses neural networks with multiple layers to learn complex patterns.",
    "Natural language processing enables computers to understand and generate human language.",
    "Computer vision allows machines to interpret and understand visual information.",
    "Reinforcement learning trains agents to make decisions through trial and error.",
    "Python is a popular programming language widely used in data science and AI.",
    "TensorFlow and PyTorch are leading frameworks for deep learning development.",
    "Transformers have revolutionized natural language processing and understanding."
]

print(f"Created corpus with {len(documents)} documents")
for i, doc in enumerate(documents, 1):
    print(f"{i}. {doc[:60]}...")

# Test query
query = "neural networks and deep learning"
print(f"\nQuery: '{query}'")


In [None]:
# Demonstrate Hybrid Retrieval with different fusion strategies
print("=== Hybrid Retrieval Demo ===\n")

# Test Linear Fusion
print("1. Linear Fusion Strategy:")
retriever_linear = HybridRetriever(
    dense_model="hashing-encoder",  # Fast model for demo
    fusion_strategy="linear",
    dense_weight=0.7,
    sparse_weight=0.3,
    use_ann=False  # Disable ANN for consistent results
)

retriever_linear.index(documents)
results_linear = retriever_linear.retrieve(query, top_k=3)

for i, (idx, score, doc) in enumerate(results_linear, 1):
    print(f"   {i}. {doc[:70]}... (score: {score:.4f})")

print("\n2. RRF (Reciprocal Rank Fusion) Strategy:")
retriever_rrf = HybridRetriever(
    dense_model="hashing-encoder",
    fusion_strategy="rrf",
    use_ann=False
)

retriever_rrf.index(documents)
results_rrf = retriever_rrf.retrieve(query, top_k=3)

for i, (idx, score, doc) in enumerate(results_rrf, 1):
    print(f"   {i}. {doc[:70]}... (score: {score:.4f})")


In [None]:
# Demonstrate Cross-Encoder Re-ranking
print("=== Cross-Encoder Re-ranking Demo ===\n")

# Use a lightweight cross-encoder for demo
reranker = ReRanker(model_name="cross-encoder/ms-marco-TinyBERT-L-2-v2")

print("Original RRF results:")
for i, (idx, score, doc) in enumerate(results_rrf, 1):
    print(f"   {i}. {doc[:70]}... (score: {score:.4f})")

print("\nAfter cross-encoder re-ranking:")
reranked_results = reranker.rerank(query, results_rrf)

for i, (idx, score, doc) in enumerate(reranked_results, 1):
    print(f"   {i}. {doc[:70]}... (score: {score:.4f})")

print("\n✓ Re-ranking complete! Notice how the scores and potentially the order have changed.")


In [None]:
# Save the trained retriever
import tempfile
import os

with tempfile.TemporaryDirectory() as tmpdir:
    save_path = os.path.join(tmpdir, "my_retriever")
    
    print(f"Saving retriever to: {save_path}")
    retriever_rrf.save_pretrained(save_path)
    
    print("✓ Retriever saved successfully!")
    
    # Load the retriever
    print("\nLoading retriever...")
    loaded_retriever = HybridRetriever.from_pretrained(save_path)
    
    print("✓ Retriever loaded successfully!")
    
    # Test the loaded retriever
    print("\nTesting loaded retriever:")
    test_results = loaded_retriever.retrieve(query, top_k=2)
    
    for i, (idx, score, doc) in enumerate(test_results, 1):
        print(f"   {i}. {doc[:70]}... (score: {score:.4f})")
    
    print("\n✓ Loaded retriever works perfectly!")


In [None]:
# Demonstrate CLI usage
print("=== Command Line Interface Examples ===\n")

print("Basic CLI commands:")
print("1. Encode texts:")
print("   python -m openembeddings encode 'Hello world' 'This is a test'")
print()

print("2. Build and search an index:")
print("   python -m openembeddings search 'machine learning' \\")
print("     --index-path ./my_index \\")
print("     'ML is great' 'Deep learning rocks' 'Python programming'")
print()

print("3. Enable re-ranking:")
print("   python -m openembeddings search 'neural networks' \\")
print("     --index-path ./my_index --rerank \\")
print("     'Neural networks are powerful' 'AI is the future'")
print()

print("Advanced research CLI:")
print("4. Launch research interface:")
print("   python -m openembeddings research")
print()

print("5. Advanced research commands (when dependencies are available):")
print("   python -m openembeddings.advanced_cli benchmark --dataset scifact")
print("   python -m openembeddings.advanced_cli experiment --config experiments.json")
print("   python -m openembeddings.advanced_cli optimize --time-budget 3600")

print("\n✓ Full CLI documentation available via --help on any command")
