# Pinecone Vector Search Experiment

Testing vector similarity search on the Wikipedia dataset using Pinecone.

In [None]:
import os
from pinecone import Pinecone
from sentence_transformers import SentenceTransformer
from dotenv import load_dotenv

load_dotenv()

PINECONE_API_KEY = os.getenv("PINECONE_API_KEY")
PINECONE_INDEX_NAME = os.getenv("PINECONE_INDEX_NAME")

In [None]:
# Initialize clients
pc = Pinecone(api_key=PINECONE_API_KEY)
index = pc.Index(PINECONE_INDEX_NAME)

model = SentenceTransformer('all-MiniLM-L6-v2')

# Check index stats
stats = index.describe_index_stats()
print(f"Index: {PINECONE_INDEX_NAME}")
print(f"Vector count: {stats.total_vector_count:,}")
print(f"Dimension: {stats.dimension}")
print(f"Index fullness: {stats.index_fullness:.2%}")

In [None]:
def vector_search(query: str, top_k: int = 5):
    """Perform vector similarity search."""
    
    # Generate query embedding
    query_embedding = model.encode(query).tolist()
    
    # Search
    results = index.query(
        vector=query_embedding,
        top_k=top_k,
        include_metadata=True
    )
    
    return results

In [None]:
def display_results(results):
    """Display search results in a readable format."""
    print(f"Found {len(results.matches)} results\n")
    print("=" * 80)
    
    for i, match in enumerate(results.matches, 1):
        print(f"\n[{i}] Score: {match.score:.4f}")
        print(f"Title: {match.metadata.get('title', 'N/A')}")
        print(f"Chunk: {match.metadata.get('chunk_index', 'N/A')}")
        print(f"Text length: {match.metadata.get('text_length', 'N/A')}")
        print(f"\nText preview:")
        print(match.metadata.get('text', 'N/A')[:300] + "...")
        print("-" * 80)

## Test Queries

Let's test various search queries:

In [None]:
# Test 1: General knowledge query
query = "artificial intelligence and machine learning"
print(f"Query: {query}\n")

results = vector_search(query, top_k=5)
display_results(results)

In [None]:
# Test 2: Historical query
query = "ancient civilizations and archaeology"
print(f"Query: {query}\n")

results = vector_search(query, top_k=5)
display_results(results)

In [None]:
# Test 3: Scientific query
query = "Who was Alexander Obolensky?"
print(f"Query: {query}\n")

results = vector_search(query, top_k=5)
display_results(results)

In [None]:
# Test hybrid reranking
query = "Who was Alexander Obolensky?"
print(f"Query: {query}")
print("Using client-side hybrid reranking (70% vector, 30% text overlap)\n")

results = hybrid_rerank(query, top_k=5, vector_weight=0.7)
display_results(results)

### Client-Side Hybrid Approach

Since Pinecone doesn't have BM25, we can simulate hybrid search by:
1. Getting vector results
2. Re-ranking based on text overlap

In [None]:
def filtered_search(query: str, title_filter: str = None, top_k: int = 5):
    """Perform vector search with optional metadata filtering."""
    
    # Generate query embedding
    query_embedding = model.encode(query).tolist()
    
    # Build filter if provided
    filter_dict = None
    if title_filter:
        filter_dict = {
            "title": {"$eq": title_filter}
        }
    
    # Search with filter
    results = index.query(
        vector=query_embedding,
        top_k=top_k,
        include_metadata=True,
        filter=filter_dict
    )
    
    return results

In [None]:
# Search for Alexander Obolensky - vector search will find semantic matches
query = "What is a Diferent Light about?"
print(f"Query: {query}\n")

results = filtered_search(query, "A Different Light (bookstore)", top_k=10)
display_results(results)

In [None]:
# Example: Search with exact title filter
# First, let's see what the actual title is from the results above,
# then we can filter by it. For example:
# results = filtered_search(query, title_filter="Alexander Obolensky", top_k=5)

# Or search for rugby-related content with a partial title match
query = "rugby player history"
print(f"\nFiltered search example - Query: {query}")
print("Note: This requires knowing the exact title. Use vector_search first to find titles.\n")

results = vector_search(query, top_k=5)
display_results(results)

## Filtered Search (Metadata Filtering)

Pinecone doesn't have native hybrid search like Elasticsearch, but we can filter by metadata:

## Performance Testing

Let's measure query latency:

In [None]:
import time

# Test queries
test_queries = [
    "artificial intelligence",
    "climate change",
    "space exploration",
    "medieval history",
    "modern architecture"
]

latencies = []

for query in test_queries:
    start = time.time()
    results = vector_search(query, top_k=10)
    latency = (time.time() - start) * 1000  # Convert to ms
    latencies.append(latency)
    print(f"Query: '{query}' - Latency: {latency:.2f}ms")

print(f"\nAverage latency: {sum(latencies) / len(latencies):.2f}ms")
print(f"Min latency: {min(latencies):.2f}ms")
print(f"Max latency: {max(latencies):.2f}ms")