## Step 1: Install required libraries

In [None]:
!pip install -qU datasets tqdm llama-index llama-index-llms-openai llama-index-vector-stores-mongodb pymongo arize-phoenix "openai>=1" "openinference-instrumentation-llama-index>=2.0.0"

## Step 2: Setup prerequisities

In [None]:
import os
import getpass
from pymongo import MongoClient

In [None]:
os.environ["OPENAI_API_KEY"] = getpass.getpass("Enter your OpenAI API key: ")

In [None]:
MONGODB_URI = getpass.getpass("Enter your MongoDB URI: ")
mongodb_client = MongoClient(
    MONGODB_URI, appname="devrel.content.retrieval_strategies_llamaindex"
)

## Step 3: Setup tracing

In [None]:
from openinference.instrumentation.llama_index import LlamaIndexInstrumentor
from opentelemetry.exporter.otlp.proto.http.trace_exporter import OTLPSpanExporter
from opentelemetry.sdk.trace import TracerProvider
from opentelemetry.sdk.trace.export import SimpleSpanProcessor
import phoenix as px
import nest_asyncio

nest_asyncio.apply()

In [None]:
px.launch_app()

In [None]:
endpoint = "http://127.0.0.1:6006/v1/traces"
tracer_provider = TracerProvider()
tracer_provider.add_span_processor(SimpleSpanProcessor(OTLPSpanExporter(endpoint)))
LlamaIndexInstrumentor().instrument(tracer_provider=tracer_provider)

## Step 4: Load dataset

In [None]:
from datasets import load_dataset
import pandas as pd
from llama_index.core import Document

In [None]:
data = load_dataset("BeIR/scifact-generated-queries", split="train", streaming=True)
# Take top 1000 rows
data_head = data.take(1000)
df = pd.DataFrame(data_head)

In [None]:
# Extract the first 100 questions
queries = df["query"].tolist()[0:500]

In [None]:
df.drop(columns=["_id", "query"], inplace=True)
df.drop_duplicates(inplace=True)

In [None]:
documents = [Document(text=row["text"]) for _, row in df.iterrows()]

In [None]:
documents[0]

In [None]:
len(documents)

## Step 5: Create MongoDB Atlas Vector Store

In [None]:
from llama_index.embeddings.openai import OpenAIEmbedding
from llama_index.vector_stores.mongodb import MongoDBAtlasVectorSearch
from llama_index.core.settings import Settings
from llama_index.core import VectorStoreIndex, StorageContext
from pymongo.operations import SearchIndexModel

In [None]:
Settings.embed_model = OpenAIEmbedding(model="text-embedding-3-small")
Settings.chunk_size = 200
Settings.chunk_overlap = 30

In [None]:
VS_INDEX_NAME = "vector_index"
FTS_INDEX_NAME = "fts_index"
DB_NAME = "llamaindex"
COLLECTION_NAME = "retrieval_comp"
collection = mongodb_client[DB_NAME][COLLECTION_NAME]
# delete any existing documents from the collection
collection.delete_many({})

In [None]:
vector_store = MongoDBAtlasVectorSearch(
    mongodb_client,
    db_name = DB_NAME,
    collection_name = COLLECTION_NAME,
    vector_index_name = VS_INDEX_NAME,
    fulltext_index_name = FTS_INDEX_NAME
)
vector_store_context = StorageContext.from_defaults(vector_store=vector_store)
vector_store_index = VectorStoreIndex.from_documents(
   documents, storage_context=vector_store_context, show_progress=True
)

## Step 6: Create Atlas Search Indexes

In [None]:
vs_model = SearchIndexModel(
    definition={
        "fields": [
            {
                "type": "vector",
                "path": "embedding",
                "numDimensions": 1536,
                "similarity": "cosine",
            }
        ]
    },
    name=VS_INDEX_NAME,
    type="vectorSearch"
)

In [None]:
fts_model = SearchIndexModel(
    definition={
        "mappings": {
            "dynamic": False,
            "fields": {
                "text": {"type": "string"}
            }
        }
    },
    name=FTS_INDEX_NAME,
    type="search"
)

In [None]:
collection.create_search_indexes(models=[vs_model, fts_model])

## Step 7: Evaluate retrieval strategies

In [None]:
import numpy as np
from sklearn.metrics import ndcg_score
from tqdm.auto import tqdm
from phoenix.trace import using_project
from phoenix.session.evaluation import get_retrieved_documents
from phoenix.evals import OpenAIModel, RelevanceEvaluator, run_evals

In [None]:
MODES = ["default", "text_search", "hybrid"]
TOP_K = 5
RELEVANCE_EVALUATOR = RelevanceEvaluator(OpenAIModel(model="gpt-4o-2024-08-06"))

In [None]:
def run_eval(retrieved_documents):
    print(len(retrieved_documents))
    evals_df = run_evals(
        evaluators=[RELEVANCE_EVALUATOR],
        dataframe=retrieved_documents,
        provide_explanation=False,
        concurrency=8,
    )[0]
    evals_df = pd.concat(
        [evals_df.add_prefix("eval_"), retrieved_documents["document_score"]], axis=1
    )
    return evals_df
    

In [None]:
def compute_ndcg(df: pd.DataFrame, k: int):
    """Compute NDCG@k in the presence of missing values"""
    n = len(df)
    eval_scores = np.zeros(n)
    doc_scores = np.zeros(n)
    eval_scores[: len(df)] = df.eval_score
    doc_scores[: len(df)] = df.document_score
    try:
        return ndcg_score([eval_scores], [doc_scores], k=k)
    except ValueError:
        return np.nan

In [None]:
overall_metrics = {}
for mode in MODES:
    query_engine = vector_store_index.as_query_engine(similarity_top_k=5, vector_store_query_mode=mode)
    px.close_app(delete_data=True)
    px.launch_app()
    for query in tqdm(queries):
        query_engine.query(query)
    retrieved_documents = get_retrieved_documents(px.active_session())
    evals_df = run_eval(retrieved_documents)
    ndcg_at_2 = pd.DataFrame({"ndcg_at_2": evals_df.groupby("context.span_id").apply(compute_ndcg, k=2)})
    ndcg_at_5 = pd.DataFrame({"ndcg_at_5": evals_df.groupby("context.span_id").apply(compute_ndcg, k=5)})
    precision_at_2 = pd.DataFrame({"precision_at_2": evals_df.groupby("context.span_id").apply(lambda x: x.eval_score[:2].sum(skipna=False) / 2)})
    precision_at_5 = pd.DataFrame({"precision_at_5": evals_df.groupby("context.span_id").apply(lambda x: x.eval_score[:5].sum(skipna=False) / 5)})
    hit_rate_at_2 = pd.DataFrame({"hit_rate_at_2": evals_df.groupby("context.span_id").apply(lambda x: 1 if x.eval_score[:2].sum(skipna=False) > 0 else 0)})
    hit_rate_at_5 = pd.DataFrame({"hit_rate_at_5": evals_df.groupby("context.span_id").apply(lambda x: 1 if x.eval_score[:5].sum(skipna=False) > 0 else 0)})
    metrics_df = pd.concat(
        [
            ndcg_at_2,
            ndcg_at_5,
            precision_at_2,
            precision_at_5,
            hit_rate_at_2,
            hit_rate_at_5
        ],
        axis=1,
    )
    mean_metrics = metrics_df.mean(numeric_only=True).to_dict()
    print(f"-----{mode}-----")
    print(mean_metrics)
    for metric in mean_metrics:
        if metric not in overall_metrics:
            overall_metrics[metric] = [mean_metrics[metric]]
        else:
            overall_metrics[metric].append(mean_metrics[metric])