# Setup retrieval

In [None]:
import os

# OS environ
os.environ["OPENAI_API_KEY"] = ""
os.environ["TAVILY_API_KEY"] = ""

In [2]:
from langchain_chroma import Chroma
from langchain_openai import OpenAIEmbeddings
from typing import List
from langchain_core.documents import Document
from langchain_core.runnables import chain


vectorstore = Chroma(
    persist_directory="./yahoo-finance-chroma",
    embedding_function=OpenAIEmbeddings(),
    collection_name="yahoo-news",
    collection_metadata={"hnsw:space": "cosine"},
)


@chain
def retriever(query: str) -> List[Document]:
    docs, distances = zip(*vectorstore.similarity_search_with_relevance_scores(query))

    for doc, distance in zip(docs, distances):
        doc.metadata["score"] = distance * 2

    return docs



def retrieve(state):
    """
    Retrieve documents
    Args:
        state (dict): The current state of the agent, including all keys.
    Returns:
        dict: New key added to state, documents, that contains documents.
    """
    print("---RETRIEVE---")

    state_dict = state["keys"]
    question = state_dict["question"]
    documents = retriever.invoke(question)

    return {"keys": {"documents": documents, "question": question}}

# Evaluate retrieval performance

In [3]:
from openai import OpenAI

openai_client = OpenAI()

def eval_qrel(query, docs):

    """
    Use GPT to evaluate the relevance of the retrieved documents
    """

    content = f"""
        Evaluate the relevance of the following 4 documents to the query: {query}. Rank each document with an integer relevance score from 0 to 2, where 0 is not relevant and 2 is relevant.
        
        Here are the 4 documents in json format: 
        {[{"title": doc.metadata["title"], "content": doc.page_content} for doc in docs]}
        
        Return the relevance scores for each document in an array, for example: [0, 1, 0, 2]. No need to include the query in the response or explain the relevance score.
    """

    completion = openai_client.chat.completions.create(
        model="gpt-4o-mini",
        messages=[
            {"role": "system", "content": "You are a helpful assistant."},
            {
                "role": "user",
                "content": content,
            },
        ],
    )

    return completion.choices[0].message.content


def get_qrel(query, retrieved_docs) -> List[int]:
    """
    Get evaluation from GPT and return the relevance scores array
    """
    qrel = eval_qrel(query, retrieved_docs)
    qrel = qrel.replace("[", "").replace("]", "").split(", ")
    qrel = [int(i) for i in qrel]
    qrel = qrel[: len(retrieved_docs)]
    return qrel

In [4]:
import ir_measures
from ir_measures import Precision, MRR, nDCG

def form_irmeasure_qrel_arr(query: str, retrieved_docs: List[Document], qrel: List[int]):
    """
    Format the qrels array for later use in ir_measures
    """
    
    if len(qrel) < len(retrieved_docs):
        qrel = qrel + [0] * (len(retrieved_docs) - len(qrel))

    return [
        ir_measures.Qrel(query, doc.page_content, qrel[i])
        for i, doc in enumerate(retrieved_docs)
    ]


def form_irmeasure_run_arr(query: str, retrieved_docs: List[Document]):
    """
    Format the runs array for later use in ir_measures
    """

    return [
        ir_measures.ScoredDoc(query, doc.page_content, doc.metadata["score"])
        for doc in retrieved_docs
    ]

# Measure

Load questions

In [5]:
import json

final_quests = json.load(open("./questions/final_questions.json"))[:100]
final_quests = [quest["question"] for quest in final_quests]

Retrieve

In [None]:
ir_qrels = []
ir_runs = []

for i, quest in enumerate(final_quests):
    print(f"Question {i+1}/{len(final_quests)}")
    
    state = {"keys": {"question": quest}}
    state = retrieve(state)
    
    retrieved_docs = state["keys"]["documents"]
    qrel = get_qrel(quest, retrieved_docs)
    
    ir_qrels.extend(form_irmeasure_qrel_arr(quest, retrieved_docs, qrel))
    ir_runs.extend(form_irmeasure_run_arr(quest, retrieved_docs))
    
print("--- Done ---")

Evaluate

In [7]:
metrics = [Precision@3, MRR, nDCG@3]

result = ir_measures.calc_aggregate(metrics, ir_qrels, ir_runs)
result

{P@3: 0.5066666666666667, nDCG@3: 0.5767568351326005, RR: 0.6058333333333334}