# Use Ragas to evaluate the customized RAG pipeline based on milvus

**Please note that this test requires a large amount of OpenAI api token consumption. Please read it carefully and Pay attention to the number of times you request access.**

## 1. Prepare environment and data

Before starting, you must set OPENAI_API_KEY in your environment variables.

You also need to install [milvus](https://milvus.io/) and start it. You can refer to the [official introduction](https://milvus.io/docs/install_standalone-docker.md) to start quickly.

Install pip dependencies

In [None]:
# ! python -m pip install openai beir pandas ragas==0.0.17 pymilvus

Download [Financial Opinion Mining and Question Answering (fiqa) Dataset](https://sites.google.com/view/fiqa/) data if it not exists in your local space. We convert it into a ragas form that is easier to process, referring from this [script](https://github.com/explodinggradients/ragas/blob/main/experiments/baselines/fiqa/dataset-exploration-and-baseline.ipynb).

In [1]:
import json
import pandas as pd
import os
from tqdm import tqdm
from datasets import Dataset
from beir import util


def prepare_fiqa_without_answer(knowledge_path):
    dataset_name = "fiqa"

    if not os.path.exists(os.path.join(knowledge_path, f'{dataset_name}.zip')):
        url = (
            "https://public.ukp.informatik.tu-darmstadt.de/thakur/BEIR/datasets/{}.zip".format(
                dataset_name
            )
        )
        util.download_and_unzip(url, knowledge_path)

    data_path = os.path.join(knowledge_path, 'fiqa')
    with open(os.path.join(data_path, "corpus.jsonl")) as f:
        cs = [pd.Series(json.loads(l)) for l in f.readlines()]

    corpus_df = pd.DataFrame(cs)

    corpus_df = corpus_df.rename(columns={"_id": "corpus-id", "text": "ground_truth"})
    corpus_df = corpus_df.drop(columns=["title", "metadata"])
    corpus_df["corpus-id"] = corpus_df["corpus-id"].astype(int)
    corpus_df.head()

    with open(os.path.join(data_path, "queries.jsonl")) as f:
        qs = [pd.Series(json.loads(l)) for l in f.readlines()]

    queries_df = pd.DataFrame(qs)
    queries_df = queries_df.rename(columns={"_id": "query-id", "text": "question"})
    queries_df = queries_df.drop(columns=["metadata"])
    queries_df["query-id"] = queries_df["query-id"].astype(int)
    queries_df.head()

    splits = ["dev", "test", "train"]
    split_df = {}
    for s in splits:
        split_df[s] = pd.read_csv(os.path.join(data_path, f"qrels/{s}.tsv"), sep="\t").drop(
            columns=["score"]
        )

    final_split_df = {}
    for split in split_df:
        df = queries_df.merge(split_df[split], on="query-id")
        df = df.merge(corpus_df, on="corpus-id")
        df = df.drop(columns=["corpus-id"])
        grouped = df.groupby("query-id").apply(
            lambda x: pd.Series(
                {
                    "question": x["question"].sample().values[0],
                    "ground_truths": x["ground_truth"].tolist(),
                }
            )
        )

        grouped = grouped.reset_index()
        grouped = grouped.drop(columns="query-id")
        final_split_df[split] = grouped

    return final_split_df


knowledge_datas_path = './knowledge_datas'
fiqa_path = os.path.join(knowledge_datas_path, 'fiqa_doc.txt')

if not os.path.exists(knowledge_datas_path):
    os.mkdir(knowledge_datas_path)
contexts_list = []
answer_list = []

final_split_df = prepare_fiqa_without_answer(knowledge_datas_path)

docs = []

split = 'test'
for ds in final_split_df[split]["ground_truths"]:
    docs.extend([d for d in ds])
print(len(docs))

docs_str = '\n'.join(docs)
with open(fiqa_path, 'w') as f:
    f.write(docs_str)

split = 'test'
question_list = final_split_df[split]["question"].to_list()
ground_truth_list = final_split_df[split]["ground_truths"].to_list()

1706


Now we have the question list and the ground truth list. And the knowledge documents are prepared in `fiqa_path`.

## 2. Build RAG pipeline based on milvus and langchain
Split the doc using langchain RecursiveCharacterTextSplitter.

In [2]:
from langchain.document_loaders import TextLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

loader = TextLoader(fiqa_path)
documents = loader.load()
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=40)
docs = text_splitter.split_documents(documents)

Prepare embedding model and milvus settings.

In [3]:
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores.milvus import Milvus

embeddings = HuggingFaceEmbeddings(model_name="BAAI/bge-base-en")

vector_db = Milvus.from_documents(
    docs,
    embeddings,
    connection_args={"host": "127.0.0.1", "port": "19530"},
    drop_old=True
)

Batches:   0%|          | 0/77 [00:00<?, ?it/s]

Build agent using langchain.

In [4]:
def search_milvus(question, top_k=5):
    contexts = vector_db.similarity_search(question, k=top_k)
    return contexts[:top_k]

In [5]:
from langchain.tools import Tool
from langchain.memory import ConversationBufferMemory
from langchain.chat_models import ChatOpenAI
from langchain.agents import AgentExecutor, ConversationalChatAgent

chat_llm = ChatOpenAI(model_name='gpt-4-1106-preview')
tools = [
    Tool(
        name='Search',
        func=search_milvus,
        description='useful for search professional knowledge and information'
    )
]
agent = ConversationalChatAgent.from_llm_and_tools(llm=chat_llm, tools=tools)

memory = ConversationBufferMemory(memory_key="chat_history", return_messages=True, output_key='output')
agent_chain = AgentExecutor.from_agent_and_tools(
    agent=agent,
    tools=tools,
    memory=memory,
    return_intermediate_steps=True,
    # verbose=True,
)

In [8]:
import time


def retry_agent_chain(retry_num=4, retry_interval=4):
    answer = 'failed. please retry.'
    contexts = ['failed. please retry.']
    for _ in range(retry_num):
        try:
            agent_result = agent_chain(question)
            contexts = [document.page_content for document in agent_result['intermediate_steps'][0][1]]
            answer = agent_result['output']
            break
        except Exception as e:
            time.sleep(retry_interval)
            print(e)
            print('failed, retry...')
            continue
    return answer, contexts

## 3. Start Ragas Evaluation

Note that a large amount of OpenAI api token is consumed. Every time you ask a question and every evaluation, you will ask the OpenAI service. Please pay attention to your token consumption. If you only want to run a small number of tests, you can modify the code to reduce the test size.

In [None]:
contexts_list = []
answer_list = []
for question in tqdm(question_list):
    memory.clear()
    answer, contexts = retry_agent_chain()
    # print(f'answer = {answer}')
    # print(f'contexts = {contexts}')
    # print('=' * 80)
    answer_list.append(answer)
    contexts_list.append(contexts)

You can choose the indicators you care about to test.

In [None]:
from ragas import evaluate
from ragas.metrics import answer_relevancy, faithfulness, context_recall, context_precision, answer_similarity

ds = Dataset.from_dict({"question": question_list,
                        "contexts": contexts_list,
                        "answer": answer_list,
                        "ground_truths": ground_truth_list})

result = evaluate(
    ds,
    metrics=[
        context_precision,
        # context_recall,
        # faithfulness,
        # answer_relevancy,
        # answer_similarity,
        # answer_correctness,
    ],

)
print(result)