In [None]:
%pip install langsmith

Evaluation을 위한 Retriever를 생성

In [2]:
from dotenv import load_dotenv
from langchain_upstage import UpstageEmbeddings

load_dotenv()

embedding=UpstageEmbeddings(model="solar-embedding-1-large")

In [3]:
from langchain_pinecone import PineconeVectorStore

index_name = 'analects-upstage-index'

database = PineconeVectorStore.from_existing_index(index_name=index_name, embedding=embedding)
retriever = database.as_retriever()

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
### LLM답변 생성을 위한 RAG bot 

import openai
from langsmith import traceable
from langsmith.wrappers import wrap_openai

class RagBot:

    def __init__(self, retriever, model: str = "gpt-4o"):
        self._retriever = retriever
        # Wrapping the client instruments the LLM
        # LangSmith 문법
        self._client = wrap_openai(openai.Client())
        self._model = model

    @traceable()
    def retrieve_docs(self, question):
        # 사용자의 질문이 들어왔을 때 벡터스토어에 접근하여 해당하는 문서 가져오기
        return self._retriever.invoke(question)

    @traceable()
    def invoke_llm(self, question, docs):
        # `retrieve_docs()` 를 통해 가져온 문서들을 system prompt로 전달
        # 3.3에서 했던 방식과 유사함
        response = self._client.chat.completions.create(
            model=self._model,
            messages=[
                {
                    "role": "system",
                    "content": "You are an expert in Eastern Philosophy specializing in answering user questions. "
        "You must always respond in Korean. "
        "Use the following pieces of retrieved context from the Analects to answer the question. "
        "When the answer is based on the Analects text, please begin the response by presenting "
        "the relevant original Chinese text along with the Analects chapter and number. "
        "If the answer is not found in the provided context, you may state your thoughts briefly, "
        "but never fabricate non-existent records or facts as real. "
        "Adjust the length of your answer based on the amount of retrieved context "
        "and keep the answer concise. Limit your response to a maximum of thirty sentences. "

                    f"## Retrieved Context\n\n{docs}",
                },
                {"role": "user", "content": question},
            ],
        )

        # Evaluators will expect "answer" and "contexts"
        return {
            "answer": response.choices[0].message.content,
            "contexts": [str(doc) for doc in docs],
        }

    @traceable()
    def get_answer(self, question: str):
        docs = self.retrieve_docs(question)
        return self.invoke_llm(question, docs)

rag_bot = RagBot(retriever)

In [5]:
def predict_rag_answer(example: dict):
    """답변만 평가할 때 사용"""
    response = rag_bot.get_answer(example["input_question"])
    return {"answer": response["answer"]}

def predict_rag_answer_with_context(example: dict):
    """Context를 활용해서 hallucination을 평가할 때 사용"""
    response = rag_bot.get_answer(example["input_question"])
    return {"answer": response["answer"], "contexts": response["contexts"]}

In [6]:
from langchain_classic import hub
from langchain_openai import ChatOpenAI

# Grade prompt  답변의 정확도를 측정하기위해 사용되는 프롬프트
grade_prompt_answer_accuracy = prompt = hub.pull("langchain-ai/rag-answer-vs-reference")

def answer_evaluator(run, example) -> dict:
    """
    RAG 답변 성능을 측정하기 위한 evaluator
    """

    # `example`이 데이터를 생성할 때 입력한 `Question-Answer` pair. `run`은 `RagBot`을 활용해서 생성한 LLM의 답변
    input_question = example.inputs["input_question"]
    reference = example.outputs["answer"] # 데이터셋에 저장해둔 정답
    prediction = run.outputs["answer"] # 모델이 실제로 생성한 답

    # LLM Judge로 사용될 LLM
    llm = ChatOpenAI(model="gpt-4o", temperature=0)

    # LLM 응답을 위한 LCEL 활용
    # 3.6 `dictionary_chain`의 `prompt | llm | StrOutputParser()`` 의 구조와 유사함
    answer_grader = grade_prompt_answer_accuracy | llm

    # Evaluator 실행
    score = answer_grader.invoke({"question": input_question,
                                  "correct_answer": reference,
                                  "student_answer": prediction})
    score = score["Score"]

    return {"key": "answer_v_reference_score", "score": score}

In [7]:
# Grade prompt   답변이 사용자의 질문에 얼마나 도움되는지 판단하는 프롬프트
grade_prompt_answer_helpfulness = prompt = hub.pull("langchain-ai/rag-answer-helpfulness")

def answer_helpfulness_evaluator(run, example) -> dict:
    """
    답변이 사용자의 질문에 얼마나 도움되는지 판단하는 Evaluator
    """

    # 데이터셋의 답변과 비교하지 않고, 데이터셋의 질문에 대한 LLM의 답변의 가치를 평가함
    input_question = example.inputs["input_question"]
    prediction = run.outputs["answer"]

    # LLM Judge로 사용될 LLM
    llm = ChatOpenAI(model="gpt-4o", temperature=0)

    # LLM 응답을 위한 LCEL 활용
    answer_grader = grade_prompt_answer_helpfulness | llm

    # Evaluator 실행
    score = answer_grader.invoke({"question": input_question,
                                  "student_answer": prediction})
    score = score["Score"]

    return {"key": "answer_helpfulness_score", "score": score}

In [9]:
# Prompt    hallucination 판단을 위한 프롬프트
grade_prompt_hallucinations = prompt = hub.pull("langchain-ai/rag-answer-hallucination")

def answer_hallucination_evaluator(run, example) -> dict:
    """
    hallucination 판단을 위한 Evaluator
    """

    # 데이터셋에 있는 질문과, LLM이 답변을 생성할 때 사용한 context를 활용
    input_question = example.inputs["input_question"]
    contexts = run.outputs["contexts"]

    # LLM의 답변
    prediction = run.outputs["answer"]

    # LLM Judge로 사용될 LLM
    llm = ChatOpenAI(model="gpt-4o", temperature=0)

    answer_grader = grade_prompt_hallucinations | llm

    # Evaluator 실행
    score = answer_grader.invoke({"documents": contexts,
                                  "student_answer": prediction})
    score = score["Score"]

    return {"key": "answer_hallucination", "score": score}

In [8]:
from langsmith.evaluation import evaluate

dataset_name = "analects_dataset"
experiment_results = evaluate(
    predict_rag_answer, # 어떤 함수를 활용해서 LLM 답변을 확인할지 지정, hallucination 판단 여부에 따라 `with_context` 사용
    data=dataset_name, # Evaluation에 사용될 dataset의 이름
    evaluators=[answer_evaluator, answer_helpfulness_evaluator], # 실행할 Evaluator의 종류
    experiment_prefix="analects-evaluator-chatbot-evaluator",
    metadata={"version": "analects v1, gpt-4o"}, 
)

View the evaluation results for experiment: 'analects-evaluator-chatbot-evaluator-2153c1cf' at:
https://smith.langchain.com/o/f7be2484-70ae-4450-9e75-fa20f632109f/datasets/f7642737-7334-4f44-b53c-03bbe3e39825/compare?selectedSessions=e69a61af-db1c-4c90-9538-231240902b91




11it [02:05, 11.43s/it]


In [10]:
from langsmith.evaluation import evaluate

dataset_name = "analects_dataset"
experiment_results = evaluate(
    predict_rag_answer_with_context,
    data=dataset_name,
    evaluators=[answer_hallucination_evaluator],
    experiment_prefix="analects-evaluator-chatbot-hallucination",
    metadata={"version": "analects v1, gpt-4o"}, 
)

View the evaluation results for experiment: 'analects-evaluator-chatbot-hallucination-989faa7f' at:
https://smith.langchain.com/o/f7be2484-70ae-4450-9e75-fa20f632109f/datasets/f7642737-7334-4f44-b53c-03bbe3e39825/compare?selectedSessions=99b9994b-1abd-4e5d-95ee-db67511e9387




11it [02:43, 14.85s/it]
