https://github.com/athina-ai/rag-cookbooks を元に8種類のRAGと5種類のAgenticRAGについて学ぶ

# Environment building

In [1]:
!pip uninstall -q -y torch torchvision torchaudio pydantic tokenizers fsspec

[0m

In [2]:
# 1. fastai に対応する torch のインストール（CUDA 12.4 対応）
!pip install -q torch==2.5.1+cu124 torchvision==0.20.1+cu124 torchaudio==2.5.1 --extra-index-url https://download.pytorch.org/whl/cu124

# 2. Pydantic の適切なバージョン
!pip install  -q "pydantic<3.0,>=2.6.3"

# 3. Tokenizers のバージョンを Transformers に合わせる
!pip install  -q "tokenizers>=0.21,<0.22"

# 4. fsspec のバージョンを datasets に合わせる
!pip install  -q "fsspec==2024.6.1"

# 5. Athina と関連ライブラリのインストール
!pip install -q -U  langchain-openai langchain-community pinecone-client pinecone-plugin-interface datasets athina athina-logger tiktoken retrying timeout-decorator jsonpath_ng litellm colorlog ragas e2b

[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
ragas 0.2.12 requires pydantic>=2, which is not installed.
sentence-transformers 3.3.1 requires transformers<5.0.0,>=4.41.0, which is not installed.
peft 0.14.0 requires transformers, which is not installed.
datasets 2.21.0 requires fsspec[http]<=2024.6.1,>=2023.1.0, but you have fsspec 2025.2.0 which is incompatible.
gcsfs 2024.10.0 requires fsspec==2024.10.0, but you have fsspec 2025.2.0 which is incompatible.[0m[31m
[0m[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
litellm 1.60.0 requires tokenizers, which is not installed.[0m[31m
[0m[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the sour

# Indexing

In [3]:
import os
from google.colab import userdata
os.environ["OPENAI_API_KEY"] = userdata.get("OPENAI_API_KEY")
os.environ['ATHINA_API_KEY'] = userdata.get('ATHINA_API_KEY')
os.environ['PINECONE_API_KEY'] = userdata.get('PINECONE_API_KEY')

In [4]:
# load embedding model
from langchain_openai import OpenAIEmbeddings
embeddings = OpenAIEmbeddings()

In [5]:
# load data
from langchain.document_loaders import CSVLoader
loader = CSVLoader("/content/drive/MyDrive/Colab Notebooks/rag-cookbooks-copy/data/context.csv")
documents = loader.load()

In [6]:
# split documents
from langchain.text_splitter import RecursiveCharacterTextSplitter
text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=0)
documents = text_splitter.split_documents(documents)

# Pipecone Vector Database

In [7]:
# initialize pinecone client
from pinecone import Pinecone, ServerlessSpec

pc = Pinecone(
    api_key=os.environ.get("PINECONE_API_KEY"),  # app.pinecone.io
)

In [8]:
# create index if it does not exist
index_name = 'my-index'
if index_name not in pc.list_indexes().names():
    pc.create_index(
        name=index_name,
        dimension=1536,
        metric='cosine',
        spec=ServerlessSpec(
            cloud='aws',
            region='us-east-1'
        )
    )
else:
    print(f"Index '{index_name}' already exists.")

Index 'my-index' already exists.


In [9]:
# create vectorstore
from langchain.vectorstores import Pinecone
vectorstore = Pinecone.from_documents(
    documents=documents,
    embedding=embeddings,
    index_name=index_name
)

# Retriever

In [10]:
# create retriever
retriever = vectorstore.as_retriever()


# Rag Chain

In [11]:
# load llm
from langchain_openai import ChatOpenAI
llm = ChatOpenAI()

In [12]:
# create document chain
from langchain.prompts import ChatPromptTemplate
from langchain.schema.runnable import RunnablePassthrough
from langchain.schema.output_parser import StrOutputParser

template = """"
You are a helpful assistant that answers questions based on the provided context.
Use the provided context to answer the question.
Question: {input}
Context: {context}
Answer:
"""
prompt = ChatPromptTemplate.from_template(template)

# setup RAG pipeline
rag_chain = (
    {"context": retriever, "input": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)


In [13]:
# response
response = rag_chain.invoke("when did ww1 end?")
response

'WW1 ended on November 11, 1918.'

# Preparing Data for Evaluation

In [14]:
# create dataset
question = ["when did ww1 end?"]
response = []
contexts = []

# inference
for query in question:
    print(query)
    response.append(rag_chain.invoke(query))
    contexts.append([docs.page_content for docs in retriever.get_relevant_documents(query)])

# retriever.get_relevant_documents(query) で返ってくるのは Document オブジェクトのリスト
#（例）：
# [
#     Document(page_content="World War I or the First World War (28 July 1914 – 11 November 1918)..."),
#     Document(page_content="Some wars name themselves. This is the Great War...")
# ]

# To dict
data = {
    "query": question,
    "response": response,
    "context": contexts,
}

data["context"]

when did ww1 end?


  contexts.append([docs.page_content for docs in retriever.get_relevant_documents(query)])


[["context: ['World War I or the First World War (28 July 1914 – 11 November 1918), often abbreviated as WWI, was one of the deadliest global conflicts in history. It was fought between two coalitions, the Allies and the Central Powers. Fighting occurred throughout Europe, the Middle East, Africa, the Pacific, and parts of Asia. An estimated 9 million soldiers were killed in combat, plus another 23 million wounded, while 5 million civilians died as a result of military action, hunger, and disease.",
  "context: ['World War I or the First World War (28 July 1914 – 11 November 1918), often abbreviated as WWI, was one of the deadliest global conflicts in history. It was fought between two coalitions, the Allies and the Central Powers. Fighting occurred throughout Europe, the Middle East, Africa, the Pacific, and parts of Asia. An estimated 9 million soldiers were killed in combat, plus another 23 million wounded, while 5 million civilians died as a result of military action, hunger, and d

# Evaluation with Athina AI

In [15]:
# create dataset
from datasets import Dataset
dataset = Dataset.from_dict(data)
# create dataframe
import pandas as pd
df = pd.DataFrame(dataset)
df

Unnamed: 0,query,response,context
0,when did ww1 end?,"World War I ended on November 11, 1918.",[context: ['World War I or the First World War...


In [16]:
# Convert to dictionary
df_dict = df.to_dict(orient='records')

# Convert context to list
for record in df_dict:
    if not isinstance(record.get('context'), list):
        if record.get('context') is None:
            record['context'] = []
        else:
            record['context'] = [record['context']]
df_dict

[{'query': 'when did ww1 end?',
  'response': 'World War I ended on November 11, 1918.',
  'context': ["context: ['World War I or the First World War (28 July 1914 – 11 November 1918), often abbreviated as WWI, was one of the deadliest global conflicts in history. It was fought between two coalitions, the Allies and the Central Powers. Fighting occurred throughout Europe, the Middle East, Africa, the Pacific, and parts of Asia. An estimated 9 million soldiers were killed in combat, plus another 23 million wounded, while 5 million civilians died as a result of military action, hunger, and disease.",
   "context: ['World War I or the First World War (28 July 1914 – 11 November 1918), often abbreviated as WWI, was one of the deadliest global conflicts in history. It was fought between two coalitions, the Allies and the Central Powers. Fighting occurred throughout Europe, the Middle East, Africa, the Pacific, and parts of Asia. An estimated 9 million soldiers were killed in combat, plus an

In [17]:
# set api keys for Athina evals
from athina.keys import AthinaApiKey, OpenAiApiKey
OpenAiApiKey.set_key(os.getenv('OPENAI_API_KEY'))
AthinaApiKey.set_key(os.getenv('ATHINA_API_KEY'))

from athina.evals import DoesResponseAnswerQuery
DoesResponseAnswerQuery(model="gpt-4").run_batch(data=df_dict)
# DoesResponseAnswerQuery(model="gpt-4").run_batch(data=df) # Object of type DataFrame is not JSON serializable
# DoesResponseAnswerQuery(model="gpt-4").run_batch(data=data) # Data at index 0 is missing required argument: response


{'status': 'success', 'data': {'eval_request': {'id': 'c2fc8d10-76eb-4575-9187-4c050537abe6', 'user_id': '3bc296e7-44e7-4f5b-a5c0-c1a6c9d129bd', 'org_id': 'gmail.com_152803301', 'workspace_slug': 'default', 'status': 'initiated', 'request_label': 'Draq_eval_1738655152.0755057', 'request_data': {'data': [{'query': 'when did ww1 end?', 'response': 'World War I ended on November 11, 1918.', 'context': ["context: ['World War I or the First World War (28 July 1914 – 11 November 1918), often abbreviated as WWI, was one of the deadliest global conflicts in history. It was fought between two coalitions, the Allies and the Central Powers. Fighting occurred throughout Europe, the Middle East, Africa, the Pacific, and parts of Asia. An estimated 9 million soldiers were killed in combat, plus another 23 million wounded, while 5 million civilians died as a result of military action, hunger, and disease.", "context: ['World War I or the First World War (28 July 1914 – 11 November 1918), often abbrev

BatchRunResult(eval_request_id='c2fc8d10-76eb-4575-9187-4c050537abe6', eval_results=[{'name': 'Draq', 'display_name': 'Does Response Answer Query', 'data': {'query': 'when did ww1 end?', 'response': 'World War I ended on November 11, 1918.', 'context': ["context: ['World War I or the First World War (28 July 1914 – 11 November 1918), often abbreviated as WWI, was one of the deadliest global conflicts in history. It was fought between two coalitions, the Allies and the Central Powers. Fighting occurred throughout Europe, the Middle East, Africa, the Pacific, and parts of Asia. An estimated 9 million soldiers were killed in combat, plus another 23 million wounded, while 5 million civilians died as a result of military action, hunger, and disease.", "context: ['World War I or the First World War (28 July 1914 – 11 November 1918), often abbreviated as WWI, was one of the deadliest global conflicts in history. It was fought between two coalitions, the Allies and the Central Powers. Fighting

# Evaluation in another LLM

In [19]:
import openai
# OpenAI クライアントを作成
client = openai.OpenAI(api_key=os.getenv("OPENAI_API_KEY"))

# 使用可能なモデル一覧を取得
available_models = client.models.list()

# モデル名を抽出
model_names = [model.id for model in available_models]

# すべての利用可能なモデルを表示
import pprint
pprint.pprint(model_names)

# "o3-mini-high" が含まれているか
print("o3-mini-high" in model_names)

['gpt-4o-mini-audio-preview',
 'gpt-4o-mini-audio-preview-2024-12-17',
 'gpt-4o-mini-realtime-preview',
 'dall-e-2',
 'o1-mini-2024-09-12',
 'o1-preview-2024-09-12',
 'o1-mini',
 'gpt-4o-2024-11-20',
 'o1-preview',
 'whisper-1',
 'dall-e-3',
 'gpt-4-turbo',
 'babbage-002',
 'omni-moderation-latest',
 'omni-moderation-2024-09-26',
 'tts-1-hd-1106',
 'gpt-4o-audio-preview-2024-10-01',
 'gpt-4o-2024-08-06',
 'gpt-4o',
 'gpt-4o-mini-2024-07-18',
 'gpt-4o-2024-05-13',
 'tts-1-hd',
 'gpt-4o-mini',
 'gpt-4-turbo-2024-04-09',
 'gpt-4-0613',
 'tts-1',
 'gpt-3.5-turbo-16k',
 'tts-1-1106',
 'davinci-002',
 'gpt-4-turbo-preview',
 'gpt-3.5-turbo-1106',
 'gpt-4o-mini-realtime-preview-2024-12-17',
 'gpt-4o-audio-preview',
 'gpt-3.5-turbo-instruct',
 'gpt-4',
 'gpt-4o-realtime-preview-2024-10-01',
 'chatgpt-4o-latest',
 'gpt-3.5-turbo-instruct-0914',
 'gpt-3.5-turbo-0125',
 'gpt-4o-audio-preview-2024-12-17',
 'gpt-4o-realtime-preview-2024-12-17',
 'gpt-3.5-turbo',
 'text-embedding-3-large',
 'gpt-4o-

In [20]:
evaluation_prompt = """
You are an expert AI evaluator.
You will evaluate the relevance and accuracy of the given response based on the provided context.

### Evaluation Criteria: ５段階評価
1. **Relevance (1-5):** Does the response directly answer the question?
2. **Accuracy (1-5):** Is the response factually correct given the context?
3. **Justification:** Explain why the response is rated this way.

### Example Format:
Relevance: 4
Accuracy: 5
Justification: The response correctly states the end date of World War I, which matches the context.

### Input:
Query: {query}
Response: {response}
Context: {context}

### Output:
"""

In [21]:
import pandas as pd

# LLM を使って評価を実施する関数
def evaluate_response(record):
    query = record["query"]
    response = record["response"]
    context = "\n".join(record["context"])

    prompt = evaluation_prompt.format(query=query, response=response, context=context)

    completion = client.chat.completions.create(
        model="gpt-4o-mini",  # 必要に応じて変更
        messages=[{"role": "system", "content": "You are an AI assistant evaluating responses."},
                  {"role": "user", "content": prompt}],
        max_tokens=200
    )

    evaluation_text = completion.choices[0].message.content.strip()

    # Relevance と Accuracy のスコアを抽出
    relevance_score = None
    accuracy_score = None
    justification = ""

    for line in evaluation_text.split("\n"):
        if line.startswith("Relevance:"):
            relevance_score = int(line.split(":")[1].strip())
        elif line.startswith("Accuracy:"):
            accuracy_score = int(line.split(":")[1].strip())
        elif line.startswith("Justification:"):
            justification = line.split("Justification:")[1].strip()

    return {
        "Query": query,
        "Response": response,
        "Context": context,
        "Relevance": relevance_score,
        "Accuracy": accuracy_score,
        "Justification": justification
    }

# df_dict のデータを LLM で評価
evaluated_results = [evaluate_response(record) for record in df_dict]

# DataFrame に変換
df_evaluated = pd.DataFrame(evaluated_results)
df_evaluated

Unnamed: 0,Query,Response,Context,Relevance,Accuracy,Justification
0,when did ww1 end?,"World War I ended on November 11, 1918.",context: ['World War I or the First World War ...,5,5,The response directly answers the query by pro...
