In [1]:
!pip install -q langchain
!pip install -q ragas

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.0/2.0 MB[0m [31m19.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m46.8/46.8 kB[0m [31m5.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m49.4/49.4 kB[0m [31m5.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m48.8/48.8 kB[0m [31m1.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m521.2/521.2 kB[0m [31m10.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.0/2.0 MB[0m [31m49.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m220.8/220.8 kB[0m [31m19.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m71.1/71.1 kB[0m [31m7.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━

# GPT evaluation


## Evaluating with GPT4

Ragas uses gpt3.5 by default but using gpt4 for evaluation can improve the results so lets use that for the `Faithfulness` metric

To start-off, we initialise the gpt4 `chat_model` from langchain and ada embeddings

In [2]:
# make sure you have you OpenAI API key ready
import os

os.environ["OPENAI_API_KEY"] = "INSERT_OPENAI_KEY_HERE"

In [9]:
from langchain.chat_models import ChatOpenAI

gpt4 = ChatOpenAI(model_name="gpt-4")

In [10]:
from ragas.llms import LangchainLLM

gpt4_wrapper = LangchainLLM(llm=gpt4)

Substitute the llm in `Metric` instance with the newly create GPT4 model.

In [11]:
from ragas.metrics import faithfulness

faithfulness.llm = gpt4_wrapper

In [12]:
# data
from datasets import load_dataset

fiqa_eval = load_dataset("explodinggradients/fiqa", "ragas_eval")
fiqa_eval

DatasetDict({
    baseline: Dataset({
        features: ['question', 'ground_truths', 'answer', 'contexts'],
        num_rows: 30
    })
})

In [13]:
# evaluate
from ragas import evaluate

result = evaluate(
    fiqa_eval["baseline"].select(range(5)),
    metrics=[faithfulness],
)

result

evaluating with [faithfulness]


100%|██████████| 1/1 [02:58<00:00, 178.61s/it]


{'faithfulness': 0.9500}

## Evaluating with Open-Source LLMs


In [None]:
# start the vLLM server
!python -m vllm.entrypoints.openai.api_server \
    --model HuggingFaceH4/zephyr-7b-alpha \
    --host 0.0.0.0 \
    --port 8080

In [None]:
from langchain.chat_models import ChatOpenAI
from ragas.llms import LangchainLLM

inference_server_url = "http://localhost:8080/v1"

# create vLLM Langchain instance
chat = ChatOpenAI(
    model="HuggingFaceH4/zephyr-7b-alpha",
    openai_api_key="no-key",
    openai_api_base=inference_server_url,
    max_tokens=5,
    temperature=0,
)

# use the Ragas LangchainLLM wrapper to create a RagasLLM instance
vllm = LangchainLLM(llm=chat)

In [None]:
from ragas.metrics import (
    context_precision,
    answer_relevancy,
    faithfulness,
    context_recall,
)
from ragas.metrics.critique import harmfulness

# change the LLM

faithfulness.llm = vllm
answer_relevancy.llm = vllm
context_precision.llm = vllm
context_recall.llm = vllm
harmfulness.llm = vllm

In [None]:
# evaluate
from ragas import evaluate

result = evaluate(
    fiqa_eval["baseline"].select(range(5)),  # showing only 5 for demonstration
    metrics=[faithfulness],
)

result