In [None]:
# import sys

# sys.path.insert(0, "../")

In [11]:
import pandas as pd
from rurage import RAGEvaluator, RAGESetConfig, RAGEModelConfig, report

1. For each model that needs to be evaluated, you need to initialize a config containing:
* the name of the column with the context on which the answer was generated
* the name of the column with the generated model answer

In [3]:
models_cfg = []
models_cfg.append(
    RAGEModelConfig(
        context_col="context_top5", answer_col="GPT4o_top5"
    )
)
models_cfg.append(
    RAGEModelConfig(
        context_col="context_top5", answer_col="gpt35_top5"
    )
)
models_cfg.append(
    RAGEModelConfig(
        context_col="context_top5", answer_col="cotype_light_top5"
    )
)

2. Initialize the configuration of the evaluation set:
* Validation set pd.Dataframe
* Name of the question column
* Name of the golden answer column
* List of model configs

In [4]:
validation_set = pd.read_csv("golden_set.csv")
validation_set_cfg = RAGESetConfig(
    golden_set=validation_set,
    question_col="Question",
    golden_answer_col="Golden Answer",
    models_cfg=models_cfg,
)

3. Initialize the evaluator by passing the collected configuration:

In [5]:
rager = RAGEvaluator(golden_set_cfg=validation_set_cfg)

4. Run a comprehensive evalution (Correctness, Faithfulness, Relevance) for each model:

In [16]:
comprehensive_report = rager.comprehensive_evaluation()

Starting correctness evaluation
The NLI model has alredy been loaded.
The similarity model has alredy been loaded.


Model #:   0%|          | 0/3 [00:00<?, ?it/s]

Starting faithfulness evaluation
The NLI model has alredy been loaded.


Model #:   0%|          | 0/3 [00:00<?, ?it/s]

Starting relevance evaluation
The similarity model has alredy been loaded.


Model #:   0%|          | 0/3 [00:00<?, ?it/s]

5.a. You can print a specific report using the special method:

In [19]:
for single_report in comprehensive_report["correctness_report"]:
    report.correctness_report(single_report)

[GPT4o_top5]
	Entailment score: 0.3719107896323086
	Neutral score: 0.28933092224231466
	Contradiction score: 0.3387582881253767


	Similarity score: 0.8344792


	Token overlap (1-gram) precision: 0.14285714285714285
	Token overlap (1-gram) recall: 0.11764705882352941
	Token overlap (1-gram) F1: 0.13333333333333333


	Token overlap (2-gram) precision: 0.0
	Token overlap (2-gram) recall: 0.0
	Token overlap (12-gram) F1: 0.0


	ROUGE-L precision: 0.2857142857142857
	ROUGE-L recall: 0.13333333333333333
	ROUGE-L F1: 0.1904761904761905


	BLEU: 0.013349307984130428



[gpt35_top5]
	Entailment score: 0.47860156720916214
	Neutral score: 0.26702833031946954
	Contradiction score: 0.2543701024713683


	Similarity score: 0.90089774


	Token overlap (1-gram) precision: 0.3333333333333333
	Token overlap (1-gram) recall: 0.23809523809523808
	Token overlap (1-gram) F1: 0.2857142857142857


	Token overlap (2-gram) precision: 0.1
	Token overlap (2-gram) recall: 0.058823529411764705
	Token overlap (12-gr

5.b. Also you can convert it to a dict:

In [20]:
from dataclasses import asdict

In [21]:
comprehensive_report["correctness_report"][0]

RAGEReport(report_name='GPT4o_top5', entailment_score=0.3719107896323086, neutral_score=0.28933092224231466, contradiction_score=0.3387582881253767, similarity_score=0.8344792, unigram_overlap_precision=0.14285714285714285, unigram_overlap_recall=0.11764705882352941, unigram_overlap_f1=0.13333333333333333, bigram_overlap_precision=0.0, bigram_overlap_recall=0.0, bigram_overlap_f1=0.0, rouge_precision=0.2857142857142857, rouge_recall=0.13333333333333333, rouge_f1=0.1904761904761905, bleu_score=0.013349307984130428)

In [22]:
asdict(comprehensive_report["correctness_report"][0])

{'report_name': 'GPT4o_top5',
 'entailment_score': 0.3719107896323086,
 'neutral_score': 0.28933092224231466,
 'contradiction_score': 0.3387582881253767,
 'similarity_score': 0.8344792,
 'unigram_overlap_precision': 0.14285714285714285,
 'unigram_overlap_recall': 0.11764705882352941,
 'unigram_overlap_f1': 0.13333333333333333,
 'bigram_overlap_precision': 0.0,
 'bigram_overlap_recall': 0.0,
 'bigram_overlap_f1': 0.0,
 'rouge_precision': 0.2857142857142857,
 'rouge_recall': 0.13333333333333333,
 'rouge_f1': 0.1904761904761905,
 'bleu_score': 0.013349307984130428}

6. If there is no need to run a comprehensive evaluation, you can use different evaluations separately:

In [None]:
correctness_report = rager.evaluate_correctness()
faithfulness_report = rager.evaluate_faithfulness()
relevance_report = rager.evaluate_relevance()

7. For the NLI and Similarity scores you can specify your own models. By default:

```python
correctness_report = rager.evaluate_correctness(
    nli_model_name="MoritzLaurer/mDeBERTa-v3-base-xnli-multilingual-nli-2mil7",
    sim_model_name="intfloat/multilingual-e5-large",
)
```