In [1]:
import pandas as pd
import sys

sys.path.insert(0, "../")
from rurage import RAGEModelConfig, RAGESetConfig, RAGEvaluator, RAGEnsemble

In [2]:
# For each model that needs to be evaluated, you need to initialize a config containing:
# * the name of the column with the context on which the answer was generated
# * the name of the column with the generated model answer
models_cfg = []
models_cfg.append(
    RAGEModelConfig(
        context_col="context_top5", answer_col="GPT4o_top5"
    )
)
models_cfg.append(
    RAGEModelConfig(
        context_col="context_top5", answer_col="gpt35_top5"
    )
)
models_cfg.append(
    RAGEModelConfig(
        context_col="context_top5", answer_col="cotype_light_top5"
    )
)

In [3]:
validation_set = pd.read_csv("golden_set.csv")

# Dataset contains multiclass markup, but we want to use only 0/1 labels
validation_set = validation_set[
    (validation_set["r_GPT4o_top5"] != "0,5")
    & (validation_set["r_gpt35_top5"] != "0,5")
    & (validation_set["r_cotype_light_top5"] != "0,5")
].reset_index(drop=True)

In [4]:
# Initialize the configuration of the evaluation set:
# * validation set pd.Daraframe
# * The name of the question column
# * The name of the golden answer column
# * The list of model configs
validation_set_cfg = RAGESetConfig(
    golden_set=validation_set,
    question_col="Question",
    golden_answer_col="Golden Answer",
    models_cfg=models_cfg,
)

In [5]:
# Initialize the evaluator
rager = RAGEvaluator(golden_set_cfg=validation_set_cfg)

In [6]:
total_report, pointwise_reports = rager.evaluate_correctness(pointwise_report=True)

Starting correctness evaluation
Initializing the NLI model: MoritzLaurer/mDeBERTa-v3-base-xnli-multilingual-nli-2mil7
Initializing the similarity model: intfloat/multilingual-e5-large


Model #:   0%|          | 0/3 [00:00<?, ?it/s]

### Ensemble training

In [7]:
labels = [
    validation_set["r_GPT4o_top5"].values,
    validation_set["r_gpt35_top5"].values,
    validation_set["r_cotype_light_top5"].values,
]

In [9]:
ensemble = RAGEnsemble(ensemble_type="correctness")

In [10]:
X_train, y_train, X_test, y_test = ensemble.prepare_data_for_study(
    pointwise_reports=pointwise_reports, labels=labels, set_config=validation_set_cfg
)

In [11]:
ensemble.fit(X_train, y_train, X_test, y_test, optimize=True)

Starting threshold optimization
Threshold for the correctness task: 0.35035035035035034


In [12]:
y_pred = ensemble.predict(X_test)

In [14]:
from sklearn.metrics import classification_report

In [16]:
print(classification_report(y_test.astype(int), y_pred))

              precision    recall  f1-score   support

           0       0.92      0.81      0.86       333
           1       0.87      0.95      0.91       450

    accuracy                           0.89       783
   macro avg       0.90      0.88      0.89       783
weighted avg       0.89      0.89      0.89       783



### Ensemble inference

Example below shows inference on the one sample, but it is possible to do batch inference.

In [23]:
validation_set = pd.read_csv("one_sample.csv")
validation_set[["Question", "context_top5", "Golden Answer", "cotype_light_top5"]]

Unnamed: 0,Question,context_top5,Golden Answer,cotype_light_top5
0,в каком году была основана компания гугл,Google LLC — транснациональная корпорация из С...,Компания Google была основана 4 сентября 1998 ...,Компания Google была основана в 1998 году.


In [24]:
models_cfg = []
models_cfg.append(
    RAGEModelConfig(
        context_col="context_top5", answer_col="cotype_light_top5"
    )
)

validation_set_cfg = RAGESetConfig(
    golden_set=validation_set,
    question_col="Question",
    golden_answer_col="Golden Answer",
    models_cfg=models_cfg,
)

In [25]:
rager = RAGEvaluator(golden_set_cfg=validation_set_cfg)
_, pointwise_reports = rager.evaluate_correctness(pointwise_report=True)

Starting correctness evaluation
Initializing the NLI model: MoritzLaurer/mDeBERTa-v3-base-xnli-multilingual-nli-2mil7
Initializing the similarity model: intfloat/multilingual-e5-large


Model #:   0%|          | 0/1 [00:00<?, ?it/s]

In [26]:
ensemble = RAGEnsemble(ensemble_type="correctness", model_path="rurage_ensemble.cbm")

In [27]:
X = ensemble.prepare_data_for_inference(data=pointwise_reports[0], set_config=validation_set_cfg)

In [30]:
X

Unnamed: 0,nli,sim,unigram_overlap_precision,unigram_overlap_recall,unigram_overlap_f1,bigram_overlap_precision,bigram_overlap_recall,bigram_overlap_f1,rouge_precision,rouge_recall,rouge_f1,bleu
0,0,0.974703,0.714286,0.714286,0.714286,0.5,0.428571,0.461538,1.0,0.714286,0.833333,0.67032


In [28]:
ensemble.predict(X)

array(['1'], dtype=object)