In [1]:
import os
import sys
sys.path.append(os.path.abspath('../'))

In [3]:
import src.config
import yaml
import pandas as pd
import torch
from typing import Optional, Dict
from src.text_gen.llm import load_llm_and_qa_tmpl
from src.embedders.embedder import load_embedder
from src.storing.storing import load_retriever
from llama_index.core.query_engine import RetrieverQueryEngine
from llama_index.llms.huggingface import HuggingFaceLLM

In [4]:
from ragas.dataset_schema import EvaluationDataset
from ragas.llms import LlamaIndexLLMWrapper
from ragas.integrations.llama_index import evaluate
from ragas.metrics import (
    AnswerRelevancy,
    Faithfulness,
    ContextPrecision,
    ContextRecall
)

In [5]:
# hyp params
with open('../config.yaml', 'r') as f:
    hyps = yaml.safe_load(f)

logs_path: str = hyps['paths']['logs_db']

In [6]:
# data
n = 10

df = pd.read_json(f"../{hyps['ragas']['ragas_dataset']}").rename(columns={'question': 'user_input', 'answer': 'reference'})
df = df.iloc[:n]

ragas_dataset = EvaluationDataset.from_pandas(df[['user_input', 'reference']])

In [None]:
# embed
query_embed_model_name: str = hyps['embed_model']['query_model_name']
query_embed_kwargs: Optional[Dict] = hyps['embed_model']['query_kwargs']
chunk_embed_model_name: str = hyps['embed_model']['chunk_model_name']
chunk_embed_kwargs: Optional[Dict] = hyps['embed_model']['chunk_kwargs']

query_embed_model = load_embedder(query_embed_model_name, logs_path=logs_path, model_kwargs=query_embed_kwargs)
chunk_embed_model = load_embedder(chunk_embed_model_name, logs_path=logs_path, model_kwargs=chunk_embed_kwargs)

In [None]:
# retriever
vector_index_path: str = hyps['paths']['vector_index']
k_vector_search: int = hyps['retriever']['k_vector_search']
article_path: str = hyps['paths']['dataset_w_articles']
bm25_index_path: str = hyps['paths']['text_index']
k_text_search: int = hyps['retriever']['k_text_search']

retriever = load_retriever(
    vector_index_path,
    chunk_embed_model,
    logs_path,
    k_vector_search,
    article_path,
    bm25_index_path,
    k_text_search,
)

In [7]:
# llm
llm_model_name: str = hyps['llm_model']['model_name']
max_new_tokens: int = hyps['llm_model']['max_new_tokens']
cache_dir: str = hyps['paths']['cache_dir']
logs_path: str = hyps['paths']['logs_db']

llm, qa_prompt_tmpl = load_llm_and_qa_tmpl(
    llm_model_name,
    max_new_tokens,
    cache_dir,
    logs_path
)

INFO 12-25 01:31:30 llm_engine.py:237] Initializing an LLM engine (v0.6.3.post1) with config: model='nvidia/Llama3-ChatQA-2-8B', speculative_config=None, tokenizer='nvidia/Llama3-ChatQA-2-8B', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, override_neuron_config=None, rope_scaling=None, rope_theta=None, tokenizer_revision=None, trust_remote_code=True, dtype=torch.bfloat16, max_seq_len=32000, download_dir='./hfcache', load_format=LoadFormat.AUTO, tensor_parallel_size=1, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=True, kv_cache_dtype=auto, quantization_param_path=None, device_config=cuda, decoding_config=DecodingConfig(guided_decoding_backend='outlines'), observability_config=ObservabilityConfig(otlp_traces_endpoint=None, collect_model_forward_time=False, collect_model_execute_time=False), seed=0, served_model_name=nvidia/Llama3-ChatQA-2-8B, num_scheduler_steps=1, chunked_prefill_enabled=False multi_step_stream_outputs=True

Loading pt checkpoint shards:   0% Completed | 0/2 [00:00<?, ?it/s]


INFO 12-25 01:31:55 model_runner.py:1067] Loading model weights took 14.9888 GB
INFO 12-25 01:32:05 gpu_executor.py:122] # GPU blocks: 8296, # CPU blocks: 512
INFO 12-25 01:32:05 gpu_executor.py:126] Maximum concurrency for 32000 tokens per request: 4.15x


In [8]:
# RAG query_engine
query_engine = RetrieverQueryEngine.from_args(
    retriever=retriever,
    embed_model=query_embed_model,
    llm=llm,
    text_qa_template=qa_prompt_tmpl,
)

In [None]:
# evaluator llm
generate_config = {
    "temperature": 0.7,
    "top_k": 20,
    "top_p": 0.8,
    "do_sample": False,
    "num_beams": 1,
}

eval_llm = HuggingFaceLLM(
    model_name=hyps['ragas']['llm_evaluator'],
    tokenizer_name=hyps['ragas']['llm_evaluator'],
    context_window=4096,
    max_new_tokens=512,
    generate_kwargs=generate_config,
    model_kwargs={"torch_dtype": torch.bfloat16},
    device_map="cuda:1",
)
evaluator_llm = LlamaIndexLLMWrapper(eval_llm)

In [10]:
# metrics
metrics = [
    Faithfulness(llm=evaluator_llm),
    AnswerRelevancy(llm=evaluator_llm),
]

In [None]:
# evaluation
result = evaluate(
    query_engine=query_engine,
    metrics=metrics,
    llm=evaluator_llm,
    embeddings=query_embed_model,
    dataset=ragas_dataset,
)

In [19]:
print(
f"""
Модель генерации: {llm_model_name}
Кол-во чанков из векторного поиска: {k_vector_search}
Кол-во чанков из текстового поиска: {k_text_search}
Модель оценщик: {hyps['ragas']['llm_evaluator']}
Метрики: {result}
""")


Модель генерации: nvidia/Llama3-ChatQA-2-8B
Кол-во чанков из векторного поиска: 3
Кол-во чанков из текстового поиска: 3
Модель оценщик: google/gemma-2-9b-it
Метрики: {'faithfulness': 0.7778, 'answer_relevancy': 0.5426}

