# Basic. Evalution

* Ingest Pipeline:
    * Simple PDF parsing
    * Page chunking and indexing
* Query Pipeline
    * Basic retriver

In [1]:
%load_ext autoreload
%autoreload 2

In [16]:
import logging
from pathlib import Path
import pickle

from dotenv import load_dotenv
import pandas as pd
from ragas.evaluation import evaluate
from ragas.metrics import (
    faithfulness,
    answer_relevancy,
    context_relevancy,
    context_recall,
    context_precision,
)

from langchain_community.document_loaders import PyPDFLoader
from langchain_openai import OpenAIEmbeddings, ChatOpenAI
from langchain_chroma import Chroma

from src import chain, tags, rag_eval

from src.mle import utils as mle_utils

In [3]:
import sys

sys.executable

'C:\\Users\\manua\\.conda\\envs\\rag-advanced\\python.exe'

In [4]:
id_pipeline = "01basic"
folder_src = "ai-papers"
index_name = "ai-papers"
llm_emb = "text-embedding-3-small"
retriever_k = 2
retriever_treshold = 0.3
temp = 0.2
llm_model = "gpt-4o-mini"

In [35]:
load_dotenv()
embeddings = OpenAIEmbeddings(model=llm_emb)
llm_qa = ChatOpenAI(model_name=llm_model, temperature=temp)
llm_eval = ChatOpenAI(model_name=llm_model, temperature=0.1)


path_corpus = mle_utils.path_data_raw / folder_src 

path_db = mle_utils.path_data_interm / index_name / "chroma_langchain_db"
path_db.mkdir(exist_ok=True, parents=True)

path_eval_ds = mle_utils.path_data_raw / "eval-questions" 
fln_eval_ds = path_eval_ds / f"{index_name}.csv"


path_eval = mle_utils.path_data_processed /  "evals" / index_name
path_eval.mkdir(exist_ok=True, parents=True)
fln_eval_ragas = path_eval / f"ragas-{id_pipeline}.pkl"

# Query Pipeline

In [6]:
vector_store = Chroma(
    collection_name=index_name,
    embedding_function=embeddings,
    persist_directory=path_db.as_posix()
)


In [8]:

retriever = vector_store.as_retriever(
    search_type="similarity_score_threshold", search_kwargs={"score_threshold":retriever_treshold, "k": retriever_k})
chain_basic_rag = chain.rag_basic_with_sources(llm_qa , retriever)
response = chain_basic_rag.invoke({'input': "how many time series datasets are used to train lag-llama model?"})

print("Context:")
for context in response['context']:
    print(f"* {context.page_content[:100]} [{res.metadata}]")
    print()
    
print("Answer: ", response['answer'])

Context:
* Lag-Llama
number of series is useful when sampling random windows
from the pretraining corpus. Furth [{'extension': '.pdf', 'page': 4, 'source': 'lagllama'}]

* Lag-Llama
Dtest={xi
Ti+1:Ti+P}D
i=1.
Theunivariate probabilistic time series forecasting problem
inv [{'extension': '.pdf', 'page': 4, 'source': 'lagllama'}]

Answer:  A total of 27 time series datasets are used to train the Lag-Llama model.


# Evaluate

In [7]:
split_label = mle_utils.Splits.TRAIN.value

df_eval_qs = (pd.read_csv(fln_eval_ds)
              .query(f"{tags.SPLIT} == '{split_label}'"))
df_eval_qs

Unnamed: 0,paper,question,ground_truth,source,split_
0,Mamba: Linear-Time Sequence Modeling with Sele...,What are the main limitations and risks associ...,While Mamba shows strong performance in variou...,Section 5,1.train
1,PaliGemma,How does PaliGemma perform compared to larger ...,"PaliGemma, with less than 3B parameters, achie...","Introduction, Section 1; Results, Section 4",1.train
2,Llama 3,What ablation studies were conducted on Llama ...,Ablation studies focused on hyperparameters li...,"Ablation Studies, Section 5",1.train
3,Mamba: Linear-Time Sequence Modeling with Sele...,What is the hardware optimization that enables...,Mamba uses a hardware-aware parallel algorithm...,"Abstract, Section 3.3",1.train
4,PaliGemma,How does PaliGemma achieve transferability acr...,PaliGemma uses a flexible fine-tuning approach...,"Transferability, Section 6",1.train
5,Lag-Llama,What is the choice of the distribution head us...,The model uses a Student's t-distribution head...,Section 4.3: Choice of Distribution Head,1.train
6,Lag-Llama,What datasets were used to train and evaluate ...,The model was trained on 27 datasets across do...,Section 5.1: Datasets,1.train
7,ColPali,How does ColPali achieve faster indexing and q...,ColPali directly encodes pages from their imag...,"Section 5.2, Latencies & Memory Footprint",1.train
8,TimesFM,How does TimesFM handle long-horizon forecasts...,TimesFM uses longer output patches during deco...,"Model Architecture, Section 4",1.train
9,TimesFM,What novelties does the TimesFM introduce in t...,The main novelties include a patched-decoder s...,"Abstract, Introduction, Model Architecture",1.train


In [11]:
dataset = rag_eval.populate_eval_dataset(
    df_eval_qs,
    chain_basic_rag
)

# https://www.databricks.com/blog/LLM-auto-eval-best-practices-RAG
# https://docs.ragas.io/en/latest/references/evaluation.htmlS

In [44]:
metrics = [
            context_relevancy,
            context_precision,
            context_recall,
            faithfulness,
            answer_relevancy,
        ]


if fln_eval_ragas.exists():
    with open(fln_eval_ragas, "rb") as file:
        res_ragas_eval = pickle.load(file)

else:  
    res_ragas_eval = evaluate(
        dataset=dataset,
        metrics=[
            context_relevancy,
            context_precision,
            context_recall,
            faithfulness,
            answer_relevancy,
        ],
        llm=llm_eval,
        embeddings=embeddings
    )
    

    with open(fln_ragas_eval, "wb") as file:
        pickle.dump(res_ragas_eval, file)


res_ragas_eval

{'context_relevancy': 0.0325, 'context_precision': 0.7500, 'context_recall': 0.5278, 'faithfulness': 0.5625, 'answer_relevancy': 0.5502}

In [10]:
df_ragas_eval = res_ragas_eval.to_pandas()
df_ragas_eval

Unnamed: 0,question,answer,contexts,ground_truth,context_relevancy,context_precision,context_recall,faithfulness,answer_relevancy
0,What are the main limitations and risks associ...,No relevant information has been founded relat...,[Model Arch. Layer Acc.\nS4 No gate S4 18.3\n-...,While Mamba shows strong performance in variou...,0.012195,0.0,0.0,0.0,0.0
1,How does PaliGemma perform compared to larger ...,"PaliGemma, despite being a sub-3B vision-langu...",[July 2024\nPaliGemma: A versatile 3B VLM for ...,"PaliGemma, with less than 3B parameters, achie...",0.018405,1.0,1.0,1.0,0.811428
2,What ablation studies were conducted on Llama ...,No relevant information has been founded relat...,[Model Preference\nPM for Llama 3 8B 60.0%\nSt...,Ablation studies focused on hyperparameters li...,0.015385,0.0,0.0,0.0,0.0
3,What is the hardware optimization that enables...,No relevant information has been founded relat...,[/uni00000015/uni00000014/uni0000001a/uni00000...,Mamba uses a hardware-aware parallel algorithm...,0.008065,1.0,0.0,0.0,0.0
4,How does PaliGemma achieve transferability acr...,PaliGemma achieves transferability across diff...,[PaliGemma: A versatile 3B VLM for transfer\nL...,PaliGemma uses a flexible fine-tuning approach...,0.020202,1.0,0.333333,0.5625,1.0
5,What is the choice of the distribution head us...,The choice of the distribution head used in La...,"[Lag-Llama\nas in LLaMA (Touvron et al., 2023)...",The model uses a Student's t-distribution head...,0.016529,1.0,1.0,1.0,0.856607
6,What datasets were used to train and evaluate ...,No relevant information has been founded relat...,[Finetuned Multilingual Long context Tool use ...,The model was trained on 27 datasets across do...,0.005376,0.0,0.0,0.5,0.0
7,How does ColPali achieve faster indexing and q...,ColPali achieves faster indexing and querying ...,[Figure 2: ColPali simplifies document retriev...,ColPali directly encodes pages from their imag...,0.043478,1.0,1.0,1.0,1.0
8,How does TimesFM handle long-horizon forecasts...,TimesFM handles long-horizon forecasts by util...,[A decoder-only foundation model for time-seri...,TimesFM uses longer output patches during deco...,0.027027,1.0,1.0,0.75,0.934344
9,What novelties does the TimesFM introduce in t...,No relevant information has been founded relat...,[A decoder-only foundation model for time-seri...,The main novelties include a patched-decoder s...,0.028409,1.0,1.0,0.0,0.0


# Trace

In [45]:
from langfuse import Langfuse

langfuse = Langfuse()
langfuse.auth_check()

True

In [64]:
df_ragas_eval_traces = trace_evaluation(
    langfuse,
    f"{id_pipeline}_ragas",
    df_ragas_eval,
    [metric.name for metric in metrics]
)
