In [1]:
%load_ext autoreload
%autoreload 2

In [None]:
from dotenv import load_dotenv

from langchain_community.document_loaders import PyPDFLoader
from langchain_openai import OpenAIEmbeddings
from langchain_chroma import Chroma

from src.mle import utils as mle_utils

In [2]:
import sys

sys.executable

'C:\\Users\\manua\\.conda\\envs\\rag-advanced\\python.exe'

In [4]:
folder_src = "ai-papers"
index_name = "ai-papers"
llm_emb = "text-embedding-3-small"

In [7]:
load_dotenv()
embeddings = OpenAIEmbeddings(model=llm_emb)

path_db = mle_utils.path_data_interm / index_name / "chroma_langchain_db"
path_db.mkdir(exist_ok=True, parents=True)

path_corpus = mle_utils.path_data_raw / folder_src 

In [29]:
import logging

lst_docs_corpus = []
uuids = []
for file in path_corpus.iterdir():
    loader = PyPDFLoader(path_corpus / file)
    print(file)
    logging.info("Parsing file: %s", file)
    docs = loader.load()
    for doc in docs:
        doc.metadata['file_name'] = file.stem
        doc.metadata['extension'] = file.suffix
        doc.metadata['page'] += 1
        uuids.append(file.stem + "-" + str(doc.metadata['page']))
    logging.info("SUCCESS: Parsed file: %s, obtained %s documents", file, len(docs))
    lst_docs_corpus.extend(docs)
    

c:\users\manua\documents\repos\dslabs\rag-advanced\data\raw\ai-papers\colpali.pdf
c:\users\manua\documents\repos\dslabs\rag-advanced\data\raw\ai-papers\lagllama.pdf
c:\users\manua\documents\repos\dslabs\rag-advanced\data\raw\ai-papers\llama3herd.pdf
c:\users\manua\documents\repos\dslabs\rag-advanced\data\raw\ai-papers\mamba.pdf
c:\users\manua\documents\repos\dslabs\rag-advanced\data\raw\ai-papers\paligemma.pdf
c:\users\manua\documents\repos\dslabs\rag-advanced\data\raw\ai-papers\timesfm.pdf


In [30]:
len(uuids)

242

In [31]:
vector_store = Chroma(
    collection_name=index_name,
    embedding_function=embeddings,
    persist_directory=path_db.as_posix()
)


vector_store.add_documents(documents=lst_docs_corpus, ids=uuids)

['colpali-1',
 'colpali-2',
 'colpali-3',
 'colpali-4',
 'colpali-5',
 'colpali-6',
 'colpali-7',
 'colpali-8',
 'colpali-9',
 'colpali-10',
 'colpali-11',
 'colpali-12',
 'colpali-13',
 'colpali-14',
 'colpali-15',
 'colpali-16',
 'colpali-17',
 'colpali-18',
 'colpali-19',
 'colpali-20',
 'lagllama-1',
 'lagllama-2',
 'lagllama-3',
 'lagllama-4',
 'lagllama-5',
 'lagllama-6',
 'lagllama-7',
 'lagllama-8',
 'lagllama-9',
 'lagllama-10',
 'lagllama-11',
 'lagllama-12',
 'lagllama-13',
 'lagllama-14',
 'lagllama-15',
 'lagllama-16',
 'lagllama-17',
 'lagllama-18',
 'lagllama-19',
 'lagllama-20',
 'lagllama-21',
 'lagllama-22',
 'lagllama-23',
 'llama3herd-1',
 'llama3herd-2',
 'llama3herd-3',
 'llama3herd-4',
 'llama3herd-5',
 'llama3herd-6',
 'llama3herd-7',
 'llama3herd-8',
 'llama3herd-9',
 'llama3herd-10',
 'llama3herd-11',
 'llama3herd-12',
 'llama3herd-13',
 'llama3herd-14',
 'llama3herd-15',
 'llama3herd-16',
 'llama3herd-17',
 'llama3herd-18',
 'llama3herd-19',
 'llama3herd-20',

In [32]:
results = vector_store.similarity_search(
    "how many datasets have been bundled to train lag llama",
    k=2,
)
for res in results:
    print(f"* {res.page_content[:100]} [{res.metadata}]")
    print()

* Lag-Llama
number of series is useful when sampling random windows
from the pretraining corpus. Furth [{'extension': '.pdf', 'page': 5, 'source': 'lagllama'}]

* Lag-Llama
Figure 11: Lag-Llama fine-tuned forecasting examples on the downstream Requests Minute dat [{'extension': '.pdf', 'page': 20, 'source': 'lagllama'}]



In [44]:
from src import chain

chain_basic_rag = chain.rag_basic_with_sources(ChatOpenAI() , vector_store.as_retriever())
response = chain_basic_rag.invoke({'input': "how many datasets are bundle to train lag llama?"})

In [45]:
response.keys()

dict_keys(['input', 'context', 'answer'])

In [54]:
import pandas as pd

df_eval_qs = pd.read_csv(mle_utils.path_data_raw / "eval-questions" / "ai-papers.csv")
df_eval_qs.head(2)

Unnamed: 0,paper,question,ground_truth,source
0,LagLLama,What is the main goal of the LagLLama project?,The LagLLama project aims to optimize large-sc...,"Introduction, Abstract"
1,LagLLama,How does LagLLama achieve improved performance...,LagLLama achieves this by applying data augmen...,"Section 2.3, Section 4"


In [59]:
dataset eval.populate_eval_dataset(df_eval_qs, chain_basic_rag)

Dataset({
    features: ['question', 'answer', 'contexts', 'ground_truth'],
    num_rows: 25
})

In [61]:
from ragas import evaluate
from ragas.metrics import (
    faithfulness,
    answer_relevancy,
    context_relevancy,
    context_recall,
    context_precision,
)

df_eval_res = evaluate(
    dataset=dataset,
    metrics=[
        context_relevancy,
        context_precision,
        context_recall,
        faithfulness,
        answer_relevancy,
    ],
    llm=llm,
    embeddings=embeddings
)

Evaluating:  81%|██████████████████████████████▋       | 101/125 [02:01<00:28,  1.20s/it]
Exception in thread Thread-6:
Traceback (most recent call last):
  File "C:\Users\manua\.conda\envs\rag-advanced\lib\threading.py", line 1016, in _bootstrap_inner
    self.run()
  File "C:\Users\manua\.conda\envs\rag-advanced\lib\site-packages\ragas\executor.py", line 95, in run
    results = self.loop.run_until_complete(self._aresults())
  File "C:\Users\manua\.conda\envs\rag-advanced\lib\asyncio\base_events.py", line 649, in run_until_complete
    return future.result()
  File "C:\Users\manua\.conda\envs\rag-advanced\lib\site-packages\ragas\executor.py", line 83, in _aresults
    raise e
  File "C:\Users\manua\.conda\envs\rag-advanced\lib\site-packages\ragas\executor.py", line 78, in _aresults
    r = await future
  File "C:\Users\manua\.conda\envs\rag-advanced\lib\asyncio\tasks.py", line 571, in _wait_for_one
    return f.result()  # May raise f.exception().
  File "C:\Users\manua\.conda\envs\r

ExceptionInRunner: The runner thread which was running the jobs raised an exeception. Read the traceback above to debug it. You can also pass `raise_exceptions=False` incase you want to show only a warning message instead.

In [62]:
df_eval_res

NameError: name 'df_eval_res' is not defined