In [1]:
from llama_index.core import (
    Document,
    Settings,
    SimpleDirectoryReader,
    VectorStoreIndex
)
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.llms.openai import OpenAI
import pandas as pd
from trulens_eval import Tru, TruLlama
from utils import (
    build_automerging_index,
    build_sentence_window_index,
    evaluate_engine,
    get_automerging_query_engine,
    get_feedback_func,
    get_sentence_window_query_engine,
    setup 
)
import warnings

In [2]:
warnings.filterwarnings('ignore')
pd.set_option("display.max_colwidth", None)
setup() 

In [3]:
!mkdir -p '../data'
!wget 'https://raw.githubusercontent.com/run-llama/llama_index/main/docs/docs/examples/data/paul_graham/paul_graham_essay.txt' -O '../data/paul_graham_essay.txt'

--2024-05-08 13:35:38--  https://raw.githubusercontent.com/run-llama/llama_index/main/docs/docs/examples/data/paul_graham/paul_graham_essay.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 2606:50c0:8001::154, 2606:50c0:8003::154, 2606:50c0:8000::154, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|2606:50c0:8001::154|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 75042 (73K) [text/plain]
Saving to: ‘../data/paul_graham_essay.txt’


2024-05-08 13:35:38 (3.94 MB/s) - ‘../data/paul_graham_essay.txt’ saved [75042/75042]



In [4]:
documents = SimpleDirectoryReader(
    input_files=["../data/paul_graham_essay.txt"]
).load_data()

In [5]:
len(documents), type(documents)

(1, list)

In [6]:
Settings.llm = OpenAI(model="gpt-3.5-turbo", temperature=0.1)
Settings.embed_model = HuggingFaceEmbedding(
    model_name="BAAI/bge-small-en-v1.5"
)

In [7]:
index = VectorStoreIndex.from_documents(documents)
query_engine = index.as_query_engine()

In [8]:
response = query_engine.query("What is Paul Graham known for?")
str(response)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


'Paul Graham is known for his work in programming, particularly for his involvement in creating a new dialect of Lisp called Arc. Additionally, he is known for his essays, which he started publishing online and eventually compiled into a book titled "Hackers & Painters."'

In [9]:
tru = Tru()
tru.reset_database()

🦑 Tru initialized with db url sqlite:///default.sqlite .
🛑 Secret keys may be written to the database. See the `database_redact_keys` option of Tru` to prevent this.


In [10]:
with open('eval_questions.txt', 'r') as file:
    eval_questions = file.read().split("\n")

In [11]:
feedbacks = get_feedback_func()

✅ In Answer Relevance, input prompt will be set to __record__.main_input or `Select.RecordInput` .
✅ In Answer Relevance, input response will be set to __record__.main_output or `Select.RecordOutput` .
✅ In Context Relevance, input prompt will be set to __record__.main_input or `Select.RecordInput` .
✅ In Context Relevance, input response will be set to __record__.app.query.rets.source_nodes[:].node.text .
✅ In Groundedness, input source will be set to __record__.app.query.rets.source_nodes[:].node.text .
✅ In Groundedness, input statement will be set to __record__.main_output or `Select.RecordOutput` .


[nltk_data] Downloading package punkt to
[nltk_data]     /Users/charlescamp/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [12]:
evaluate_engine(
    query_engine=query_engine,
    feedbacks=feedbacks,
    app_id="Direct Query Engine",
    eval_questions=eval_questions,
    verbose=False
)

Groundedness per statement in source:   0%|          | 0/1 [00:00<?, ?it/s]

Groundedness per statement in source:   0%|          | 0/1 [00:00<?, ?it/s]

Groundedness per statement in source:   0%|          | 0/1 [00:00<?, ?it/s]

Groundedness per statement in source:   0%|          | 0/1 [00:00<?, ?it/s]

Groundedness per statement in source:   0%|          | 0/1 [00:00<?, ?it/s]

Groundedness per statement in source:   0%|          | 0/1 [00:00<?, ?it/s]

Groundedness per statement in source:   0%|          | 0/1 [00:00<?, ?it/s]

Groundedness per statement in source:   0%|          | 0/1 [00:00<?, ?it/s]

Groundedness per statement in source:   0%|          | 0/1 [00:00<?, ?it/s]

Groundedness per statement in source:   0%|          | 0/1 [00:00<?, ?it/s]

Groundedness per statement in source:   0%|          | 0/1 [00:00<?, ?it/s]

Groundedness per statement in source:   0%|          | 0/1 [00:00<?, ?it/s]

Groundedness per statement in source:   0%|          | 0/1 [00:00<?, ?it/s]

Groundedness per statement in source:   0%|          | 0/1 [00:00<?, ?it/s]

Groundedness per statement in source:   0%|          | 0/1 [00:00<?, ?it/s]

Groundedness per statement in source:   0%|          | 0/1 [00:00<?, ?it/s]

Groundedness per statement in source:   0%|          | 0/1 [00:00<?, ?it/s]

Groundedness per statement in source:   0%|          | 0/1 [00:00<?, ?it/s]

TruLlama(tru_class_info=trulens_eval.tru_llama.TruLlama, app_id='Direct Query Engine', tags='-', metadata={}, feedback_definitions=[], feedback_mode=<FeedbackMode.WITH_APP_THREAD: 'with_app_thread'>, root_class=llama_index.core.query_engine.retriever_query_engine.RetrieverQueryEngine, app=<llama_index.core.query_engine.retriever_query_engine.RetrieverQueryEngine object at 0x37553b6d0>, initial_app_loader_dump=None, app_extra_json={}, feedbacks=[FeedbackDefinition(Answer Relevance,
	selectors={'prompt': Lens().__record__.main_input, 'response': Lens().__record__.main_output},
	if_exists=None
), FeedbackDefinition(Context Relevance,
	selectors={'prompt': Lens().__record__.main_input, 'response': Lens().__record__.app.query.rets.source_nodes[:].node.text},
	if_exists=None
), FeedbackDefinition(Groundedness,
	selectors={'source': Lens().__record__.app.query.rets.source_nodes[:].node.text, 'statement': Lens().__record__.main_output},
	if_exists=None

In [13]:
records, feedback = tru.get_records_and_feedback(app_ids=[])

Groundedness per statement in source:   0%|          | 0/1 [00:00<?, ?it/s]

In [14]:
records[["input", "output"] + feedback]

Unnamed: 0,input,output,Context Relevance,Answer Relevance,Groundedness
0,"""Can you name the first book of the Bible?""","""Genesis""",0.0,1.0,1.0
1,"""Who is considered the author of the Book of Genesis?""","""The author of the Book of Genesis is traditionally believed to be Moses.""",0.0,0.9,0.0
2,"""In which book of the Bible can the Ten Commandments be found?""","""Exodus""",0.0,1.0,1.0
3,"""What is the name of the city where Jesus was born according to the New Testament?""","""Bethlehem.""",0.0,1.0,0.0
4,"""Can you name the four Gospels in the New Testament?""","""Matthew, Mark, Luke, and John""",0.0,1.0,0.0
5,"""Which prophet is known for being swallowed by a big fish?""","""Jonah""",0.0,1.0,0.0
6,"""What is the shortest verse in the Bible?""","""Jesus wept.""",0.0,1.0,0.0
7,"""Who betrayed Jesus according to the New Testament?""","""Judas Iscariot betrayed Jesus according to the New Testament.""",0.0,1.0,0.0
8,"""In which book of the Bible can the story of Noah's Ark be found?""","""Genesis""",,1.0,
9,"""Can you name one of the major prophets in the Old Testament?""","""Isaiah""",,,


In [15]:
window_index = build_sentence_window_index(documents, Settings.llm)
window_engine = get_sentence_window_query_engine(window_index)

Groundedness per statement in source:   0%|          | 0/1 [00:00<?, ?it/s]

In [16]:
evaluate_engine(
    query_engine=window_engine,
    feedbacks=feedbacks,
    app_id="Window Engine",
    eval_questions=eval_questions,
    verbose=False
)

Groundedness per statement in source:   0%|          | 0/1 [00:00<?, ?it/s]

Groundedness per statement in source:   0%|          | 0/1 [00:00<?, ?it/s]

Groundedness per statement in source:   0%|          | 0/1 [00:00<?, ?it/s]

Groundedness per statement in source:   0%|          | 0/1 [00:00<?, ?it/s]

Groundedness per statement in source:   0%|          | 0/1 [00:00<?, ?it/s]

Groundedness per statement in source:   0%|          | 0/1 [00:00<?, ?it/s]

Groundedness per statement in source:   0%|          | 0/1 [00:00<?, ?it/s]

Groundedness per statement in source:   0%|          | 0/1 [00:00<?, ?it/s]

Groundedness per statement in source:   0%|          | 0/1 [00:00<?, ?it/s]

Groundedness per statement in source:   0%|          | 0/1 [00:00<?, ?it/s]

Groundedness per statement in source:   0%|          | 0/1 [00:00<?, ?it/s]

TruLlama(tru_class_info=trulens_eval.tru_llama.TruLlama, app_id='Window Engine', tags='-', metadata={}, feedback_definitions=[], feedback_mode=<FeedbackMode.WITH_APP_THREAD: 'with_app_thread'>, root_class=llama_index.core.query_engine.retriever_query_engine.RetrieverQueryEngine, app=<llama_index.core.query_engine.retriever_query_engine.RetrieverQueryEngine object at 0x32378f290>, initial_app_loader_dump=None, app_extra_json={}, feedbacks=[FeedbackDefinition(Answer Relevance,
	selectors={'prompt': Lens().__record__.main_input, 'response': Lens().__record__.main_output},
	if_exists=None
), FeedbackDefinition(Context Relevance,
	selectors={'prompt': Lens().__record__.main_input, 'response': Lens().__record__.app.query.rets.source_nodes[:].node.text},
	if_exists=None
), FeedbackDefinition(Groundedness,
	selectors={'source': Lens().__record__.app.query.rets.source_nodes[:].node.text, 'statement': Lens().__record__.main_output},
	if_exists=None

In [17]:
merging_index = build_automerging_index(documents, Settings.llm, chunk_sizes=[1024, 256, 64])
merging_engine = get_automerging_query_engine(merging_index)

Groundedness per statement in source:   0%|          | 0/1 [00:00<?, ?it/s]

In [18]:
evaluate_engine(
    query_engine=merging_engine,
    feedbacks=feedbacks,
    app_id="Merging Engine",
    eval_questions=eval_questions,
    verbose=False
)

> Merging 3 nodes into parent node.
> Parent node id: 0b1be9dd-e5fa-43f0-9f3c-1dd6bacae030.
> Parent node text: rather than letting others define if for you.  Incorpora te dramatic devices into your public 
ge...

> Merging 1 nodes into parent node.
> Parent node id: 90c41ad9-37a1-4c23-9640-14768c414c00.
> Parent node text: rather than letting others define if for you.  Incorpora te dramatic devices into your public 
ge...



Groundedness per statement in source:   0%|          | 0/1 [00:00<?, ?it/s]

> Merging 1 nodes into parent node.
> Parent node id: 86cae63a-5ca0-4279-b238-aa2c78341bc0.
> Parent node text: They are wolves in lambs’ clothing.  Choose your 
victims and opponents carefully, then – never o...



Groundedness per statement in source:   0%|          | 0/1 [00:00<?, ?it/s]

> Merging 3 nodes into parent node.
> Parent node id: 0b1be9dd-e5fa-43f0-9f3c-1dd6bacae030.
> Parent node text: rather than letting others define if for you.  Incorpora te dramatic devices into your public 
ge...

> Merging 1 nodes into parent node.
> Parent node id: 86cae63a-5ca0-4279-b238-aa2c78341bc0.
> Parent node text: They are wolves in lambs’ clothing.  Choose your 
victims and opponents carefully, then – never o...

> Merging 1 nodes into parent node.
> Parent node id: 90c41ad9-37a1-4c23-9640-14768c414c00.
> Parent node text: rather than letting others define if for you.  Incorpora te dramatic devices into your public 
ge...



Groundedness per statement in source:   0%|          | 0/1 [00:00<?, ?it/s]

Groundedness per statement in source:   0%|          | 0/1 [00:00<?, ?it/s]

> Merging 1 nodes into parent node.
> Parent node id: 86cae63a-5ca0-4279-b238-aa2c78341bc0.
> Parent node text: They are wolves in lambs’ clothing.  Choose your 
victims and opponents carefully, then – never o...



Groundedness per statement in source:   0%|          | 0/1 [00:00<?, ?it/s]

Groundedness per statement in source:   0%|          | 0/1 [00:00<?, ?it/s]

Groundedness per statement in source:   0%|          | 0/1 [00:00<?, ?it/s]

Groundedness per statement in source:   0%|          | 0/1 [00:00<?, ?it/s]

> Merging 3 nodes into parent node.
> Parent node id: 0b1be9dd-e5fa-43f0-9f3c-1dd6bacae030.
> Parent node text: rather than letting others define if for you.  Incorpora te dramatic devices into your public 
ge...

> Merging 1 nodes into parent node.
> Parent node id: 86cae63a-5ca0-4279-b238-aa2c78341bc0.
> Parent node text: They are wolves in lambs’ clothing.  Choose your 
victims and opponents carefully, then – never o...

> Merging 1 nodes into parent node.
> Parent node id: 90c41ad9-37a1-4c23-9640-14768c414c00.
> Parent node text: rather than letting others define if for you.  Incorpora te dramatic devices into your public 
ge...



Groundedness per statement in source:   0%|          | 0/1 [00:00<?, ?it/s]

Groundedness per statement in source:   0%|          | 0/1 [00:00<?, ?it/s]

Groundedness per statement in source:   0%|          | 0/1 [00:00<?, ?it/s]

Groundedness per statement in source:   0%|          | 0/1 [00:00<?, ?it/s]

Groundedness per statement in source:   0%|          | 0/1 [00:00<?, ?it/s]

Groundedness per statement in source:   0%|          | 0/1 [00:00<?, ?it/s]

Groundedness per statement in source:   0%|          | 0/1 [00:00<?, ?it/s]

Groundedness per statement in source:   0%|          | 0/1 [00:00<?, ?it/s]

Groundedness per statement in source:   0%|          | 0/1 [00:00<?, ?it/s]

> Merging 1 nodes into parent node.
> Parent node id: 86cae63a-5ca0-4279-b238-aa2c78341bc0.
> Parent node text: They are wolves in lambs’ clothing.  Choose your 
victims and opponents carefully, then – never o...



Groundedness per statement in source:   0%|          | 0/1 [00:00<?, ?it/s]

Groundedness per statement in source:   0%|          | 0/1 [00:00<?, ?it/s]

> Merging 1 nodes into parent node.
> Parent node id: 86cae63a-5ca0-4279-b238-aa2c78341bc0.
> Parent node text: They are wolves in lambs’ clothing.  Choose your 
victims and opponents carefully, then – never o...



Groundedness per statement in source:   0%|          | 0/1 [00:00<?, ?it/s]

> Merging 1 nodes into parent node.
> Parent node id: 86cae63a-5ca0-4279-b238-aa2c78341bc0.
> Parent node text: They are wolves in lambs’ clothing.  Choose your 
victims and opponents carefully, then – never o...



Groundedness per statement in source:   0%|          | 0/1 [00:00<?, ?it/s]

Groundedness per statement in source:   0%|          | 0/1 [00:00<?, ?it/s]

> Merging 3 nodes into parent node.
> Parent node id: 0b1be9dd-e5fa-43f0-9f3c-1dd6bacae030.
> Parent node text: rather than letting others define if for you.  Incorpora te dramatic devices into your public 
ge...

> Merging 1 nodes into parent node.
> Parent node id: 86cae63a-5ca0-4279-b238-aa2c78341bc0.
> Parent node text: They are wolves in lambs’ clothing.  Choose your 
victims and opponents carefully, then – never o...

> Merging 1 nodes into parent node.
> Parent node id: 90c41ad9-37a1-4c23-9640-14768c414c00.
> Parent node text: rather than letting others define if for you.  Incorpora te dramatic devices into your public 
ge...



Groundedness per statement in source:   0%|          | 0/1 [00:00<?, ?it/s]

TruLlama(tru_class_info=trulens_eval.tru_llama.TruLlama, app_id='Merging Engine', tags='-', metadata={}, feedback_definitions=[], feedback_mode=<FeedbackMode.WITH_APP_THREAD: 'with_app_thread'>, root_class=llama_index.core.query_engine.retriever_query_engine.RetrieverQueryEngine, app=<llama_index.core.query_engine.retriever_query_engine.RetrieverQueryEngine object at 0x3b5f9e450>, initial_app_loader_dump=None, app_extra_json={}, feedbacks=[FeedbackDefinition(Answer Relevance,
	selectors={'prompt': Lens().__record__.main_input, 'response': Lens().__record__.main_output},
	if_exists=None
), FeedbackDefinition(Context Relevance,
	selectors={'prompt': Lens().__record__.main_input, 'response': Lens().__record__.app.query.rets.source_nodes[:].node.text},
	if_exists=None
), FeedbackDefinition(Groundedness,
	selectors={'source': Lens().__record__.app.query.rets.source_nodes[:].node.text, 'statement': Lens().__record__.main_output},
	if_exists=None

In [19]:
import nest_asyncio
nest_asyncio.apply()

tru.get_leaderboard(app_ids=[])
tru.run_dashboard()

Groundedness per statement in source:   0%|          | 0/1 [00:00<?, ?it/s]

Starting dashboard ...
Config file already exists. Skipping writing process.
Credentials file already exists. Skipping writing process.


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Groundedness per statement in source:   0%|          | 0/1 [00:00<?, ?it/s]

Accordion(children=(VBox(children=(VBox(children=(Label(value='STDOUT'), Output())), VBox(children=(Label(valu…

Dashboard started at http://192.168.1.29:8501 .


<Popen: returncode: None args: ['streamlit', 'run', '--server.headless=True'...>