# Lesson 1: Advanced RAG Pipeline

## Import Libraries

In [1]:
import os
import dotenv
import openai
import numpy as np
import nest_asyncio

nest_asyncio.apply()

dotenv.load_dotenv("/Users/michaelmateju/Documents/Data Science/PycharmProjects/.env")
openai.api_key = os.environ["OPENAI_API_KEY"]

llm_model_name = "gpt-4o-mini" #jmeno modelu, ktery se bude pouzivat napric celym notebookem.

In [2]:
from trulens.core import Feedback
from trulens.core import Select
from trulens.providers.openai import OpenAI as truOpenAI
from trulens.apps.llamaindex import TruLlama
from trulens.core import TruSession


In [3]:
from llama_index.core import SimpleDirectoryReader
from llama_index.core import Document
from llama_index.core import Settings
from llama_index.core import VectorStoreIndex
from llama_index.core import load_index_from_storage, SimpleDirectoryReader
from llama_index.core.storage import StorageContext
from llama_index.core.node_parser import SentenceSplitter
from llama_index.core.node_parser import SentenceWindowNodeParser
from llama_index.core.node_parser import HierarchicalNodeParser
from llama_index.core.node_parser import get_leaf_nodes
from llama_index.core.retrievers import AutoMergingRetriever
from llama_index.core.query_engine import RetrieverQueryEngine
from llama_index.core.postprocessor import MetadataReplacementPostProcessor
from llama_index.core.postprocessor import SentenceTransformerRerank


from llama_index.llms.openai import OpenAI
from llama_index.embeddings.huggingface import HuggingFaceEmbedding

In [4]:
llm = OpenAI(model=llm_model_name, temperature=0.1)

## Load Data

In [5]:
documents = SimpleDirectoryReader(
    input_files=["./source_notebooks/building_and_evalutation_advanced_rags/eBook-How-to-Build-a-Career-in-AI.pdf"]
).load_data()

In [6]:
print(type(documents), "\n")
print(len(documents), "\n")
print(type(documents[0]))
print(documents[0])

<class 'list'> 

41 

<class 'llama_index.core.schema.Document'>
Doc ID: 559d7a9f-4d2c-4e3e-a9ba-d1964a70fd0a
Text: PAGE 1 Founder, DeepLearning.AI Collected Insights from Andrew
Ng How to  Build Your Career in AI A Simple Guide


In [7]:
document = Document(text="\n\n".join([doc.text for doc in documents]))

## TruLens settings

In [8]:
embedding_model = HuggingFaceEmbedding(
    model_name="BAAI/bge-small-en-v1.5"
)

In [9]:
Settings.llm = llm
Settings.embed_model = embedding_model

## Basic RAG pipeline

In [10]:
index = VectorStoreIndex.from_documents([document])

In [11]:
query_engine = index.as_query_engine()

In [12]:
response = query_engine.query(
    "What are steps to take when finding projects to build your experience?"
)
print(str(response))

When looking for projects to build your experience, consider the following steps:

1. **Join Existing Projects**: Collaborate with others who have ideas and seek opportunities to contribute to their projects.

2. **Engage with Learning**: Spend time reading, taking courses, and discussing with domain experts to generate new project ideas.

3. **Focus on Application Areas**: Identify specific applications of machine learning that are underexplored and align with your interests or your organization’s goals.

4. **Develop a Side Hustle**: Work on personal projects outside of your main job to foster creativity and potentially lead to significant opportunities.

5. **Evaluate Technical Growth**: Choose projects that challenge your skills but are achievable, helping you progress technically.

6. **Collaborate with Good Teammates**: Work with individuals who can provide support and enhance your learning experience.

7. **Consider Stepping Stones**: Select projects that can serve as meaningful

### Evaluation setup using TruLens

In [13]:
eval_questions = []
with open('./source_notebooks/building_and_evalutation_advanced_rags/lecture_1/eval_questions.txt', 'r') as file:
    for line in file:
        # Remove newline character and convert to integer
        item = line.strip()
        print(item)
        eval_questions.append(item)

What are the keys to building a career in AI?
How can teamwork contribute to success in AI?
What is the importance of networking in AI?
What are some good habits to develop for a successful career?
How can altruism be beneficial in building a career?
What is imposter syndrome and how does it relate to AI?
Who are some accomplished individuals who have experienced imposter syndrome?
What is the first step to becoming good at AI?
What are some common challenges in AI?
Is it normal to find parts of AI challenging?


In [14]:
# You can try your own question:
new_question = "What is the right AI job for me?"
eval_questions.append(new_question)

In [15]:
print(eval_questions)

['What are the keys to building a career in AI?', 'How can teamwork contribute to success in AI?', 'What is the importance of networking in AI?', 'What are some good habits to develop for a successful career?', 'How can altruism be beneficial in building a career?', 'What is imposter syndrome and how does it relate to AI?', 'Who are some accomplished individuals who have experienced imposter syndrome?', 'What is the first step to becoming good at AI?', 'What are some common challenges in AI?', 'Is it normal to find parts of AI challenging?', 'What is the right AI job for me?']


In [16]:
tru = TruSession()
tru.reset_database()

🦑 Initialized with db url sqlite:///default.sqlite .
🛑 Secret keys may be written to the database. See the `database_redact_keys` option of `TruSession` to prevent this.


Updating app_name and app_version in apps table: 0it [00:00, ?it/s]
Updating app_id in records table: 0it [00:00, ?it/s]
Updating app_json in apps table: 0it [00:00, ?it/s]


For the classroom, we've written some of the code in helper functions inside a utils.py file.  
- You can view the utils.py file in the file directory by clicking on the "Jupyter" logo at the top of the notebook.
- In later lessons, you'll get to work directly with the code that's currently wrapped inside these helper functions, to give you more options to customize your RAG pipeline.

In [17]:
provider = truOpenAI(model_engine=llm_model_name)

In [18]:
qa_relevance = (
    Feedback(provider.relevance_with_cot_reasons, name="Answer Relevance")
    .on_input()
    .on_output()
)

✅ In Answer Relevance, input prompt will be set to __record__.main_input or `Select.RecordInput` .
✅ In Answer Relevance, input response will be set to __record__.main_output or `Select.RecordOutput` .


In [19]:
qs_relevance = (
    Feedback(provider.relevance_with_cot_reasons, name = "Context Relevance")
    .on_input()
    .on(TruLlama.select_source_nodes().node.text)
    .aggregate(np.mean)
)

✅ In Context Relevance, input prompt will be set to __record__.main_input or `Select.RecordInput` .
✅ In Context Relevance, input response will be set to __record__.calls[-1].rets.source_nodes[:].node.text .


In [20]:
f_groundedness = (
    Feedback(
        provider.groundedness_measure_with_cot_reasons, name="Groundedness"
    )
    .on(Select.RecordCalls.retrieve.rets.collect())
    .on_output()
)

✅ In Groundedness, input source will be set to __record__.app.retrieve.rets.collect() .
✅ In Groundedness, input statement will be set to __record__.main_output or `Select.RecordOutput` .


In [21]:
feedbacks = [qa_relevance, qs_relevance, f_groundedness]

In [None]:
tru_recorder = TruLlama(
    query_engine,
    app_id="Direct Query Engine",
    feedbacks=feedbacks
    )

In [None]:
with tru_recorder as recording:
    for question in eval_questions:
        response = query_engine.query(question)

In [24]:
records, feedback = tru.get_records_and_feedback(app_ids=[])

In [25]:
records.head()

Unnamed: 0,app_id,app_json,type,record_id,input,output,tags,record_json,cost_json,perf_json,...,Groundedness feedback cost in USD,Context Relevance,Context Relevance_calls,Context Relevance feedback cost in USD,app_name,app_version,latency,total_tokens,total_cost,cost_currency
0,app_hash_6e8221fde876d15698298cea8c0d1bd6,"{'tru_class_info': {'name': 'TruLlama', 'modul...",RetrieverQueryEngine(llama_index.core.query_en...,record_hash_4653f8910513cf27b4346249fe5eef08,What is the right AI job for me?,Finding the right AI job for you involves unde...,-,{'record_id': 'record_hash_4653f8910513cf27b43...,"{""n_requests"": 1, ""n_successful_requests"": 1, ...","{""start_time"": ""2025-01-31T15:07:34.809679"", ""...",...,,,,,Direct Query Engine,base,3.392069,2024,0.000348,USD
1,app_hash_6e8221fde876d15698298cea8c0d1bd6,"{'tru_class_info': {'name': 'TruLlama', 'modul...",RetrieverQueryEngine(llama_index.core.query_en...,record_hash_917c95a1f3402c2d695a5903d5c234a0,Is it normal to find parts of AI challenging?,"Yes, it is normal to find parts of AI challeng...",-,{'record_id': 'record_hash_917c95a1f3402c2d695...,"{""n_requests"": 1, ""n_successful_requests"": 1, ...","{""start_time"": ""2025-01-31T15:07:33.481111"", ""...",...,,,,,Direct Query Engine,base,1.228528,1956,0.000315,USD
2,app_hash_6e8221fde876d15698298cea8c0d1bd6,"{'tru_class_info': {'name': 'TruLlama', 'modul...",RetrieverQueryEngine(llama_index.core.query_en...,record_hash_bf73e6febbe0175196c15d38f042a9b9,What are some common challenges in AI?,Common challenges in AI include understanding ...,-,{'record_id': 'record_hash_bf73e6febbe0175196c...,"{""n_requests"": 1, ""n_successful_requests"": 1, ...","{""start_time"": ""2025-01-31T15:07:31.619736"", ""...",...,,,,,Direct Query Engine,base,1.755121,1960,0.000329,USD
3,app_hash_6e8221fde876d15698298cea8c0d1bd6,"{'tru_class_info': {'name': 'TruLlama', 'modul...",RetrieverQueryEngine(llama_index.core.query_en...,record_hash_e323266d66377068dbf095dbeddd6865,What is the first step to becoming good at AI?,The first step to becoming good at AI is to ac...,-,{'record_id': 'record_hash_e323266d66377068dbf...,"{""n_requests"": 1, ""n_successful_requests"": 1, ...","{""start_time"": ""2025-01-31T15:07:30.351648"", ""...",...,0.0017,,,,Direct Query Engine,base,1.158006,1972,0.000316,USD
4,app_hash_6e8221fde876d15698298cea8c0d1bd6,"{'tru_class_info': {'name': 'TruLlama', 'modul...",RetrieverQueryEngine(llama_index.core.query_en...,record_hash_24fc5fb33299305bd51686f0d56cbbed,Who are some accomplished individuals who have...,Many accomplished individuals in the AI commun...,-,{'record_id': 'record_hash_24fc5fb33299305bd51...,"{""n_requests"": 1, ""n_successful_requests"": 1, ...","{""start_time"": ""2025-01-31T15:07:28.384136"", ""...",...,0.001688,0.0,[{'args': {'prompt': 'Who are some accomplishe...,0.000492,Direct Query Engine,base,1.859945,1960,0.00033,USD


In [26]:
# launches on http://localhost:8501/
tru.run_dashboard()

Starting dashboard ...



  tru.run_dashboard()
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Accordion(children=(VBox(children=(VBox(children=(Label(value='STDOUT'), Output())), VBox(children=(Label(valu…

Dashboard started at http://localhost:62031 .


<Popen: returncode: None args: ['streamlit', 'run', '--server.headless=True'...>

## Advanced RAG pipeline

### 1. Sentence Window retrieval

In [27]:
node_parser = SentenceWindowNodeParser.from_defaults(
    window_size=3,
    window_metadata_key="window",
    original_text_metadata_key="original_text",
)

In [28]:
Settings.node_parser = node_parser

In [29]:
def build_sentence_window_index(
    document, embed_model, save_dir="./sentence_index"
):

    if not os.path.exists(save_dir):
        sentence_index = VectorStoreIndex.from_documents([document], embed_model=embedding_model)
        sentence_index.storage_context.persist(persist_dir=save_dir)
    else:
        sentence_index = load_index_from_storage(
            StorageContext.from_defaults(persist_dir=save_dir))

    return sentence_index

In [30]:
persist_dir="./source_notebooks/building_and_evalutation_advanced_rags/lecture_1/sentence_index"

In [31]:
sentence_index = build_sentence_window_index(
    document,
    embed_model=embedding_model,
    save_dir=persist_dir
)

In [32]:
def get_sentence_window_query_engine(
    sentence_index,
    similarity_top_k=6,
    rerank_top_n=2,
):
    # define postprocessors
    postproc = MetadataReplacementPostProcessor(target_metadata_key="window")
    rerank = SentenceTransformerRerank(
        top_n=rerank_top_n, model="BAAI/bge-reranker-base"
    )

    sentence_window_engine = sentence_index.as_query_engine(
        similarity_top_k=similarity_top_k, node_postprocessors=[postproc, rerank]
    )
    return sentence_window_engine

In [33]:
sentence_window_engine = get_sentence_window_query_engine(sentence_index)

In [34]:
window_response = sentence_window_engine.query(
    "how do I get started on a personal project in AI?"
)
print(str(window_response))

To get started on a personal project in AI, begin by selecting a topic that interests you and aligns with your current skill level. It’s important to start small; even simple projects can provide valuable learning experiences. For instance, you might consider training a neural network on a basic function or dataset to understand the fundamentals of machine learning.

As you work on your project, focus on building foundational skills in machine learning and software development. Familiarize yourself with key concepts such as different models, optimization algorithms, and programming fundamentals. This knowledge will help you tackle more complex projects in the future.

Document your progress and be prepared to communicate the value of your work. This will not only help you articulate your accomplishments but also attract potential collaborators or mentors who can provide guidance and support.

Finally, remember that each project is a stepping stone in your journey. Use the insights gain

In [None]:
# tru.reset_database()

tru_recorder_sentence_window = TruLlama(
    sentence_window_engine,
    app_id="Sentence Window Query Engine",
    feedbacks=feedbacks
    )

In [None]:
for question in eval_questions:
    with tru_recorder_sentence_window as recording:
        response = sentence_window_engine.query(question)
        print(question)
        print(str(response))

In [37]:
tru.get_leaderboard(app_ids=[])

Unnamed: 0_level_0,Unnamed: 1_level_0,Answer Relevance,Context Relevance,Groundedness,latency,total_cost
app_name,app_version,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Direct Query Engine,base,0.866667,0.395833,0.866667,2.216288,0.000353
Sentence Window Query Engine,base,,,,2.834291,0.00037


In [38]:
# launches on http://localhost:8501/
tru.run_dashboard()

Starting dashboard ...
Dashboard already running at path:   Local URL: http://localhost:62031



<Popen: returncode: None args: ['streamlit', 'run', '--server.headless=True'...>

### 2. Auto-merging retrieval

In [50]:
def build_automerging_index(
    documents,
    save_dir="merging_index",
    chunk_sizes=None,
):
    chunk_sizes = chunk_sizes or [2048, 512, 128]
    node_parser = HierarchicalNodeParser.from_defaults(chunk_sizes=chunk_sizes)
    nodes = node_parser.get_nodes_from_documents(documents)
    leaf_nodes = get_leaf_nodes(nodes)

    Settings.node_parser = node_parser

    storage_context = StorageContext.from_defaults()
    storage_context.docstore.add_documents(nodes)

    if not os.path.exists(save_dir):
        automerging_index = VectorStoreIndex(
            leaf_nodes, storage_context=storage_context)
        automerging_index.storage_context.persist(persist_dir=save_dir)
    else:
        automerging_index = load_index_from_storage(
            StorageContext.from_defaults(persist_dir=save_dir))
    return automerging_index

In [51]:
persist_dir="./source_notebooks/building_and_evalutation_advanced_rags/lecture_1/merging_index"

In [52]:
automerging_index = build_automerging_index(
    documents,
    save_dir=persist_dir
)

In [53]:
def get_automerging_query_engine(
    automerging_index,
    similarity_top_k=12,
    rerank_top_n=2,
):
    base_retriever = automerging_index.as_retriever(similarity_top_k=similarity_top_k)
    retriever = AutoMergingRetriever(
        base_retriever, automerging_index.storage_context, verbose=True
    )
    rerank = SentenceTransformerRerank(
        top_n=rerank_top_n, model="BAAI/bge-reranker-base"
    )
    auto_merging_engine = RetrieverQueryEngine.from_args(
        retriever, node_postprocessors=[rerank]
    )
    return auto_merging_engine

In [54]:
automerging_query_engine = get_automerging_query_engine(
    automerging_index,
)

In [55]:
auto_merging_response = automerging_query_engine.query(
    "How do I build a portfolio of AI projects?"
)
print(str(auto_merging_response))

> Merging 1 nodes into parent node.
> Parent node id: fd34c388-e5d0-4e76-99a5-14bc58d34f59.
> Parent node text: PAGE 21
Building a Portfolio of 
Projects that Shows 
Skill Progression 
CHAPTER 6
PROJECTS

> Merging 1 nodes into parent node.
> Parent node id: 862488c3-9b98-4cca-b2a5-2bcf1c971365.
> Parent node text: PAGE 21
Building a Portfolio of 
Projects that Shows 
Skill Progression 
CHAPTER 6
PROJECTS

To build a portfolio of AI projects, start by selecting a range of projects that demonstrate your skills and knowledge. Focus on showcasing a progression from simpler projects to more complex ones, highlighting your growth and learning over time. It's also essential to communicate your thought process clearly, as this will help others understand the value of your work and trust you with more significant opportunities. Consider applying machine learning to various industries, as diverse applications can enrich your portfolio and provide valuable experience.


In [None]:
# tru.reset_database()

tru_recorder_automerging = TruLlama(
    automerging_query_engine,
    app_id="Automerging Query Engine",
    feedbacks=feedbacks
    )

In [None]:
for question in eval_questions:
    with tru_recorder_automerging as recording:
        response = automerging_query_engine.query(question)
        print(question)
        print(response)

In [58]:
tru.get_leaderboard(app_ids=[])

Unnamed: 0_level_0,Unnamed: 1_level_0,Answer Relevance,Context Relevance,Groundedness,latency,total_cost
app_name,app_version,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Automerging Query Engine,base,1.0,0.472222,0.909722,2.144787,0.000113
Direct Query Engine,base,0.878788,0.378788,0.872727,2.216288,0.000353
Sentence Window Query Engine,base,0.848485,0.484848,0.890332,2.834291,0.00037


In [48]:
# launches on http://localhost:8501/
tru.run_dashboard()

Starting dashboard ...
Dashboard already running at path:   Local URL: http://localhost:62031



<Popen: returncode: None args: ['streamlit', 'run', '--server.headless=True'...>