# RAG pipeline: 

Indexing portion of our RAG consists of:
* PDF to markdown file with LlamaParse + instructions
* Chunking with MarkdownElementNodeParser (gives chunks of texts and embedded tables and table summaries)
* Metadata Extrator: Summary, Title 
* Embedding Model: Text-embeddings-3-large

In [1]:
import glob
import pickle
import itertools
import nest_asyncio
from dotenv import load_dotenv
import os

from llama_parse import LlamaParse
from llama_index.llms.openai import OpenAI
from llama_index.embeddings.openai import OpenAIEmbedding
from llama_index.core.extractors import (SummaryExtractor,TitleExtractor)
from llama_index.core.node_parser import (SentenceSplitter,SemanticSplitterNodeParser,MarkdownElementNodeParser)
from llama_index.core import VectorStoreIndex
from llama_index.core.query_engine import RetryQueryEngine

from llama_index.core.evaluation import RelevancyEvaluator,FaithfulnessEvaluator
from llama_index.core.llama_dataset.generator import RagDatasetGenerator
from llama_index.core.llama_pack import download_llama_pack


## 1. Load Documents

In [2]:
# Load environment variables from .env file
load_dotenv()

nest_asyncio.apply()

openai_api_key = os.getenv("OPENAI_API_KEY")
llama_cloud_api_key = os.getenv("LLAMA_CLOUD_API_KEY")

In [3]:
md_lec_slides = glob.glob("GSU - CIS 3260 - Fall 2023/Lecture Slides Parsed/*.pkl") 
md_lec_slides
# List to store loaded documents
documents = []

# Loop through file numbers 1 through 8
for i in md_lec_slides:  # 9 is exclusive, so it goes from 1 to 8
    doc = pickle.load(open(i, "rb"))
    documents.append(doc)

md_qe = glob.glob("GSU - CIS 3260 - Fall 2023/Quizzes and Exams/*.pkl") 
# Loop through file numbers 1 through 8
for i in md_qe:  # 9 is exclusive, so it goes from 1 to 8
    doc = pickle.load(open(i, "rb"))
    documents.append(doc)
    
# Load Syllabus 
doc1 = pickle.load(open("GSU - CIS 3260 - Fall 2023/Syllabus Parsed/demo.pkl", "rb"))
documents.append(doc1)

## 2. Chunking, Indexing, and Storing

### a) Semantic Chunk

In [6]:
# Semantic splitter
splitter = SemanticSplitterNodeParser(
    buffer_size=1, breakpoint_percentile_threshold=95, 
    embed_model=OpenAIEmbedding(model = "text-embedding-3-large")
)

In [7]:
nodes_semantic = []

for doc in documents:
    pdoc = splitter.get_nodes_from_documents(doc)
    nodes_semantic.append(pdoc)

In [8]:
nodes_semantic1 = list(itertools.chain(*nodes_semantic))

In [9]:
# Index and store our chunks using VectorStoreIndex
recursive_index_semantic = VectorStoreIndex(nodes= nodes_semantic1)

# Set up query enginge
recursive_query_engine_semantic = recursive_index_semantic.as_query_engine(
    similarity_top_k=5, verbose=True)

### b) MarkdownElementNodeParser

In [4]:
node_parser_OPENAI = MarkdownElementNodeParser(
    llm=OpenAI(model="gpt-4"), num_workers=8, 
)

In [5]:
nodes_OA = []
base_nodes_OA  = []
objects_OA  = []

for doc in documents:
    pdoc_OA = node_parser_OPENAI.get_nodes_from_documents(doc)
    nodes_OA.append(pdoc_OA)
    base, obj = node_parser_OPENAI.get_nodes_and_objects(pdoc_OA)
    base_nodes_OA.append(base)
    objects_OA.append(obj)

2it [00:00, 19972.88it/s]
100%|██████████| 2/2 [00:11<00:00,  5.72s/it]
4it [00:00, 47127.01it/s]
100%|██████████| 4/4 [00:15<00:00,  3.81s/it]
1it [00:00, 12787.51it/s]
100%|██████████| 1/1 [00:13<00:00, 13.13s/it]
4it [00:00, 49636.73it/s]
100%|██████████| 4/4 [00:13<00:00,  3.39s/it]
2it [00:00, 26886.56it/s]
100%|██████████| 2/2 [00:12<00:00,  6.38s/it]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
5it [00:00, 63358.07it/s]
100%|██████████| 5/5 [00:06<00:00,  1.22s/it]
2it [00:00, 34521.02it/s]
100%|██████████| 2/2 [00:09<00:00,  4.58s/it]
5it [00:00, 13426.07it/s]
100%|██████████| 5/5 [00:12<00:00,  2.53s/it]
4it [00:00, 60787.01it/s]
100%|██████████| 4/4 [00:05<00:00,  1.31s/it]
6it [00:00, 64693.63it/s]
100%|██████████| 6/6 [00:05<00:00,  1.02it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
4it [00:00, 38216.89it/s]
100%|██████████| 4/4 [00:06<00:00,  1.54s/it]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
1it [00:00, 6413.31it/s]
100%|██████████| 1/1 [00:06<00:

In [11]:
base_nodes_OA_list = list(itertools.chain(*base_nodes_OA))
objects_OA_list = list(itertools.chain(*objects_OA))

await SummaryExtractor().acall(base_nodes_OA_list)
await TitleExtractor().acall(base_nodes_OA_list+objects_OA_list)

100%|██████████| 83/83 [00:36<00:00,  2.27it/s]
100%|██████████| 5/5 [00:00<00:00,  5.28it/s]
100%|██████████| 5/5 [00:00<00:00,  6.22it/s]
100%|██████████| 5/5 [00:01<00:00,  4.74it/s]
100%|██████████| 5/5 [00:00<00:00,  6.10it/s]
100%|██████████| 5/5 [00:00<00:00,  6.61it/s]
100%|██████████| 2/2 [00:00<00:00,  3.61it/s]
100%|██████████| 5/5 [00:00<00:00,  6.49it/s]
100%|██████████| 5/5 [00:00<00:00,  5.75it/s]
100%|██████████| 5/5 [00:00<00:00,  6.98it/s]
100%|██████████| 5/5 [00:00<00:00,  5.36it/s]
100%|██████████| 5/5 [00:00<00:00,  6.98it/s]
100%|██████████| 2/2 [00:00<00:00,  5.05it/s]
100%|██████████| 5/5 [00:00<00:00,  5.18it/s]
100%|██████████| 1/1 [00:00<00:00,  2.31it/s]
100%|██████████| 2/2 [00:06<00:00,  3.32s/it]
100%|██████████| 3/3 [00:00<00:00,  5.95it/s]
100%|██████████| 1/1 [00:00<00:00,  1.02it/s]
100%|██████████| 1/1 [00:00<00:00,  2.44it/s]
100%|██████████| 1/1 [00:00<00:00,  2.27it/s]
100%|██████████| 1/1 [00:01<00:00,  1.42s/it]
100%|██████████| 5/5 [00:00<00:0

[TextNode(id_='189acc19-73e7-4282-96ed-dd20bc9f61cd', embedding=None, metadata={'section_summary': 'In this section, the key topics covered include reviewing previous material such as IA4-IA7 and Quiz 3-Quiz 5, as well as discussing the final exam and group project schedules. \n\nThe individual assignments in this section focus on different programming tasks. \n\n- Individual Assignment 4-2 involves converting letter grades to their corresponding numeric values using if-elif statements in Python.\n- Individual Assignment 4-3 calculates the number of days in a given month and year, taking into account leap years.\n- Individual Assignment 5-1 prompts the user to enter a string and displays its length and last character.\n- Individual Assignment 5-2 involves determining a major and year based on user input, using lists for efficiency.\n- Individual Assignment 5-3 is an ISBN-9 to ISBN-10 converter that calculates the checksum and handles special cases like X for a checksum of 10.\n\nOveral

In [20]:
# Default Embedding 
query_engine_embed_default_OpenAI = VectorStoreIndex(nodes=base_nodes_OA_list + objects_OA_list, 
                                   embed_model = OpenAIEmbedding())
# Set up query enginge
recursive_embed_default_OpenAI = query_engine_embed_default_OpenAI.as_query_engine(
    similarity_top_k=5, verbose=True)

##############################################################################################
# Embedding with "text-embedding-3-large"
query_engine_embed_3_large = VectorStoreIndex(nodes=base_nodes_OA_list + objects_OA_list, 
                                   embed_model = OpenAIEmbedding(model="text-embedding-3-large"))
# Set up query enginge
recursive_query_engine_embed_3_large = query_engine_embed_3_large.as_query_engine(
    similarity_top_k=5, verbose=True)

---
### Store and load chunks

#### i. Store

In [61]:
chunks = base_nodes_OA_list + objects_OA_list

In [70]:
import json

# Example function to serialize nodes to JSON and save to a file
def save_nodes_to_file(nodes, filename):
    with open(filename, 'w') as f:
        json_nodes = [node.to_dict() for node in nodes]
        json.dump(json_nodes, f)

In [79]:
import os
import pickle

# Iterate over the PDFs starting from the fourth element in QE
for index, pdf_file in enumerate(chunks, start=0):
    # Generate a unique name for each .pkl file based on the index
    pkl_filename = f"chunk_{index}.json"
    pkl_filepath = os.path.join("GSU - CIS 3260 - Fall 2023/Chunks", pkl_filename)

    save_nodes_to_file([chunks[index]], pkl_filepath)


#### ii. Load

In [120]:
from llama_index.core.schema import IndexNode, TextNode

# Example function to read nodes from a file and deserialize from JSON
def load_nodes_from_file(filename):
    with open(filename, 'r') as f:
        json_nodes = json.load(f)
        nodes = [IndexNode.from_dict(node) if node['class_name'] == 'index' else TextNode.from_dict(node) for node in json_nodes]
    return nodes

In [107]:
# Example function to read nodes from a file and deserialize from JSON
def load_nodes_from_file(filename):
    with open(filename, 'r') as f:
        json_nodes = json.load(f)
    return json_nodes

In [84]:
chunk_files = glob.glob("GSU - CIS 3260 - Fall 2023/Chunks/*.json")  

In [126]:
loaded_chunks = []
for i in chunk_files:
    loaded_chunks.append(load_nodes_from_file(i)[0])


---

## Query

### a) Self Correcting Query Enginge

In [21]:
llm_eval=OpenAI(model="gpt-4")
faithfulness_evaluator = FaithfulnessEvaluator(llm=llm_eval)
rel_evaluator = RelevancyEvaluator(llm=llm_eval)

In [22]:
## Self-correction ##

# 1) Semantic Chunk
retry_query_engine_semantic = RetryQueryEngine(
    recursive_query_engine_semantic, rel_evaluator
)
retry_query_engine_semantic_1 = RetryQueryEngine(
    retry_query_engine_semantic, faithfulness_evaluator
)

# 2) MarkdownElementNodeParser w Default OpenAI Embedding model 
retry_query_engine_openai = RetryQueryEngine(
    recursive_embed_default_OpenAI, rel_evaluator
)
retry_query_engine_openai_1 = RetryQueryEngine(
    retry_query_engine_openai, faithfulness_evaluator
)

# 3) MarkdownElementNodeParser w Embedding model = "text-embedding-3-large"
retry_query_engine_3l = RetryQueryEngine(
    recursive_query_engine_embed_3_large, rel_evaluator
)
retry_query_engine_3l_1 = RetryQueryEngine(
    retry_query_engine_3l, faithfulness_evaluator
)

### b) Query

In [25]:
def ask_our_llm(question):
    # 1) Semantic Chunk
    # Query enginge with embeddings from semantic chunker + (Embedding model = default)
    res1 = recursive_query_engine_semantic.query(question)
    print("Semantic: ", res1)
    eval_result = faithfulness_evaluator.evaluate_response(response=res1)
    print(str(eval_result.passing))
    print(str(eval_result.score))
    eval_result = rel_evaluator.evaluate_response(query=question,response=res1)
    print(str(eval_result.passing))
    print(str(eval_result.score))
    # Retry engine
    retry_response = retry_query_engine_semantic_1.query(question)
    print("Semantic: (RETRY)", retry_response)

    # 2) Query enginge with embeddings from MarkdownElementNodeParser chunker + summary extraction + Title extraction + (Embedding model = default)
    res = recursive_embed_default_OpenAI.query(question)
    print("Markdownelementnode + Summary + 3-small embedding model: ", res)
    eval_result = faithfulness_evaluator.evaluate_response(response=res)
    print(str(eval_result.passing))
    print(str(eval_result.score))
    eval_result = rel_evaluator.evaluate_response(query=question,response=res)
    print(str(eval_result.passing))
    print(str(eval_result.score))
    # Retry engine
    retry_response = retry_query_engine_openai_1.query(question)
    print("Markdownelementnode + Summary + 3-small embedding model: (RETRY)", retry_response)

    # 3) Query enginge with embeddings from MarkdownElementNodeParser chunker + summary extraction + Title extraction + (Embedding model = text-embeddgins-3-large)
    res = recursive_query_engine_embed_3_large.query(question)
    print("Markdownelementnode + Summary + Title + 3-small embedding model: ", res)
    eval_result = faithfulness_evaluator.evaluate_response(response=res)
    print(str(eval_result.passing))
    print(str(eval_result.score))
    eval_result = rel_evaluator.evaluate_response(query=question,response=res)
    print(str(eval_result.passing))
    print(str(eval_result.score))
    # Retry engine
    # Recurisive retrial: 
    retry_response = retry_query_engine_3l_1.query(question)
    print("Markdownelementnode + Summary + Title + 3-small embedding model: (Recursive RETRY)", retry_response)



In [26]:
question = "who is the instructor for the course"

ask_our_llm(question)

Semantic:  Dr. Yuan Long
True
1.0
True
1.0
Semantic: (RETRY) Dr. Yuan Long
[1;3;38;2;11;159;203mRetrieval entering 18df97c9-c6c0-4fa2-848c-ec929a0ddf04: TextNode
[0m[1;3;38;2;237;90;200mRetrieving from object TextNode with query who is the instructor for the course
[0m[1;3;38;2;11;159;203mRetrieval entering aa01ed5d-a482-43e0-bc63-a3cded6af6b9: TextNode
[0m[1;3;38;2;237;90;200mRetrieving from object TextNode with query who is the instructor for the course
[0m[1;3;38;2;11;159;203mRetrieval entering 1d21519e-7ffd-415c-97b0-162831e0a5ba: TextNode
[0m[1;3;38;2;237;90;200mRetrieving from object TextNode with query who is the instructor for the course
[0m[1;3;38;2;11;159;203mRetrieval entering 835f3cf4-43b8-4345-bf5a-99fc35dcefbc: TextNode
[0m[1;3;38;2;237;90;200mRetrieving from object TextNode with query who is the instructor for the course
[0mMarkdownelementnode + Summary + 3-small embedding model:  Dr. Yuan Long
True
1.0
True
1.0
[1;3;38;2;11;159;203mRetrieval entering 18

In [27]:
question = "i want to review the final exam. What did the exam cover. Which lecture or class do we have to focus on"

ask_our_llm(question)

Semantic:  The final exam will cover all the course material taught throughout the term. It is important to focus on all the lectures, exercises, assignments, quizzes, and group projects to prepare for the final exam effectively.
True
1.0
True
1.0
Semantic: (RETRY) The final exam will cover all the course material taught throughout the term. It is important to focus on all the lectures, exercises, assignments, quizzes, and group projects to prepare for the final exam effectively.
[1;3;38;2;11;159;203mRetrieval entering 1d21519e-7ffd-415c-97b0-162831e0a5ba: TextNode
[0m[1;3;38;2;237;90;200mRetrieving from object TextNode with query i want to review the final exam. What did the exam cover. Which lecture or class do we have to focus on
[0mMarkdownelementnode + Summary + 3-small embedding model:  The final exam covered content from Chapters 4 to 9 of the textbook. To prepare for the final exam, focus on the lecture or class that covers Objects and Classes in Week 14.
True
1.0
True
1.0


In [28]:
question = 'Are you aware of question 6 in quiz 5? What is the answer to this question and why is the answer correct?'

ask_our_llm(question)

Semantic:  I am aware of question 6 in quiz 5. The answer to question 6 is C, which is "True". This answer is correct because when the number is 8, the code will output True based on the logic or condition specified in the code.
True
1.0
True
1.0
Semantic: (RETRY) I am aware of question 6 in quiz 5. The answer to question 6 is "True." This answer is correct because when the number is 8, the code in question 6 evaluates to True based on the conditions specified in the code.
[1;3;38;2;11;159;203mRetrieval entering 0925930f-eb59-478e-9774-60f956f019f5: TextNode
[0m[1;3;38;2;237;90;200mRetrieving from object TextNode with query Are you aware of question 6 in quiz 5? What is the answer to this question and why is the answer correct?
[0mMarkdownelementnode + Summary + 3-small embedding model:  I am aware of question 6 in quiz 5. The answer to this question is A, which corresponds to a runtime error. This answer is correct because if a program needs to read data from a file that does not 

In [29]:
question = "Why is the answer to question 1 of quiz 2 A?"

ask_our_llm(question)

Semantic:  The answer to question 1 of quiz 2 is A because combinations of zeros and ones can represent any numbers and characters.
True
1.0
False
0.0
Semantic: (RETRY) The answer to question 1 of quiz 2 is A because combinations of zeros and ones can represent any numbers and characters.
[1;3;38;2;11;159;203mRetrieval entering a92ba7cb-192c-4376-a79d-3146ff5f9e80: TextNode
[0m[1;3;38;2;237;90;200mRetrieving from object TextNode with query Why is the answer to question 1 of quiz 2 A?
[0mMarkdownelementnode + Summary + 3-small embedding model:  The answer to question 1 of quiz 2 is A because combinations of zeros and ones can represent any numbers and characters.
False
0.0
False
0.0
[1;3;38;2;11;159;203mRetrieval entering a92ba7cb-192c-4376-a79d-3146ff5f9e80: TextNode
[0m[1;3;38;2;237;90;200mRetrieving from object TextNode with query Why is the answer to question 1 of quiz 2 A?
[0m[1;3;38;2;11;159;203mRetrieval entering a92ba7cb-192c-4376-a79d-3146ff5f9e80: TextNode
[0m[1;3;3

## Evaluation

In [31]:
documents1 = list(itertools.chain(*documents))

In [32]:
# define generator, generate questions
dataset_generator = RagDatasetGenerator.from_documents(
    documents=documents1,
    llm=llm_eval,
    num_questions_per_chunk=3,  # set the number of questions per nodes
)

  from .autonotebook import tqdm as notebook_tqdm


In [33]:
rag_dataset = dataset_generator.generate_questions_from_nodes()
questions = [e.query for e in rag_dataset.examples]

In [52]:
rag_dataset

LabelledRagDataset(examples=[LabelledRagDataExample(query='Write a Python program that prompts the user to enter a letter grade (A, B, C, D) and then displays its corresponding numeric value (90, 80, 70, 60). If the user enters an invalid grade, the program should display an error message.', query_by=CreatedBy(model_name='gpt-4', type=<CreatedByType.AI: 'ai'>), reference_contexts=['# Objectives for class 14\n\n- Review IA4-IA7\n- Review Quiz 3 – Quiz 5\n- Final Exam Schedule\n- Group Project Schedule\n---\n# Individual Assignment 4 - 2\n\n- Enter a letter grade A/a, B/b, C/c, D/d, and then displays its corresponding numeric value 90, 80, 70, 60.\n\n## Sample Run\n```\nEnter a letter grade: B\nThe numeric value for grade B is 80\n```\n\n```python\nletter = input("Enter a letter grade: ")\n\nif letter in \'Aa\':\n    print("The numeric value for grade A is 90")\nelif letter in \'Bb\':\n    print("The numeric value for grade B is 80")\nelif letter in \'Cc\':\n    print("The numeric value 

In [36]:
question_eval = questions[:15]

In [35]:
RagEvaluatorPack = download_llama_pack("RagEvaluatorPack", "./pack")

Defaulting to user installation because normal site-packages is not writeable
Processing /home/ewp/MIT Media Lab_Linux/Knowledge Graph RAG/pack
  Installing build dependencies: started
  Installing build dependencies: finished with status 'done'
  Getting requirements to build wheel: started
  Getting requirements to build wheel: finished with status 'done'
  Preparing metadata (pyproject.toml): started
  Preparing metadata (pyproject.toml): finished with status 'done'
Building wheels for collected packages: llama-index-packs-rag-evaluator
  Building wheel for llama-index-packs-rag-evaluator (pyproject.toml): started
  Building wheel for llama-index-packs-rag-evaluator (pyproject.toml): finished with status 'done'
  Created wheel for llama-index-packs-rag-evaluator: filename=llama_index_packs_rag_evaluator-0.1.6-py3-none-any.whl size=5002 sha256=ace33b7e47ad2e8fa3e8ca52aef8df7a27b4e947a64acfea1599bc108a783378
  Stored in directory: /tmp/pip-ephem-wheel-cache-aw2rkol6/wheels/14/94/ae/9f

In [45]:
rag_evaluator = RagEvaluatorPack(
    query_engine=recursive_query_engine_embed_3_large,  # built with the same source Documents as the rag_dataset
    rag_dataset=rag_dataset[:10],
)
benchmark_df = await rag_evaluator.run()

AttributeError: 'list' object has no attribute 'examples'