# Iteration 4.0
Introduces strategies to address issues encountered in previous iterations with large documents. These strategies include:
1. Iterative retrieval
2. Context pairing
3. Context cleaning


In [1]:
import os
from google.colab import userdata

# HuggingFace API Key in Google Colab Secrets

LLAMA_CLOUD_API_KEY = userdata.get('LLAMA_CLOUD_API_KEY')
GROQ_API_KEY = userdata.get('GROQ_API_KEY')

os.environ['LLAMA_CLOUD_API_KEY'] = LLAMA_CLOUD_API_KEY
os.environ['GROQ_API_KEY'] = GROQ_API_KEY

In [2]:
from llama_index.core import Settings
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.llms.groq import Groq
import nest_asyncio
from llama_parse import LlamaParse
from llama_index.core import SimpleDirectoryReader
from llama_index.core.node_parser import SentenceSplitter, SemanticSplitterNodeParser, LangchainNodeParser
from llama_index.core import VectorStoreIndex

nest_asyncio.apply()

# Settings
LlamaIndex global configurations.

In [3]:
Settings.llm = Groq(model="llama-3.1-70b-versatile", api_key=os.getenv("GROQ_API_KEY"), temperature=0)
Settings.embed_model = HuggingFaceEmbedding(model_name="BAAI/bge-small-en-v1.5")

# Preprocessing
Preprocessing documents using LlamaParse to extract text from files.\
https://docs.cloud.llamaindex.ai/llamaparse/features/python_usage

In [4]:
outdated_docs = LlamaParse(num_workers=8, split_by_page=0,  result_type="text").load_data("demo_guide.docx") # Set 'split_by_page=0' to remove splitting by default
reference_docs = LlamaParse(split_by_page=0, result_type="text").load_data("demo_reference.docx")

# Chunking

In [7]:
from haystack import Document
from haystack.components.preprocessors import DocumentSplitter

# Splitter
outdated_splitter = SentenceSplitter(chunk_size=1024, chunk_overlap=512)
reference_splitter = DocumentSplitter(split_by="passage", split_length=1, split_overlap=0)

# Get Chunks
outdated_chunks = outdated_splitter.get_nodes_from_documents(outdated_docs)
reference_chunks = reference_splitter.run(documents=[Document(content=reference_docs[0].text)])

In [None]:
for i, document in enumerate(reference_chunks['documents']):
  print(f"Chunk #{i+1}\n{document.content}")

In [10]:
from llama_index.core.schema import TextNode

In [11]:
nodes = []
for chunk in outdated_chunks:
  nodes.append(TextNode(text=chunk.get_content()))

outdated_index = VectorStoreIndex(nodes)

In [None]:
# demo: display text chunks
from IPython.display import Markdown

for i, node in enumerate(nodes):
  display(Markdown(f"# 🔴Chunk: {i+1}"))
  print(node.get_content())

# Retrieval & Context Pairing
A context pair consists of the reference material text used for retrieval and the corresponding retrieved chunk.

In [13]:
from llama_index.core.retrievers import VectorIndexRetriever
from llama_index.core import QueryBundle

# Create a retriever over the outdated material index
retriever = VectorIndexRetriever(index=outdated_index, similarity_top_k=6)
context_pairs = []

# Iterate through each section of the reference material
for reference_material in reference_chunks['documents']:
  # Search and retrieve from the index using a single section(update) of the reference material
  query_bundle = QueryBundle(reference_material.content)
  retrieved = retriever.retrieve(query_bundle)

  # Pair the retrieved chunk with the reference material
  reference, outdated = reference_material.content, retrieved[0].text
  context_pairs.append(tuple([reference, outdated]))


In [None]:
# demo: display the context pairs
for reference, outdated in context_pairs:
  display(Markdown(f"## 🔴Context Pair"))

  display(Markdown("**Reference Material:**"))
  print(reference)

  display(Markdown("**Outdated Material:**"))
  print(outdated)

# Prepare Text for Replacement

# Method 1: Chunk Cleaning
Clean the chunk by removing irrelevant text. The cleaned text will serve as a marker for string replacement.

In [15]:
# System prompt
sys_prompt="""
You are tasked with identifying and retaining only the sections of the document that need to be replaced based on the provided reference material. Your goal is to extract the content that is directly relevant for replacement without adding or modifying the existing text.

Instructions:
1. Review both the provided document chunk and reference material to determine which sections are directly relevant for replacement.
2. Extract the content that will be replaced, including any references or context that should be retained as part of the replacement.
3. Ensure that the extracted section remains as close as possible in length and context to the original content that will be replaced.
4. Do not include any additional content or modifications outside of what is necessary for the update.
5. Do not include any explanations, introductions, or additional text.
"""

In [16]:
# Prompte template
prompt = f"""
Please extract and retain only the section of the document that is directly relevant to the content being updated. Here is the specific section to keep:

Here is a chunk from a report that needs refinement:
{context_pairs[0][1]}

Here is the reference material to use for determining relevance:
{context_pairs[0][0]}

Ensure that only this section is extracted and preserved for the update.
"""

In [17]:
# demo: uncomment to see the final prompt
# print(prompt)

In [None]:
from groq import Groq

client = Groq()

chat_completion = client.chat.completions.create(
    messages=[
        {"role": "system", "content": sys_prompt},
        {"role": "user", "content": prompt}],
    model="llama-3.1-70b-versatile",
    temperature=0,
    max_tokens=8000,
    top_p=1,
    stop=None,
    stream=False,
)

print(chat_completion.choices[0].message.content)

# Method 2: Update the outdated text chunk and use it as the replacement text
This method will not work when there are context pairs that share the same outdated text chunk.

When working with multiple context pairs that reference the same outdated text chunk, the method breaks down because once the first update is applied, the original text is altered, making subsequent updates impossible if they rely on the same outdated text.


In [19]:
sys_prompt = """
Your task is to selectively update the provided document section using the given reference material.

Instructions:
1. Read the document section and reference material carefully.
2. Identify specific information in the document section that is directly addressed by the reference material.
3. Update ONLY the information that is explicitly contradicted or updated by the reference material.
4. Keep all text from the document that is still relevant and not specifically addressed by the reference material.
5. Maintain the original style, format, and structure of the section.
6. Do not add any new information that is not explicitly provided in the reference material.
7. Provide ONLY the updated section text.
8. Do not include any explanations, introductions, or additional text.
"""

In [20]:
prompt = f"""
Here is some context for information to update a section from a chunk that was chunked from a report:
{context_pairs[0][1]}

Here is the reference material:
{context_pairs[0][0]}

Update the section of the chunk with the context, remove all other sections of the chunk that have not been updated.
"""

In [21]:
# demo: uncomment to see the final prompt
# print(prompt)

In [None]:
client = Groq()

chat_completion = client.chat.completions.create(
    messages=[
        {"role": "system", "content": sys_prompt},
        {"role": "user", "content": prompt}],
    model="llama-3.1-70b-versatile",
    temperature=0,
    max_tokens=8000,
    top_p=1,
    stop=None,
    stream=False,
)

print(chat_completion.choices[0].message.content)