# RAG

In [None]:
!pip install llama-index
!pip install llama-parse
!pip install torch transformers python-pptx Pillow
!pip install llama-index-llms-groq
!pip install llama-index-embeddings-huggingface
!pip install python-docx
!pip install llama-index-retrievers
!pip install pyautogen
!pip install groq
!pip install llama-index-retrievers-bm25
!pip install python-docx

In [None]:
from llama_index.core import Settings
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.llms.groq import Groq
import nest_asyncio
from llama_parse import LlamaParse
from llama_index.core import SimpleDirectoryReader
from llama_index.core.node_parser import SentenceSplitter, SemanticSplitterNodeParser, LangchainNodeParser
from llama_index.core import VectorStoreIndex

from IPython.display import Markdown

nest_asyncio.apply()

In [None]:
import os
from google.colab import userdata

LLAMA_CLOUD_API_KEY = userdata.get('LLAMA_CLOUD_API_KEY')
GROQ_API_KEY = userdata.get('GROQ_API_KEY')

os.environ['LLAMA_CLOUD_API_KEY'] = LLAMA_CLOUD_API_KEY
os.environ['GROQ_API_KEY'] = GROQ_API_KEY

# Settings

In [None]:
Settings.llm = Groq(model="llama-3.2-90b-text-preview", api_key=os.getenv("GROQ_API_KEY"), temperature=0)
Settings.embed_model = HuggingFaceEmbedding(model_name="BAAI/bge-small-en-v1.5")

# Preprocessing

No preprocessing of training/guide docx file using LlamaParse.

In [None]:
def split_into_paragraphs(text):
  paragraphs = text.split('\n\n')
  return [para.strip() for para in paragraphs if para.strip()]

In [None]:
# outdated_docs = LlamaParse(num_workers=8, split_by_page=0,  result_type="text").load_data("demo_guide.docx")
direct = LlamaParse(num_workers=8, split_by_page=0, result_type="text").load_data("direct.docx")
direct_header = LlamaParse(num_workers=8, split_by_page=0, result_type="text").load_data("direct_header.docx")
instruction = LlamaParse(num_workers=8, split_by_page=0, result_type="text").load_data("instruction.docx")
instruction_header = LlamaParse(num_workers=8, split_by_page=0, result_type="text").load_data("instruction_header.docx")

In [None]:
# Split the reference material
direct = split_into_paragraphs(direct[0].text)
direct_header = split_into_paragraphs(direct_header[0].text)
instruction = split_into_paragraphs(instruction[0].text)
instruction_header = split_into_paragraphs(instruction_header[0].text)

In [None]:
from typing import List, Dict
import re
from docx import Document

# process_document function
def process_document(file_path: str):
    # Create document object
    document = Document(file_path)

    # Split document into sections
    sections = split_into_sections(document)

    return sections

def split_into_sections(document):
    # Create list of sections
    sections = []
    current_section = {"title": "", "content": []}

    # Iterate through each paragraph in the document
    for paragraph in document.paragraphs:
        # Check if the paragraph string starts with a numbered section
        if re.match(r'^\d+\.', paragraph.text.strip()):
            # If there is an existing current_section, append it to the list of sections
            if current_section["title"]:
                sections.append(current_section)

            # Create new current_section with new title and new content
            current_section = {"title": paragraph.text.strip(), "content": []}
        # If there is no title, append the paragraph text as the current_section's content
        else:
            current_section["content"].append(paragraph.text)

    if current_section["title"]:
        sections.append(current_section)

    return sections

# Process the document
sections = process_document('demo_guide.docx')

In [None]:
# demo: Print each section
for section in sections:
    print(section)

In [None]:
from llama_index.core import Document

def create_documents(sections):
  documents = []

  for section in sections:
    # Combine title and content into a single string
    text = f"{section['title']}\n\n"

    if isinstance(section['content'], list):
      text += "\n".join(section['content'])
    else:
      text += str(section['content'])

    # Create Document object
    doc = Document(text=text)
    documents.append(doc)

  return documents

documents = create_documents(sections)

In [None]:
from llama_index.core import VectorStoreIndex

vector_index = VectorStoreIndex([])

for doc in documents:
  vector_index.insert(doc)

In [None]:
from llama_index.core.node_parser import SentenceSplitter

# parse nodes
parser = SentenceSplitter()
nodes = parser.get_nodes_from_documents(documents)

# Hybrid Retrieval

In [None]:
from llama_index.core.tools import RetrieverTool
from llama_index.core.retrievers import  VectorIndexRetriever
from llama_index.retrievers.bm25 import BM25Retriever

vector_retriever = VectorIndexRetriever(vector_index)
bm25_retriever = BM25Retriever.from_defaults(nodes=nodes, similarity_top_k=6)

retriever_tools = [
    RetrieverTool.from_defaults(
        retriever=vector_retriever,
        description="Useful in most cases",
    ),
    RetrieverTool.from_defaults(
        retriever=bm25_retriever,
        description="Useful if searching about specific information",
    ),
]

In [None]:
from llama_index.core.retrievers import RouterRetriever

hybrid_retriever = RouterRetriever.from_defaults(
    retriever_tools=retriever_tools,
    select_multi=True,
)

In [None]:
from llama_index.core.response.notebook_utils import display_source_node

def hybrid_result(reference_material):
  for i, update in enumerate(reference_material):
    result=hybrid_retriever.retrieve(update)

    display(Markdown(f"#🔴 Reference {i+1}"))
    print(update)

    display(Markdown(f"#🔵 Retrieved Section"))
    # display_source_node(result[0])
    print(result[0].text)
    # print(chunk_map[result[0].node_id].text)

# Hybrid Retrieval Results
No reranker

In [None]:
hybrid_result(direct)

In [None]:
hybrid_result(direct_header)

In [None]:
hybrid_result(instruction)

In [None]:
hybrid_result(instruction_header)

# Rerank (LLM)

In [None]:
from llama_index.core.postprocessor.rankGPT_rerank import RankGPTRerank

llm_reranker = RankGPTRerank(
    # llm = Groq(model="llama3-70b-8192"),
    llm = Groq(model="llama-3.2-90b-text-preview", temperature=0, max_tokens=8000),
    top_n=3,
)

In [None]:
from llama_index.core import QueryBundle

def reranked_result(reference_material):
  for i, update in enumerate(reference_material):
    query_bundle = QueryBundle(update)
    retrieved_nodes=hybrid_retriever.retrieve(update)

    result = llm_reranker.postprocess_nodes(retrieved_nodes, query_bundle)

    display(Markdown(f"#🔴 Reference {i+1}"))
    print(update)

    display(Markdown(f"#🔵 Retrieved Section"))
    print(result[0].text)

In [None]:
reranked_result(direct)

In [None]:
reranked_result(direct_header)

In [None]:
reranked_result(instruction)

In [None]:
reranked_result(instruction_header)

# Rerank (Sentence Transformer)

In [None]:
from llama_index.core.postprocessor import SentenceTransformerRerank

reranker = SentenceTransformerRerank(model="BAAI/bge-reranker-v2-m3", top_n=3)

In [None]:
from llama_index.core import QueryBundle

def reranked_result(reference_material):
  for i, update in enumerate(reference_material):
    query_bundle = QueryBundle(update)
    retrieved_nodes=hybrid_retriever.retrieve(update)

    result = llm_reranker.postprocess_nodes(retrieved_nodes, query_bundle)

    display(Markdown(f"#🔴 Reference {i+1}"))
    print(update)

    display(Markdown(f"#🔵 Retrieved Section"))
    print(result[0].text)

In [None]:
reranked_result(direct)

In [None]:
reranked_result(direct_header)

In [None]:
reranked_result(instruction)

In [None]:
reranked_result(instruction_header)

# Rerank (Sentence Transformer)

In [None]:
from llama_index.core.postprocessor import SentenceTransformerRerank

reranker = SentenceTransformerRerank(model="cross-encoder/ms-marco-MiniLM-L-2-v2", top_n=3)

In [None]:
reranked_result(direct)

In [None]:
reranked_result(direct_header)

In [None]:
reranked_result(instruction)

In [None]:
reranked_result(instruction_header)