<a href="https://colab.research.google.com/github/mrozenva/ChatGPT-PromptEngineering/blob/main/Advanced_RAG.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>


Advanced RAG Using Window Sentence Retrieval


In [1]:
%%capture
!pip install llama-index >> null
!pip install openai >> null
!pip install pypdf >> null   # for reading PDF files
!pip install docx2txt > null # for reading MS doc files

In [2]:
import os
import openai

import logging
import sys
from pprint import pprint

logging.basicConfig(stream=sys.stdout, level=logging.INFO)
logging.getLogger().addHandler(logging.StreamHandler(stream=sys.stdout))

from llama_index import (
    VectorStoreIndex,
    SimpleDirectoryReader,
    load_index_from_storage,
    StorageContext,
    ServiceContext,
    Document
)

from llama_index.llms import OpenAI, Anthropic
from llama_index.node_parser import SentenceWindowNodeParser, HierarchicalNodeParser, get_leaf_nodes
from llama_index.text_splitter import SentenceSplitter
from llama_index.embeddings import OpenAIEmbedding, HuggingFaceEmbedding
from llama_index.schema import MetadataMode
from llama_index.postprocessor import MetadataReplacementPostProcessor

# from IPython.display import Markdown, display
# from transformers import AutoTokenizer, T5ForConditionalGeneration

# Step 0:  Authentication with API Key

In [3]:
openai_key = "##########################################" #<--- Your API KEY

In [4]:
openai.api_key = openai_key

# Step 1:  Fetch Data and Store into local directory

In [5]:
# create local directory and retrieve file from external source
!mkdir -p 'my_data'

In [6]:
from google.colab import files
uploaded = files.upload()

Saving Manufacturing Agreement 5.19.10 SHBV Waste2Energy.txt to Manufacturing Agreement 5.19.10 SHBV Waste2Energy.txt


In [8]:
!mv *.txt my_data/

# Step 2:  Load into files into "Document" Object

In [12]:
documents = SimpleDirectoryReader("./my_data/").load_data()

# Step 2B (Optional):  Inspect the documents obect

In [13]:
# Inspect the documents
print("length of doc: "+ str(len(documents)))
print("----")
pprint(documents)


length of doc: 1
----
[Document(id_='2658be82-bd35-499c-aba0-ead2ef7a7236', embedding=None, metadata={'file_path': 'my_data/Manufacturing Agreement 5.19.10 SHBV Waste2Energy.txt', 'file_name': 'Manufacturing Agreement 5.19.10 SHBV Waste2Energy.txt', 'file_type': 'text/plain', 'file_size': 56552, 'creation_date': '2023-12-19', 'last_modified_date': '2023-12-19', 'last_accessed_date': '2023-12-19'}, excluded_embed_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], excluded_llm_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], relationships={}, hash='46c93b7b5c63cc9da38848df4d9e1a8eeca7514e03b0199b99ce55d177febad6', text='Exhibit 10.1\n\nManufacturing Agreement\n\n       AGREEMENT made this 19th day of May, 2010 by and between SHBV (Hong Kong) Ltd (SHBV), a Company with its principal place of business at Unit 3208, 32/F Office Tower, Convention Plaza No. 1 Har

In [15]:
documents[0].metadata

{'file_path': 'my_data/Manufacturing Agreement 5.19.10 SHBV Waste2Energy.txt',
 'file_name': 'Manufacturing Agreement 5.19.10 SHBV Waste2Energy.txt',
 'file_type': 'text/plain',
 'file_size': 56552,
 'creation_date': '2023-12-19',
 'last_modified_date': '2023-12-19',
 'last_accessed_date': '2023-12-19'}

# Step 3:  Node Parsing & Indexing (Base & Sentence Window Method)

In [16]:
# create the sentence window node parser w/ default settings
sentence_node_parser = SentenceWindowNodeParser.from_defaults(
    window_size=3,
    window_metadata_key="window",
    original_text_metadata_key="original_text"
)

base_node_parser = SentenceSplitter()

llm = OpenAI(model="gpt-4", temperature=0.1)


[nltk_data] Downloading package punkt to /tmp/llama_index...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [17]:
nodes = sentence_node_parser.get_nodes_from_documents(documents)
base_nodes = base_node_parser.get_nodes_from_documents(documents)

In [None]:
ctx_sentence = ServiceContext.from_defaults(llm=llm, embed_model=OpenAIEmbedding(embed_batch_size=50), node_parser=sentence_node_parser)
ctx_base = ServiceContext.from_defaults(llm=llm, embed_model=OpenAIEmbedding(embed_batch_size=50), node_parser=base_node_parser)

sentence_index = VectorStoreIndex(nodes, service_context=ctx_sentence)
base_index = VectorStoreIndex(base_nodes, service_context=ctx_base)

# Step 4:  Save to Persistent Storage

In [None]:
sentence_index.storage_context.persist(persist_dir="./sentence_index")
base_index.storage_context.persist(persist_dir="./base_index")


In [None]:
# Download to own computer for backup

!zip -r ./indexes.zip ./*_index

from google.colab import files
files.download("./indexes.zip")

  adding: base_index/ (stored 0%)
  adding: base_index/image__vector_store.json (deflated 19%)
  adding: base_index/graph_store.json (stored 0%)
  adding: base_index/index_store.json (deflated 68%)
  adding: base_index/docstore.json (deflated 76%)
  adding: base_index/default__vector_store.json (deflated 62%)
  adding: sentence_index/ (stored 0%)
  adding: sentence_index/image__vector_store.json (deflated 19%)
  adding: sentence_index/graph_store.json (stored 0%)
  adding: sentence_index/index_store.json (deflated 68%)
  adding: sentence_index/docstore.json (deflated 94%)
  adding: sentence_index/default__vector_store.json (deflated 63%)


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

# Step 5:  Retrieve from Storage

In [None]:
# rebuild storage context
SC_retrieved_sentence = StorageContext.from_defaults(persist_dir="./sentence_index")
SC_retrieved_base = StorageContext.from_defaults(persist_dir="./base_index")

In [None]:
# load index
retrieved_sentence_index = load_index_from_storage(SC_retrieved_sentence)
retrieved_base_index = load_index_from_storage(SC_retrieved_base)

# Step 6: Create query engine

In [None]:
from llama_index.postprocessor import MetadataReplacementPostProcessor

sentence_query_engine = retrieved_sentence_index.as_query_engine(
    similarity_top_k=5,
    verbose=True,
    # the target key defaults to `window` to match the node_parser's default
    node_postprocessors=[
        MetadataReplacementPostProcessor(target_metadata_key="window")
    ],
)

base_query_engine = retrieved_base_index.as_query_engine(
    similarity_top_k=5,
    verbose=True
)

# Step 7:  Inference

In [None]:
question = ""

In [None]:
base_response = base_query_engine.query(
    question
)
print(base_response)

In [None]:
sentence_response = sentence_query_engine.query(
    question
)
print(sentence_response)