# Showcase Recency Node Postprocessor

Showcase capabilities of recency-weighted node postprocessor

In [1]:
from gpt_index import GPTSimpleVectorIndex, SimpleDirectoryReader, ServiceContext
from gpt_index.indices.postprocessor import (
    FixedRecencyPostprocessor,
    EmbeddingRecencyPostprocessor
)
from gpt_index.node_parser import SimpleNodeParser
from gpt_index.docstore import DocumentStore
from gpt_index.response.notebook_utils import display_response

  from .autonotebook import tqdm as notebook_tqdm


### Parse Documents into Nodes, add to Docstore

In this example, there are 3 different versions of PG's essay. They are largely identical **except** 
for one specific section, which details the amount of funding they raised for Viaweb. 

V1: 50k, V2: 30k, V3: 10K

V1: 2020-01-01, V2: 2020-02-03, V3: 2022-04-12

The idea is to encourage index to fetch the most recent info (which is V3)

In [2]:
# load documents
def get_file_metadata(file_name: str):
    """Get file metadata."""
    if "v1" in file_name:
        return {"date": "2020-01-01"}
    elif "v2" in file_name:
        return {"date": "2020-02-03"}
    elif "v3" in file_name:
        return {"date": "2022-04-12"}
    else:
        raise ValueError("invalid file")

documents = SimpleDirectoryReader(
    input_files=[
        'test_versioned_data/paul_graham_essay_v1.txt',
        'test_versioned_data/paul_graham_essay_v2.txt',
        'test_versioned_data/paul_graham_essay_v3.txt'
    ],
    file_metadata=get_file_metadata
).load_data()

# define service context (wrapper container around current classes)
service_context = ServiceContext.from_defaults(chunk_size_limit=512)

# use node parser in service context to parse into nodes
nodes = service_context.node_parser.get_nodes_from_documents(documents)

# add to docstore
docstore = DocumentStore()
docstore.add_documents(nodes)

In [None]:
print(documents[2].get_text())

### Build Index

In [3]:
# build index 
index = GPTSimpleVectorIndex(nodes, docstore=docstore)

INFO:gpt_index.token_counter.token_counter:> [build_index_from_nodes] Total LLM token usage: 0 tokens
INFO:gpt_index.token_counter.token_counter:> [build_index_from_nodes] Total embedding token usage: 84471 tokens


In [4]:
# [optional] save to disk
index.save_to_disk("index_recency_test.json")

In [5]:
# [optional] load from disk
index = GPTSimpleVectorIndex.load_from_disk('index_recency_test.json')

### Define Recency Postprocessors

In [6]:
node_postprocessor = FixedRecencyPostprocessor(service_context=service_context)

In [7]:
node_postprocessor_emb = EmbeddingRecencyPostprocessor(service_context=service_context)

### Query Index

In [8]:
# naive query

response = index.query(
    "How much did the author raise in seed funding from Idelle's husband (Julian) for Viaweb?", 
    similarity_top_k=3,
)

INFO:gpt_index.token_counter.token_counter:> [query] Total LLM token usage: 1813 tokens
INFO:gpt_index.token_counter.token_counter:> [query] Total embedding token usage: 22 tokens


In [9]:
display_response(response)

**`Final Response:`** The author raised $50,000 in seed funding from Idelle's husband (Julian) for Viaweb, in exchange for 10% of the company. This funding allowed the author to live on while they worked on the software, which eventually became a WYSIWYG site builder.

---

**`Source Node 1/3`**

**Document ID:** bcfa42d1-1a8c-454e-98ac-274c82df74d7<br>**Similarity:** 0.8236323661774008<br>**Text:** date: 2020-02-03

never have to write anything to run on users' computers. We could generate the ...<br>

---

**`Source Node 2/3`**

**Document ID:** 4f7bace2-33d5-45e1-9688-c5a08b2a22d1<br>**Similarity:** 0.8235177967096099<br>**Text:** date: 2020-01-01

never have to write anything to run on users' computers. We could generate the ...<br>

---

**`Source Node 3/3`**

**Document ID:** 63e5a213-5ea0-4634-86b7-b321e1e8e4d0<br>**Similarity:** 0.8225144936272283<br>**Text:** date: 2022-04-12

never have to write anything to run on users' computers. We could generate the ...<br>

In [None]:
# query using fixed recency node postprocessor

response = index.query(
    "How much did the author raise in seed funding from Idelle's husband (Julian) for Viaweb?", 
    similarity_top_k=3,
    node_postprocessors=[node_postprocessor]
)

In [11]:
display_response(response)

**`Final Response:`** $10,000

---

**`Source Node 1/1`**

**Document ID:** 63e5a213-5ea0-4634-86b7-b321e1e8e4d0<br>**Similarity:** 0.8224122808034401<br>**Text:** date: 2022-04-12

never have to write anything to run on users' computers. We could generate the ...<br>

In [12]:
# query using embedding-based node postprocessor

response = index.query(
    "How much did the author raise in seed funding from Idelle's husband (Julian) for Viaweb?", 
    similarity_top_k=3,
    node_postprocessors=[node_postprocessor]
)

INFO:gpt_index.token_counter.token_counter:> [query] Total LLM token usage: 541 tokens
INFO:gpt_index.token_counter.token_counter:> [query] Total embedding token usage: 22 tokens


In [13]:
display_response(response)

**`Final Response:`** The author raised $10,000 in seed funding from Idelle's husband (Julian) for Viaweb.

---

**`Source Node 1/1`**

**Document ID:** 63e5a213-5ea0-4634-86b7-b321e1e8e4d0<br>**Similarity:** 0.8225337778989594<br>**Text:** date: 2022-04-12

never have to write anything to run on users' computers. We could generate the ...<br>

### Query Index (Lower-Level Usage)

In this example we first get the full set of nodes from a query call, and then send to node postprocessor, and then
finally synthesize response through a list index.

In [18]:
from gpt_index import GPTListIndex

In [19]:
query_str = "How much did the author raise in seed funding from Idelle's husband (Julian) for Viaweb?"

In [20]:
init_response = index.query(
    query_str, 
    similarity_top_k=3,
    response_mode="no_text"
)
resp_nodes = [n.node for n in init_response.source_nodes]

INFO:gpt_index.token_counter.token_counter:> [query] Total LLM token usage: 0 tokens
INFO:gpt_index.token_counter.token_counter:> [query] Total embedding token usage: 22 tokens


In [22]:
list_index = GPTListIndex(resp_nodes)
response = list_index.query(query_str, node_postprocessors=[node_postprocessor])

INFO:gpt_index.token_counter.token_counter:> [build_index_from_nodes] Total LLM token usage: 0 tokens
INFO:gpt_index.token_counter.token_counter:> [build_index_from_nodes] Total embedding token usage: 0 tokens
INFO:gpt_index.token_counter.token_counter:> [query] Total LLM token usage: 541 tokens
INFO:gpt_index.token_counter.token_counter:> [query] Total embedding token usage: 0 tokens
