In [39]:
from llama_index.core.storage.docstore import SimpleDocumentStore
from llama_index.core.storage.index_store import SimpleIndexStore
from llama_index.core.vector_stores.simple import SimpleVectorStore
from llama_index.core.callbacks import CallbackManager, TokenCountingHandler, EventPayload
from llama_index.core import Settings
from llama_index.embeddings.openai import OpenAIEmbedding
from llama_index.core import StorageContext, load_index_from_storage
from llama_index.core.response.notebook_utils import display_response
from llama_index.core.response.pprint_utils import pprint_response, pprint_metadata, pprint_source_node
from llama_index.llms.openai import OpenAI
from llama_index.llms.openai.utils import ALL_AVAILABLE_MODELS, CHAT_MODELS
from llama_index.core import Settings

PERSIST_DB_DIR = "../db/db_storage/"
ALL_AVAILABLE_MODELS["gpt-4o-mini"]= 128000
CHAT_MODELS["gpt-4o-mini"] = 128000

Settings.embed_model = OpenAIEmbedding(model="text-embedding-3-small")


In [34]:
def load_index(
        docstore = SimpleDocumentStore(), 
        index_store=SimpleIndexStore(), 
        vector_store = SimpleVectorStore(), 
        embedding_model = Settings.embed_model,
        persist_db_dir = PERSIST_DB_DIR
    ):

    storage_context = StorageContext.from_defaults(
        docstore=docstore.from_persist_dir(persist_db_dir),
        vector_store=vector_store.from_persist_dir(persist_db_dir, namespace="default"),
        index_store=index_store.from_persist_dir(persist_db_dir),
    )

    vector_index = load_index_from_storage(storage_context, index_id="vector_index")

    return vector_index

vector_index = load_index(persist_db_dir=PERSIST_DB_DIR)

In [35]:
query_engine = vector_index.as_query_engine(llm=OpenAI(model="gpt-4o-mini"))

In [37]:
res = query_engine.query("What are the highlights of the lastest llamaindex newsletter??")

In [40]:
display_response(res,show_source=True,show_source_metadata=True)

**`Final Response:`** The latest LlamaIndex newsletter features exciting updates, in-depth guides, demos, educational tutorials, and webinars aimed at enhancing user experience and understanding of the platforms and tools.

---

**`Source Node 1/2`**

**Node ID:** 151c9156-9ba0-47f2-bf33-f6df229b9747<br>**Similarity:** 0.7101944796118811<br>**Text:** Hello Llama Fans🦙
Step into this week's edition of the LlamaIndex newsletter, where we bring you ...<br>**Metadata:** {'title': 'LlamaIndex Newsletter 2024-06-11', 'link': 'https://www.llamaindex.ai/blog/llamaindex-newsletter-2024-06-11', 'date': 'Jun 11, 2024', 'tags': '[]'}<br>

---

**`Source Node 2/2`**

**Node ID:** c848514e-c711-4bd5-9d62-9cc82d5169f9<br>**Similarity:** 0.6910661516602047<br>**Text:** Hello to All Llama Lovers!🦙
Welcome to this week’s issue of the LlamaIndex newsletter! This editi...<br>**Metadata:** {'title': 'LlamaIndex Newsletter 2024-06-25', 'link': 'https://www.llamaindex.ai/blog/llamaindex-newsletter-2024-06-25', 'date': 'Jun 25, 2024', 'tags': '[]'}<br>

In [41]:
from llama_index.core.vector_stores import MetadataInfo, VectorStoreInfo
import datetime

vector_store_info = VectorStoreInfo(
    content_info="Receipts",
    metadata_info=[
        MetadataInfo(
            name="date",
            description="The result for this date",
            type=f"date in MMM dd, yyyy format. Today is {datetime.datetime.today()}",
        ),
        # MetadataInfo(
        #     name="year",
        #     description="The year blog post was created",
        #     type="integer",
        # ),
        # MetadataInfo(
        #     name="month",
        #     description="The month blog post was created",
        #     type="integer",
        # ),
        # MetadataInfo(
        #     name="day",
        #     description="The day blog post was created",
        #     type="integer",
        # )
    ])

In [42]:
from llama_index.core.retrievers import VectorIndexAutoRetriever

retriever = VectorIndexAutoRetriever(
    vector_index,
    vector_store_info=vector_store_info,
    similarity_top_k=2,
    empty_query_top_k=10,  # if only metadata filters are specified, this is the limit
    verbose=True,
)

In [48]:

nodes = retriever.retrieve(
    "The newsletter in 16 Jul 2024?"
)

Using query str: newsletter
Using filters: [('date', '==', 'Jul 16, 2024')]


In [49]:
nodes

[NodeWithScore(node=TextNode(id_='2fd697ef-aec5-42a6-b2fe-356f99b19203', embedding=None, metadata={'title': 'LlamaIndex Newsletter 2024-07-16', 'link': 'https://www.llamaindex.ai/blog/llamaindex-newsletter-2024-07-16', 'date': 'Jul 16, 2024', 'tags': '[]'}, excluded_embed_metadata_keys=[], excluded_llm_metadata_keys=['file_name'], relationships={<NodeRelationship.SOURCE: '1'>: RelatedNodeInfo(node_id='fef0c07c-6811-464d-8784-db468bef5097', node_type=<ObjectType.DOCUMENT: '4'>, metadata={'title': 'LlamaIndex Newsletter 2024-07-16', 'link': 'https://www.llamaindex.ai/blog/llamaindex-newsletter-2024-07-16', 'date': 'Jul 16, 2024', 'tags': '[]'}, hash='20b71507f27148a1973ab69881603705bfade99570303d5e500237eef61912f5'), <NodeRelationship.NEXT: '3'>: RelatedNodeInfo(node_id='ce3296ae-cb81-456c-8272-4eb1ad3f3783', node_type=<ObjectType.TEXT: '1'>, metadata={'Header_2': '🤩\xa0The highlights:'}, hash='f75093d8a4531286f507cc040ff1f86232320011a68285eaef4903c93fd906ea')}, text='Hello, Llama Family

# Llama Index Blog Crawler and Query Engine

This project provides a tool to crawl blog posts, initialize and load an index, and query the index using a command-line interface. It supports re-crawling the blog, re-indexing, and running evaluations with optional retry support.

## Features

- Crawl blog posts and store them in a local database.
- Initialize and load an index from local storage.
- Query the index using a command-line interface.
- Optional evaluation mode with retry support for generating answers.

## Requirements

- Python 3.9
- Required Python packages (install via `requirements.txt`)

## Installation

Install the required packages:
```sh
pip install -r requirements.txt
```

## Usage

### API-KEY
This project use openAI, you must store OPENAI_API_KEY to .env file

```
OPENAI_API_KEY=
```

### Command-Line Arguments

- `--re-crawl`: Re-crawl the blog before running queries.
- `--eval`: Run evaluation on the test set.

### Running the Script

To run the script, use the following command:
```sh
python main.py [--re-crawl] [--eval]
```

### Example
1. Interact with the RAG via Q&A
```sh
python main.py
```

1. Re-crawl the blog and re-index
```sh
python main.py --re-crawl
```

3. Run evaluation using RAGAS
```sh
python main.py --eval
```

