In [1]:
from llama_index.core import VectorStoreIndex, SimpleDirectoryReader, Settings
from llama_index.readers.web import SimpleWebPageReader
from rich import print as rprint
from llama_index.core.schema import MetadataMode
from llama_index.core.node_parser import MarkdownNodeParser
from llama_index.core.schema import Document

In [2]:
import nest_asyncio
nest_asyncio.apply()

In [None]:
import os
from llama_index.llms.openai import OpenAI

os.environ["OPENAI_API_KEY"] = ""
llm = OpenAI(temperature=0.1, model="gpt-3.5-turbo", max_tokens=512)

### 1. Identify the dataset to analyze

In [4]:
documents = SimpleWebPageReader(html_to_text=True).load_data(
    ["https://docs.llamaindex.ai/en/stable/module_guides/loading/node_parsers/modules/"]
)

### 2. Identify a pipeline

In [5]:
def clean_metadata(doc):
    doc.excluded_llm_metadata_keys = ["Header_1", "Header_2", "Header_3", "excerpt_keywords"]
    return doc

In [6]:
documents = list(map(clean_metadata, documents))

In [7]:
from llama_index.core.ingestion import IngestionPipeline
from llama_index.core.extractors import (
    KeywordExtractor,
)
from llama_index.core.node_parser import SentenceSplitter

pipeline = IngestionPipeline(transformations=[
    MarkdownNodeParser(),
    SentenceSplitter(
        chunk_size=512, chunk_overlap=20
    ),
    KeywordExtractor(keywords=3, llm=llm),
])
nodes = pipeline.run(documents=documents)
i = 0

100%|██████████| 87/87 [00:13<00:00,  6.45it/s]


### 3. Filter nodes

In [13]:
i=20

In [70]:
rprint(nodes[i].get_content(
    metadata_mode=MetadataMode.LLM
))
i+=1

In [9]:
nodes_filtered = []
print(f"Start with {len(nodes)} nodes")
for node in nodes:
    if "Header_3" in node.metadata.keys():
        nodes_filtered.append(node)
print(f"Filtered to {len(nodes_filtered)} nodes")

Start with 87 nodes
Filtered to 11 nodes


### 4. Create the index

In [10]:
index = VectorStoreIndex(nodes=nodes_filtered)
engine = index.as_query_engine()

In [None]:
query = engine.query("""How can i manage a html page with llama index?""")

In [12]:
rprint(query.response)