https://github.com/run-llama/llama_index/blob/main/docs/docs/examples/metadata_extraction/EntityExtractionClimate.ipynb

In [1]:
import nest_asyncio
nest_asyncio.apply()

In [None]:
import os
from llama_index.llms.openai import OpenAI


os.environ["OPENAI_API_KEY"] = ""
llm = OpenAI(temperature=0.1, model="gpt-3.5-turbo", max_tokens=512)

In [3]:
import re
from llama_index.core.schema import TransformComponent


class TextCleaner(TransformComponent):
    def __call__(self, nodes, **kwargs):
        for node in nodes:
            node.text = re.sub(r"[^0-9A-Za-z ]", "", node.text)
        return nodes

In [4]:
from llama_index.core.ingestion import IngestionPipeline
from llama_index.core.node_parser import TokenTextSplitter
from llama_index.core.extractors import (
    QuestionsAnsweredExtractor,
)
from llama_index.core.node_parser import SentenceSplitter

pipeline = IngestionPipeline(
    transformations=[
        TextCleaner(),
        SentenceSplitter(chunk_size=512),
        #TokenTextSplitter(chunk_size=512),
        #QuestionsAnsweredExtractor(llm=llm, questions=3)
    ],
)

In [5]:
from llama_index.core.schema import MetadataMode
from llama_index.core import SimpleDirectoryReader
from rich import print as rprint

documents = SimpleDirectoryReader("./data/").load_data()

In [6]:
nodes = pipeline.run(documents=documents,
                    in_place=True,
                    show_progress=True,)
i = 0

  from .autonotebook import tqdm as notebook_tqdm
Parsing nodes: 100%|██████████| 81/81 [00:00<00:00, 165.76it/s]


In [7]:
rprint(nodes[i].get_content(
    metadata_mode=MetadataMode.LLM
))
i+=1


In [8]:
from llama_index.core import VectorStoreIndex

index = VectorStoreIndex(nodes=nodes)
engine = index.as_query_engine()

In [None]:
rprint(engine.query("Who is the author of the book?"))

Paul Graham


In [None]:
rprint(engine.query("What inspired the author to switch from studying philosophy to studying AI in college?"))

The author was inspired to switch from studying philosophy to studying AI in college after being drawn into the world of AI through a novel by Heinlein called "The Moon is a Harsh Mistress" and a PBS documentary showing Terry Winograd using SHRDLU.


In [11]:
rprint(engine.query("What would the author say about art vs. engineering?").response)

In [12]:
rprint(engine.query("Why did the author have to learn italian?").response)

In [13]:
rprint(engine.query("Why the author was in Florence?").response)