In [2]:
# %pip install llama-index-llms-openai
# %pip install llama-index-extractors-entity

In [4]:
#!pip install llama-index

In [5]:
import nest_asyncio

nest_asyncio.apply()

import os
import openai

os.environ["OPENAI_API_KEY"] = "sk-"

In [6]:
from llama_index.llms.openai import OpenAI
from llama_index.core.schema import MetadataMode

In [7]:
llm = OpenAI(temperature=0.1, model="gpt-3.5-turbo", max_tokens=512)

In [9]:
from llama_index.core.extractors import (
    SummaryExtractor,
    QuestionsAnsweredExtractor,
    TitleExtractor,
    KeywordExtractor,
    BaseExtractor,
)
from llama_index.extractors.entity import EntityExtractor
from llama_index.core.node_parser import TokenTextSplitter

In [10]:
text_splitter = TokenTextSplitter(
    separator=" ", chunk_size=512, chunk_overlap=12
)

In [11]:
class CustomExtractor(BaseExtractor):
    def extract(self, nodes):
        metadata_list = [
            {
                "custom": (
                    node.metadata["document_title"]
                    + "\n"
                    + node.metadata["excerpt_keywords"]
                )
            }
            for node in nodes
        ]
        return metadata_list


In [13]:
extractors = [
    TitleExtractor(nodes=5, llm=llm),
    QuestionsAnsweredExtractor(questions=3, llm=llm),
    # EntityExtractor(prediction_threshold=0.5),
    # SummaryExtractor(summaries=["prev", "self"], llm=llm),
    # KeywordExtractor(keywords=10, llm=llm),
    # CustomExtractor()
]
transformations = [text_splitter] + extractors

In [14]:
from llama_index.core import SimpleDirectoryReader

In [17]:
uber_docs = SimpleDirectoryReader(input_files=["data/uber_2021.pdf"]).load_data()
uber_front_pages = uber_docs[0:3]
uber_content = uber_docs[63:69]
uber_docs = uber_front_pages + uber_content

In [18]:
uber_docs

[Document(id_='835a8b73-38f1-454a-8c7a-ccd81aa0ed96', embedding=None, metadata={'page_label': '1', 'file_name': 'uber_2021.pdf', 'file_path': 'data\\uber_2021.pdf', 'file_type': 'application/pdf', 'file_size': 1880483, 'creation_date': '2024-06-29', 'last_modified_date': '2024-06-29'}, excluded_embed_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], excluded_llm_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], relationships={}, text='UNITED STATESSECURITIES AND EXCHANGE COMMISSION\nWashington, D.C. 20549\n____________________________________________ \nFORM\n 10-K____________________________________________ \n(Mark One)\n☒\n ANNUAL REPORT PURSUANT TO SECTION 13 OR 15(d) OF THE SECURITIES EXCHANGE ACT OF 1934For the fiscal year ended\n December 31, 2021OR\n☐\n TRANSITION REPORT PURSUANT TO SECTION 13 OR 15(d) OF THE SECURITIES EXCHANGE ACT OF 1934For the tr

In [19]:
from llama_index.core.ingestion import IngestionPipeline

pipeline = IngestionPipeline(transformations=transformations)

uber_nodes = pipeline.run(documents=uber_docs)

100%|██████████| 2/2 [00:01<00:00,  1.27it/s]
100%|██████████| 1/1 [00:01<00:00,  1.01s/it]
100%|██████████| 1/1 [00:01<00:00,  1.12s/it]
100%|██████████| 3/3 [00:00<00:00,  3.18it/s]
100%|██████████| 2/2 [00:00<00:00,  2.09it/s]
100%|██████████| 3/3 [00:01<00:00,  2.91it/s]
100%|██████████| 3/3 [00:00<00:00,  3.49it/s]
100%|██████████| 3/3 [00:01<00:00,  2.80it/s]
100%|██████████| 3/3 [00:01<00:00,  2.26it/s]
100%|██████████| 21/21 [00:12<00:00,  1.69it/s]


In [23]:
uber_nodes[5].metadata

{'page_label': '64',
 'file_name': 'uber_2021.pdf',
 'file_path': 'data\\uber_2021.pdf',
 'file_type': 'application/pdf',
 'file_size': 1880483,
 'creation_date': '2024-06-29',
 'last_modified_date': '2024-06-29',
 'document_title': 'The Impact of COVID-19 Response Initiatives on Adjusted EBITDA and Financial Reporting: Understanding Exclusions and Reconciliation',
 'questions_this_excerpt_can_answer': '1. How does Uber account for the costs associated with COVID-19 response initiatives, such as financial assistance to drivers and distribution of personal protective equipment?\n2. What are the limitations of using Adjusted EBITDA as a financial measure, according to the document on the impact of COVID-19 response initiatives on financial reporting?\n3. How does Uber exclude certain expenses, such as stock-based compensation and COVID-19 response initiative costs, when calculating Adjusted EBITDA and financial reporting?'}

In [24]:
# Note the uninformative document file name, which may be a common scenario in a production setting
lyft_docs = SimpleDirectoryReader(
    input_files=["data/lyft_2021.pdf"]
).load_data()
lyft_front_pages = lyft_docs[0:3]
lyft_content = lyft_docs[68:73]
lyft_docs = lyft_front_pages + lyft_content

In [25]:
from llama_index.core.ingestion import IngestionPipeline

pipeline = IngestionPipeline(transformations=transformations)

lyft_nodes = pipeline.run(documents=lyft_docs)

100%|██████████| 3/3 [00:01<00:00,  2.61it/s]
100%|██████████| 1/1 [00:00<00:00,  1.11it/s]
100%|██████████| 2/2 [00:00<00:00,  2.08it/s]
100%|██████████| 3/3 [00:00<00:00,  3.15it/s]
100%|██████████| 3/3 [00:01<00:00,  2.09it/s]
100%|██████████| 3/3 [00:01<00:00,  2.46it/s]
100%|██████████| 3/3 [00:00<00:00,  3.10it/s]
100%|██████████| 1/1 [00:00<00:00,  1.39it/s]
100%|██████████| 19/19 [00:10<00:00,  1.80it/s]


In [26]:
lyft_nodes[5].metadata

{'page_label': '3',
 'file_name': 'lyft_2021.pdf',
 'file_path': 'data\\lyft_2021.pdf',
 'file_type': 'application/pdf',
 'file_size': 1440303,
 'creation_date': '2024-06-29',
 'last_modified_date': '2024-06-29',
 'document_title': 'Navigating Risks and Opportunities in the Transportation Industry: A Forward-Looking Perspective',
 'questions_this_excerpt_can_answer': "1. How does Lyft plan to address new and evolving markets, such as Lyft Autonomous, Light Vehicles, Driver Centers, and Lyft Mobile Services?\n2. What are Lyft's strategies for managing risks associated with their Transportation-as-a-Service network, including auto-related and operations-related risks?\n3. How does Lyft plan to maintain, protect, and enhance their intellectual property in the competitive and rapidly changing environment of the transportation industry?"}

In [27]:
from llama_index.core.question_gen import LLMQuestionGenerator
from llama_index.core.question_gen.prompts import (
    DEFAULT_SUB_QUESTION_PROMPT_TMPL,
)


question_gen = LLMQuestionGenerator.from_defaults(
    llm=llm,
    prompt_template_str="""
        Follow the example, but instead of giving a question, always prefix the question 
        with: 'By first identifying and quoting the most relevant sources, '. 
        """
    + DEFAULT_SUB_QUESTION_PROMPT_TMPL,
)

## Querying an Index With No Extra Metadata

In [36]:
from copy import deepcopy

nodes_no_metadata = deepcopy(uber_nodes) + deepcopy(lyft_nodes)
for node in nodes_no_metadata:
    node.metadata = {
        k: node.metadata[k]
        for k in node.metadata
        if k in ["page_label", "file_name"]
    }


In [39]:
print(
    "LLM sees:\n",
    (nodes_no_metadata)[8].get_content(metadata_mode=MetadataMode.LLM),
)

LLM sees:
 [Excerpt from document]
page_label: 65
Excerpt:
-----
millions)
2020 2021 Net cash used in operating ac
tivities$ (2,745) $ (445) Net cash used in investing act
ivities(2,869) (1,201) Net cash provided by financing a
ctivities1,379 1,780 Operating Activities
Net
 cash used in operating activities was $445 million for the year ended December 31, 2021, primarily consisting of $570 million of net loss, adjusted forcertain
 non-cash items, which primarily included $1.7 billion in gain on business divestitures, $1.2 billion of stock-based compensation expense, $1.1 billion ofunrealized
 gain on debt and equity securities, $413 million of gain from sale of investments, depreciation and amortization expense of $902 million, as well as a$477 million decrease
 in cash consumed by working capital. The decrease in cash consumed by working capital and other operating activities was primarily drivenby
 an increase in accrued expenses and other liabilities, an increase in our insurance re

In [30]:
from llama_index.core import VectorStoreIndex
from llama_index.core.query_engine import SubQuestionQueryEngine
from llama_index.core.tools import QueryEngineTool, ToolMetadata

In [31]:
index_no_metadata = VectorStoreIndex(
    nodes=nodes_no_metadata,
)
engine_no_metadata = index_no_metadata.as_query_engine(
    similarity_top_k=10, llm=OpenAI(model="gpt-4")
)

In [32]:
final_engine_no_metadata = SubQuestionQueryEngine.from_defaults(
    query_engine_tools=[
        QueryEngineTool(
            query_engine=engine_no_metadata,
            metadata=ToolMetadata(
                name="sec_filing_documents",
                description="financial information on companies",
            ),
        )
    ],
    question_gen=question_gen,
    use_async=True,
)

In [34]:
response_no_metadata = final_engine_no_metadata.query(
    """
    What was the cost due to research and development v.s. sales and marketing for uber and lyft in 2019 in millions of USD?
    Give your answer as a JSON.
    """
)
print(response_no_metadata.response)
# Correct answer:
# {"Uber": {"Research and Development": 4836, "Sales and Marketing": 4626},
#  "Lyft": {"Research and Development": 1505.6, "Sales and Marketing": 814 }}

Generated 4 sub questions.
[1;3;38;2;237;90;200m[sec_filing_documents] Q: What was the cost due to research and development for Uber in 2019 in millions of USD
[0m[1;3;38;2;90;149;237m[sec_filing_documents] Q: What was the cost due to sales and marketing for Uber in 2019 in millions of USD
[0m[1;3;38;2;11;159;203m[sec_filing_documents] Q: What was the cost due to research and development for Lyft in 2019 in millions of USD
[0m[1;3;38;2;155;135;227m[sec_filing_documents] Q: What was the cost due to sales and marketing for Lyft in 2019 in millions of USD
[0m[1;3;38;2;11;159;203m[sec_filing_documents] A: The context does not provide information on the cost of research and development for Lyft in 2019.
[0m[1;3;38;2;237;90;200m[sec_filing_documents] A: The context does not provide information on the cost of research and development for Uber in 2019.
[0m[1;3;38;2;155;135;227m[sec_filing_documents] A: The context does not provide information on the cost due to sales and marketing

## Querying an Index With Extracted Metadata

In [40]:
print(
    "LLM sees:\n",
    (uber_nodes + lyft_nodes)[8].get_content(metadata_mode=MetadataMode.LLM),
)

LLM sees:
 [Excerpt from document]
page_label: 65
file_path: data\uber_2021.pdf
document_title: Financial Performance and Currency Analysis for Uber Technologies, Inc., Summary of Cash Flow Activities for the Year Ended December 31, 2021.
questions_this_excerpt_can_answer: 1. How much net cash was used in operating activities for Uber Technologies, Inc. in the year ended December 31, 2021, and what were the main contributing factors to this amount?
2. What were the key non-cash items that impacted Uber's net cash used in operating activities for the year ended December 31, 2021, and how did they affect the overall cash flow?
3. Can you provide details on the $1.0 billion cash inflow related to a legacy auto insurance transfer mentioned in the excerpt, including how it impacted Uber's operating activities and financial performance for the year?
Excerpt:
-----
millions)
2020 2021 Net cash used in operating ac
tivities$ (2,745) $ (445) Net cash used in investing act
ivities(2,869) (1,201)

In [41]:
index = VectorStoreIndex(
    nodes=uber_nodes + lyft_nodes,
)
engine = index.as_query_engine(similarity_top_k=10, llm=OpenAI(model="gpt-4"))

In [42]:
final_engine = SubQuestionQueryEngine.from_defaults(
    query_engine_tools=[
        QueryEngineTool(
            query_engine=engine,
            metadata=ToolMetadata(
                name="sec_filing_documents",
                description="financial information on companies.",
            ),
        )
    ],
    question_gen=question_gen,
    use_async=True,
)

In [None]:
response = final_engine.query(
    """
    What was the cost due to research and development v.s. sales and marketing for uber and lyft in 2019 in millions of USD?
    Give your answer as a JSON.
    """
)
print(response.response)
# Correct answer:
# {"Uber": {"Research and Development": 4836, "Sales and Marketing": 4626},
#  "Lyft": {"Research and Development": 1505.6, "Sales and Marketing": 814 }}

Generated 4 sub questions.
[1;3;38;2;237;90;200m[sec_filing_documents] Q: By first identifying and quoting the most relevant sources, What was the cost due to research and development for Uber in 2019 in millions of USD?
[0m[1;3;38;2;90;149;237m[sec_filing_documents] Q: By first identifying and quoting the most relevant sources, What was the cost due to sales and marketing for Uber in 2019 in millions of USD?
[0m[1;3;38;2;11;159;203m[sec_filing_documents] Q: By first identifying and quoting the most relevant sources, What was the cost due to research and development for Lyft in 2019 in millions of USD?
[0m[1;3;38;2;155;135;227m[sec_filing_documents] Q: By first identifying and quoting the most relevant sources, What was the cost due to sales and marketing for Lyft in 2019 in millions of USD?
[0m[1;3;38;2;237;90;200m[sec_filing_documents] A: The provided context does not contain information about the cost due to research and development for Uber in 2019.
[0m[1;3;38;2;11;159;2