In [1]:
'''
Llama Index - used to bind the data and LLM 
Tool kit for summarization and question-answering 

We use data agents for this 

'''

'\nLlama Index - used to bind the data and LLM \nTool kit for summarization and question-answering \n\nWe use data agents for this \n\n'

In [2]:
# 10-k fillings example chatbot 

import os 
os.environ["OPENAI_API_KEY"] = ""

#saves from some ansynchronous error
import nest_asyncio
nest_asyncio.apply()

In [3]:
# set text wrapping
from IPython.display import HTML, display

def set_css(self):
    display(HTML("""
      <style>
        pre {
            white-space: pre-wrap;
        }
      </style>
  """))

get_ipython().events.register("pre_run_cell", set_css)

In [4]:
# download files
!mkdir data_10k
!wget "https://www.dropbox.com/s/948jr9cfs7fgj99/UBER.zip?dl=1" -O data_10k/UBER.zip
!unzip data_10k/UBER.zip -d data_10k

mkdir: cannot create directory ‘data_10k’: File exists
--2024-05-27 22:21:31--  https://www.dropbox.com/s/948jr9cfs7fgj99/UBER.zip?dl=1
Resolving www.dropbox.com (www.dropbox.com)... 162.125.6.18, 2620:100:601c:18::a27d:612
Connecting to www.dropbox.com (www.dropbox.com)|162.125.6.18|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: /scl/fi/vetj6tgcux8e309swquxs/UBER.zip?rlkey=oy4vx60rplorounkh9wh2waux&dl=1 [following]
--2024-05-27 22:21:31--  https://www.dropbox.com/scl/fi/vetj6tgcux8e309swquxs/UBER.zip?rlkey=oy4vx60rplorounkh9wh2waux&dl=1
Reusing existing connection to www.dropbox.com:443.
HTTP request sent, awaiting response... 302 Found
Location: https://uc649e00428177d70c2680aa5b94.dl.dropboxusercontent.com/cd/0/inline/CTs7HsjS7munveggoORfwzPYDRvZMlCQAH9LHvZnYN4-Y-xez-NoMhSESuitOwlk4qVa28IP8MtLO42ncvMIPAjScBx6dcmdE7NET8Nd7sDmai-70Qprph3HocNn6Ly5__o/file?dl=1# [following]
--2024-05-27 22:21:31--  https://uc649e00428177d70c2680aa5b94.dl.dropboxuserconten

In [5]:
import nltk
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /teamspace/studios/this_studio/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [46]:
'''
HTML files requires Unstrucutred Library (https://github.com/Unstructured-IO/unstructured)

'''

from llama_index.readers.file import UnstructuredReader, HTMLTagReader
from llama_index.core import SimpleDirectoryReader
from pathlib import Path

years = [2022, 2021, 2020, 2019]

#loader = UnstructuredReader()
loader = HTMLTagReader(tag="div")
parser = HTMLTagReader()
file_extractor = {".html": parser}

doc_set = {}
all_docs = []
for year in years:
    file = Path(f"data_10k/UBER/UBER_{year}.html")
    doc = loader.load_data(file)
    #print(doc)
    #doc_sdr = SimpleDirectoryReader("data_10k/UBER/", file_extractor=file_extractor).load_data()
    #print(doc_sdr)
    doc_set[year] = doc
    all_docs.extend(doc)

In [48]:
#create vector indices 
from llama_index.core import VectorStoreIndex, StorageContext
from llama_index.embeddings.openai import OpenAIEmbedding
from llama_index.llms.openai import OpenAI
from llama_index.core import Settings

Settings.chunk_size = 512
Settings.chunk_overlap = 64
Settings.llm = OpenAI(model="gpt-3.5-turbo")
Settings.embed_model = OpenAIEmbedding(model="text-embedding-3-small")

index_set = {}
for year in years:
    storage_context = StorageContext.from_defaults()
    cur_index = VectorStoreIndex.from_documents(
        doc_set[year],
        storage_context=storage_context,
    )
    index_set[year] = cur_index
    storage_context.persist(persist_dir=f"./storage_10k/{year}")

In [50]:
# sample the index
# Load indices from disk
from llama_index.core import load_index_from_storage

index_set = {}
for year in years:
    storage_context = StorageContext.from_defaults(
        persist_dir=f"./storage_10k/{year}"
    )
    cur_index = load_index_from_storage(
        storage_context,
    )
    index_set[year] = cur_index

In [51]:
# sub question querying 
'''
Decomposes the query into sub-query each answered by an vector 

'''

from llama_index.core.tools import QueryEngineTool, ToolMetadata

individual_query_engine_tools = [
    QueryEngineTool(
        query_engine = index_set[year].as_query_engine(),
        metadata = ToolMetadata(
            name = f"vector_index_{year}",
            description=(
                "useful for when you want to answer queries about the"
                f" {year} SEC 10-K for Uber"
            ),
        ),
    )
    for year in years
]

In [52]:
from llama_index.core.query_engine import SubQuestionQueryEngine

query_engine = SubQuestionQueryEngine.from_defaults(
    query_engine_tools = individual_query_engine_tools,
)

In [53]:
query_engine_tool = QueryEngineTool(
    query_engine = query_engine,
    metadata = ToolMetadata(
        name="sub_question_query_engine",
        description=(
            "useful for when you want to answer queries that require analyzing"
            " multiple SEC 10-K documents for Uber"
        ),
    ),
)

In [54]:
#combine them
tools = individual_query_engine_tools + [query_engine_tool]

In [55]:
from llama_index.agent.openai import OpenAIAgent

agent = OpenAIAgent.from_tools(tools, verbose=True)

In [56]:
#testing 
response = agent.chat("hi, i am bob")
print(str(response))

Added user message to memory: hi, i am bob
Hello Bob! How can I assist you today?


In [57]:
response = agent.chat(
    "What were some of the biggest risk factors in 2020 for Uber?"
)
print(str(response))

Added user message to memory: What were some of the biggest risk factors in 2020 for Uber?
=== Calling Function ===
Calling function: vector_index_2020 with args: {"input":"biggest risk factors"}
Got output: The biggest risk factors for the company are typically outlined in the "Risk Factors" section of its annual report or 10-K filing. These factors can vary depending on the company and its industry, but they generally highlight the key challenges and uncertainties that could negatively impact the business, financial condition, or future prospects of the company.

For specific details on the biggest risk factors for Uber in 2020, you may refer to the "Risk Factors" section of their 2020 SEC 10-K filing. This section typically outlines the key challenges and uncertainties that could negatively impact Uber's business, financial condition, or future prospects.


In [59]:
# across the years 
cross_query_str = (
    "Compare/contrast in detail the risk factors described in the Uber 10-K across"
    " years. Give answer in bullet points."
)

response = agent.chat(cross_query_str)
print(str(response))

Added user message to memory: Compare/contrast in detail the risk factors described in the Uber 10-K across years. Give answer in bullet points.
=== Calling Function ===
Calling function: sub_question_query_engine with args: {"input":"compare and contrast risk factors in Uber 10-K across years"}
Generated 4 sub questions.
[1;3;38;2;237;90;200m[vector_index_2022] Q: What are the risk factors mentioned in the 2022 SEC 10-K for Uber?
[0m[1;3;38;2;90;149;237m[vector_index_2021] Q: What are the risk factors mentioned in the 2021 SEC 10-K for Uber?
[0m[1;3;38;2;11;159;203m[vector_index_2020] Q: What are the risk factors mentioned in the 2020 SEC 10-K for Uber?
[0m[1;3;38;2;155;135;227m[vector_index_2019] Q: What are the risk factors mentioned in the 2019 SEC 10-K for Uber?
[0m[1;3;38;2;11;159;203m[vector_index_2020] A: The risk factors mentioned in the 2020 SEC 10-K for Uber include uncertainties and complexities arising from the regulatory environment, which pose challenges to the 