In [1]:
import os
import openai
from dotenv import load_dotenv
import nest_asyncio

In [2]:
load_dotenv()
nest_asyncio.apply()
OPENAI_API_KEY=os.getenv('OPENAI_API_KEY')

# Ingest Data

In [6]:
# NOTE: the code examples assume you're operating within a Jupyter notebook.
# download files
!mkdir data
!wget "https://www.dropbox.com/s/948jr9cfs7fgj99/UBER.zip?dl=1" -O data/UBER.zip
!unzip data/UBER.zip -d data

mkdir: cannot create directory ‘data’: File exists
--2024-08-15 13:34:44--  https://www.dropbox.com/s/948jr9cfs7fgj99/UBER.zip?dl=1
Resolving www.dropbox.com (www.dropbox.com)... 2620:100:6031:18::a27d:5112, 162.125.81.18
Connecting to www.dropbox.com (www.dropbox.com)|2620:100:6031:18::a27d:5112|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://www.dropbox.com/scl/fi/vetj6tgcux8e309swquxs/UBER.zip?rlkey=oy4vx60rplorounkh9wh2waux&dl=1 [following]
--2024-08-15 13:34:45--  https://www.dropbox.com/scl/fi/vetj6tgcux8e309swquxs/UBER.zip?rlkey=oy4vx60rplorounkh9wh2waux&dl=1
Reusing existing connection to [www.dropbox.com]:443.
HTTP request sent, awaiting response... 302 Found
Location: https://uc3cf7b63d84829ebba363145277.dl.dropboxusercontent.com/cd/0/inline/CYoDbjFTacz1N_xOuoQ9Lfk3VEIVFM9eW9cBJhsM-DCaNz1TJ4V-UodKOCSou5hfDfubrQPAHg-Eq-y3A4102bNeI4IX7fsinm18rf--CjB69pT7zeHe6SThkOLA7RPoPzc/file?dl=1# [following]
--2024-08-15 13:34:46--  https://uc3cf7b63d8

In [3]:
!pip install llama-hub unstructured --upgrade

Collecting unstructured
  Downloading unstructured-0.15.3-py3-none-any.whl.metadata (29 kB)
Collecting llama-index-core<0.11.0,>=0.10.65 (from llama-index>=0.9.41->llama-hub)
  Using cached llama_index_core-0.10.65-py3-none-any.whl.metadata (2.4 kB)
INFO: pip is looking at multiple versions of llama-index-core to determine which version is compatible with other requirements. This could take a while.
Collecting llama-index-cli<0.2.0,>=0.1.2 (from llama-index>=0.9.41->llama-hub)
  Using cached llama_index_cli-0.1.13-py3-none-any.whl.metadata (1.5 kB)
Collecting llama-index-agent-openai<0.3.0,>=0.1.4 (from llama-index>=0.9.41->llama-hub)
  Using cached llama_index_agent_openai-0.2.9-py3-none-any.whl.metadata (729 bytes)
Collecting llama-index>=0.9.41 (from llama-hub)
  Using cached llama_index-0.10.65-py3-none-any.whl.metadata (11 kB)
  Downloading llama_index-0.10.64-py3-none-any.whl.metadata (11 kB)
Downloading unstructured-0.15.3-py3-none-any.whl (2.1 MB)
[2K   [38;2;114;156;31m━━━━━

In [3]:
from llama_index.readers.file import UnstructuredReader
from pathlib import Path
import nltk

In [5]:
years=[2022,2021,2020,2019]
nltk.data.path.append('/home/nitish/nltk_data')
loader=UnstructuredReader()
doc_set={}
all_docs=[]

for year in years:
    year_docs=loader.load_data(file=Path(f'./data/UBER/UBER_{year}.html'),split_documents=False)
    #insert metadata into each year
    for d in year_docs:
        d.metadata={'year':year}
    doc_set[year]=year_docs
    all_docs.extend(year_docs)

# Setting up Vector Indices for each year

In [7]:
from llama_index.core import VectorStoreIndex,StorageContext
from llama_index.core import Settings

Settings.chunk_size=512

index_set={}


for year in years:
    storage_context=StorageContext.from_defaults()
    cur_index=VectorStoreIndex.from_documents(
        doc_set[year],
        storage_context=storage_context
    )
    index_set[year]=cur_index
    storage_context.persist(persist_dir=f'./storage/{year}')

In [9]:
from llama_index.core import load_index_from_storage


index_set = {}

for year in years:
    storage_context=StorageContext.from_defaults(
        persist_dir=f'./storage/{year}'
    )
    cur_index=load_index_from_storage(storage_context)
    index_set[year]=cur_index

In [10]:
from llama_index.core.tools import QueryEngineTool,ToolMetadata

individual_query_engine_tools=[
    QueryEngineTool(
        query_engine=index_set[year].as_query_engine(),
        metadata=ToolMetadata(
            name=f'vector_index_{year}',
            description=f'useful for when you want to answer queries about the {year} SEC 10-K for Uber'
        )
    )
    for year in years
]

In [11]:
from llama_index.llms.openai import OpenAI
from llama_index.core.query_engine import SubQuestionQueryEngine

query_engine=SubQuestionQueryEngine.from_defaults(
    query_engine_tools=individual_query_engine_tools,
    llm=OpenAI(model='gpt-3.5-turbo')
)

# Setting up the Chatbot Agent

In [12]:
query_engine_tool=QueryEngineTool(
    query_engine=query_engine,
    metadata=ToolMetadata(
        name='sub_question_query_engine',
        description='useful for when you want to answer queries that require analyzing multiple SEC 10-k documents for Uber',
        
    )
)

In [13]:
tools=individual_query_engine_tools+[query_engine_tool]

In [14]:
from llama_index.agent.openai import OpenAIAgent

agent=OpenAIAgent.from_tools(tools,verbose=True)

In [15]:
response=agent.chat('Hi, i am Nitish')
print(str(response))

Added user message to memory: Hi, i am Nitish
Hello Nitish! How can I assist you today?


In [16]:
response = agent.chat(
    "What were some of the biggest risk factors in 2020 for Uber?"
)
print(str(response))

Added user message to memory: What were some of the biggest risk factors in 2020 for Uber?
=== Calling Function ===
Calling function: vector_index_2020 with args: {"input":"biggest risk factors"}
Got output: The biggest risk factors include the adverse effects of the COVID-19 pandemic on the business, potential reclassification of Drivers, intense competition in the mobility, delivery, and logistics industries, significant losses incurred since inception with uncertain profitability, the need to lower fares and offer incentives to remain competitive, and the reliance on large metropolitan areas for a significant percentage of Gross Bookings which are susceptible to economic, social, and regulatory conditions.

In 2020, some of the biggest risk factors for Uber included the adverse effects of the COVID-19 pandemic on the business, potential reclassification of Drivers, intense competition in the mobility, delivery, and logistics industries, significant losses incurred since inception wi

In [18]:
cross_query_str = "Compare/contrast the risk factors described in the Uber 10-K across years. Give answer in bullet points."

response = agent.chat(cross_query_str)
print(str(response))

Added user message to memory: Compare/contrast the risk factors described in the Uber 10-K across years. Give answer in bullet points.
=== Calling Function ===
Calling function: sub_question_query_engine with args: {"input":"Compare and contrast the risk factors described in the Uber 10-K across years"}
Generated 4 sub questions.
[1;3;38;2;237;90;200m[vector_index_2022] Q: What are the risk factors described in the 2022 SEC 10-K for Uber?
[0m[1;3;38;2;90;149;237m[vector_index_2021] Q: What are the risk factors described in the 2021 SEC 10-K for Uber?
[0m[1;3;38;2;11;159;203m[vector_index_2020] Q: What are the risk factors described in the 2020 SEC 10-K for Uber?
[0m[1;3;38;2;155;135;227m[vector_index_2019] Q: What are the risk factors described in the 2019 SEC 10-K for Uber?
[0m[1;3;38;2;237;90;200m[vector_index_2022] A: The risk factors described in the 2022 SEC 10-K for Uber include the potential adverse impact on their business if Drivers were classified as employees, worke