<a href="https://colab.research.google.com/github/miaomiaozhang20/ec970_spring2024/blob/main/sec_filing_reader.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import getpass
import os

# Set your OpenAI API key
os.environ["OPENAI_API_KEY"] = getpass.getpass()

In [None]:
!pip install -U -q langchain openai chromadb unstructured==0.12.5 instructor tiktoken

# Download 10-K from SEC

In [None]:
from langchain_community.document_loaders import UnstructuredURLLoader

url = "https://www.sec.gov/Archives/edgar/data/1559720/000155972024000006/abnb-20231231.htm"
loader = UnstructuredURLLoader(urls=[url], headers={'User-Agent': 'your-org your@org.com'})
documents = loader.load()

# Chunk and store 10-K in vector DB

In [None]:
from langchain.vectorstores import Chroma
from langchain.embeddings import OpenAIEmbeddings
from langchain.text_splitter import TokenTextSplitter

# Naively chunk the SEC filing by tokens
token_splitter = TokenTextSplitter(chunk_size=512, chunk_overlap=20)
docs = token_splitter.split_documents(documents)

In [None]:
# Save the chunked docs in vector DB
vectorstore = Chroma.from_documents(docs, OpenAIEmbeddings(model="text-embedding-3-large"))

# Download Mistral-7B from HuggingFace

In [None]:
!pip install -U -q llama-cpp-python huggingface-hub

In [None]:
import llama_cpp
from llama_cpp import Llama
from llama_cpp.llama_speculative import LlamaPromptLookupDecoding

import instructor

from pydantic import BaseModel
from typing import List
from rich.console import Console
from huggingface_hub import hf_hub_download

mistral_path = "TheBloke/Mistral-7B-Instruct-v0.2-GGUF"

mistral_q4_basename = "mistral-7b-instruct-v0.2.Q4_K_M.gguf"

model_path = hf_hub_download(repo_id=mistral_path, filename=mistral_q4_basename)

llm = Llama(
    model_path=model_path,
    n_gpu_layers=--1, # The number of layers to put on the GPU. The rest will be on the CPU. If you don't know how many layers there are, you can use -1 to move all
    n_batch = 2048, # Should be between 1 and n_ctx, consider the amount of VRAM in your GPU.
    n_ctx=2048,
    logits_all=False,
)
llm.verbose = False

# Extract income statements from 10-K

In [None]:
import instructor
from openai import OpenAI
from pydantic import BaseModel
from pydantic import Field
from enum import Enum
from typing import Optional, Union, List

class UnitSuffix(str, Enum):
    billion = 'Billion'
    million = 'Million'
    thousand = 'Thousand'
    unknown = ''

class FiscalPeriod(str, Enum):
    fy_2023 = 'FY2023'
    fy_2022 = 'FY2022'
    fy_2021 = 'FY2021'
    fy_2020 = 'FY2020'
    unknown = ''

# Define our income statement
class IncomeStatement(BaseModel):
  period: Optional[FiscalPeriod]

  revenue: Union[float, str] = Field(description="Revenue")
  revenue_unit: Optional[UnitSuffix]

  cost_of_revenue: Union[float, str] = Field(description="Cost of revenue")
  cost_of_revenue_unit: Optional[UnitSuffix]

  income_from_operations: Union[float, str] = Field(description="Income from operations")
  income_from_operations_unit: Optional[UnitSuffix]

  operations_and_support: Union[float, str] = Field(description="Operations and support")
  operations_and_support_unit: Optional[UnitSuffix]

  product_development: Union[float, str] = Field(description="Product development")
  product_development_unit: Optional[UnitSuffix]

  sales_and_marketing: Union[float, str] = Field(description="Sales and marketing")
  sales_and_marketing_unit: Optional[UnitSuffix]

  general_and_administrative: Union[float, str] = Field(description="General and administrative")
  general_and_administrative_unit: Optional[UnitSuffix]

  interest_income: Union[float, str] = Field(description="Interest income")
  interest_income_unit: Optional[UnitSuffix]

  interest_expense: Union[float, str] = Field(description="Interest expense")
  interest_expense_unit: Optional[UnitSuffix]

  other_income: Union[float, str] = Field(description="Other income")
  other_income_unit: Optional[UnitSuffix]

  net_income: Union[float, str] = Field(description="Net income")
  net_income_unit: Optional[UnitSuffix]

class IncomeStatements(BaseModel):
  income_statements: List[IncomeStatement]

In [None]:
query = "Consolidated Statements of Operations (in millions)"
# Get documents from the vector DB
k = 1
top_k_docs = vectorstore.similarity_search(query, k)
context = "\n".join([doc.page_content for doc in top_k_docs])

In [None]:
import time

create = instructor.patch(
    create=llm.create_chat_completion_openai_v1,
    mode=instructor.Mode.JSON_SCHEMA,
)

start = time.time()

income_statements = create(
    response_model=instructor.Partial[IncomeStatements],
    messages=[
        {
            "role": "user",
            "content": f"Extract Airbnb's income statement from 2023, 2022, and 2021 from following context: {context}",
        },
    ],
)

print(f"Took {time.time() - start} seconds to complete!")
print(income_statements.model_dump_json(indent=2))

Took 202.3725185394287 seconds to complete!
{
  "income_statements": [
    {
      "period": "FY2021",
      "revenue": 5992.0,
      "revenue_unit": "Million",
      "cost_of_revenue": 1156.0,
      "cost_of_revenue_unit": "Million",
      "income_from_operations": 429.0,
      "income_from_operations_unit": "Million",
      "operations_and_support": 847.0,
      "operations_and_support_unit": "Million",
      "product_development": 1425.0,
      "product_development_unit": "Million",
      "sales_and_marketing": 1186.0,
      "sales_and_marketing_unit": "Million",
      "general_and_administrative": 836.0,
      "general_and_administrative_unit": "Million",
      "interest_income": 13.0,
      "interest_income_unit": "Million",
      "interest_expense": -438.0,
      "interest_expense_unit": "Million",
      "other_income": -304.0,
      "other_income_unit": "Million",
      "net_income": -352.0,
      "net_income_unit": "Million"
    },
    {
      "period": "FY2022",
      "revenue

# Extract balance sheets from 10-K

In [None]:
# Define Balance Sheet
class BalanceSheet(BaseModel):
  period: Optional[FiscalPeriod]

  # Assets
  cash_and_cash_equivalents: Union[float, str] = Field(description="Cash and cash equivalents")
  cash_and_cash_equivalents_unit: Optional[UnitSuffix]

  short_term_investments: Union[float, str] = Field(description="Short-term investments")
  short_term_investments_unit: Optional[UnitSuffix]

  total_current_assets: Union[float, str] = Field(description="Total current assets")
  total_current_assets_unit: Optional[UnitSuffix]

  goodwill: Union[float, str] = Field(description="Goodwill")
  goodwill_unit: Optional[UnitSuffix]

  total_assets: Union[float, str] = Field(description="Total assets")
  total_assets_unit: Optional[UnitSuffix]

  # Liabilities
  accrued_expenses: Union[float, str] = Field(description="Accrued expenses")
  accrued_expenses_unit: Optional[UnitSuffix]

  funds_payable: Union[float, str] = Field(description="Funds payable")
  funds_payable_unit: Optional[UnitSuffix]

  unearned_fees: Union[float, str] = Field(description="Unearned fees")
  unearned_fees_unit: Optional[UnitSuffix]

  total_current_liabilities: Union[float, str] = Field(description="Total current liabilities")
  total_current_liabilities_unit: Optional[UnitSuffix]

  long_term_debt: Union[float, str] = Field(description="Long-term debt")
  long_term_debt_unit: Optional[UnitSuffix]

  operating_lease_liabilities: Union[float, str] = Field(description="Operating lease liabilities")
  operating_lease_liabilities_unit: Optional[UnitSuffix]

  other_liabilities: Union[float, str] = Field(description="Other liabilities")
  other_liabilities_unit: Optional[UnitSuffix]

  total_liabilities: Union[float, str] = Field(description="Total liabilities")
  total_liabilities_unit: Optional[UnitSuffix]

  stockholders_equity: Union[float, str] = Field(description="Stockholders' equity")
  stockholders_equity_unit: Optional[UnitSuffix]

class BalanceSheets(BaseModel):
  balance_sheets: List[BalanceSheet]

In [None]:
query = "Extract Consolidated Balance Sheets with total debt, current liabilities"
# Get documents from the vector DB
k = 1
top_k_docs = vectorstore.similarity_search(query, k)
context = "\n".join([doc.page_content for doc in top_k_docs])

In [None]:
import time

create = instructor.patch(
    create=llm.create_chat_completion_openai_v1,
    mode=instructor.Mode.JSON_SCHEMA,
)

start = time.time()

balance_sheets = create(
    response_model=instructor.Partial[BalanceSheets],
    messages=[
        {
            "role": "user",
            "content": f"Extract Airbnb's balance sheet from 2023, 2022, and 2021 from following context: {context}",
        },
    ],
)
print(f"Took {time.time() - start} seconds to complete!")
print(balance_sheets.model_dump_json(indent=2))

Took 238.2173011302948 seconds to complete!
{
  "balance_sheets": [
    {
      "period": "FY2021",
      "cash_and_cash_equivalents": 7378.0,
      "cash_and_cash_equivalents_unit": null,
      "short_term_investments": 2244.0,
      "short_term_investments_unit": null,
      "total_current_assets": 14861.0,
      "total_current_assets_unit": null,
      "goodwill": 684.0,
      "goodwill_unit": null,
      "total_assets": 16038.0,
      "total_assets_unit": null,
      "accrued_expenses": 2013.0,
      "accrued_expenses_unit": null,
      "funds_payable": 4783.0,
      "funds_payable_unit": null,
      "unearned_fees": 1182.0,
      "unearned_fees_unit": null,
      "total_current_liabilities": 7978.0,
      "total_current_liabilities_unit": null,
      "long_term_debt": 1987.0,
      "long_term_debt_unit": null,
      "operating_lease_liabilities": 295.0,
      "operating_lease_liabilities_unit": null,
      "other_liabilities": 218.0,
      "other_liabilities_unit": null,
      "to

# Extract cash flow statements from 10-K

In [None]:
# Define Cash Flow Statement
class CashFlowStatement(BaseModel):
  period: Optional[FiscalPeriod]

  net_income: Union[float, str] = Field(description="Net income")
  net_income_unit: Optional[UnitSuffix]

  depreciation_and_amortization: Union[float, str] = Field(description="Depreciation and amortization")
  depreciation_and_amortization_unit: Optional[UnitSuffix]

  stock_based_compensation: Union[float, str] = Field(description="Stock-based compensation")
  stock_based_compensation_unit: Optional[UnitSuffix]

  net_cash_from_operating_activities: Union[float, str] = Field(description="Net cash from operating activities")
  net_cash_from_operating_activities_unit: Optional[UnitSuffix]

  net_cash_from_investing_activities: Union[float, str] = Field(description="Net cash from investing activities")
  net_cash_from_investing_activities_unit: Optional[UnitSuffix]

  net_cash_from_financing_activities: Union[float, str] = Field(description="Net cash from financing activities")
  net_cash_from_financing_activities_unit: Optional[UnitSuffix]

class CashFlowStatements(BaseModel):
  cash_flow_statements: List[CashFlowStatement]

In [None]:
query = "Airbnb, Inc.\nConsolidated Statements of Cash Flows\n(in millions)"
# Get documents from the vector DB
k = 1
top_k_docs = vectorstore.similarity_search(query, k)
context = "\n".join([doc.page_content for doc in top_k_docs])

In [None]:
import time

create = instructor.patch(
    create=llm.create_chat_completion_openai_v1,
    mode=instructor.Mode.JSON_SCHEMA,
)

start = time.time()

cash_flow_statements = create(
    response_model=instructor.Partial[CashFlowStatements],
    messages=[
        {
            "role": "user",
            "content": f"Extract Airbnb's cash flow statement from 2023, 2022, and 2021 from following context: {context}",
        },
    ],
)
print(f"Took {time.time() - start} seconds to complete!")
print(cash_flow_statements.model_dump_json(indent=2))