# Creating contextual data for RAG

##### Set up environment

In [1]:
import os

import requests
from dotenv import load_dotenv

load_dotenv()

In [5]:
sec_api_key = os.environ.get("SEC_API_IO_API_KEY")

##### API usage example and response sample

In [None]:
# from sec_api import ExtractorApi

# extractor_api = ExtractorApi(api_key=sec_api_key)
# response = extractor_api.get_section("10-K", "AAPL", "0000320193", "2019-01-01", "2020-01-01")

In [21]:
url = (
    "https://www.sec.gov/Archives/edgar/data/1321655/000132165524000209/pltr-20240930.htm"
)
retrieval_code = "part1item1"

api_url = f"""
    https://api.sec-api.io/extractor?url={url}&item={retrieval_code}&type=text&token={sec_api_key}
"""

In [28]:
pltr_finstmt = requests.get(api_url)
print(pltr_finstmt.content.decode())

 PART I - FINANCIAL INFORMATION 

ITEM 1. FINANCIAL STATEMENTS (UNAUDITED) 

Palantir Technologies Inc. 

Condensed Consolidated Balance Sheets 

(in thousands, except per share amounts) 

(unaudited) 

As of September 30, As of December 31, 2024 2023 Assets Current assets: Cash and cash equivalents $ 768,710 &#160; $ 831,047 &#160; Marketable securities 3,795,949 &#160; 2,843,132 &#160; Accounts receivable, net 668,110 &#160; 364,784 &#160; Prepaid expenses and other current assets 119,193 &#160; 99,655 &#160; Total current assets 5,351,962 &#160; 4,138,618 &#160; Property and equipment, net 40,345 &#160; 47,758 &#160; Operating lease right-of-use assets 211,570 &#160; 182,863 &#160; Other assets 164,220 &#160; 153,186 &#160; Total assets $ 5,768,097 &#160; $ 4,522,425 &#160; Liabilities and Equity Current liabilities: Accounts payable $ 27,021 &#160; $ 12,122 &#160; Accrued liabilities 265,244 &#160; 222,991 &#160; Deferred revenue 236,608 &#160; 246,901 &#160; Customer deposits 366,

Note:
- HTML tags still visible
- Table data is unraveled

---

### Download (semi-parsed) data for LLM context

Herein, extracting sections from 10-K and 10-Q filings of Palantir (NASDAQ:PLTR) and JP Morgan (NYSE:JPM) using the API endpoints from sec-api.io

Steps to be taken:
- Extract sections from SEC filings
- Clean up HTML tags to get clean texts
- Organize the text by company name, filing type, date, filing


In [66]:
pltr_10q_urls = [
    "https://www.sec.gov/Archives/edgar/data/1321655/000132165524000135/pltr-20240630.htm",
    "https://www.sec.gov/Archives/edgar/data/1321655/000132165524000071/pltr-20240331.htm",
    "https://www.sec.gov/Archives/edgar/data/1321655/000132165523000118/pltr-20230930.htm",
    "https://www.sec.gov/Archives/edgar/data/1321655/000132165523000090/pltr-20230630.htm",
    "https://www.sec.gov/Archives/edgar/data/1321655/000132165523000044/pltr-20230331.htm",
    "https://www.sec.gov/Archives/edgar/data/1321655/000132165522000032/pltr-20220930.htm",
    "https://www.sec.gov/Archives/edgar/data/1321655/000132165522000006/pltr-20220331.htm",
]

pltr_10k_urls = [
    "https://www.sec.gov/Archives/edgar/data/1321655/000132165524000022/pltr-20231231.htm",
    "https://www.sec.gov/Archives/edgar/data/1321655/000132165523000011/pltr-20221231.htm",
]

jpm_10q_urls = [
    "https://www.sec.gov/Archives/edgar/data/19617/000001961724000611/jpm-20240930.htm",
    "https://www.sec.gov/Archives/edgar/data/19617/000001961724000453/jpm-20240630.htm",
    "https://www.sec.gov/Archives/edgar/data/19617/000001961724000326/jpm-20240331.htm",
    "https://www.sec.gov/Archives/edgar/data/19617/000001961723000524/jpm-20230930.htm",
    "https://www.sec.gov/Archives/edgar/data/19617/000001961723000432/jpm-20230630.htm",
    "https://www.sec.gov/Archives/edgar/data/19617/000001961723000310/jpm-20230331.htm",
]

urls = {"10k": pltr_10k_urls, "10q": pltr_10q_urls + jpm_10q_urls}

Specify extraction items from each filings

In [36]:
extract_item_10q = {
    "part1item1": "financial_statements",
    "part1item2": "md_a",
    # "part2item1a": "risk_factors",
}

extract_item_10k = {
    # "1A": "risk_factors",
    "7": "md_a",
    "8": "financial_statements",
}

extraction_codes = {"10k": extract_item_10k, "10q": extract_item_10q}

Create helper functions to organize the API response

In [None]:
import hashlib
import html
import re
from datetime import datetime


def parse_sec_url(url: str) -> tuple[str, str] | None:
    co_match = re.search(r"/([^/]+)-", url)
    date_match = re.search(r"(\d{8})(?=\.htm)", url)

    try:
        company_name = co_match.group(1)
        date_str = date_match.group(1)
        date_obj = datetime.strptime(date_str, "%Y%m%d")
        formatted_date = date_obj.strftime("%Y-%m-%d")
        return company_name, formatted_date
    except AttributeError:
        print("SEC filing URL could not be parsed for company name and/or date.")
        return None


def generate_document_id(text: str):
    hash_object = hashlib.md5(text.encode())
    hash_hex = hash_object.hexdigest()
    url_id = hash_hex[:8]
    return url_id

Download and parse into contextual dataset

In [69]:
responses = {}
base_uri = (
    """https://api.sec-api.io/extractor?url={url}&item={code}&type=text&token={api_key}"""
)

for filing_type, filing_urls in urls.items():
    for url in filing_urls:
        for code, name in extraction_codes[filing_type].items():
            api_url = base_uri.format(url=url, code=code, api_key=sec_api_key)
            company, reporting_period = parse_sec_url(url)
            res = requests.get(api_url)
            responses[api_url] = [company, reporting_period, filing_type, name, res]

In [117]:
sample_url = list(responses.keys())[2]

responses[sample_url][-1].content.decode()

" ITEM 8. FINANCIAL STATEMENTS AND SUPPLEMENTARY DATA \n\n##TABLE_START Page \n\nReports of Independent Registered Public Accounting Firm (PCAOB ID: 42 ) \n\nConsolidated Balance Sheets \n\nConsolidated Statements of Operations \n\nConsolidated Statements of Comprehensive Income ( Loss ) \n\nConsolidated Statements of Stockholders' Equity \n\nConsolidated Statements of Cash Flows \n\nNotes to Consolidated Financial Statements \n\n##TABLE_END\n\nReport of Independent Registered Public Accounting Firm \n\nTo the Stockholders and the Board of Directors of Palantir Technologies Inc. \n\nOpinion on the Financial Statements \n\nWe have audited the accompanying consolidated balance sheets of Palantir Technologies Inc. (the Company) as of December 31, 2023 and 2022, the related consolidated statements of operations, comprehensive income (loss), stockholders&#8217; equity and cash flows for each of the three years in the period ended December 31, 2023, and the related notes (collectively referr

In [None]:
documents = []
for u, values in responses.items():
    url_id = generate_document_id(u)
    documents.append(
        {
            "company": values[0],
            "reporting_period": values[1],
            "filing_type": values[2],
            "section": values[3],
            "text": html.unescape(values[-1].content.decode()),
            "id": url_id,
        }
    )

In [104]:
documents

[{'company': 'pltr',
  'reporting_period': '2023-12-31',
  'filing_type': '10k',
  'section': 'risk_factors',
  'text': ' ITEM 1A. RISK FACTORS \n\nInvesting in our Class A common stock involves a high degree of risk. You should carefully consider the risks and uncertainties described below, together with all of the other information in this Annual Report on Form 10-K, including the section titled “Management’s Discussion and Analysis of Financial Condition and Results of Operations” and our consolidated financial statements and accompanying notes, before making a decision to invest in our Class A common stock. Our business, financial condition, results of operations, or prospects could also be harmed by risks and uncertainties not currently known to us or that we currently do not believe are material. If any of the risks actually occur, our business, financial condition, results of operations, and prospects could be adversely affected. In that event, the trading price of our Class A c

In [108]:
import json

with open("data/documents_with_ids.json", "w") as f:
    json.dump(documents, f, indent=2)

In [109]:
!head data/documents_with_ids.json

[
  {
    "company": "pltr",
    "reporting_period": "2023-12-31",
    "filing_type": "10k",
    "section": "risk_factors",
    "text": " ITEM 1A. RISK FACTORS \n\nInvesting in our Class A common stock involves a high degree of risk. You should carefully consider the risks and uncertainties described below, together with all of the other information in this Annual Report on Form 10-K, including the section titled \u201cManagement\u2019s Discussion and Analysis of Financial Condition and Results of Operations\u201d and our consolidated financial statements and accompanying notes, before making a decision to invest in our Class A common stock. Our business, financial condition, results of operations, or prospects could also be harmed by risks and uncertainties not currently known to us or that we currently do not believe are material. If any of the risks actually occur, our business, financial condition, results of operations, and prospects could be adversely affected. In that event, the