app/config.py

In [1]:
from config import *

✅ Config loaded
  BASE_DIR: /workspace/notebooks
  VECTOR_DB_PATH: /workspace/data/chroma


In [2]:
import os
from dotenv import load_dotenv

load_dotenv()

SEC_HEADERS = {
    "User-Agent": os.getenv("SEC_USER_AGENT")
}

CHROMA_DIR = os.getenv("CHROMA_PERSIST_DIR", "/workspace/data/chroma")
OLLAMA_HOST = "http://ollama:11434/"
BATCH_SIZE = 128

In [3]:
print(OLLAMA_HOST)

http://ollama:11434/


## app/sec_client.py

In [4]:
import json
import requests
from pathlib import Path

BASE_DATA = "https://data.sec.gov"
BASE_ARCHIVE = "https://www.sec.gov"
SUBMISSIONS_DIR = Path("/workspace/data/submissions")
IMAGES_DIR = Path("/workspace/data/sec_images")

def get_submissions(
    cik: str,
    use_remote_fallback: bool = True,
    save_if_downloaded: bool = True,
):
    """
    Load SEC submissions JSON from local cache if available,
    otherwise optionally fetch from SEC and cache locally.
    """
    cik = cik.zfill(10)
    SUBMISSIONS_DIR.mkdir(parents=True, exist_ok=True)

    local_path = SUBMISSIONS_DIR / f"CIK{cik}.json"

    # 1️⃣ Try local file
    if local_path.exists():
        with open(local_path, "r", encoding="utf-8") as f:
            return json.load(f)

    # 2️⃣ Fallback to SEC
    if not use_remote_fallback:
        raise FileNotFoundError(f"Local submissions file not found: {local_path}")

    url = f"{BASE_DATA}/submissions/CIK{cik}.json"
    resp = requests.get(url, headers=SEC_HEADERS, timeout=30)
    resp.raise_for_status()
    data = resp.json()

    # 3️⃣ Save locally
    if save_if_downloaded:
        SUBMISSIONS_DIR.mkdir(parents=True, exist_ok=True)
        with open(local_path, "w", encoding="utf-8") as f:
            json.dump(data, f, indent=2)

    return data

def get_filing_index(cik: str, accession: str):
    acc_no_dash = accession.replace("-", "")
    url = f"{BASE_ARCHIVE}/Archives/edgar/data/{int(cik)}/{acc_no_dash}/index.json"
    resp = requests.get(url, headers=SEC_HEADERS)

    if resp.status_code == 200:
        return resp.json()

    raise FileNotFoundError(
        f"Filing index not found for accession {accession}"
    )

def download_filing(cik: str, accession: str, filename: str):
    acc = accession.replace("-", "")
    url = f"{BASE_ARCHIVE}/Archives/edgar/data/{int(cik)}/{acc}/{filename}"
    return requests.get(url, headers=SEC_HEADERS).text


## app/parser.py

In [5]:
SEC_ITEM_MAP = {
    # ───────────── Part I ─────────────
    "Item 1": "Business",
    "Item 1A": "Risk Factors",
    "Item 1B": "Unresolved Staff Comments",
    "Item 2": "Properties",
    "Item 3": "Legal Proceedings",
    "Item 4": "Mine Safety Disclosures",

    # ───────────── Part II ─────────────
    "Item 5": "Market for Registrant’s Common Equity, Related Stockholder Matters and Issuer Purchases of Equity Securities",
    "Item 6": "Reserved",
    "Item 7": "Management’s Discussion and Analysis of Financial Condition and Results of Operations",
    "Item 7A": "Quantitative and Qualitative Disclosures About Market Risk",
    "Item 8": "Financial Statements and Supplementary Data",
    "Item 9": "Changes in and Disagreements with Accountants on Accounting and Financial Disclosure",
    "Item 9A": "Controls and Procedures",
    "Item 9B": "Other Information",

    # ───────────── Part III ─────────────
    "Item 10": "Directors, Executive Officers and Corporate Governance",
    "Item 11": "Executive Compensation",
    "Item 12": "Security Ownership of Certain Beneficial Owners and Management and Related Stockholder Matters",
    "Item 13": "Certain Relationships and Related Transactions, and Director Independence",
    "Item 14": "Principal Accountant Fees and Services",

    # ───────────── Part IV ─────────────
    "Item 15": "Exhibits, Financial Statement Schedules"
}


In [6]:
def html_table_to_json(table):
    """
    Converts an HTML <table> into a structured JSON object
    suitable for LLM understanding or structured querying.
    """
    rows = []
    headers = []

    for i, row in enumerate(table.find_all("tr")):
        cells = [c.get_text(strip=True) for c in row.find_all(["th", "td"])]

        if not cells:
            continue

        if i == 0:
            headers = cells
        else:
            if headers and len(cells) == len(headers):
                rows.append(dict(zip(headers, cells)))
            else:
                rows.append({"row": cells})

    return {
        "type": "table",
        "headers": headers,
        "rows": rows,
    }


In [7]:
import re

FINANCIAL_KEYWORDS = {
    "revenue", "net sales", "income", "profit", "loss",
    "assets", "liabilities", "equity", "cash flow",
    "operating", "gross margin", "cost of sales",
    "usd", "dollars", "millions"
}

YEAR_PATTERN = re.compile(r"\b(19|20)\d{2}\b")
    
def classify_table(table_json):
    """
    Classifies SEC tables into semantic categories.
    """

    headers_text = table_json.get("headers", [])
    rows_text = table_json.get("rows", [])

    text = f"{headers_text} {rows_text}"

    has_years = bool(YEAR_PATTERN.search(text))
    has_numbers = sum(char.isdigit() for char in text) > 20
    has_financial_terms = any(k in text for k in FINANCIAL_KEYWORDS)

    # --- Classification rules ---
    if has_financial_terms and has_years and has_numbers:
        return "financial"

    if has_numbers and any(t in text for t in ["salary", "bonus", "stock", "option"]):
        return "compensation"

    if has_numbers and not has_financial_terms:
        return "entity"

    if has_numbers:
        return "entity2"

    if not has_numbers:
        return "policy"

    return "unknown"


In [8]:
ITEM_PATTERN = re.compile(
    r"\n\s*(ITEM\s+\d+[A-Z]?)\.\s+",
    re.IGNORECASE
)

def contains_item_code(text: str) -> bool:
    return bool(ITEM_PATTERN.search(text))

def normalize_text(text: str) -> str:
    re.sub(r"\n{3,}", "\n\n", text)
    return text.replace("\xa0", " ").strip()

In [None]:
from bs4 import BeautifulSoup
from urllib.parse import urljoin
import os
import re
import hashlib
import requests

os.makedirs(IMAGES_DIR, exist_ok=True)
   
def extract_text_tables_images(
    html: str,
    base_url: str,
):
    """
    Extracts:
    - narrative text (tables + images removed)
    - tables as structured JSON
    - images with persistent storage + metadata

    Returns:
        text: str
        tables_json: list[dict]
        images: list[dict]
    """

    soup = BeautifulSoup(html, "lxml")

    # -------------------------------------------------
    # 1. Remove scripts & styles
    # -------------------------------------------------
    for tag in soup(["script", "style"]):
        tag.decompose()

    # -------------------------------------------------
    # 2. Extract images
    # -------------------------------------------------
    images = []

    for img in soup.find_all("img"):
        src = img.get("src")
        if not src:
            continue

        img_url = urljoin(base_url, src)
        img_id = hashlib.md5(img_url.encode()).hexdigest()
        ext = os.path.splitext(img_url)[1].split("?")[0] or ".png"
        img_path = os.path.join(IMAGES_DIR, f"{img_id}{ext}")

        '''if not os.path.exists(img_path):
            try:
                r = requests.get(img_url, timeout=15)
                if r.ok:
                    with open(img_path, "wb") as f:
                        f.write(r.content)
            except Exception:
                pass'''

        images.append({
            "image_id": img_id,
            "image_url": img_url,
            "image_path": img_path,
            "alt_text": img.get("alt", "").strip()
        })

        img.decompose()  # remove image from text flow

    # -------------------------------------------------
    # 3. Extract tables (STRUCTURED)
    # -------------------------------------------------
    tables_json = []

    for table in soup.find_all("table"):

        # ✅ Skip parent/layout tables
        if table.find("table"):
            continue

        table_text = table.get_text(" ", strip=True)

        # ✅ If table contains ITEM code → treat as narrative
        if contains_item_code(table_text):
            table.replace_with(table_text)
            continue
            
        # ✅ Very low-signal → flatten into text
        if len(table_text) < 50:
            table.replace_with(table_text)
            continue

        # ✅ Extract first, classify later
        table_json = html_table_to_json(table)

        if not table_json:
            table.replace_with(table_text)
            continue

        table_type = classify_table(
            table_json=table_json
        )

        if table_type in {"financial", "compensation", "entity", "entity2"}:
            tables_json.append(table_json)
            table.decompose()
        elif table_type == "policy":
            table.replace_with(table_text)
        else:
            # unknown → flatten
            table.replace_with(table_text)
        
    # -------------------------------------------------
    # 4. Extract narrative text
    # -------------------------------------------------
    text = soup.get_text(separator="\n")

    # Normalize whitespace
    text = normalize_text(text)

    # -------------------------------------------------
    # 5. Remove meaningless footers
    # -------------------------------------------------
    #text = FOOTER_PATTERN.sub("", text)

    return text, tables_json, images


In [10]:
'''from bs4 import BeautifulSoup

def clean_html(html: str) -> str:
    soup = BeautifulSoup(html, "lxml")
    for tag in soup(["script", "style"]):
        tag.decompose()
    return soup.get_text(separator="\n")'''

'from bs4 import BeautifulSoup\n\ndef clean_html(html: str) -> str:\n    soup = BeautifulSoup(html, "lxml")\n    for tag in soup(["script", "style"]):\n        tag.decompose()\n    return soup.get_text(separator="\n")'

## app/chunker.py

In [11]:
import re

def split_sections_with_items(text: str):
    """
    Returns list of tuples:
    (item_code, section_text)
    """
    pattern = re.compile(
        r"\n\s*(ITEM\s+\d+[A-Z]?)\.\s+",
        re.IGNORECASE
    )

    parts = pattern.split(text)

    sections = []
    for i in range(1, len(parts), 2):
        item = parts[i].upper().replace("ITEM ", "Item ")
        content = parts[i + 1].strip()
        sections.append((item, content))

    return sections

from langchain_text_splitters import RecursiveCharacterTextSplitter

def chunk_text(
    text: str,
    header: str | None = None,
    max_size: int = 800,
    overlap: int = 160,
) -> list[str]:
    """
    SEC-optimized semantic chunker.

    - Preserves paragraph & sentence boundaries
    - Adds section header context
    - Produces embedding-friendly chunks
    """

    splitter = RecursiveCharacterTextSplitter(
        chunk_size=max_size,
        chunk_overlap=overlap,
        separators=[
            "\n\n",     # paragraphs
            "\n",       # lines
            ". ",       # sentences
            "; ",       # legal clauses
            ", ",       # fallback
            " ",        # words
            ""          # characters (last resort)
        ],
    )

    chunks = splitter.split_text(text)
    chunks = [c for c in chunks if len(c) > 200]

    # Prepend section header to every chunk (VERY IMPORTANT)
    if header:
        chunks = [
            f"[{header}]\n{chunk}".strip()
            for chunk in chunks
        ]

    return chunks

def get_fiscal_year(filing_date: str):
    # filing_date format: YYYY-MM-DD
    return int(filing_date[:4])

app/vectorstore.py

In [12]:
from src.chroma_manager import ChromaManager
#from src.embedding_manager import EmbeddingManager
from src.embedding_manager_transformer import TransformerEmbeddingManager

In [13]:
# Initialize ChromaDB manager
chroma_manager = ChromaManager(
    persist_directory=VECTOR_DB_PATH,          # From config.py
    embedding_model="nomic-embed-text",        # Should match embedding_manager
    collection_name="sec_filings",         # Collection name for this project
    base_url= OLLAMA_HOST,
    verbose=0
)

# Initialize our embedding manager
'''embedding_manager = EmbeddingManager(
    text_embedding_model="nomic-embed-text",  # Specialized embedding model
    vision_model="qwen2.5vl:3b",              # Vision model for image descriptions
    base_url=OLLAMA_HOST
)'''

embedding_manager = TransformerEmbeddingManager(model_name='nomic-ai/nomic-embed-text-v1', batch_size=BATCH_SIZE)

INFO:chromadb.telemetry.product.posthog:Anonymized telemetry enabled. See                     https://docs.trychroma.com/telemetry for more information.
INFO:src.embedding_manager_transformer:Loading SentenceTransformer model 'nomic-ai/nomic-embed-text-v1' on device 'cuda'
INFO:sentence_transformers.SentenceTransformer:Load pretrained SentenceTransformer: nomic-ai/nomic-embed-text-v1
INFO:src.embedding_manager_transformer:TransformerEmbeddingManager ready (dim=768)


In [14]:
chroma_manager.reset_collection()

True

In [15]:
chroma_manager.get_collection_stats()

{'name': 'sec_filings',
 'count': 0,
 'embedding_model': 'nomic-embed-text',
 'embedding_dimension': 'unknown',
 'persist_directory': '/workspace/data/chroma'}

In [16]:
CIK = "0000320193"  # Apple example

data = get_submissions(CIK, use_remote_fallback=False)
filings = data["filings"]
recent = data["filings"]["recent"]
files = data["filings"]["files"]
print(files)
print(recent.keys())

forms = data["filings"]["recent"]["form"]
accessions = data["filings"]["recent"]["accessionNumber"]
dates = data["filings"]["recent"]["filingDate"]
primary_documents = data["filings"]["recent"]["primaryDocument"]

[{'name': 'CIK0000320193-submissions-001.json', 'filingCount': 1187, 'filingFrom': '1994-01-26', 'filingTo': '2015-02-01'}]
dict_keys(['accessionNumber', 'filingDate', 'reportDate', 'acceptanceDateTime', 'act', 'form', 'fileNumber', 'filmNumber', 'items', 'core_type', 'size', 'isXBRL', 'isInlineXBRL', 'primaryDocument', 'primaryDocDescription'])


In [17]:
import time

REQUESTS_PER_SECOND = 2.0   # SEC-safe
INTERVAL = 1.0 / REQUESTS_PER_SECOND
_last_request = 0.0

def wait():
    global _last_request
    now = time.time()
    elapsed = now - _last_request
    if elapsed < INTERVAL:
        time.sleep(INTERVAL - elapsed)
    _last_request = time.time()

## app/main.py

In [18]:
import hashlib

def make_doc_id(cik, year, acc, item, chunk_idx, text):
    #h = hashlib.md5(text.encode("utf-8")).hexdigest()[:8]
    return f"{cik}_10K_{year}_{acc.replace('-', '')}_{item.replace(' ', '')}_{chunk_idx:03d}"

In [19]:
import time
from statistics import mean

def chunk_stats(chunks: list[str]) -> dict:
    lengths = [len(c) for c in chunks]
    if not lengths:
        return {
            "count": 0,
            "avg_len": 0,
            "min_len": 0,
            "max_len": 0,
        }

    return {
        "count": len(lengths),
        "avg_len": round(mean(lengths), 1),
        "min_len": min(lengths),
        "max_len": max(lengths),
    }

In [20]:
from collections import Counter

def list_repeated_lines(
    text: str,
    min_repeats: int = 3,
    min_length: int = 10,
    max_length: int = 200,
    top_k: int = 50,
):
    """
    Prints the most frequently repeated lines for inspection.

    Args:
        text: full extracted text
        min_repeats: minimum repetition count
        min_length: ignore very short lines
        max_length: ignore very long lines
        top_k: number of lines to display
    """
    lines = [
        line.strip()
        for line in text.splitlines()
        if min_length <= len(line.strip()) <= max_length
    ]

    counts = Counter(lines)

    repeated = [
        (line, cnt)
        for line, cnt in counts.items()
        if cnt >= min_repeats
    ]

    repeated.sort(key=lambda x: x[1], reverse=True)

    print(f"\n=== Repeated lines (>= {min_repeats} times) ===\n")
    for line, cnt in repeated[:top_k]:
        print(f"[{cnt:>3}x] {line}")


In [21]:
from time import perf_counter
from tqdm.auto import tqdm

ten_ks = [
    (form, acc, date, primary_doc)
    for form, acc, date, primary_doc in zip(forms, accessions, dates, primary_documents)
    if form == "10-K"
]

all_tables = []

for form, acc, date, primary_doc in tqdm(ten_ks, desc="Indexing 10-K filings"):
    print(f"\nProcessing {acc} ({date})")

    #if acc !='0001193125-15-356351':
    #    continue

    base_url = f"{BASE_ARCHIVE}/Archives/edgar/data/{CIK}/{acc.replace('-', '')}/"
    
    wait()
    html = download_filing(CIK, acc, primary_doc)
    text, tables_json, images = extract_text_tables_images(html, base_url)

    list_repeated_lines(
        text,
        min_repeats=3,
        min_length=15,
        max_length=120,
        top_k=30
    )
    
    sections = split_sections_with_items(text)
    if not sections:
        continue

    fiscal_year = get_fiscal_year(date)

    for idx, table in enumerate(tables_json):
        all_tables.append({
            "cik": CIK,
            "company": data["name"],
            "filing_type": "10-K",
            "filing_date": date,
            "fiscal_year": fiscal_year,
            "accession": acc,
            "table_index": idx,
            "table": table,
            "source": "SEC EDGAR"
        })

    batch_texts, batch_metadatas, batch_ids = [], [], []
    indexed_chunks = 0
    all_chunk_lengths = []

    # ⏱ start timing
    t0 = perf_counter()

    for item_code, section_text in sections:
        section_title = SEC_ITEM_MAP.get(item_code)
        if section_title is None:
            continue

        chunks = chunk_text(
            section_text,
            max_size=800,
            overlap=800*0.2,
            # header=f"{item_code} – {section_title}",
        )
        
        # ---- stats ----
        all_chunk_lengths.extend(len(c) for c in chunks)
        
        for chunk_idx, ch in enumerate(chunks):
            doc_id = make_doc_id(
                cik=CIK,
                year=fiscal_year,
                acc=acc,
                item=item_code,
                chunk_idx=chunk_idx,
                text=ch
            )

            metadata = {
                "cik": CIK,
                "company": data["name"],
                "filing_type": "10-K",
                "filing_date": date,
                "fiscal_year": fiscal_year,
                "accession": acc,
                "section": item_code,
                "section_title": section_title,
                "chunk_index": chunk_idx,
                "content_type": "narrative",
                "source": "SEC EDGAR"
            }

            batch_texts.append(ch)
            batch_metadatas.append(metadata)
            batch_ids.append(doc_id)

            # ⏱ Flush batch
            if len(batch_texts) >= BATCH_SIZE:
                batch_embeddings = embedding_manager.generate_text_embeddings(batch_texts)
                
                chroma_manager.add_with_embeddings(
                    texts=batch_texts,
                    embeddings=batch_embeddings,
                    metadatas=batch_metadatas,
                    ids=batch_ids,
                )

                indexed_chunks += len(batch_texts)

                batch_texts.clear()
                batch_embeddings.clear()
                batch_metadatas.clear()
                batch_ids.clear()

    # Flush remainder
    if batch_texts:
        batch_embeddings = embedding_manager.generate_text_embeddings(batch_texts)
        
        chroma_manager.add_with_embeddings(
            texts=batch_texts,
            embeddings=batch_embeddings,
            metadatas=batch_metadatas,
            ids=batch_ids,
        )
    
        indexed_chunks += len(batch_texts)

    # ⏱ end timing
    elapsed = perf_counter() - t0

    # ---- final stats ----
    if all_chunk_lengths:
        avg_len = round(sum(all_chunk_lengths) / len(all_chunk_lengths), 1)
        min_len = min(all_chunk_lengths)
        max_len = max(all_chunk_lengths)
    else:
        avg_len = min_len = max_len = 0

    print(
        f"  Indexed {indexed_chunks} chunks | "
        f"Elapsed: {elapsed:.2f}s | "
        f"Avg chunk length: {avg_len} chars | "
        f"Min/Max: {min_len}/{max_len}"
    )

Indexing 10-K filings:   0%|          | 0/11 [00:00<?, ?it/s]


Processing 0000320193-25-000079 (2025-10-31)



Assuming this really is an XML document, what you're doing might work, but you should know that using an XML parser will be more reliable. To parse this document as XML, make sure you have the Python package 'lxml' installed, and pass the keyword argument `features="xml"` into the BeautifulSoup constructor.




  soup = BeautifulSoup(html, "lxml")



=== Repeated lines (>= 3 times) ===

[ 18x] us-gaap:FairValueInputsLevel2Member
[ 15x] us-gaap:OperatingSegmentsMember
[ 11x] aapl:FixedRateNotesMember
[ 10x] us-gaap:CreditConcentrationRiskMember
[  8x] us-gaap:CommonStockMember
[  7x] us-gaap:CommonStockIncludingAdditionalPaidInCapitalMember
[  7x] us-gaap:RetainedEarningsMember
[  7x] us-gaap:AccumulatedOtherComprehensiveIncomeMember
[  7x] us-gaap:RestrictedStockUnitsRSUMember
[  7x] aapl:A20132023DebtIssuancesMember
[  6x] us-gaap:FairValueInputsLevel1Member
[  6x] us-gaap:CommercialPaperMember
[  6x] aapl:NonTradeReceivableMember
[  5x] us-gaap:ForeignExchangeContractMember
[  5x] aapl:OtherCountriesMember
[  5x] See accompanying Notes to Consolidated Financial Statements.
[  4x] http://fasb.org/us-gaap/2025#OtherLiabilitiesCurrent
[  4x] http://fasb.org/us-gaap/2025#OtherLiabilitiesNoncurrent
[  4x] us-gaap:DesignatedAsHedgingInstrumentMember
[  4x] us-gaap:TradeAccountsReceivableMember
[  4x] srt:MinimumMember
[  4x] srt:Maxim

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

  Indexed 284 chunks | Elapsed: 6.19s | Avg chunk length: 592.9 chars | Min/Max: 202/799

Processing 0000320193-24-000123 (2024-11-01)

=== Repeated lines (>= 3 times) ===

[ 18x] us-gaap:FairValueInputsLevel2Member
[  9x] us-gaap:RestrictedStockUnitsRSUMember
[  8x] us-gaap:CommonStockMember
[  8x] us-gaap:CreditConcentrationRiskMember
[  7x] us-gaap:CommonStockIncludingAdditionalPaidInCapitalMember
[  7x] us-gaap:RetainedEarningsMember
[  7x] us-gaap:AccumulatedOtherComprehensiveIncomeMember
[  7x] aapl:A20132023DebtIssuancesMember
[  7x] aapl:FixedRateNotesMember
[  6x] us-gaap:FairValueInputsLevel1Member
[  6x] us-gaap:CommercialPaperMember
[  6x] aapl:NonTradeReceivableMember
[  5x] us-gaap:ForeignExchangeContractMember
[  5x] aapl:OtherCountriesMember
[  5x] See accompanying Notes to Consolidated Financial Statements.
[  4x] http://fasb.org/us-gaap/2024#OtherLiabilitiesCurrent
[  4x] http://fasb.org/us-gaap/2024#OtherLiabilitiesNoncurrent
[  4x] aapl:UnfavorableInvestigationOutco

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

  Indexed 279 chunks | Elapsed: 6.59s | Avg chunk length: 593.4 chars | Min/Max: 204/799

Processing 0000320193-23-000106 (2023-11-03)

=== Repeated lines (>= 3 times) ===

[ 27x] us-gaap:FairValueInputsLevel2Member
[ 12x] aapl:FixedRateNotesMember
[ 11x] us-gaap:ForeignExchangeContractMember
[ 11x] us-gaap:CreditConcentrationRiskMember
[  9x] us-gaap:RestrictedStockUnitsRSUMember
[  8x] us-gaap:CommonStockMember
[  7x] us-gaap:CommonStockIncludingAdditionalPaidInCapitalMember
[  7x] us-gaap:RetainedEarningsMember
[  7x] us-gaap:AccumulatedOtherComprehensiveIncomeMember
[  7x] us-gaap:DesignatedAsHedgingInstrumentMember
[  7x] aapl:A20132022DebtIssuancesMember
[  6x] us-gaap:FairValueInputsLevel1Member
[  6x] us-gaap:CommercialPaperMember
[  6x] us-gaap:OtherLiabilitiesMember
[  6x] aapl:NonTradeReceivableMember
[  5x] us-gaap:InterestRateContractMember
[  5x] us-gaap:NondesignatedMember
[  5x] aapl:ThirdQuarter2023DebtIssuanceMember
[  5x] aapl:OtherCountriesMember
[  5x] See accompan

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

  Indexed 274 chunks | Elapsed: 7.78s | Avg chunk length: 592.7 chars | Min/Max: 204/799

Processing 0000320193-22-000108 (2022-10-28)

=== Repeated lines (>= 3 times) ===

[ 28x] us-gaap:FairValueInputsLevel2Member
[ 12x] us-gaap:CreditConcentrationRiskMember
[ 11x] us-gaap:ForeignExchangeContractMember
[ 11x] aapl:A20132021DebtIssuancesMember
[ 11x] aapl:FixedRateNotesMember
[ 10x] us-gaap:RetainedEarningsMember
[ 10x] us-gaap:AccumulatedOtherComprehensiveIncomeMember
[  9x] srt:MaximumMember
[  9x] us-gaap:RestrictedStockUnitsRSUMember
[  8x] us-gaap:CommonStockMember
[  8x] srt:MinimumMember
[  7x] us-gaap:CommonStockIncludingAdditionalPaidInCapitalMember
[  7x] us-gaap:DesignatedAsHedgingInstrumentMember
[  7x] aapl:NonTradeReceivableMember
[  6x] srt:CumulativeEffectPeriodOfAdoptionAdjustmentMember
[  6x] us-gaap:FairValueInputsLevel1Member
[  6x] us-gaap:OtherLiabilitiesMember
[  5x] us-gaap:CommercialPaperMember
[  5x] us-gaap:InterestRateContractMember
[  5x] us-gaap:Nondesign

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

  Indexed 303 chunks | Elapsed: 8.35s | Avg chunk length: 589.9 chars | Min/Max: 201/796

Processing 0000320193-21-000105 (2021-10-29)

=== Repeated lines (>= 3 times) ===

[ 19x] us-gaap:FairValueInputsLevel2Member
[ 17x] aapl:FixedRateNotesMember
[ 14x] aapl:A20132020DebtIssuancesMember
[ 11x] srt:MaximumMember
[ 10x] us-gaap:RetainedEarningsMember
[ 10x] us-gaap:AccumulatedOtherComprehensiveIncomeMember
[ 10x] srt:MinimumMember
[ 10x] us-gaap:CreditConcentrationRiskMember
[  9x] us-gaap:RestrictedStockUnitsRSUMember
[  8x] us-gaap:CommonStockMember
[  7x] us-gaap:CommonStockIncludingAdditionalPaidInCapitalMember
[  7x] us-gaap:ForeignExchangeContractMember
[  7x] aapl:NonTradeReceivableMember
[  7x] aapl:FloatingRateNotesMember
[  6x] srt:CumulativeEffectPeriodOfAdoptionAdjustmentMember
[  5x] us-gaap:CommercialPaperMember
[  5x] aapl:SecondQuarter2021DebtIssuanceMember
[  5x] aapl:FourthQuarter2021DebtIssuanceMember
[  5x] aapl:OtherCountriesMember
[  5x] See accompanying Notes to 

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

  Indexed 321 chunks | Elapsed: 7.20s | Avg chunk length: 585.4 chars | Min/Max: 205/799

Processing 0000320193-20-000096 (2020-10-30)

=== Repeated lines (>= 3 times) ===

[ 39x] us-gaap:FairValueInputsLevel2Member
[ 26x] us-gaap:ForeignExchangeContractMember
[ 22x] aapl:FixedRateNotesMember
[ 21x] us-gaap:InterestRateContractMember
[ 14x] srt:CumulativeEffectPeriodOfAdoptionAdjustmentMember
[ 14x] aapl:A20132019DebtIssuancesMember
[ 13x] srt:MaximumMember
[ 12x] srt:MinimumMember
[ 12x] us-gaap:OtherAssetsMember
[ 11x] us-gaap:RetainedEarningsMember
[ 11x] us-gaap:AccumulatedOtherComprehensiveIncomeMember
[ 11x] us-gaap:DesignatedAsHedgingInstrumentMember
[ 10x] us-gaap:ReclassificationOutOfAccumulatedOtherComprehensiveIncomeMember
[  9x] us-gaap:NondesignatedMember
[  9x] us-gaap:OtherLiabilitiesMember
[  9x] us-gaap:NonoperatingIncomeExpenseMember
[  9x] us-gaap:CreditConcentrationRiskMember
[  9x] us-gaap:AccumulatedNetUnrealizedInvestmentGainLossMember
[  9x] us-gaap:RestrictedSt

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

  Indexed 342 chunks | Elapsed: 8.71s | Avg chunk length: 595.1 chars | Min/Max: 205/798

Processing 0000320193-19-000119 (2019-10-31)

=== Repeated lines (>= 3 times) ===

[ 75x] September 28, 2019
[ 64x] Apple Inc. | 2019 Form 10-K |
[ 39x] us-gaap:FairValueInputsLevel2Member
[ 29x] September 29, 2018
[ 25x] us-gaap:ForeignExchangeContractMember
[ 19x] us-gaap:InterestRateContractMember
[ 14x] aapl:A20132018DebtIssuancesMember
[ 12x] us-gaap:OtherLiabilitiesMember
[ 12x] us-gaap:CreditConcentrationRiskMember
[ 12x] aapl:FixedRateNotesMember
[ 12x] us-gaap:AccumulatedNetGainLossFromDesignatedOrQualifyingCashFlowHedgesMember
[ 11x] us-gaap:RetainedEarningsMember
[ 11x] us-gaap:AccumulatedOtherComprehensiveIncomeMember
[ 11x] us-gaap:DesignatedAsHedgingInstrumentMember
[ 11x] , respectively.
[ 10x] us-gaap:ReclassificationOutOfAccumulatedOtherComprehensiveIncomeMember
[  9x] srt:MaximumMember
[  9x] us-gaap:NondesignatedMember
[  9x] us-gaap:CashFlowHedgingMember
[  9x] us-gaap:FairValu

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

  Indexed 329 chunks | Elapsed: 10.23s | Avg chunk length: 614.3 chars | Min/Max: 205/799

Processing 0000320193-18-000145 (2018-11-05)

=== Repeated lines (>= 3 times) ===

[ 72x] September 29, 2018
[ 72x] Apple Inc. | 2018 Form 10-K |
[ 27x] September 30, 2017
[  8x] , respectively.
[  6x] (dollars in millions):
[  6x] , the Company had
[  5x] See accompanying Notes to Consolidated Financial Statements.
[  5x] November 5, 2018
[  5x] Proxy Statement to be filed with the SEC within 120 days after
[  4x] (dollars in millions and units in thousands):
[  4x] and is incorporated herein by reference.
[  3x] Revenue Recognition
[  3x] , approximately


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

  Indexed 403 chunks | Elapsed: 10.62s | Avg chunk length: 609.2 chars | Min/Max: 201/799

Processing 0000320193-17-000070 (2017-11-03)

=== Repeated lines (>= 3 times) ===

[ 81x] September 30, 2017
[ 77x] Apple Inc. | 2017 Form 10-K |
[ 31x] September 24, 2016
[  9x] , respectively.
[  6x] (dollars in millions):
[  6x] , the Company had
[  5x] See accompanying Notes to Consolidated Financial Statements.
[  5x] November 3, 2017
[  5x] Proxy Statement to be filed with the SEC within 120 days after
[  4x] (dollars in millions and units in thousands):
[  4x] and is incorporated herein by reference.
[  3x] September 26, 2015
[  3x] , plus interest of
[  3x] Revenue Recognition
[  3x] , approximately


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

  Indexed 429 chunks | Elapsed: 12.05s | Avg chunk length: 600.2 chars | Min/Max: 201/799

Processing 0001628280-16-020309 (2016-10-26)

=== Repeated lines (>= 3 times) ===

[ 81x] September 24, 2016
[ 77x] Apple Inc. | 2016 Form 10-K |
[ 28x] September 26, 2015
[  7x] , respectively.
[  6x] (dollars in millions):
[  5x] See accompanying Notes to Consolidated Financial Statements.
[  5x] October 26, 2016
[  5x] Proxy Statement to be filed with the SEC within 120 days after
[  4x] (dollars in millions and units in thousands):
[  4x] are as follows (in millions):
[  4x] and is incorporated herein by reference.
[  3x] are as follows (dollars in millions):
[  3x] Revenue Recognition
[  3x] , approximately


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

  Indexed 430 chunks | Elapsed: 12.78s | Avg chunk length: 597.1 chars | Min/Max: 202/799

Processing 0001193125-15-356351 (2015-10-28)

=== Repeated lines (>= 3 times) ===

[ 78x] Table of Contents
[  5x] See accompanying Notes to Consolidated Financial Statements.
[  4x] forward-looking


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

  Indexed 483 chunks | Elapsed: 46.51s | Avg chunk length: 590.0 chars | Min/Max: 201/799


In [22]:
chroma_manager.get_collection_stats()

{'name': 'sec_filings',
 'count': 3877,
 'embedding_model': 'nomic-embed-text',
 'embedding_dimension': 'unknown',
 'persist_directory': '/workspace/data/chroma'}

In [26]:
chroma_manager.query(
    query_text="what are the risk factor of apple in year 2015?",
    n_results=5,
    where={
        "$and": [
            {"company": "Apple Inc."},
            {"filing_type": "10-K"},
            {"fiscal_year": 2015},
        ]
    }
)


INFO:httpx:HTTP Request: POST http://ollama:11434/api/embed "HTTP/1.1 200 OK"


{'ids': [['0000320193_10K_2015_000119312515356351_Item1A_055',
   '0000320193_10K_2015_000119312515356351_Item1A_083',
   '0000320193_10K_2015_000119312515356351_Item8_080',
   '0000320193_10K_2015_000119312515356351_Item7_005',
   '0000320193_10K_2015_000119312515356351_Item3_002']],
 'embeddings': None,
 'documents': [['The Company also could be significantly affected by other risks associated with international activities including, but not limited to, economic and\nlabor conditions, increased duties, taxes and other costs and political instability. Margins on sales of the Company’s products in foreign countries, and on sales of products that include components obtained from foreign suppliers, could be\nmaterially adversely affected by international trade regulations, including duties, tariffs and antidumping penalties. The Company is also exposed to credit and collectability risk on its trade receivables with customers in certain international\nmarkets. There can be no assurance th