In [13]:
# Download SEC Inline-XBRL 10-K
import re
import warnings
from pathlib import Path

import requests
from bs4 import BeautifulSoup, XMLParsedAsHTMLWarning

In [15]:
# CONFIG
HTML_URL = "https://www.sec.gov/Archives/edgar/data/320193/000032019325000079/aapl-20250927.htm"
SAVE_HTML = Path("apple.htm")

warnings.filterwarnings("ignore", category=XMLParsedAsHTMLWarning)

In [17]:
# Helper functions
def download_htm(url: str, dest: Path) -> None:
    """Download the raw .htm file (keeps original bytes)."""
    print(f"Downloading from {url} ...")
    headers = {"User-Agent": "Mozilla/5.0 (compatible; RAGBot/1.0)"}
    r = requests.get(url, headers=headers, timeout=30)
    r.raise_for_status()
    dest.write_bytes(r.content)
    print(f"Saved → {dest} ({dest.stat().st_size/1024:.1f} KB)")


def clean_xbrl_xml(soup: BeautifulSoup) -> str:
    """Strip XBRL tags, scripts, styles, and SEC boilerplate."""
    # 1. Remove all <ix:...> elements
    for tag in soup.find_all(lambda t: t.name and t.name.startswith("ix:")):
        tag.decompose()

    # 2. Remove <script>, <style>, <noscript>
    for script in soup(["script", "style", "noscript"]):
        script.decompose()

    # 3. Extract visible text
    text = soup.get_text(separator="\n")

    # 4. Clean whitespace & SEC headers/footers
    text = re.sub(r"\n{3,}", "\n\n", text)  # collapse blank lines
    text = re.sub(r"Table of Contents.*?\n", "", text, flags=re.DOTALL)
    text = re.sub(r"\n\s*Page \d+.*?\n", "\n", text)  # page numbers
    text = re.sub(
        r"^.*?UNITED STATES\s+SECURITIES.*?\n",
        "",
        text,
        flags=re.DOTALL | re.IGNORECASE,
    )
    return text.strip()


def split_into_pages(text: str, words_per_page: int = 800) -> list[str]:
    """Mimic PDF pagination by word count."""
    words = text.split()
    return [
        " ".join(words[i : i + words_per_page])
        for i in range(0, len(words), words_per_page)
    ]

In [19]:
# MAIN
if __name__ == "__main__":
    # Download
    if not SAVE_HTML.exists():
        download_htm(HTML_URL, SAVE_HTML)
    else:
        print(f"{SAVE_HTML.name} already present – skipping download")

    # Parse as XML (the correct way)
    raw = SAVE_HTML.read_bytes()
    soup = BeautifulSoup(raw, features="xml")

    clean_text = clean_xbrl_xml(soup)
    pages = split_into_pages(clean_text, words_per_page=800)

    print(f"\nExtracted {len(pages)} pseudo‑pages")
    print(f"First 500 characters of page 1:\n{pages[0][:500]}")

apple.htm already present – skipping download

Extracted 40 pseudo‑pages
First 500 characters of page 1:
Washington, D.C. 20549 FORM 10-K (Mark One) ☒ ANNUAL REPORT PURSUANT TO SECTION 13 OR 15(d) OF THE SECURITIES EXCHANGE ACT OF 1934 For the fiscal year ended September 27 , 2025 or ☐ TRANSITION REPORT PURSUANT TO SECTION 13 OR 15(d) OF THE SECURITIES EXCHANGE ACT OF 1934 For the transition period from to . Commission File Number: 001-36743 Apple Inc. (Exact name of Registrant as specified in its charter) California 94-2404110 (State or other jurisdiction of incorporation or organization) (I.R.S. 


In [20]:
# loading local embedding transformer
from sentence_transformers import SentenceTransformer

embedder = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [21]:
# Encord the text
corpus = pages  # your cleaned list of page texts
embeddings = embedder.encode(
    corpus, batch_size=32, convert_to_numpy=True, show_progress_bar=True
)

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

In [22]:
# Normalizing embeddings for Cosine similarity
import numpy as np

embeddings = embeddings / np.linalg.norm(embeddings, axis=1, keepdims=True)

In [23]:
# FAISS (by Meta) is a library designed for fast similarity search across huge vector databases — it’s what many real RAG systems use in production.
import faiss

dim = embeddings.shape[1]  # 384 for MiniLM
index = faiss.IndexFlatIP(dim)  # IP = Inner Product (cosine)

In [24]:
# Adding vectors and ensuring that FAISS is accepting 32 bit integers
index.add(embeddings.astype("float32"))
print("Number of entries:", index.ntotal)

Number of entries: 40


In [26]:
# the retriever function from model
def retrieve(query: str, k: int = 5):
    q_vec = embedder.encode([query], convert_to_numpy=True)
    q_vec = q_vec / np.linalg.norm(q_vec, axis=1, keepdims=True)
    D, I = index.search(q_vec.astype("float32"), k)
    return [(pages[i], float(D[0][j])) for j, i in enumerate(I[0])]

In [27]:
for p, s in retrieve("What are Apple's main sources of revenue?"):
    print(f"score={s:.3f}\n{p[:200]}...\n")

score=0.548
liabilities 285,508 308,030 Commitments and contingencies Shareholders’ equity: Common stock and additional paid-in capital, $ 0.00001 par value: 50,400,000 shares authorized; 14,773,260 and 15,116,78...

score=0.506
of Services, iPhone and Mac. Greater China Greater China net sales decreased during 2025 compared to 2024 primarily due to lower net sales of iPhone, partially offset by higher net sales of Mac. Japan...

score=0.475
companies that have significant technical, marketing, distribution and other resources, as well as established hardware, software and service offerings. In addition, the Company faces significant comp...

score=0.463
estimated period they are expected to be provided. For certain long-term service arrangements, the Company has performance obligations for services it has not yet delivered. For these arrangements, th...

score=0.449
Accessories includes Apple-branded and third-party accessories. Apple Inc. | 2025 Form 10-K | 1 Services Advertising The

In [28]:
# Adding an LLM for synthesizing the answer (Ollama)
import ollama


def ask_ollama(prompt, model="llama3", temperature=0.0):
    r = ollama.chat(
        model=model,
        messages=[{"role": "user", "content": prompt}],
        options={"temperature": temperature},
    )
    return r["message"]["content"]

In [29]:
# Building the RAG prompt
def build_prompt(question, contexts):
    intro = "You are a financial analyst. Use only the provided context."
    cite = "\n\n".join([f"[chunk {i+1}]\n{ctx}" for i, ctx in enumerate(contexts)])
    ask = f"\n\nQuestion: {question}\nAnswer concisely with citations."
    return intro + "\n\nContext:\n" + cite + ask

In [30]:
# Asking a question end-to-end
query = "What supply chain risks does Apple mention?"
top = retrieve(query, k=5)
contexts = [t[0] for t in top]
prompt = build_prompt(query, contexts)
answer = ask_ollama(prompt)
print(answer)

Apple mentions the following supply chain risks:

1. **Tariffs and trade restrictions**: "the imposition of new or increased tariffs and other trade restrictions, their overall magnitude and duration, and retaliatory actions in response" (Part I, Item 1A, Risk Factors).
2. **Component shortages**: "supply shortages" (Part I, Item 1A, Risk Factors).
3. **Currency fluctuations**: "the weakening of foreign currencies relative to the U.S. dollar... could cause the Company to reduce international pricing or incur losses on its foreign currency derivative instruments" (Part II, Item 7A, Quantitative and Qualitative Disclosures About Market Risk).
4. **Credit risk**: "credit risk and fluctuations in the values of its investment portfolio" (Part II, Item 7A, Quantitative and Qualitative Disclosures About Market Risk).

These risks can have a material adverse impact on Apple's results of operations, financial condition, and stock price.


In [31]:
# Checking the sources of the answers
for i, (chunk, score) in enumerate(top, 1):
    print(f"Chunk {i} | similarity={score:.3f}\n{chunk[:250]}...\n")

Chunk 1 | similarity=0.634
demand for the Company’s products and services. In addition, consumer confidence and spending can be materially adversely affected in response to changes in fiscal and monetary policy, financial market volatility, declines in income or asset values, ...

Chunk 2 | similarity=0.604
companies that have significant technical, marketing, distribution and other resources, as well as established hardware, software and service offerings. In addition, the Company faces significant competition as competitors imitate the Company’s produ...

Chunk 3 | similarity=0.565
locations that are prone to earthquakes and other natural disasters. Global climate change is resulting in certain types of natural disasters and extreme weather occurring more frequently or with more intense effects. In addition, the Company’s and i...

Chunk 4 | similarity=0.529
will be filed with the U.S. Securities and Exchange Commission within 120 days after the end of the fiscal year to which this r

In [32]:
# wrapping the logic into 1 function for easy working
def rag_answer(question, k=5, model="llama3"):
    top = retrieve(question, k)
    ctx = [t[0] for t in top]
    prompt = build_prompt(question, ctx)
    return ask_ollama(prompt, model=model)

In [33]:
print(rag_answer("Summarize Apple's main business risks."))

Apple's main business risks include:

1. **Global economic downturn**: A recession or decline in consumer spending can negatively impact demand for Apple's products (Note 2).
2. **Supply chain disruptions**: Disruptions to the supply chain, such as natural disasters or industrial accidents, can impact production and delivery of products (Note 3).
3. **Cybersecurity threats**: Ransomware attacks and other cybersecurity threats can compromise customer data and disrupt business operations (Note 4).
4. **Competition from low-cost competitors**: Aggressive pricing by competitors can erode Apple's market share and profitability (Note 5).
5. **Failure to innovate**: If Apple fails to develop innovative new products, it may struggle to maintain its competitive advantage (Note 6).

These risks are summarized in the "Risk Factors" section of Apple's Form 10-K filing for 2025.


In [34]:
print(rag_answer("What are the main risks the company faces?"))

The company faces various risks, including:

1. **Economic Risks**: Uncertainty about global or regional economic conditions can impact consumer spending and demand for the company's products and services (Chunk 5).
2. **Trade Disputes**: Restrictions on international trade, such as tariffs and other controls on imports or exports of goods, technology, or data, can affect the company's business and supply chain (Chunk 5).
3. **Cybersecurity Risks**: The company is vulnerable to hacking, ransomware attacks, employee error, malfeasance, system error, faulty password management, or other irregularities that could compromise customer data (Chunk 4).
4. **Compliance Risks**: Unfavorable results of legal proceedings or government investigations can impact the company's business, results of operations, and financial condition (Chunk 6).
5. **Supply Chain Disruptions**: Global supply chains can be highly concentrated, and an escalation of geopolitical tensions or conflict could result in signi

In [37]:
print(
    rag_answer(
        "How does Apple describe its strategy for managing international sales and currency risks?"
    )
)

Apple describes its strategy for managing international sales and currency risks as follows:

* The company uses foreign exchange contracts to hedge against fluctuations in foreign currencies (Note 4, "Derivatives").
* Apple designates these instruments as either cash flow or fair value hedges.
* The company also uses interest rate swaps, options, or other instruments to protect its term debt or marketable securities from fluctuations in interest rates (Note 5, "Interest Rate Risk").

These strategies aim to mitigate the impact of foreign currency and interest rate fluctuations on Apple's international sales and financial performance.


In [38]:
print(
    rag_answer(
        "What steps does Apple mention for ensuring supply chain resilience and component availability?"
    )
)

Apple mentions the following steps to ensure supply chain resilience and component availability:

* Diversifying suppliers (Part I, Item 1A, Risk Factor 5)
* Maintaining a significant number of patents, trademarks, and copyrights to protect intellectual property (Part IV, Item 15, Exhibit and Financial Statement Schedules)

These measures aim to mitigate risks associated with supply chain disruptions and component availability issues.


In [39]:
print(
    rag_answer(
        "What legal proceedings or regulatory risks are discussed in the filing?"
    )
)

The following legal proceedings and regulatory risks are discussed in the filing:

* Unresolved Staff Comments (Item 1B): None mentioned.
* Legal Proceedings (Item 3): The Company is involved in various legal proceedings, including patent infringement lawsuits, employment-related disputes, and other matters. (Page 18)
* Cybersecurity (Item 1C): The Company faces cybersecurity risks, including the potential for unauthorized access to its systems or data. (Page 17)
* Uncertain Tax Positions (Note 7): The Company has uncertain tax positions that could impact its effective tax rate if recognized. (Page 49-50)
* Audit Matters: The critical audit matter discussed is related to the evaluation of uncertain tax positions, which involves significant judgment and complex domestic and international tax laws. (Page 49-50)

Note: These are just some of the legal proceedings and regulatory risks mentioned in the filing.


In [40]:
print(
    rag_answer(
        "Summarize how Apple balances innovation with environmental sustainability."
    )
)

Apple balances innovation with environmental sustainability by incorporating eco-friendly practices into its product design, manufacturing, and supply chain management. Here are some key initiatives:

1. **Sustainable Materials**: Apple uses recycled materials in its products, such as aluminum from recycled cans (Source: [chunk 2]).
2. **Energy Efficiency**: The company designs products to consume less energy, reducing carbon emissions and waste (Source: [chunk 3]).
3. **Supply Chain Transparency**: Apple works with suppliers to ensure responsible sourcing of materials, minimizing environmental impact (Source: [chunk 4]).
4. **Recycling Programs**: Apple offers take-back programs for old devices, promoting recycling and reducing electronic waste (Source: [chunk 5]).
5. **Renewable Energy**: The company invests in renewable energy sources, such as solar power, to reduce its carbon footprint (Source: [chunk 6]).

By integrating these sustainable practices into its operations, Apple aims 

In [41]:
# Quantitative evaluation Relevance Scoring using (Cosine Similarity)
from numpy import dot
from numpy.linalg import norm


def relevance_score(query, chunks, top_k=5):
    q_vec = embedder.encode([query], convert_to_numpy=True)[0]
    q_vec = q_vec / norm(q_vec)
    chunk_vecs = embedder.encode(chunks, convert_to_numpy=True)
    chunk_vecs = chunk_vecs / norm(chunk_vecs, axis=1, keepdims=True)
    sims = [dot(q_vec, c) for c in chunk_vecs]
    return sims[:top_k]

In [42]:
query = "What legal proceedings or regulatory risks are discussed in the filing?"
chunks = [t[0] for t in retrieve(query, k=5)]
scores = relevance_score(query, chunks)
for i, s in enumerate(scores, 1):
    print(f"Chunk {i}: similarity={s:.3f}")

Chunk 1: similarity=0.482
Chunk 2: similarity=0.479
Chunk 3: similarity=0.468
Chunk 4: similarity=0.466
Chunk 5: similarity=0.434
