In [1]:
import os
from dotenv import load_dotenv

load_dotenv()
openai_api_key = os.getenv("OPENAI_API_KEY")

In [12]:
from langchain_community.document_loaders import PyPDFLoader
from langchain.chat_models import init_chat_model
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langchain.chains.summarize import load_summarize_chain

pdf_path = "contract.pdf"
loader = PyPDFLoader(pdf_path)
pages = loader.load()
full_text = "\n".join([page.page_content for page in pages])

llm = init_chat_model(
    model="gpt-4o-mini",
    temperature=0,
    model_provider="openai",
)

prompt = ChatPromptTemplate.from_messages([
    ("system", "You are a lawyer and helpful assistant."),
    ("human", "Answer the {question} based on {text}\n\nSummary:")
])

parser = StrOutputParser()

chain = prompt | llm | parser

response = chain.invoke({"question": input("ask"), "text": full_text})

print("\nü§ñ Bot-reply:\n", response)


ü§ñ Bot-reply:
 **Summary of Contract Agreement**

**Parties Involved:**
- **Company:** ABC Corporation, a Delaware corporation with its principal office in Wilmington, DE.
- **Contractor:** John Doe, an individual residing in Springfield, IL.

**Key Terms:**

1. **Engagement:** The Company engages the Contractor to perform specific services outlined in Exhibit A, which the Contractor accepts.

2. **Term:** The Agreement is effective from July 1, 2025, to December 31, 2025, unless terminated earlier.

3. **Compensation:** The Contractor will receive $5,000 per month, payable on the last business day of each month.

4. **Confidentiality:** The Contractor must keep all proprietary information confidential and cannot disclose it without the Company's written consent.

5. **Intellectual Property:** Any intellectual property created by the Contractor while performing the Services will belong exclusively to the Company.

6. **Termination:** Either party can terminate the Agreement with thi

In [13]:
# 1. Loading a PDF doc
pdf_path = "contract.pdf"  # doc name in same directory
loader = PyPDFLoader(pdf_path)
pages = loader.load()
# no need to join texts

# 2. LLM
llm = init_chat_model(
    model="gpt-4o-mini",
    temperature=0,
    model_provider="openai", 
)

# 3. Prompt with roles
stuff_prompt = ChatPromptTemplate.from_messages([
    ("system", "You are a lawyer and helpful assistant always making some jokes."),
    ("human", "Summarize the following text:\n\n{text}\n\nSummary:")
])

# 4 . Building a chain (simple stuff-mode)
chain = load_summarize_chain(
    llm,
    chain_type="stuff",
    prompt=stuff_prompt
)

# 5. invoke chain
summary = chain.invoke({"input_documents": pages}) 

# 6. Output
print("\nüìÑ Summary:\n", summary["output_text"])


üìÑ Summary:
 This Contract Agreement, effective July 1, 2025, is between ABC Corporation and Contractor John Doe. The Contractor will provide specified services until December 31, 2025, for a monthly fee of $5,000. The Contractor must maintain confidentiality regarding proprietary information and all intellectual property created will belong to the Company. Either party can terminate the agreement with 30 days' notice. The Contractor is classified as an independent contractor, not an employee. The agreement is governed by Delaware law. 

And remember, if you ever need to terminate a contract, just give 30 days' notice‚Äîunless it's a bad haircut, then you might want to act faster!


In [20]:
summary.keys()

dict_keys(['input_documents', 'output_text'])

In [21]:
loader = PyPDFLoader(pdf_path, 
                     mode = "single",
                    pages_delimiter = "\n-------THIS IS A CUSTOM END OF PAGE-------\n",
                    )

In [26]:
page = loader.load()[0]
print(page.page_content)

CONTRACT AGREEMENT
This Contract Agreement ("Agreement") is made and entered into on this 1st day of July, 2025, by
and between:
ABC Corporation, a company incorporated under the laws of Delaware, with its principal office
located at 123 Business Road, Wilmington, DE 19801 ("Company"),
AND
John Doe, an individual residing at 456 Residential Street, Springfield, IL 62704 ("Contractor").
1. ENGAGEMENT
The Company hereby engages the Contractor to perform the services described in Exhibit A
attached hereto ("Services"), and the Contractor hereby accepts such engagement on the terms and
conditions set forth herein.
2. TERM
This Agreement shall commence on July 1, 2025, and shall continue until December 31, 2025,
unless earlier terminated in accordance with this Agreement.
3. COMPENSATION
As compensation for the Services, the Company shall pay the Contractor $5,000 per month,
payable on the last business day of each month.
4. CONFIDENTIALITY
-------THIS IS A CUSTOM END OF PAGE-------
The Con

In [27]:
path = "apple_10k.pdf"
loader = PyPDFLoader(path, mode = "page")
pages = loader.load()

In [32]:
for doc in loader.lazy_load():
    print(doc.page_content)

UNITED STATES
SECURITIES AND EXCHANGE COMMISSION
Washington, D.C. 20549
FORM 10-K
(Mark One)
‚òí    ANNUAL REPORT PURSUANT TO SECTION 13 OR 15(d) OF THE SECURITIES EXCHANGE ACT OF 1934
For the fiscal year ended September¬†28, 2024
or
‚òê    TRANSITION REPORT PURSUANT TO SECTION 13 OR 15(d) OF THE SECURITIES EXCHANGE ACT OF 1934
For the transition period from ¬†¬†¬†¬†¬†¬†¬†¬†¬†¬†¬†¬† to ¬†¬†¬†¬†¬†¬†¬†¬†¬†¬†¬†¬†.
Commission File Number: 001-36743
Apple Inc.
(Exact name of Registrant as specified in its charter)
California 94-2404110
(State or other jurisdiction
of incorporation or organization)
(I.R.S. Employer Identification No.)
One Apple Park Way
Cupertino, California 95014
(Address of principal executive offices) (Zip Code)
(408) 996-1010
(Registrant‚Äôs telephone number, including area code)
Securities registered pursuant to Section 12(b) of the Act:
Title of each class
Trading 
symbol(s) Name of each exchange on which registered
Common Stock, $0.00001 par value per share AAPL The N

In [36]:
import tiktoken                        
enc = tiktoken.encoding_for_model("gpt-4o-mini")  
n_tokens = len(enc.encode(pages[3].page_content))      
print(n_tokens)

805


In [37]:
approx_tokens = len(pages[3].page_content) // 4.5 # approx 4-5 characters per token
approx_tokens

804.0

In [38]:
from langchain_community.document_loaders import TextLoader

path = "Report.txt"
loader = TextLoader(path, autodetect_encoding=True)
doc = loader.load()

In [42]:
doc[0].page_content

'EXECUTIVE SUMMARY\n\nInnovexa\u202fHoldings concluded Fiscal\u202fYear\u202f2024 in a solid strategic and financial position, despite the overhang of inflationary pressure, supply‚Äëchain dislocation, and uneven enterprise‚ÄëIT sentiment. Full‚Äëyear revenue climbed to 1\u202fbillion\u202f260\u202fmillion US\u202fdollars, an increase of roughly twelve percent compared with the prior year, while net profit improved to 147\u202fmillion dollars. Operating cash flow reached 184\u202fmillion dollars, supporting a free‚Äëcash‚Äëflow margin above thirteen percent. Management attributes the advance to a disciplined shift toward higher‚Äëmargin recurring subscriptions, a moderation in component costs during the second half, and early benefits from the Horizon\u202f2030 transformation program.\nThe company has three reportable segments. Core\u202fSolutions, which delivers application‚Äëmodernisation and hybrid‚Äëcloud migration services, produced 43\u202fpercent of revenue and grew seventeen pe

In [46]:
from langchain_text_splitters import RecursiveCharacterTextSplitter  

splitter = RecursiveCharacterTextSplitter(chunk_size=2000, chunk_overlap=500, separators=["\n\n", "\n", " ", ""])
chunks = splitter.split_documents(doc)
print(f"Number of chunks: {len(chunks)}")


Number of chunks: 6


In [47]:
type(chunks[0])

langchain_core.documents.base.Document

In [48]:
chunks[0].metadata

{'source': 'Report.txt'}

In [49]:
[len(chunk.page_content) for chunk in chunks]

[1786, 1075, 1789, 1846, 1536, 1767]

In [51]:
print(chunks[0].page_content)

EXECUTIVE SUMMARY

Innovexa‚ÄØHoldings concluded Fiscal‚ÄØYear‚ÄØ2024 in a solid strategic and financial position, despite the overhang of inflationary pressure, supply‚Äëchain dislocation, and uneven enterprise‚ÄëIT sentiment. Full‚Äëyear revenue climbed to 1‚ÄØbillion‚ÄØ260‚ÄØmillion US‚ÄØdollars, an increase of roughly twelve percent compared with the prior year, while net profit improved to 147‚ÄØmillion dollars. Operating cash flow reached 184‚ÄØmillion dollars, supporting a free‚Äëcash‚Äëflow margin above thirteen percent. Management attributes the advance to a disciplined shift toward higher‚Äëmargin recurring subscriptions, a moderation in component costs during the second half, and early benefits from the Horizon‚ÄØ2030 transformation program.
The company has three reportable segments. Core‚ÄØSolutions, which delivers application‚Äëmodernisation and hybrid‚Äëcloud migration services, produced 43‚ÄØpercent of revenue and grew seventeen percent year over year. Cloud‚ÄØIntegratio

In [52]:
print(chunks[1].page_content)

MARKET OVERVIEW

Industry analysts continue to forecast high‚Äësingle‚Äëdigit compound annual growth for cloud‚Äënative middleware and digital‚Äëtransformation consulting through 2028. The total addressable market for Innovexa‚Äôs portfolio is estimated at sixty‚Äëseven‚ÄØbillion dollars. North‚ÄØAmerica and the EMEA region together generated seventy‚Äëfour percent of company revenue during 2024. Asia‚ÄëPacific contributed eighteen percent, powered by strong demand in Australia, India, and Japan, while Latin‚ÄØAmerica accounted for the remaining eight percent.
Competition intensified as tier‚Äëone global system integrators pressed further into the mid‚Äëmarket; nevertheless, Innovexa maintained a mid‚Äëteens market share thanks to deep domain expertise in regulated verticals such as financial services and life sciences. Gartner‚Äôs latest Magic‚ÄØQuadrant placed the firm in the Leaders section for Hybrid‚ÄëCloud Orchestration, citing its robust reference architecture, a service‚Äëlevel

In [53]:
from langchain_community.document_loaders import PyPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_core.prompts import ChatPromptTemplate
from langchain.chat_models import init_chat_model
from langchain_core.output_parsers import StrOutputParser

# 1. Loading PDF Doc 
pdf_path = "apple_10k.pdf" 
loader = PyPDFLoader(pdf_path, mode = "single")
doc = loader.load()
#full_text = doc[0].page_content

# 2. split into coherent chunks
splitter = RecursiveCharacterTextSplitter(
    chunk_size=10000, # characters! ~ 2000-2500 Tokens 
    chunk_overlap=2000, # 10-20%
    separators=["\n\n", "\n", ".", "!", "?", " "],  # broad -> narrow
)
chunks = splitter.split_documents(doc)

# 3. Preparing Inputs for Batch-API: ADDED!!!
inputs = [{"text": chunk.page_content} for chunk in chunks]

# 4. Prompt for chunk-summaries (MAP)
map_prompt = ChatPromptTemplate.from_messages([
    ("system", "You are a business anylyst and helpful assistant."),
    ("human", "Summarize the following text briefly in 3 bullet points:\n\n{text}\n\nSummary:")
])

# 5. Prompt for final summary (REDUCE)
reduce_prompt = ChatPromptTemplate.from_messages([
    ("system", "You are a business anylyst and helpful assistant."),
    ("human", "Summarize the following texts in a consistent final summary {reduce_style}:\n\n{text}\n\nFinal Summary:")
])

# 6. LLM
llm = init_chat_model(
    model="gpt-4o-mini",
    temperature=0,
    model_provider="openai", 
)
    
# 7. Parser
parser = StrOutputParser()

# 8. Two separate chains
map_chain = map_prompt | llm | parser
reduce_chain = reduce_prompt | llm | parser

# 9. MAP Step: Summarizing each chunk *in parallel (batch)* NEW!!!
summaries = map_chain.batch(
    inputs, # list of chunks
    config={"max_concurrency": 4},   # ‚Üê adjust to your OpenAI-Rate-Limits
)

# 10. REDUCE Step: Final Summary from chunk summaries
final_summary = reduce_chain.invoke({"reduce_style":"in a very detailed, structured and comprehensive manner while dropping duplicated info",
                                     "text": "\n\n".join(summaries)})

# 11. Final Output
print("\nüìÑ Summary:\n", final_summary)


üìÑ Summary:
 ### Comprehensive Summary of Apple Inc.'s Annual Report (Form 10-K) for Fiscal Year Ending September 28, 2024

#### 1. **Business Overview**
- **Product Lines**: Apple Inc. designs, manufactures, and markets a diverse range of products, including:
  - **Smartphones**: iPhone
  - **Personal Computers**: Mac
  - **Tablets**: iPad
  - **Wearables**: Various accessories
  - **Services**: Advertising, AppleCare, cloud services, and digital content platforms.
- **Geographic Segmentation**: The company operates across multiple regions, including the Americas, Europe, Greater China, Japan, and the Rest of Asia Pacific, utilizing both direct and indirect distribution channels.

#### 2. **Financial Performance**
- **Net Sales**: For fiscal year 2024, Apple reported total net sales of **$391.035 billion**, a **2% increase** from 2023, primarily driven by higher sales in Services, while iPhone sales remained flat.
- **Operating Expenses**: Increased by **5%** to **$57.467 billion**