<a href="https://colab.research.google.com/github/nirupamgpta/Assignments/blob/main/pdfSummarization-L4_1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install -q \
langchain \
  langchain-community \
  langchain-text-splitters \
  chromadb \
  sentence-transformers \
  transformers \
  pypdf \
  accelerate \
  reportlab

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/67.3 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m67.3/67.3 kB[0m [31m3.7 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.5/2.5 MB[0m [31m90.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m21.7/21.7 MB[0m [31m91.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m328.2/328.2 kB[0m [31m24.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.0/2.0 MB[0m [31m79.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m278.2/278.2 kB[0m [31m22.4 MB/s[0m eta [36m0:00:0

In [9]:
from langchain_community.document_loaders import PyPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import Chroma
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline
from google.colab import files
import os

# Upload file from hard drive
uploaded = files.upload()

# Get the path to the uploaded file
# Assuming only one file is uploaded for simplicity
file_path = None
for fn in uploaded.keys():
    file_path = os.path.join("/content", fn)
    print(f"User uploaded file \"{fn}\" to {file_path}")
    break # Take the first uploaded file

if file_path:
    loader = PyPDFLoader(file_path)
    documents = loader.load()
    print(f"Successfully loaded {len(documents)} pages from {fn}.")
else:
    print("No file was uploaded.")




Saving abstract.pdf to abstract (1).pdf
User uploaded file "abstract (1).pdf" to /content/abstract (1).pdf
Successfully loaded 1 pages from abstract (1).pdf.


In [10]:
# ===========================
# Text Splitting
# ===========================
splitter = RecursiveCharacterTextSplitter(
    chunk_size=800,
    chunk_overlap=150
)

docs = splitter.split_documents(documents)
print("Chunks:", len(docs))


# ===========================
# Embeddings + ChromaDB
# ===========================
embeddings = HuggingFaceEmbeddings(
    model_name="sentence-transformers/all-MiniLM-L6-v2"
)

vectordb = Chroma.from_documents(
    docs,
    embedding=embeddings,
    persist_directory="chroma_db"
)

vectordb.persist()


# ===========================
# Retrieval
# ===========================
query = "Summarize this document"

retrieved_docs = vectordb.similarity_search(
    query,
    k=6
)


# ===========================
# Summarization Model (IMPORTANT CHANGE)
# ===========================
# BART is MADE for summarization, unlike FLAN-T5
model_name = "facebook/bart-large-cnn"

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

pipe = pipeline(
    "summarization",
    model=model,
    tokenizer=tokenizer,
    max_length=200,
    min_length=60,
    do_sample=False
)


# ===========================
# TOKEN-SAFE TRUNCATION
# ===========================
def truncate_to_max_tokens(text, tokenizer, max_tokens=900):
    tokens = tokenizer(
        text,
        truncation=True,
        max_length=max_tokens,
        return_tensors="pt"
    )
    return tokenizer.decode(
        tokens["input_ids"][0],
        skip_special_tokens=True
    )


# ===========================
# MAP STEP — summarize chunks
# ===========================
def summarize_chunks(docs, pipe, tokenizer, max_tokens=900):
    summaries = []

    for doc in docs:
        safe_text = truncate_to_max_tokens(
            doc.page_content,
            tokenizer,
            max_tokens
        )

        summary = pipe(safe_text)[0]["summary_text"]
        summaries.append(summary)

    return summaries


# ===========================
# Deduplicate helper
# ===========================
def deduplicate_summaries(summaries):
    seen = set()
    unique = []
    for s in summaries:
        s_clean = s.strip()
        if s_clean and s_clean not in seen:
            unique.append(s_clean)
            seen.add(s_clean)
    return unique


# ===========================
# REDUCE STEP — final synthesis
# ===========================
def reduce_summaries(chunk_summaries, pipe, tokenizer, max_tokens=900):
    chunk_summaries = deduplicate_summaries(chunk_summaries)

    combined = "\n".join(chunk_summaries)

    safe_combined = truncate_to_max_tokens(
        combined,
        tokenizer,
        max_tokens
    )

    final = pipe(safe_combined)[0]["summary_text"]
    return final


# ===========================
# RUN MAP → REDUCE
# ===========================
chunk_summaries = summarize_chunks(
    retrieved_docs,
    pipe,
    tokenizer
)

final_summary = reduce_summaries(
    chunk_summaries,
    pipe,
    tokenizer
)


# ===========================
# OUTPUT
# ===========================
print("\n📄 FINAL SUMMARY:\n")
print(final_summary)

Chunks: 2


  embeddings = HuggingFaceEmbeddings(
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

  vectordb.persist()


config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

Device set to use cpu
Your max_length is set to 142, but your input_length is only 77. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=38)
Your max_length is set to 142, but your input_length is only 138. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=69)



📄 FINAL SUMMARY:

ISB Online’s Leadership with AI programme is your gateway to unlocking the true potential of AI and Generative AI. harnessing its capabilities to drive strategic decision-making, foster innovation and lead with confidence. The programme is designed to equip executives and professionals with the knowledge, skills, and strategies required to thrive in the ever-evolving world of Artificial Intelligence.


In [11]:
from reportlab.platypus import SimpleDocTemplate, Paragraph
from reportlab.lib.styles import getSampleStyleSheet

def save_summary_to_pdf(summary_text, filename="summary.pdf"):
    styles = getSampleStyleSheet()
    story = []

    story.append(Paragraph("<b>Document Summary</b>", styles["Title"]))
    story.append(Paragraph(summary_text.replace("\n", "<br/>"), styles["Normal"]))

    pdf = SimpleDocTemplate(filename)
    pdf.build(story)

    return filename


In [12]:
file_path = save_summary_to_pdf(final_summary)
print(f"✅ Summary saved as {file_path}")

✅ Summary saved as summary.pdf
