<a href="https://colab.research.google.com/github/nirupamgpta/Assignments/blob/main/pdfSummarization_1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install -q \
langchain \
  langchain-community \
  langchain-text-splitters \
  chromadb \
  sentence-transformers \
  transformers \
  pypdf \
  accelerate \
  reportlab

In [None]:
from langchain_community.document_loaders import PyPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import Chroma
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline
from google.colab import files
import os

# Upload file from hard drive
uploaded = files.upload()

# Get the path to the uploaded file
# Assuming only one file is uploaded for simplicity
file_path = None
for fn in uploaded.keys():
    file_path = os.path.join("/content", fn)
    print(f"User uploaded file \"{fn}\" to {file_path}")
    break # Take the first uploaded file

if file_path:
    loader = PyPDFLoader(file_path)
    documents = loader.load()
    print(f"Successfully loaded {len(documents)} pages from {fn}.")
else:
    print("No file was uploaded.")


In [None]:
# ===========================
# Text Splitting
# ===========================
splitter = RecursiveCharacterTextSplitter(
    chunk_size=800,
    chunk_overlap=150
)

docs = splitter.split_documents(documents)
print("Chunks:", len(docs))


# ===========================
# Embeddings + ChromaDB
# ===========================
embeddings = HuggingFaceEmbeddings(
    model_name="sentence-transformers/all-MiniLM-L6-v2"
)

vectordb = Chroma.from_documents(
    docs,
    embedding=embeddings,
    persist_directory="chroma_db"
)

vectordb.persist()


# ===========================
# Retrieval
# ===========================
query = "Summarize this document"

retrieved_docs = vectordb.similarity_search(
    query,
    k=6
)


# ===========================
# Summarization Model (IMPORTANT CHANGE)
# ===========================
# BART is MADE for summarization, unlike FLAN-T5
model_name = "facebook/bart-large-cnn"

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

pipe = pipeline(
    "summarization",
    model=model,
    tokenizer=tokenizer,
    max_length=200,
    min_length=60,
    do_sample=False
)


# ===========================
# TOKEN-SAFE TRUNCATION
# ===========================
def truncate_to_max_tokens(text, tokenizer, max_tokens=900):
    tokens = tokenizer(
        text,
        truncation=True,
        max_length=max_tokens,
        return_tensors="pt"
    )
    return tokenizer.decode(
        tokens["input_ids"][0],
        skip_special_tokens=True
    )


# ===========================
# MAP STEP â€” summarize chunks
# ===========================
def summarize_chunks(docs, pipe, tokenizer, max_tokens=900):
    summaries = []

    for doc in docs:
        safe_text = truncate_to_max_tokens(
            doc.page_content,
            tokenizer,
            max_tokens
        )

        summary = pipe(safe_text)[0]["summary_text"]
        summaries.append(summary)

    return summaries


# ===========================
# Deduplicate helper
# ===========================
def deduplicate_summaries(summaries):
    seen = set()
    unique = []
    for s in summaries:
        s_clean = s.strip()
        if s_clean and s_clean not in seen:
            unique.append(s_clean)
            seen.add(s_clean)
    return unique


# ===========================
# REDUCE STEP â€” final synthesis
# ===========================
def reduce_summaries(chunk_summaries, pipe, tokenizer, max_tokens=900):
    chunk_summaries = deduplicate_summaries(chunk_summaries)

    combined = "\n".join(chunk_summaries)

    safe_combined = truncate_to_max_tokens(
        combined,
        tokenizer,
        max_tokens
    )

    final = pipe(safe_combined)[0]["summary_text"]
    return final


# ===========================
# RUN MAP â†’ REDUCE
# ===========================
chunk_summaries = summarize_chunks(
    retrieved_docs,
    pipe,
    tokenizer
)

final_summary = reduce_summaries(
    chunk_summaries,
    pipe,
    tokenizer
)


# ===========================
# OUTPUT
# ===========================
print("\nðŸ“„ FINAL SUMMARY:\n")
print(final_summary)

In [11]:
from reportlab.platypus import SimpleDocTemplate, Paragraph
from reportlab.lib.styles import getSampleStyleSheet

def save_summary_to_pdf(summary_text, filename="summary.pdf"):
    styles = getSampleStyleSheet()
    story = []

    story.append(Paragraph("<b>Document Summary</b>", styles["Title"]))
    story.append(Paragraph(summary_text.replace("\n", "<br/>"), styles["Normal"]))

    pdf = SimpleDocTemplate(filename)
    pdf.build(story)

    return filename


In [12]:
file_path = save_summary_to_pdf(final_summary)
print(f"âœ… Summary saved as {file_path}")

âœ… Summary saved as summary.pdf
