In [None]:
# Summarizing Large Books with LLMs

# ✅ Goal:
# Efficiently summarize a long book that exceeds the token limit of LLMs (e.g., 4096 tokens for GPT-3.5).

# ✅ Approach:
# Use a "Chunking and Hierarchical Summarization" strategy:

#   1. Split the book into smaller text chunks (e.g., by sentences, up to a token/word limit).
#   2. Summarize each chunk individually using the LLM.
#   3. Concatenate all chunk summaries.
#   4. Optionally, summarize the combined summaries for a concise final result.

In [None]:
import os
import nltk
import PyPDF2
import config_loader
from nltk.tokenize import sent_tokenize
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, pipeline

# === Load Config ===
config = config_loader.load_config('config.py')
CACHE_DIR = getattr(config, "CACHE_DIR", None)
MODEL_NAME = getattr(config, "MODEL_NAME", None)
TRANSFORMERS_OFFLINE = getattr(config, "TRANSFORMERS_OFFLINE", "1")

if CACHE_DIR is None:
    raise AttributeError("The configuration file does not contain 'CACHE_DIR'.")
if MODEL_NAME is None:
    raise AttributeError("The configuration file does not contain 'MODEL_NAME'.")
print("Static Paths:", CACHE_DIR, MODEL_NAME)

# === Set Offline Mode ===
os.environ["TRANSFORMERS_OFFLINE"] = TRANSFORMERS_OFFLINE

# === Load Model ===
print("Loading model from local directory...")
tokenizer = AutoTokenizer.from_pretrained(os.path.join(CACHE_DIR, MODEL_NAME))
model = AutoModelForSeq2SeqLM.from_pretrained(os.path.join(CACHE_DIR, MODEL_NAME))
summarizer = pipeline("summarization", model=model, tokenizer=tokenizer, device=0)

# === Setup NLTK ===
NLTK_DATA_DIR = os.path.join(CACHE_DIR, "nltk_data")
os.makedirs(NLTK_DATA_DIR, exist_ok=True)
nltk.download("punkt", download_dir=NLTK_DATA_DIR)
nltk.download('punkt_tab',download_dir=NLTK_DATA_DIR)
nltk.data.path.append(NLTK_DATA_DIR)

# === PDF Reader ===
def read_pdf(file_path):
    with open(file_path, "rb") as file:
        reader = PyPDF2.PdfReader(file)
        text = ""
        for page in reader.pages:
            text += page.extract_text() or ""
        return text

# === Summarization Function ===
def summarize_text(text):
    sentences = sent_tokenize(text)
    max_chunk_length = 512  # You can tweak this

    # Split text into chunks
    chunks = []
    current_chunk = ""
    for sentence in sentences:
        if len(current_chunk) + len(sentence) < max_chunk_length:
            current_chunk += " " + sentence
        else:
            chunks.append(current_chunk.strip())
            current_chunk = sentence
    if current_chunk:
        chunks.append(current_chunk.strip())

    # Summarize each chunk
    summaries = []
    for chunk in chunks:
        input_length = len(tokenizer.encode(chunk, truncation=True))
        max_len = min(int(input_length * 0.5), 120)
        min_len = 25

        # Fix max/min mismatch
        if max_len <= min_len:
            min_len = max(5, int(max_len * 0.5))

        try:
            print(f"[INFO] Summarizing chunk with max_len={max_len}, min_len={min_len}")
            summary = summarizer(
                chunk,
                max_length=max_len,
                min_length=min_len,
                do_sample=False
            )
            summaries.append(summary[0]["summary_text"])
        except RuntimeError as e:
            print(f"[WARNING] Skipped chunk due to error: {e}")
            continue

    return " ".join(summaries)

# === Main ===
if __name__ == "__main__":
    pdf_path = "./The McKinsey Way Using the Techniques of the World.pdf"
    if not os.path.exists(pdf_path):
        raise FileNotFoundError(f"File not found: {pdf_path}")

    text = read_pdf(pdf_path)
    summary = summarize_text(text)
    print("Summary:\n", summary)
