In [1]:
# Question 1: Summarizing a 500-page book using LLMs
# ✅ Goal:
# The book is much longer than the maximum token length of LLM models (e.g., 4096 tokens in GPT-3.5). This issue needs to be managed.
# ✅ Solution:
# Technique: "Chunking + Step-by-Step Summarization"

#     Divide the book into smaller chunks (e.g., 1000 words each).

#     Summarize each chunk separately.

#     Combine the summaries.

#     Summarize the combined summaries again (summarize the summaries).

In [8]:
import config_loader
import os

# Load the configuration using the load_config function
config = config_loader.load_config('config.py')

# Access static paths or other variables from the loaded configuration
CACHE_DIR = getattr(config, "CACHE_DIR", None)  # Use getattr to safely access attributes
MODEL_NAME =getattr(config, "MODEL_NAME", None)
TRANSFORMERS_OFFLINE =getattr(config, "TRANSFORMERS_OFFLINE", "1")
if CACHE_DIR is None:
    raise AttributeError("The configuration file does not contain 'CACHE_DIR'.")
if MODEL_NAME is None:
    raise AttributeError("The configuration file does not contain 'MODEL_NAME'.")
print("Static Paths:", CACHE_DIR,MODEL_NAME)


# Set environment variable for offline mode
os.environ["TRANSFORMERS_OFFLINE"] =TRANSFORMERS_OFFLINE

Static Paths: /media/marzieh/3656C28656C24679/llm_models/ t5-small


In [9]:

from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, pipeline
# Load the model and tokenizer from the local directory
print("Loading model from local directory...")
tokenizer = AutoTokenizer.from_pretrained(os.path.join(CACHE_DIR, MODEL_NAME))
model = AutoModelForSeq2SeqLM.from_pretrained(os.path.join(CACHE_DIR, MODEL_NAME))

# Initialize the summarization pipeline with the local model
summarizer = pipeline("summarization", model=model, tokenizer=tokenizer,device='cuda')


  from .autonotebook import tqdm as notebook_tqdm


Loading model from local directory...


In [10]:
import nltk

# Set your custom cache directory
NLTK_DATA_DIR = os.path.join(CACHE_DIR, "nltk_data")

# Create nltk_data directory if it doesn't exist
os.makedirs(NLTK_DATA_DIR, exist_ok=True)

# Download the 'punkt' tokenizer to your custom nltk_data path
nltk.download("punkt", download_dir=NLTK_DATA_DIR)
nltk.download('punkt_tab',download_dir=NLTK_DATA_DIR)

# Tell NLTK to use that directory to find tokenizers and corpora
nltk.data.path.append(NLTK_DATA_DIR)

[nltk_data] Downloading package punkt to
[nltk_data]     /media/marzieh/3656C28656C24679/llm_models/nltk_data..
[nltk_data]     .
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     /media/marzieh/3656C28656C24679/llm_models/nltk_data..
[nltk_data]     .
[nltk_data]   Package punkt_tab is already up-to-date!


In [17]:
from nltk.tokenize import sent_tokenize
# Example usage (replace with your PDF processing logic)
def summarize_text(text):
    sentences = sent_tokenize(text)
    # Combine sentences into chunks if needed (e.g., for long texts)
    max_chunk_length = 512  # Adjust based on model’s max input length
    chunks = []
    current_chunk = ""
    for sentence in sentences:
        if len(current_chunk) + len(sentence) < max_chunk_length:
            current_chunk += " " + sentence
        else:
            chunks.append(current_chunk.strip())
            current_chunk = sentence
    if current_chunk:
        chunks.append(current_chunk.strip())
    
    # Summarize each chunk
    summaries = []
    for chunk in chunks:
        summary = summarizer(chunk, max_length=int(len(tokenizer.encode(chunk)) * 0.5), min_length=25, do_sample=False)
        # summary = summarizer(chunk, max_length=150, min_length=30, do_sample=False)
        summaries.append(summary[0]["summary_text"])
    return " ".join(summaries)




In [None]:
import PyPDF2
# Example: Read and summarize a PDF (your existing logic)
def read_pdf(file_path):
    with open(file_path, "rb") as file:
        reader = PyPDF2.PdfReader(file)
        text = ""
        for page in reader.pages:
            text += page.extract_text() or ""
        return text

# Example usage
if __name__ == "__main__":
    pdf_path = "./The McKinsey Way Using the Techniques of the World.pdf"  # Replace with your PDF path
    text = read_pdf(pdf_path)
    

    summary = summarize_text(text)
    print("Summary:", summary)