In [1]:
import ebooklib
from ebooklib import epub
from bs4 import BeautifulSoup
import os

def epub_to_text(epub_path, output_dir):
    # Create output directory if it doesn't exist
    os.makedirs(output_dir, exist_ok=True)

    # Read the EPUB file
    book = epub.read_epub(epub_path)

    # Counter for chapter numbering
    chapter_count = 1

    # Iterate through all the items in the EPUB
    for item in book.get_items():
        if item.get_type() == ebooklib.ITEM_DOCUMENT:
            # Parse the content with BeautifulSoup
            soup = BeautifulSoup(item.get_content(), 'html.parser')
            
            # Extract text content
            text = soup.get_text()
            
            # Remove leading/trailing whitespace and extra newlines
            text = '\n'.join(line.strip() for line in text.split('\n') if line.strip())

            # Save the chapter as a text file
            output_path = os.path.join(output_dir, f'chapter_{chapter_count:03d}.txt')
            with open(output_path, 'w', encoding='utf-8') as f:
                f.write(text)
            
            print(f"Saved chapter {chapter_count} to {output_path}")
            chapter_count += 1

if __name__ == "__main__":
    epub_file = "nexus.epub"
    output_directory = "output_chapters"
    
    epub_to_text(epub_file, output_directory)

  for root_file in tree.findall('//xmlns:rootfile[@media-type]', namespaces={'xmlns': NAMESPACES['CONTAINERNS']}):


Saved chapter 1 to output_chapters/chapter_001.txt
Saved chapter 2 to output_chapters/chapter_002.txt
Saved chapter 3 to output_chapters/chapter_003.txt
Saved chapter 4 to output_chapters/chapter_004.txt
Saved chapter 5 to output_chapters/chapter_005.txt
Saved chapter 6 to output_chapters/chapter_006.txt
Saved chapter 7 to output_chapters/chapter_007.txt
Saved chapter 8 to output_chapters/chapter_008.txt
Saved chapter 9 to output_chapters/chapter_009.txt
Saved chapter 10 to output_chapters/chapter_010.txt
Saved chapter 11 to output_chapters/chapter_011.txt
Saved chapter 12 to output_chapters/chapter_012.txt
Saved chapter 13 to output_chapters/chapter_013.txt
Saved chapter 14 to output_chapters/chapter_014.txt
Saved chapter 15 to output_chapters/chapter_015.txt
Saved chapter 16 to output_chapters/chapter_016.txt
Saved chapter 17 to output_chapters/chapter_017.txt
Saved chapter 18 to output_chapters/chapter_018.txt
Saved chapter 19 to output_chapters/chapter_019.txt
Saved chapter 20 to o

In [9]:
import os
import openai
from tqdm import tqdm

from openai import OpenAI

client = OpenAI(
    # This is the default and can be omitted
    api_key='',
)



def get_chapter_content(file_path):
    with open(file_path, 'r', encoding='utf-8') as f:
        return f.read()

def summarize_chapter(previous_chapter, current_chapter, next_chapter):
    prompt = f"""Write a condensed version of the following chapter in approximately 1000 words from the third person but in a similar style to the author. Use markdown syntax. 
    Highlight the key words. Be thoughtful, concise, coherent, easy to understand. 
    Be sure to select what ideas from the source to present and what not to carefully. 
    For the main ideas you selected, be sure to provide enough context from the source for the readers to understand. Also be sure to make smooth transitions and connections between them so that to maintain a nice flow to the summary.
    Be like Richard Feynman.
    Write in plain text and paragraphs, no bullet points.
    The chapter is provided between triple quotes, along with the previous and next chapters for context if you think it is neccessary:

Previous chapter:
\"\"\"
{previous_chapter}
\"\"\"

Current chapter to rewrite:
\"\"\"
{current_chapter}
\"\"\"

Next chapter:
\"\"\"
{next_chapter}
\"\"\"



Please provide a concise 1000-word summary of the current chapter:"""

    response = client.chat.completions.create(
        model="gpt-4o-mini",
        messages=[
            {"role": "system", "content": "You are a helpful assistant that summarizes book chapters."},
            {"role": "user", "content": prompt}
        ],
        max_tokens=1500,  # Adjust as needed to get close to 500 words
        n=1,
        temperature=0.3,
    )

    return response.choices[0].message.content.strip()

def summarize_chapters(input_dir, output_dir):
    # Create output directory if it doesn't exist
    os.makedirs(output_dir, exist_ok=True)

    # Get all chapter files and sort them
    chapter_files = sorted([f for f in os.listdir(input_dir) if f.startswith('chapter_') and f.endswith('.txt')])

    for i, current_file in enumerate(tqdm(chapter_files, desc="Summarizing chapters")):
        current_path = os.path.join(input_dir, current_file)
        current_chapter = get_chapter_content(current_path)

        # Get previous chapter content (if exists)
        previous_chapter = ""
        if i > 0:
            previous_path = os.path.join(input_dir, chapter_files[i-1])
            previous_chapter = get_chapter_content(previous_path)

        # Get next chapter content (if exists)
        next_chapter = ""
        if i < len(chapter_files) - 1:
            next_path = os.path.join(input_dir, chapter_files[i+1])
            next_chapter = get_chapter_content(next_path)

        # Generate summary
        summary = summarize_chapter(previous_chapter, current_chapter, next_chapter)

        # Save summary as .md file
        summary_file = f"summary_{os.path.splitext(current_file)[0]}.md"
        summary_path = os.path.join(output_dir, summary_file)
        with open(summary_path, 'w', encoding='utf-8') as f:
            f.write(summary)

        print(f"Saved summary for {current_file} to {summary_path}")

if __name__ == "__main__":
    input_directory = "test_chapters"  # Directory containing the chapter text files
    output_directory = "test_summaries"  # Directory to save the summaries
    
    summarize_chapters(input_directory, output_directory)

Summarizing chapters: 100%|██████████| 1/1 [00:14<00:00, 14.58s/it]

Saved summary for chapter_010.txt to test_summaries/summary_chapter_010.md



