Ebook processing script courtesy of Prax (@maldv)  
This notebook is licensed under CC-BY-NC 4.0 and the original can be found here: https://huggingface.co/datasets/maldv/cyberpunk/blob/main/epub-processing.ipynb

In [1]:
import re
import pandas as pd
import shutil
from bs4 import BeautifulSoup, NavigableString, Tag
import ebooklib
from ebooklib import epub
import os
import re
from typing import Generator, List

In [1]:
def parse_ebook_html(ebook_path: str, try_chapter : bool = False) -> Generator[tuple, None, None]:
    """
    Parses the HTML content of an EPUB file, yielding only text content from each <p> block,
    while skipping specific elements with class 'calibre3' but considering valid text that follows.

    Parameters:
    - ebook_path (str): The path to the EPUB file.
    - try_chapter (bool): If True, the first paragraph of each chapter will be used to determine the chapter title.

    Returns:
    - text_generator (Generator[tuple, None, None]): A generator yielding text content.
    """
    book = epub.read_epub(ebook_path)
    basename = os.path.basename(ebook_path)
    noext = os.path.splitext(basename)[0]
    chapter_idx = 0
    paragraph_idx = 0
    cumsum_word_count = 0
    for item in book.get_items_of_type(ebooklib.ITEM_DOCUMENT):
        content = item.get_content().decode('utf-8')
        results = list(html_tokenizer(content, try_chapter))
        if len(results) == 0:
            continue
        chapter_idx += 1
        for row in results:
            if len(row[1]) == 0:
                continue
            paragraph_idx += 1
            word_count = len((row[1]))
            cumsum_word_count += word_count
            row = [noext, paragraph_idx, chapter_idx] + list(row[:]) + [word_count, cumsum_word_count]
            yield tuple(row)

def html_tokenizer(html_content: str, try_chapter) -> Generator[tuple, None, None]:
    """
    Generator function to tokenize HTML content, yielding text content from each <p> block.

    Parameters:
    - html_content (str): The HTML content to be tokenized.
    - try_chapter (bool): If True, the first paragraph of each chapter will be used to determine the chapter title.

    Yields:
    - text_generator (Generator[tuple, None, None]): A generator yielding text content. 
    """
    soup = BeautifulSoup(html_content, 'html.parser')
    fix_quote = re.compile(r'“|”|»|«')
    fix_threedot = re.compile(r'…')
    fix_bars = re.compile(r'\|\s*\|')
    fix_spaces = re.compile(r'\s+')

    def extract_and_yield_text(element, accumulated_texts: List[str]):
        if isinstance(element, NavigableString):
            accumulated_texts.append(str(element))
        elif isinstance(element, Tag):
            if element.name == 'a' and 'calibre3' in element.get('class', []):
                # Skip processing the <a class="calibre3"> tag itself, but not its siblings
                #print('skipping', element)
                return
            if element.name == 'span' and 'italic' in element.get('class', []):
                # Append italic text directly to the accumulated_texts list without yielding
                accumulated_texts.append(element.get_text())
            else:
                # Recursively process all children, including those following skipped elements
                for child in element.children:
                    extract_and_yield_text(child, accumulated_texts)

    chapter = None
    for i, p_tag in enumerate(soup.find_all('p')):
        accumulated_texts = []
        # if p's class is calibre14, skip it because it's metadata
        if 'calibre14' in p_tag.get('class', []):
            #print('skipping', i)
            #continue
            pass
        else:
            #print('processing', i)
            if i == 0 and try_chapter:
                # Instead of processing, this contains our chapter and title
                markers = []
                for span in p_tag.find_all('span', class_='bold'):
                    markers.append(span.get_text())

                if len(markers) >= 2:
                    chapter = ' '.join(markers)
                continue
                    
        extract_and_yield_text(p_tag, accumulated_texts)
        # if our text is '| |', skip it
        if '| |' in ' '.join(accumulated_texts):
            continue
        text = ' '.join([text.strip() for text in accumulated_texts if text.strip()])
        text = text.replace('\n', ' ')
        text = text.replace(u'\xa0', u' ')
        text = fix_quote.sub(u'"', text)
        text = fix_threedot.sub(u'...', text)
        text = fix_bars.sub(u'', text)
        text = fix_spaces.sub(u' ', text)
        text = text.strip()
        if text.find('Oceano') != -1:
            continue
        # If the first character is a capital letter, then a space, followed by more capital letters, it is likely the beginning of a chapter and needs to have the space removed
        if len(text) == 0:
            continue
        if len(text) < 4 and text.isnumeric():
            continue
        #elif len(text) > 2 and text[0].isupper() and text[1] == ' ' and text[2].isupper():
        #    text = text[0] + text[2:]
        yield chapter, text


In [33]:
lines = parse_ebook_html("./LarsKepler-TheHypnotist.epub", try_chapter=False)
df = pd.DataFrame(lines, columns=['book_name', 'paragraph_ix', 'chapter_ix', 'chapter_title', 'text', 'char_count', 'cumsum_char_count'])
for drop in [1,2,3,4,117,116]:
    df = df[df['chapter_ix'] != drop]
df = df[~df['text'].str.startswith("Chapter ")]

df["text"] = df['text'].str.replace("’", "'").str.replace(" . . .", "...")

shutil.rmtree("out", ignore_errors=True)
os.mkdir("out")
for idx, group in df.groupby("chapter_ix"):
    with open(f"out/chapter_{idx}.txt", 'w') as f:
        f.write('\n'.join(group['text']))