In [7]:
# Imports

import ebooklib
from ebooklib import epub
from bs4 import BeautifulSoup
import re

In [8]:
# Functions 

# Function to load and read the content of an EPUB file
def epub_loader(filepath):
    # Read the EPUB file
    book = epub.read_epub(r"../../Resources/Raw/"+filepath)

    # Extract the text content from the EPUB file
    for item in book.get_items():
        if item.get_type() == ebooklib.ITEM_DOCUMENT:
            # Parse the HTML content using BeautifulSoup
            soup = BeautifulSoup(item.content, 'html.parser')
            # Get the text from the parsed HTML
            text = soup.get_text()

    # Return the extracted text
    return text

# Function to clean the extracted text from an EPUB file
def clean_text(text):
    # Split the text into paragraphs
    paragraphs = text.split('\n\n\n')
    cleaned_paragraphs = []

    # Clean each paragraph by removing extra whitespace
    for paragraph in paragraphs:
        cleaned_paragraph = re.sub(r'\s+', ' ', paragraph.strip())
        cleaned_paragraphs.append(cleaned_paragraph)

    # Join the cleaned paragraphs and return the result
    return '\n'.join(cleaned_paragraphs)

# Function to remove page and chapter numbers from the cleaned text
def remove_page_and_chapter_num(text):
    # Split the text into lines
    lines = text.split('\n')
    cleaned_lines = []

    # Remove page and chapter numbers from each line
    for line in lines:
        cleaned_line = re.sub(r'^\d+\s*', '', line)
        cleaned_lines.append(cleaned_line)

    # Join the cleaned lines and return the result
    return '\n'.join(cleaned_lines)

# Function to remove initial text from the cleaned text
def remove_initial_text(text):
    # Define the initial text to be removed
    removal_text = "Rhilato the Marvellous Foreword"
    # Find the end index of the removal text
    end_index = text.find(removal_text) + len(removal_text)

    # Remove the initial text if found, otherwise return the original text
    if end_index > -1:
        return text[end_index:].strip()
    else:
        return text

# Function to remove blank lines from the cleaned text
def remove_blank_lines(text):
    # Split the text into lines
    lines = text.split('\n')
    non_blank_lines = []

    # Remove blank lines from the text
    for line in lines:
        if line.strip():
            non_blank_lines.append(line)

    # Join the non-blank lines and return the result
    return '\n'.join(non_blank_lines)

# Function to save the cleaned text as a text file
def save_as_txt(filename, text):
    # Open the file in write mode and write the cleaned text
    with open("../../Resources/Cleaned/"+filename+".txt", 'w', encoding='utf-8') as file:
        file.write(text.strip())

In [None]:
de_4_text = epub_loader("dying_earth4.epub")

In [10]:
de_4_text = clean_text(de_4_text)
de_4_text = remove_page_and_chapter_num(de_4_text)
de_4_text = remove_initial_text(de_4_text)
de_4_text = remove_blank_lines(de_4_text)

In [None]:
print(de_4_text)

In [12]:
save_as_txt("dying_earth4_cleaned", de_4_text)