In [36]:
import fitz  # PyMuPDF
import ebooklib
from ebooklib import epub
from bs4 import BeautifulSoup

def extract_text_from_document(file_path, skip_items=10, strict=False):
    """
    Extracts literary content from a PDF or EPUB file, with options to apply different levels of filtering.

    :param file_path: Path to the PDF or EPUB file.
    :param skip_items: Number of initial pages/items to skip.
    :param strict: If True, applies stricter filtering criteria (useful for non-fiction works).
    :return: Extracted literary text.
    """

    if file_path.lower().endswith('.pdf'):
        return extract_text_from_pdf(file_path, skip_pages=skip_items, strict=strict)
    elif file_path.lower().endswith('.epub'):
        return extract_text_from_epub(file_path, skip_items=skip_items)
    else:
        raise ValueError("Unsupported file format. Please provide a PDF or EPUB file.")

def extract_text_from_pdf(pdf_path, skip_pages=5, strict=False):
    doc = fitz.open(pdf_path)
    text = ""
    # PDF-specific extraction logic
    # ...
    doc.close()
    return text

def extract_text_from_epub(epub_path, skip_items=10):
    book = epub.read_epub(epub_path)
    text = ""
    # EPUB-specific extraction logic
    # ...
    return text


In [37]:


def extract_text_from_pdf(pdf_path, skip_pages=0, skip_lines=0):
    """
    Extracts text from a PDF document, with options to skip initial pages and lines.

    :param pdf_path: Path to the PDF document.
    :param skip_pages: Number of initial pages to skip.
    :param skip_lines: Number of initial lines to skip on each page.
    :return: Extracted text from the PDF.
    """
    doc = fitz.open(pdf_path)
    text = ""

    for page_number in range(skip_pages, len(doc)):
        page = doc[page_number]
        page_text = page.get_text("text")
        if page_text:
            lines = page_text.split('\n')[skip_lines:]
            paragraph = ''

            for line in lines:
                if line.endswith(('-', '—')):  # Handling hyphenated words at line breaks
                    paragraph += line.rstrip('-—')
                elif line == '':
                    if paragraph:
                        text += paragraph.strip() + '\n\n'
                        paragraph = ''
                else:
                    paragraph += line + ' '

            if paragraph:
                text += paragraph.strip() + '\n\n'
            
    doc.close()
    return text

def open_book(filename):
    with open("../../Resources/Raw/"+filename+".txt", 'r', encoding='utf-8') as file:
        text = file.read()
    return text



def extract_narrative_content_from_epub(epub_path, skip_items=10):
    book = epub.read_epub(epub_path)
    text = ""
    item_count = 0

    for item in book.get_items():
        if item.get_type() == ebooklib.ITEM_DOCUMENT:
            item_count += 1
            if item_count <= skip_items:
                continue  # Skipping more initial non-narrative sections

            soup = BeautifulSoup(item.content, 'html.parser')
            paragraphs = soup.find_all('p')
            
            for p in paragraphs:
                paragraph_text = p.get_text().strip()
                if paragraph_text and not paragraph_text.isdigit() and len(paragraph_text.split()) > 3:
                    text += paragraph_text + '\n\n'

def save_as_txt(filename, text):
    # Open the file in write mode and write the cleaned text
    with open("../../Resources/Cleaned/"+filename+".txt", 'w', encoding='utf-8') as file:
        file.write(text.strip())

In [38]:
path_1984 = r"C:\Users\dontb\01\001\Repos\Dying-Earth\Resources\Raw\1984.pdf"
path_androids = r"C:\Users\dontb\01\001\Repos\Dying-Earth\Resources\Raw\androids.pdf"
path_stardust = r"C:\Users\dontb\01\001\Repos\Dying-Earth\Resources\Raw\stardust.pdf"
path_tom = r"C:\Users\dontb\01\001\Repos\Dying-Earth\Resources\Raw\tom_sawyer.pdf"
path_cosmos = r"C:\Users\dontb\01\001\Repos\Dying-Earth\Resources\Raw\cosmos.pdf"
path_into = r"C:\Users\dontb\01\001\Repos\Dying-Earth\Resources\Raw\into_thin_air.epub"

In [39]:
txt_1984 = extract_text_from_pdf(path_1984, skip_pages=0, skip_lines=2)
txt_androids = extract_text_from_pdf(path_androids, skip_pages=2, skip_lines=0)
txt_stardust = extract_text_from_pdf(path_stardust, skip_pages=4, skip_lines=0)
txt_tom = extract_text_from_pdf(path_tom, skip_pages=1, skip_lines=0)
txt_cosmos = extract_text_from_pdf(path_cosmos, skip_pages=5, skip_lines=0)
txt_into = extract_text_from_epub(path_into, skip_items=10)

In [None]:
save_as_txt("cosmos_cleaned", txt_cosmos)
save_as_txt("tom_sawyer_cleaned", txt_tom)
save_as_txt("into_thin_air_cleaned", txt_into)
save_as_txt("1984_cleaned", txt_1984)
save_as_txt("androids_cleaned", txt_androids)
save_as_txt("stardust_cleaned", txt_stardust)