In [None]:
# Imports

import ebooklib
from ebooklib import epub
from bs4 import BeautifulSoup
import re
from PyPDF2 import PdfReader

In [None]:
removal_text = """Into thin air
by
Jon Krakauer


Copyright (c) 1997 by Jon Krakauer

All rights reserved under International and Pan-American Copyright
Conventions.
Published in the United States by Villard Books, a division of Random
House, Inc New York, and simultaneously in Canada by Random House of
Canada Limited, Toronto.
ViLLard Books is a registered trademark of Random House, Inc. Portions
of this work were originally published in Outside.
Grateful acknowledgment is made to the following for permission to
reprint previously published material:

BATON WICKS PUBLICATIONS: Excerpts from Upon That Mountain by Eric
Shipton (Hodder, London, 1943).  This title is now collected in the
omnibus Eric Sbipton The Six Mountain Travel Books (Diadem, London, and
the Mountaineers, Seattle, 1995).  Reprinted by permission of Nick
Shipton and Baton Wicks Publications, Macclesfield, Cheshire, England.
HAYNES PUBLISHING: Excerpts from Everest by Walt Unsworth.  Published
by Oxford Illustrated Press, an imprint of Haynes Publishing,
Sparkford, Nr Yeovil, Somerset, BA22 7jj.  Reprinted by permission of
the author and publisher.
SIMON AND SCHUSTER AND A. P. WATT LTD: Six lines from 'The Second
Coming" by William Butler Yeats, from The Collected Works of W B.
Yeats, Volume 1: The Poems revised and edited by Richard J. Finneran.
Copyright 1924 by Macmillan Publishing Company.  Copyright renewed 1952
by Bertha Georgie Yeats.  Reprinted by permission of Simon and Schuster
and A. P. Watt Ltd.

on behalf of Michael Yeats.
For Linda; and in memory of Andy Harris, Doug Hansen, Rob Hall, Yasuko
Namba, Scott Fischer, Ngawang Topche Sherpa, Chen Yu-Nan, Bruce Herrod,
and Lopsang jangbu Sherpa

Library of Congress Cataloging-in-Publication Data Krakauer, Jon.

Into thin air: a personal account of the Mount Everest disaster Jon
Krakauer.

P. CM.

Includes bibliographical references.

ISBN 0-679-45752-6

1. Mountaineering accidents-Everest, Mount (China and Nepal).

2. Mount Everest Expedition (1996).  3. Krakauer, Jon.  I. Title.

GV199.44.E85K725 1997 796.5'2"-095496--dc2 i 96-30031 Random House
website address: http://www.randomhouse.com/

Printed in the United States of America on acid-free paper

Book design by Caroline Cunningham

Men play at tragedy because they do not believe in the reality of life
tragedy which is actually being staged in the civilized world.

Jose Ortega Gasset

INTRODUCTION"""

In [None]:
# Functions 

def open_book(filename):
    with open("../../Resources/Raw/"+filename+".txt", 'r', encoding='utf-8') as file:
        text = file.read()
    return text

def remove_hyphenation(content):
    # Define the regular expression pattern for hyphenated words across lines
    # This pattern matches words ending with a hyphen followed by any whitespace and the continuation of the word
    pattern = r'(\w+)-\s+(\w+)'

    # Substitute the pattern with the combined word
    corrected_content = re.sub(pattern, r'\1\2', content)

    return corrected_content

# Function to load and read the content of an EPUB file
def epub_loader(filepath):
    # Read the EPUB file
    book = epub.read_epub(r"../../Resources/Raw/"+filepath)

    # Extract the text content from the EPUB file
    for item in book.get_items():
        if item.get_type() == ebooklib.ITEM_DOCUMENT:
            # Parse the HTML content using BeautifulSoup
            soup = BeautifulSoup(item.content, 'html.parser')
            # Get the text from the parsed HTML
            text = soup.get_text()

    # Return the extracted text
    return text

def remove_page_markers(text):
    """
    This function removes any instances of markers like "www.TaleBooks.comPage  3 ,  Adventures of Tom Sawyer, The - Mark Twain"
    and works for any "Page" number. It also removes a specific instance: "Page  2 ,  Adventures of Tom Sawyer, The - Mark Twain ".
    """
    import re

    # Regex pattern to match the page markers
    pattern = r"www\.TaleBooks\.comPage\s+\d+\s+,  Adventures of Tom Sawyer, The - Mark Twain\s+"
    # Additional pattern for the specific instance
    specific_pattern = r"Page\s+2\s+,  Adventures of Tom Sawyer, The - Mark Twain\s+"

    # Using regex to remove the page markers
    cleaned_text = re.sub(pattern, "", text)
    # Removing the specific instance
    cleaned_text = re.sub(specific_pattern, "", cleaned_text)

    return cleaned_text



def pdf_reader(filepath, pages_to_ignore):
    """
    Reads a PDF file and extracts the text from all pages except the first two.
    
    Args:
        filepath (str): The path to the PDF file.
        
    Returns:
        str: The extracted text from the PDF file.
    """
    # Read the PDF file at the specified filepath
    reader = PdfReader(r"../../Resources/Raw/" + filepath)
    
    # Get the total number of pages in the PDF
    number_of_pages = len(reader.pages)

    # Skip the first two pages and extract the text from the remaining pages
    text = ""
    for page_number in range(pages_to_ignore, number_of_pages):
        page = reader.pages[page_number]
        text += page.extract_text()
    return text

def remove_initial_text(text):

    # Find the end index of the removal text
    end_index = text.find(removal_text) + len(removal_text)

    # Remove the initial text if found, otherwise return the original text
    if end_index > -1:
        return text[end_index:].strip()
    else:
        return text

# Function to clean the extracted text from an EPUB file
def clean_text(text):
    # Split the text into paragraphs
    paragraphs = text.split('\n\n\n')
    cleaned_paragraphs = []

    # Clean each paragraph by removing extra whitespace
    for paragraph in paragraphs:
        cleaned_paragraph = re.sub(r'\s+', ' ', paragraph.strip())
        cleaned_paragraphs.append(cleaned_paragraph)

    # Join the cleaned paragraphs and return the result
    return '\n'.join(cleaned_paragraphs)

def remove_chapter_headers_and_citations(text):
    # Split the text into lines
    lines = text.split('\n')
    cleaned_lines = []

    # Remove 'CHAPTER [n]', 'PART [n]', and lines starting with '-' or '*' from each line
    for line in lines:
        if (not re.match(r'^CHAPTER \b(X{0,3})(IX|IV|V?I{0,3})\b', line, re.IGNORECASE) and
            not re.match(r'^PART \b(X{0,3})(IX|IV|V?I{0,3})\b', line, re.IGNORECASE) and
            not line.startswith('-') and not line.startswith('*')):
            cleaned_lines.append(line)

    # Join the cleaned lines and return the result
    return '\n'.join(cleaned_lines)

def remove_unwanted_line_breaks(text):
    """
    Removes line breaks from a text except when the line break is preceded by a period or a colon.
    
    :param text: str - The text to be processed.
    :return: str - The processed text.
    """
    # Split the text into lines
    lines = text.split('\n')

    # Initialize an empty string to hold the processed text
    processed_text = ''

    for line in lines:
        # Strip leading and trailing whitespaces from the line
        trimmed_line = line.strip()

        if processed_text and processed_text[-1] in ".:":
            # Add the line with a line break if the previous sentence ends with a period or colon
            processed_text += '\n' + trimmed_line
        else:
            # Add a space before the line if the processed text is not empty
            if processed_text:
                processed_text += ' '
            processed_text += trimmed_line

    return processed_text



# Function to remove blank lines from the cleaned text
def remove_blank_lines(text):
    # Split the text into lines
    lines = text.split('\n')
    non_blank_lines = []

    # Remove blank lines from the text
    for line in lines:
        if line.strip():
            non_blank_lines.append(line)

    # Join the non-blank lines and return the result
    return '\n'.join(non_blank_lines)

def remove_file_paths_from_content(content):
    # Define the regular expression pattern for the file paths
    pattern = r'file:///C\|/.+\.txt'

    # If the content is a string, split it into lines
    if isinstance(content, str):
        content = content.splitlines()

    # Filter out lines that match the pattern
    filtered_lines = [line for line in content if not re.search(pattern, line)]

    # Join the filtered lines back into a single string
    filtered_content = '\n'.join(filtered_lines)

    return filtered_content

def remove_specific_patterns(content):
    # Define the regular expression pattern for the specified strings
    # This pattern matches one or more digits followed by space and uppercase letters,
    # and possibly some special characters at the end
    pattern = r'\d+\s+[A-Z\s]+(?:\d+\'?)?'

    # If the content is a string, split it into lines
    if isinstance(content, str):
        content = content.splitlines()

    # Filter each line using the regular expression
    filtered_lines = []
    for line in content:
        # Substitute matching patterns with an empty string
        new_line = re.sub(pattern, '', line)
        filtered_lines.append(new_line)

    # Join the filtered lines back into a single string
    filtered_content = '\n'.join(filtered_lines)

    return filtered_content

# Function to save the cleaned text as a text file
def save_as_txt(filename, text):
    # Open the file in write mode and write the cleaned text
    with open("../../Resources/Cleaned/"+filename+".txt", 'w', encoding='utf-8') as file:
        file.write(text.strip())
        
def remove_initial(text):
    # Define the initial text to be removed
    removal_text = """The Killing Machine
By Jack Vance
Book 2 in the "Demon Prince" Series



"""
    # Find the end index of the removal text
    end_index = text.find(removal_text) + len(removal_text)

    # Remove the initial text if found, otherwise return the original text
    if end_index > -1:
        return text[end_index:].strip()
    else:
        return text
        

In [None]:
cosmos_text = pdf_reader("cosmos.pdf", 8)
cosmos_text = remove_chapter_headers_and_citations(cosmos_text)
cosmos_text = remove_unwanted_line_breaks(cosmos_text)

In [None]:
ts_text = pdf_reader("tom_sawyer.pdf", 1)
ts_text = remove_page_markers(ts_text)
ts_text = remove_chapter_headers_and_citations(ts_text)
ts_text = remove_unwanted_line_breaks(ts_text)

In [None]:
text_1984 = pdf_reader("1984.pdf", 1)
text_1984 = remove_chapter_headers_and_citations(text_1984)
text_1984 = remove_unwanted_line_breaks(text_1984)

In [None]:
into_air_text = open_book("into_thin_air")

In [None]:
killing_machine_text = open_book("killing_machine")

In [None]:
killing_machine_text = remove_file_paths_from_content(killing_machine_text)
killing_machine_text = remove_initial(killing_machine_text)
killing_machine_text = remove_specific_patterns(killing_machine_text)
killing_machine_text = remove_unwanted_line_breaks(killing_machine_text)
killing_machine_text = remove_hyphenation(killing_machine_text)

In [None]:
into_air_text = remove_initial_text(into_air_text)
into_air_text = remove_unwanted_line_breaks(into_air_text)

In [None]:
stardust_text = pdf_reader("stardust.pdf", 4)

In [None]:
stardust_text = remove_page_markers(stardust_text)
stardust_text = remove_chapter_headers_and_citations(stardust_text)
stardust_text = remove_unwanted_line_breaks(stardust_text)

In [None]:
print(stardust_text)

In [None]:
androids_text = pdf_reader("androids.pdf", 3)

In [None]:
androids_text = remove_page_markers(androids_text)
androids_text = remove_chapter_headers_and_citations(androids_text)
androids_text = remove_unwanted_line_breaks(androids_text)


In [None]:
save_as_txt("cosmos_cleaned", cosmos_text)
save_as_txt("tom_sawyer_cleaned", ts_text)
save_as_txt("into_thin_air_cleaned", into_air_text)
save_as_txt("1984_cleaned", text_1984)
save_as_txt("killing_machine_cleaned", killing_machine_text)
save_as_txt("androids_cleaned", androids_text)
save_as_txt("stardust_cleaned", stardust_text)