In [9]:
# Imports

import re
from PyPDF2 import PdfReader

In [10]:
# Functions

def pdf_reader(filepath):
    """
    Reads a PDF file and extracts the text from all pages except the first two.
    
    Args:
        filepath (str): The path to the PDF file.
        
    Returns:
        str: The extracted text from the PDF file.
    """
    # Read the PDF file at the specified filepath
    reader = PdfReader(r"../../Resources/Raw/" + filepath)
    
    # Get the total number of pages in the PDF
    number_of_pages = len(reader.pages)

    # Skip the first two pages and extract the text from the remaining pages
    text = ""
    for page_number in range(2, number_of_pages):
        page = reader.pages[page_number]
        text += page.extract_text()

    return text

def remove_page_and_chapter_num(text):
    """
    Removes page and chapter numbers from the given text using a regular expression pattern.
    
    Args:
        text (str): The text to remove page and chapter numbers from.
        
    Returns:
        str: The text with page and chapter numbers removed.
    """
    # Define a regular expression pattern to match page and chapter numbers
    combined_pattern = re.compile(
        r'^\d+\s*$|^\d+\s+[A-Z\s]+$|^\d+\s+[A-Z\s]+:\s+[A-Z\s]+$|^\d+', 
        re.MULTILINE
    )
    
    # Remove the matched patterns from the text
    cleaned_text = re.sub(combined_pattern, '', text)
    return cleaned_text

def remove_unwanted_line_breaks(text):
    """
    Removes line breaks from a text except when the line break is preceded by a period.
    
    :param text: str - The text to be processed.
    :return: str - The processed text.
    """
    # Split the text into lines
    lines = text.split('\n')

    # Initialize an empty string to hold the processed text
    processed_text = ''

    for line in lines:
        # Strip leading and trailing whitespaces from the line
        trimmed_line = line.strip()

        if processed_text.endswith('.'):
            # Add the line with a line break if the previous sentence ends with a period
            processed_text += '\n' + trimmed_line
        else:
            # Add the line without a line break otherwise
            processed_text += ' ' + trimmed_line

    return processed_text

def remove_single_line_roman_numerals(text):
    """
    Removes lines that contain only single line Roman numerals.
    
    :param text: str - The text to be processed.
    :return: str - The processed text.
    """
    # Define a regular expression pattern for single line Roman numerals
    roman_numeral_pattern = r'^[IVXLCDM]+$'

    # Split the text into lines
    lines = text.split('\n')

    # Filter out lines that match the Roman numeral pattern
    filtered_lines = [line for line in lines if not re.match(roman_numeral_pattern, line.strip())]

    # Join the remaining lines back into a single string
    return '\n'.join(filtered_lines)

def save_as_txt(filename, text):
    """
    Saves the given text as a text file with the specified filename.
    
    Args:
        filename (str): The name of the text file to save.
        text (str): The text to be saved in the file.
    """
    # Save the cleaned text as a text file with the specified filename
    with open("../../Resources/Cleaned/"+filename+".txt", 'w', encoding='utf-8') as file:
        file.write(text.strip())

In [11]:
de_2_text = pdf_reader("dying_earth2.pdf")
de_2_text = remove_page_and_chapter_num(de_2_text)
de_2_text = remove_single_line_roman_numerals(de_2_text)
de_2_text = remove_unwanted_line_breaks(de_2_text)

In [12]:
print(de_2_text)

 The Overworld! ON THE HEIGHTS above the river Xzan, at the site of certain ancient ruins, Iucounu the Laughing Magician had built a manse to his private taste: an eccentric structure of steep gables, balconies, sky-walks, cupolas, together with three spiral green glass towers through which the red sunlight shone in twisted glints and peculiar colors.
Behind the manse and across the valley, low hills rolled away like dunes to the limit of vision. The sun projected shifting crescents of black shadow; otherwise the hills were unmarked, empty, solitary. The Xzan, rising in the Old Forest to the east of Almery, passed below, then three leagues to the west made junction with the Scaum. Here was Azenomei, a town old beyond memory, notable now only for its fair, which attracted folk from all the region. At Azenomei Fair Cugel had established a booth for the sale of talismans.
Cugel was a man of many capabilities, with a disposition at once flexible and pertinacious. He was long of leg, deft o

In [None]:
save_as_txt("dying_earth2_cleaned", de_2_text)