In [33]:
# Imports

import re
from PyPDF2 import PdfReader

In [40]:
# Functions

def pdf_reader(filepath):
    """
    Reads a PDF file and extracts the text from all pages except the first two.
    
    Args:
        filepath (str): The path to the PDF file.
        
    Returns:
        str: The extracted text from the PDF file.
    """
    # Read the PDF file at the specified filepath
    reader = PdfReader(r"../../Resources/Raw/" + filepath)
    
    # Get the total number of pages in the PDF
    number_of_pages = len(reader.pages)

    # Skip the first two pages and extract the text from the remaining pages
    text = ""
    for page_number in range(2, number_of_pages):
        page = reader.pages[page_number]
        text += page.extract_text()

    return text

def ignore_watermark(text):
    """
    Removes a specific watermark text from the given text.
    
    Args:
        text (str): The text to remove the watermark from.
        
    Returns:
        str: The text with the watermark removed.
    """
    # Define the watermark text to be ignored
    watermark = """Click here to buyA
B
B
Y
Y
P
D
F
T
r
a
n
s
f
o
r
m
e
r
2
.
0
w
w
w
.
A
B
B
Y
Y
.
c
o
m"""
    
    # Remove the watermark text from the given text
    text = text.replace(watermark, '')
    return text

def remove_page_and_chapter_num(text):
    """
    Removes page and chapter numbers from the given text using a regular expression pattern.
    
    Args:
        text (str): The text to remove page and chapter numbers from.
        
    Returns:
        str: The text with page and chapter numbers removed.
    """
    # Define a regular expression pattern to match page and chapter numbers
    combined_pattern = re.compile(
        r'^\d+\s*$|^\d+\s+[A-Z\s]+$|^\d+\s+[A-Z\s]+:\s+[A-Z\s]+$|^\d+', 
        re.MULTILINE
    )
    
    # Remove the matched patterns from the text
    cleaned_text = re.sub(combined_pattern, '', text)
    return cleaned_text

def remove_unwanted_line_breaks(text):
    """
    Removes line breaks that are not preceded by a period, replacing them with a space.
    
    Args:
        text (str): The text to process.
        
    Returns:
        str: The text with unwanted line breaks removed.
    """
    # Replace line breaks not preceded by a period with a space
    text = re.sub(r'(?<!\.)\n', ' ', text)

    return text


def remove_blank_lines(text):
    """
    Removes blank lines from the given text.
    
    Args:
        text (str): The text to remove blank lines from.
        
    Returns:
        str: The text with blank lines removed.
    """
    # Split the text into lines
    lines = text.split('\n')
    non_blank_lines = []

    # Remove blank lines from the text
    for line in lines:
        if line.strip():
            non_blank_lines.append(line)

    # Join the non-blank lines and return the result
    return '\n'.join(non_blank_lines)

def save_as_txt(filename, text):
    """
    Saves the given text as a text file with the specified filename.
    
    Args:
        filename (str): The name of the text file to save.
        text (str): The text to be saved in the file.
    """
    # Save the cleaned text as a text file with the specified filename
    with open("../../Resources/Cleaned/"+filename+".txt", 'w', encoding='utf-8') as file:
        file.write(text.strip())

In [41]:
de_3_text = pdf_reader("dying_earth3.pdf")
de_3_text = ignore_watermark(de_3_text)
de_3_text = remove_page_and_chapter_num(de_3_text)
de_3_text = remove_unwanted_line_breaks(de_3_text)
de_3_text = remove_blank_lines(de_3_text)


In [None]:
de_3_text

In [None]:
print(de_3_text)

In [None]:
save_as_txt("dying_earth3_cleaned", de_3_text)