In [3]:

import pdfplumber
from PyPDF2 import PdfReader, PdfWriter

import os
import re
import pdfplumber
from PyPDF2 import PdfReader, PdfWriter

def split_and_rename_pdf(file_path):
    """
    Splits a PDF into test and answer sections at the 'Answer Section'.
    After splitting, the original file is renamed with a prefix '_'.
    
    Parameters:
    file_path (str): The full path to the original PDF file.
    """
    # Using pdfplumber to find the correct split index
    split_index = None
    with pdfplumber.open(file_path) as pdf:
        for i, page in enumerate(pdf.pages):
            text = page.extract_text() or ""
            if "Answer Section" in text:
                split_index = i
                break
    
    if split_index is None:
        raise ValueError("The 'Answer Section' marker could not be found in the document.")
    
    # Load the PDF file with PyPDF2 for splitting
    reader = PdfReader(file_path)
    
    # Setup paths for the new PDF files
    directory = os.path.dirname(file_path)
    base_name = os.path.splitext(os.path.basename(file_path))[0]
    test_pdf_path = os.path.join(directory, f"{base_name}_test.pdf")
    answer_pdf_path = os.path.join(directory, f"{base_name}_answ.pdf")
    
    # Initialize PDF writers for the two sections
    test_writer = PdfWriter()
    answer_writer = PdfWriter()
    
    # Split the pages based on the determined index
    for i in range(len(reader.pages)):
        if i < split_index:
            test_writer.add_page(reader.pages[i])
        else:
            answer_writer.add_page(reader.pages[i])
    
    # Save the split PDFs to files
    with open(test_pdf_path, "wb") as f:
        test_writer.write(f)
    with open(answer_pdf_path, "wb") as f:
        answer_writer.write(f)
    
    # Rename the original file
    new_file_path = os.path.join(directory, f"_{base_name}.pdf")
    os.rename(file_path, new_file_path)
    
    print(f"Test section saved to: {test_pdf_path}")
    print(f"Answer section saved to: {answer_pdf_path}")
    print(f"Original file renamed to: {new_file_path}")
    return test_pdf_path, answer_pdf_path, new_file_path

def process_directory(directory_path):
    """
    Processes all PDF files in a given directory to split and rename them.
    
    Parameters:
    directory_path (str): The path to the directory containing PDF files.
    """
    # Iterate through all files in the directory
    for filename in os.listdir(directory_path):
        if filename.lower().endswith(".pdf"):
            file_path = os.path.join(directory_path, filename)
            try:
                split_and_rename_pdf(file_path)
                print(f"Processed file: {filename}")
            except Exception as e:
                print(f"Failed to process file {filename}: {e}")


# Example usage
if __name__ == "__main__":
    # Replace 'path_to_pdf_file.pdf' with the actual path to the PDF file you want to split
    # path_to_pdf_file = "./testData/0115ExamAI_EV.pdf"
    # split_and_rename_pdf(path_to_pdf_file)
    
    directory_path = "/Users/markgyao/Downloads/CompactExamsAI-EV"
    process_directory(directory_path)



Test section saved to: ./testData/0115ExamAI_EV_test.pdf
Answer section saved to: ./testData/0115ExamAI_EV_answ.pdf
Original file renamed to: ./testData/_0115ExamAI_EV.pdf
