In [None]:

import pdfplumber
from PyPDF2 import PdfReader, PdfWriter

import os
import re
import pdfplumber
from PyPDF2 import PdfReader, PdfWriter

def split_and_rename_pdf(file_path):
    """
    Splits a PDF into test and answer sections at the 'Answer Section'.
    After splitting, the original file is renamed with a prefix '_'.
    
    Parameters:
    file_path (str): The full path to the original PDF file.
    """
    # Using pdfplumber to find the correct split index
    split_index = None
    with pdfplumber.open(file_path) as pdf:
        for i, page in enumerate(pdf.pages):
            text = page.extract_text() or ""
            if "Answer Section" in text:
                split_index = i
                break
    
    if split_index is None:
        raise ValueError("The 'Answer Section' marker could not be found in the document.")
    
    # Load the PDF file with PyPDF2 for splitting
    reader = PdfReader(file_path)
    
    # Setup paths for the new PDF files
    directory = os.path.dirname(file_path)
    base_name = os.path.splitext(os.path.basename(file_path))[0]
    test_pdf_path = os.path.join(directory, f"{base_name}_test.pdf")
    answer_pdf_path = os.path.join(directory, f"{base_name}_answ.pdf")
    
    # Initialize PDF writers for the two sections
    test_writer = PdfWriter()
    answer_writer = PdfWriter()
    
    # Split the pages based on the determined index
    for i in range(len(reader.pages)):
        if i < split_index:
            test_writer.add_page(reader.pages[i])
        else:
            answer_writer.add_page(reader.pages[i])
    
    # Save the split PDFs to files
    with open(test_pdf_path, "wb") as f:
        test_writer.write(f)
    with open(answer_pdf_path, "wb") as f:
        answer_writer.write(f)
    
    # Rename the original file
    new_file_path = os.path.join(directory, f"_{base_name}.pdf")
    os.rename(file_path, new_file_path)
    
    print(f"Test section saved to: {test_pdf_path}")
    print(f"Answer section saved to: {answer_pdf_path}")
    print(f"Original file renamed to: {new_file_path}")
    return test_pdf_path, answer_pdf_path, new_file_path

def process_directory(directory_path):
    """
    Processes all PDF files in a given directory to split and rename them.
    
    Parameters:
    directory_path (str): The path to the directory containing PDF files.
    """
    # Iterate through all files in the directory
    for filename in os.listdir(directory_path):
        if filename.lower().endswith(".pdf"):
            file_path = os.path.join(directory_path, filename)
            try:
                split_and_rename_pdf(file_path)
                print(f"Processed file: {filename}")
            except Exception as e:
                print(f"Failed to process file {filename}: {e}")


# Example usage
if __name__ == "__main__":
    # Replace 'path_to_pdf_file.pdf' with the actual path to the PDF file you want to split
    # path_to_pdf_file = "./testData/0115ExamAI_EV.pdf"
    # split_and_rename_pdf(path_to_pdf_file)
    
    directory_path = "/Users/markgyao/Downloads/test01"
    process_directory(directory_path)



In [12]:

import json
import re
import json
import PyPDF2

def parse_exam_answers0(pdf_path):
    # Read the PDF content
    pdf_reader = PyPDF2.PdfReader(pdf_path)
    pdf_content = ""
    
    for page_num in range(len(pdf_reader.pages)):
        page = pdf_reader.pages[page_num]
        pdf_content += page.extract_text() + "\n"

    # Parse the data
    data = {}
    pattern = re.compile(r'(\d+)\s+ANS:\s*(.*?)\s*PTS:\s*(\d+)\s*REF:\s*(\w+)\s*NAT:\s*(\S+)', re.DOTALL)

    for match in pattern.finditer(pdf_content):
        q_id = match.group(1)
        ans = match.group(2).strip()
        pts = match.group(3).strip()
        ref = match.group(4).strip()
        nat = match.group(5).strip()

        data[q_id] = {
            "ANS": ans,
            "PTS": pts,
            "REF": ref,
            "NAT": nat
        }

    # Convert to JSON
    json_data = json.dumps(data, indent=4)
    
    json_path = os.path.splitext(pdf_path)[0] + '_0.json'
    with open(json_path, 'w') as json_file:
        json_file.write(json_data)
    
    return json_data


def normalize_text(text):
    # Replace multiple spaces and line breaks with a single space
    #text = re.sub(r'\s+', ' ', text)
    return text

def parse_exam_answers1(pdf_path):
    # Read the PDF content
    pdf_reader = PyPDF2.PdfReader(pdf_path)
    pdf_content = ""
    
    for page_num in range(len(pdf_reader.pages)):
        page = pdf_reader.pages[page_num]
        pdf_content += page.extract_text() + "\n"
    
    # Normalize the extracted text
    pdf_content = normalize_text(pdf_content)
    
    # Pattern to handle the structure
    pattern = re.compile(r'(\d+)\s+ANS:\s+(\S.*?)\s+PTS:\s+(\d+)\s+REF:\s+(\S+)\s+NAT:\s+([A-Z0-9.]+)', re.DOTALL)
    
    data = {}
    for match in pattern.finditer(pdf_content):
        q_id = match.group(1)
        ans = match.group(2).strip()
        pts = match.group(3).strip()
        ref = match.group(4).strip()
        nat = match.group(5).strip()

        data[q_id] = {
            "ANS": ans,
            "PTS": pts,
            "REF": ref,
            "NAT": nat
        }

    # Convert to JSON
    json_data = json.dumps(data, indent=4,  ensure_ascii=False) ## note the difference from previous functions
    
    # Save JSON to file
    json_path = os.path.splitext(pdf_path)[0] + '_1.json'
    with open(json_path, 'w') as json_file:
        json_file.write(json_data)
    
    return json_data

r"""
To deal with the situation where the ANS, PTS, REF, and NAT fields for each question might not all be on the same line, we can adjust our approach to ensure that our regex pattern is able to capture multiline data.

Adjusting the Regular Expression
We'll use the re.DOTALL flag, which allows the . character in the regex pattern to match newline characters as well. This way, our regex pattern can capture fields that span multiple lines. Additionally, we'll make sure our pattern is flexible enough to match varying amounts of whitespace and the possibility of fields appearing on separate lines.

Detailed Pattern Explanation
Here's the regex pattern and a detailed explanation of how it works:

python
Copy code
pattern = re.compile(r'(\d+)\s+ANS:\s+(.*?)\s+PTS:\s+(\d+)\s+REF:\s+(\S+)\s+NAT:\s+([A-Z0-9.]+)', re.DOTALL)
(\d+): Captures the question number, which consists of one or more digits.
\s+: Matches one or more whitespace characters, including newlines.
ANS:\s+: Matches the literal string "ANS:" followed by one or more whitespace characters.
(.*?): Non-greedy match for any characters (including newlines, due to re.DOTALL), capturing the answer.
\s+PTS:\s+: Matches one or more whitespace characters, "PTS:", and one or more whitespace characters.
(\d+): Captures the points, which consist of one or more digits.
\s+REF:\s+: Matches one or more whitespace characters, "REF:", and one or more whitespace characters.
(\S+): Captures the reference, which consists of one or more non-whitespace characters.
\s+NAT:\s+: Matches one or more whitespace characters, "NAT:", and one or more whitespace characters.
([A-Z0-9.]+): Captures the national standard, which consists of uppercase letters, digits, or periods.
"""

'\nTo deal with the situation where the ANS, PTS, REF, and NAT fields for each question might not all be on the same line, we can adjust our approach to ensure that our regex pattern is able to capture multiline data.\n\nAdjusting the Regular Expression\nWe\'ll use the re.DOTALL flag, which allows the . character in the regex pattern to match newline characters as well. This way, our regex pattern can capture fields that span multiple lines. Additionally, we\'ll make sure our pattern is flexible enough to match varying amounts of whitespace and the possibility of fields appearing on separate lines.\n\nDetailed Pattern Explanation\nHere\'s the regex pattern and a detailed explanation of how it works:\n\npython\nCopy code\npattern = re.compile(r\'(\\d+)\\s+ANS:\\s+(.*?)\\s+PTS:\\s+(\\d+)\\s+REF:\\s+(\\S+)\\s+NAT:\\s+([A-Z0-9.]+)\', re.DOTALL)\n(\\d+): Captures the question number, which consists of one or more digits.\n\\s+: Matches one or more whitespace characters, including newlines.\

In [14]:

pdf_file_path = './testData/0115ExamAI_EV_answ.pdf'

#answer_data = parse_exam_answers0(pdf_file_path)
answer_data = parse_exam_answers1(pdf_file_path)
#print(answer_data)