In [316]:
import pdfplumber

In [317]:
fname = 'Data/EntrepreneurshipTest#7.pdf'

pdf = pdfplumber.open(fname)

In [318]:
import re
question_start = re.compile(r'^\d+\.')
choice_start   = re.compile(r'^[A-Z]\.')
choice_split   = re.compile(r' [A-Z]\. ')

def get_questions(page):
    questions            = []
    current_question     = None
    question_in_progress = False
    choices              = []

    for line in page.extract_text().split("\n"):
        if question_start.match(line):
            # This line is a question
            # Save the last question so we can start storing this one
            if current_question:
                # print(f"\n\nQuestion Parsed:\n{current_question}\n{choices}")
                questions.append([current_question, sorted(choices)] )
            current_question = line
            question_in_progress = True
            choices = []

        elif choice_start.match(line):
            question_in_progress = False
            # choices = choice_split.split(line)
            # choices.extend(line)
            c_index = line.find(' C. ')
            d_index = line.find(' D. ')
            if c_index > 0:
                choices.append( line[0:c_index] )   # A choice
                choices.append( line[c_index+1:] )  # C choice

            elif d_index > 0:
                choices.append( line[0:d_index] )   # B choice
                choices.append( line[d_index+1:] )  # D choice

            else:
                choices.append(line)

        elif question_in_progress:
            current_question = f"{current_question} {line}"

    if current_question:
        questions.append([current_question, sorted(choices)] )

    return questions

all_questions = []
for page in range(1,10):
    all_questions.extend( get_questions(pdf.pages[page] ) )



In [319]:
len(all_questions)

100

In [320]:
key_start      = re.compile(r'^\d+\. [A-Z]$')

def get_answers(page):
    answers = []
    for line in page.extract_text().split("\n"):
        if key_start.match(line):
            answers.append( line )
    return answers

all_answers = []
for page in range(10,30):
    all_answers.extend ( get_answers(pdf.pages[page] ) )


In [321]:
len(all_answers)

100

# Create the CSV

* Column A is the type of question: MC (multiple choice) and MR (multiple response). For True/False questions, use MC.
* Column B is not used but must be there.
* Column C is the point value of the question. It can be between 0-100 and up to two decimal places (3.33)
* Column D is the question body. 
* Column E is the correct answer. The numbers 1-5 each correspond to the one of the possible answers listed in column F-J. Use 1 to indicate a, 2 to indicate b, 3 to indicate c, 4 to indicate d, and 5 to indicate e. For True/False questions, use 1 for True and 0 for False. Clear any unused cells.
* Columns F-J are the possible answer choices. You can have 2 or more. 

In [322]:
def make_row(question_and_choices, answer):
    question, choices = question_and_choices
    # convert the answer to a number: A=1, B=2, C=3, D=4
    numerical_answers = {
        'A': 1, 'B': 2, 'C': 3, 'D': 4
    }
    letter_answer = answer[-1]
    number_answer = numerical_answers[ letter_answer ]
    question = question.split(" ", 1)[1]  # get rid of the number at the start of the question
    row = [
        'MC',
        '',
        '1',
        question,
        number_answer,
        choices[0][3:],
        choices[1][3:],
        choices[2][3:],
        choices[3][3:],
    ]
    return row

In [323]:
csv_rows = []
for i in range(len(all_questions)):
    csv_rows.append( make_row(all_questions[i], all_answers[i]) )

# Write the results as CSV

In [324]:
import csv

file_name = 'EntrepreneurshipTest#7.csv'

with open(file_name, 'w', newline='') as csv_file:
    csv_writer = csv.writer(csv_file)
    csv_writer.writerows(csv_rows)
