In [28]:
import os
import re
import PyPDF2

In [40]:
def process_pdf(pdf_reader: PyPDF2.PdfReader, out_filename: str, verbose=False):
    
    pattern = r"(\d+)\s+([\u4e00-\u9fff｜（）、〇¹²…\n]+)\s+((?:(?:(?:\w*[āáǎàēéěèīíǐìōóǒòūúǔùüǘǚǜ][\w']*)|la|le|zi|ma|ba|de|di|ge|jie|men|ren|ne|ya|wa|zhe|¹|²|…)[\s｜（）·]+)+)([^\d]+)"
    
    def sprint(string): # sanity print
        print(string) if verbose else None
    
    with open(out_filename, "a+") as out_file:
        for page in pdf_reader.pages:
            page_text = page.extract_text()

            # since some entries are multi-lined, need to concat lines together
            # main idea here is to pay attention to entry number
            # we won't process an entry until we know that we have all the relevant data for that entry
            # this can only be gauranteed until we come across the starting point of the next entry
            current_entry = ""
            for line in page_text.split("\n") + ["0000 完了"]: # need for last coverage
                # data cleaning
                line = re.sub(r"[ ]+", " ", line)
                line = re.sub(r"\.\.\.", "…", line)

                # check to see if we've hit the start of the next entry
                if re.search(r"\d+.+[\u4e00-\u9fff]", line):
                    # if so, start processing the current entry
                    sprint(f"ENTRY START\n{current_entry}\nENTRY END\n\n")

                    find_result = re.findall(
                        pattern,
                        current_entry,
                    )

                    if len(find_result):  # check if we found a match
                        (num, chars, pingyin, definition) = find_result[0]
                        num = num.replace("\n", " ").strip()
                        chars = chars.replace("\n", " ").strip()
                        pingyin = pingyin.replace("\n", " ").strip()
                        definition = definition.replace("\n", " ").strip()

                        entry_text = f"{num}[-]{chars}[-]{pingyin}[-]{definition}\n"
                        out_file.write(entry_text)
                    else:
                        # check to see if we should've found a match
                        if re.search(r"[\u4e00-\u9fff]", current_entry):
                            entry_text = f"***{current_entry}\n"
                            out_file.write(entry_text)
                        else:
                            pass

                    # update current_entry to be the new entry
                    current_entry = line
                else:
                    # if we haven't yet stumbled across the start of the next entry, keep concatenating new lines to the current entry
                    current_entry = f"{current_entry}\n{line}"

In [42]:
pdf_dir = os.path.join(os.getcwd(), "..", "data", "pdf")
pdf_files = sorted(
    [os.path.join(pdf_dir, pdf_name) for pdf_name in os.listdir(pdf_dir)]
)
pdf_files

['/Users/elim-mbp-01/Documents/cngrind/third_pass/src/../data/pdf/New-HSK-1-Word-List.pdf',
 '/Users/elim-mbp-01/Documents/cngrind/third_pass/src/../data/pdf/New-HSK-2-Word-List.pdf',
 '/Users/elim-mbp-01/Documents/cngrind/third_pass/src/../data/pdf/New-HSK-3-Word-List.pdf',
 '/Users/elim-mbp-01/Documents/cngrind/third_pass/src/../data/pdf/New-HSK-4-Word-List.pdf',
 '/Users/elim-mbp-01/Documents/cngrind/third_pass/src/../data/pdf/New-HSK-5-Word-List.pdf',
 '/Users/elim-mbp-01/Documents/cngrind/third_pass/src/../data/pdf/New-HSK-6-Word-List.pdf']

In [43]:
for pdf_file in pdf_files:
    pdf_reader = PyPDF2.PdfReader(pdf_file)
    pdf_name = pdf_file.split("/")[-1].strip(".pdf")
    print(pdf_name, len(pdf_reader.pages))
    
    process_pdf(pdf_reader, f"{pdf_name}.txt", verbose=False)
    # break

New-HSK-3-Word-List 48
