In [2]:
import PyPDF2
import pandas as pd
import os
from pprint import pprint
import re
import numpy as np

In [64]:
def process_pdf(pdf_reader: PyPDF2.PdfReader, out_filename: str, verbose=False):
    
    pattern = r"(\d+)\s+([\u4e00-\u9fff｜（）、〇]+)\s+((?:(?:(?:\w*[āáǎàēéěèīíǐìōóǒòūúǔùüǘǚǜ][\w']*)|le|zi|ma|ba|de|di|ge|jie|men|ren|ne|zhe)[\s｜（）·]+)+)([^\n]+)"
    
    def sprint(string): # sanity print
        print(string) if verbose else None
    
    with open(out_filename, "a+") as out_file:
        for page in pdf_reader.pages:
            page_text = page.extract_text()

            # since some entries are multi-lined, need to concat lines together
            # main idea here is to pay attention to entry number
            # we won't process an entry until we know that we have all the relevant data for that entry
            # this can only be gauranteed until we come across the starting point of the next entry
            current_entry = ""
            for line in page_text.split("\n"):
                # perform some cleaning on the current line
                line = re.sub(r"[ ]+", " ", line)

                # check to see if we've hit the start of the next entry
                if re.search(r"[\u4e00-\u9fff]", line):
                    # if so, start processing the current entry
                    sprint(f"ENTRY START\n{current_entry}\nENTRY END\n\n")

                    find_result = re.findall(
                        pattern,
                        current_entry,
                    )

                    if len(find_result):  # check if we found a match
                        (num, chars, pingyin, definition) = find_result[0]
                        num = num.replace("\n", " ").strip()
                        chars = chars.replace("\n", " ").strip()
                        pingyin = pingyin.replace("\n", " ").strip()
                        definition = definition.replace("\n", " ").strip()

                        entry_text = f"{num}[-] {chars}[-] {pingyin}[-] {definition}\n"
                        out_file.write(entry_text)
                    else:
                        # check to see if we should've found a match
                        if re.search(r"[\u4e00-\u9fff]", current_entry):
                            entry_text = f"***{current_entry}\n"
                            out_file.write(entry_text)
                        else:
                            pass

                    # update current_entry to be the new entry
                    current_entry = line
                else:
                    # if we haven't yet stumbled across the start of the next entry, keep concatenating new lines to the current entry
                    current_entry = f"{current_entry}\n{line}"

In [65]:
pdf_dir = os.path.join(os.getcwd(), "data", "pdf")
pdf_files = sorted(
    [os.path.join(pdf_dir, pdf_name) for pdf_name in os.listdir(pdf_dir)]
)
pdf_files

['/Users/elim-mbp-01/Documents/cngrind/third_pass/data/pdf/New-HSK-1-Word-List.pdf',
 '/Users/elim-mbp-01/Documents/cngrind/third_pass/data/pdf/New-HSK-2-Word-List.pdf',
 '/Users/elim-mbp-01/Documents/cngrind/third_pass/data/pdf/New-HSK-3-Word-List.pdf',
 '/Users/elim-mbp-01/Documents/cngrind/third_pass/data/pdf/New-HSK-4-Word-List.pdf',
 '/Users/elim-mbp-01/Documents/cngrind/third_pass/data/pdf/New-HSK-5-Word-List.pdf',
 '/Users/elim-mbp-01/Documents/cngrind/third_pass/data/pdf/New-HSK-6-Word-List.pdf']

In [66]:
for pdf_file in pdf_files:
    pdf_reader = PyPDF2.PdfReader(pdf_file)
    pdf_name = pdf_file.split("/")[-1].strip(".pdf")
    print(pdf_name, len(pdf_reader.pages))
    
    process_pdf(pdf_reader, f"{pdf_name}.csv", verbose=True)

New-HSK-1-Word-List 25
ENTRY START

1 
 No Chinese Pinyin English 
ENTRY END


ENTRY START
1 爱 ài love 
ENTRY END


ENTRY START
2 爱好 ài hào hobby 
ENTRY END


ENTRY START
3 八 bā eight 
ENTRY END


ENTRY START
4 爸爸｜爸 bàba ｜ bà dad 
ENTRY END


ENTRY START
5 吧 ba (interjection particle) 
ENTRY END


ENTRY START
6 白（形） bái white 
ENTRY END


ENTRY START
7 白天 bái tiān day 
ENTRY END


ENTRY START
8 百 bǎi hundred 
ENTRY END


ENTRY START
9 班 bān class 
ENTRY END


ENTRY START
10 半 bàn half 
ENTRY END


ENTRY START
11 半年 bàn nián half a year 
ENTRY END


ENTRY START
12 半天 bàn tiān half day 
ENTRY END


ENTRY START
13 帮 bāng help 
ENTRY END


ENTRY START
14 帮忙 bāng máng help 
ENTRY END


ENTRY START
15 包 bāo package 
ENTRY END


ENTRY START
16 包子 bāo zi bun 
ENTRY END


ENTRY START
17 杯 bēi cup 
ENTRY END


ENTRY START
18 杯子 bēi zi cup 
ENTRY END


ENTRY START
19 北 běi north 
ENTRY END


ENTRY START
20 北边 běi biān North side 
ENTRY END


ENTRY START

2 
ENTRY END


ENTRY START
 22 本（量） běn (m