In [None]:
import pathlib
import re
import pandas as pd

# (內部使用)
def adjust_file_name(filename: str):
    """調整用戶輸入的文件路徑，使其變爲 不含目錄 的文件名。"""
    filename = filename.rsplit('/')[-1]
    if not filename.endswith(".md"):
        filename += ".md"
    if not pathlib.Path("原文/"+filename).is_file():
        raise FileNotFoundError("未找到相關文件！")
    return filename

# 檢查原文格式。只簡單檢查標點符號。[...] 內的需自行搜索檢查。
# 注意，原書籍裏就存在的錯誤可能也被檢查出來。
def detect_format(filename: str):
    """檢查原文格式。"""
    filename = adjust_file_name(filename)
    with open("原文/"+filename, encoding="utf-8") as f:
        no = 0
        for line in f:
            no += 1
            if line == "\n":
                continue
            flag = False
            if line.startswith("**"):
                line = line[:-4]+"\n"
                flag = True
            if line.startswith("·"):
                flag = True
            if flag:
                index = 0
                while index < len(line)-1:
                    letter = line[index]
                    if letter == '[':
                        ref = False
                        if line[index+1] == "^" and line[index-1] == " ":
                            print(f"行-{no},列-{index+1}: [^...] 前不要空格。")
                        elif line[index+1] != "^" and line[index-1] not in " ‘“(":
                            print(f"行-{no},列-{index+1}: [ 前無空格。")
                        if line[index+1] == "^":
                            ref = True
                        index += 1
                        while index < len(line)-1:
                            if line[index] == "]":
                                if ref: break
                                if line[index+1] not in " \n)’”":
                                    print(f"行-{no},列-{index+1}: ] 後無空格。")
                                break
                            index += 1
                            if index == len(line)-1:
                                print(f"行-{no},列-{index+1}: [ 沒有與之匹配的 ]。")
                    elif letter in ',.;:!?':
                        n = line[index+1]
                        if n not in " \n’”)]_":
                            print(f"行-{no},列-{index+1}: {letter} 後無空格。")
                    # elif letter == '.':
                    #     n = line[index+1]
                    #     if n not in " \n’”)]_":
                    #         print(f"行-{no},列-{index+1}: . 後無空格。")
                    elif letter == '‘':
                        f = line[index-1]
                        if f not in " “[(":
                            print(f"行-{no},列-{index+1}: ‘ 前無空格。")
                    elif letter == '“':
                        f = line[index-1]
                        if f not in " ‘[(":
                            print(f"行-{no},列-{index+1}: “ 前無空格。")
                    elif letter == '’':
                        n = line[index+1]
                        if n not in " \n,.:;”)]_":
                            print(f"行-{no},列-{index+1}: ’ 後無空格。")
                    elif letter == '”':
                        n = line[index+1]
                        if n not in " \n,.:;)]_":
                            print(f"行-{no},列-{index+1}: ” 後無空格。")
                    elif letter == '—':
                        f = line[index-1]
                        if f not in " [(‘“":
                            print(f"行-{no},列-{index+1}: — 前無空格。")
                        n = line[index+1]
                        if n == '—':
                            print(f"行-{no},列-{index+1}: — 太長了！刪掉一半。")
                            index += 1
                        elif n not in " \n’”)]":
                            print(f"行-{no},列-{index+1}: — 後無空格。")
                    index += 1


In [None]:
# 把一串羅馬字簡單轉爲漢字
def lat2han(lat:str, data):
    """把一串羅馬字簡單轉爲漢字。需傳入數據字典。

    僅僅按照最常用詞匹配，所以同音詞可能會出現錯誤，請手動修正。"""
    lat = lat.lower()
    lat = "".join(re.split("\[.+?\]", lat))   # 去掉 中括號及內容
    # 拉丁字母分詞到列表中
    origin = re.findall("[,.;:‘’“”!?()—]|['a-zA-ZÜüÔôÖöÆæ-]+", lat)
    preview = []
    punc = str.maketrans(',.;:‘’“”!?()—', '，。；：‘’“”！？（）—')
    for item in origin:
        if re.search("['a-zA-ZÜüÔôÖöÆæ-]+", item) != None:
            select = data[data['lat']==item]['han'].to_list()
            if len(select)>0:
                preview.append(select[0])
            else:preview.append(item)
        elif item == '—':
            preview.append("——")
        else:
            preview.append(item.translate(punc))
    result = "".join(preview)
    return result

# 將按照“原文”中的文件生成 “漢字對照”格式 的文件。方便編輯。
def generate_hanzi_template(filename: str):
    """ 將按照“原文”中的文件生成 “漢字對照”格式 的目標文件。方便編輯。"""
    filename = adjust_file_name(filename)
    if pathlib.Path("漢字對照/"+filename).exists():
        print("警告：目標文件已存在，如需重新生成，請先手動刪除！\n"
              "目標文件路徑：'/漢字對照/%s'。" % filename)
        return None
    data = pd.read_csv("data.csv")
    with (open("原文/"+filename, "r", encoding="utf-8") as rf,
        open("漢字對照/"+filename, 'w+', encoding='utf-8') as wf):
        
        for line in rf:
            wf.write(line)
            if (line.startswith("# ")):
                wf.write("\n> " + lat2han(line, data) + "\n")
            elif (line.startswith("## ")):
                wf.write("\n> "+ line[3:-1] +"\n")
            elif (line.startswith("**_")):
                wf.write("\n> **_" + lat2han(line, data) + "_**\n")
            elif (line.startswith("·")):
                wf.write("\n> "+ line.split(" ", 2)[0][1:] + " " + lat2han(line, data) + "\n")
                
        print("已完成，請查看 '/漢字對照/%s'。" % filename)

# test
# generate_hanzi_template("原文/7、哥林多書信 1.md")

In [None]:
# 加載
def load_trans_book(filename):
    """加載 book，保存到字典。加載失敗返回 None。"""
    book = {}
    book['book_name'] = {}
    book['chapters'] = list[dict]()
    book['footnotes'] = list[str]()
    filename = adjust_file_name(filename)
    with open("漢字對照/"+filename, encoding="utf-8") as f:
        line_no = 0
        handle_footnates = False
        while (l:=f.readline()) != '':  # 讀取行
            line_no += 1

            # 檢測爲書名行
            if l.startswith("# "):
                book['book_name']['line_no'] = line_no
                book['book_name']['lat'] = l.removeprefix("# ").removesuffix("\n")
                if f.readline() != '\n':
                    print(f"第 {line_no} 行：書名行下未空一行，函數中止！")
                    return None
                han = f.readline()
                if not han.startswith("> "):
                    print(f"第 {line_no} 行：書名行未翻譯，程序中止！")
                    return None
                line_no += 2
                book['book_name']['han'] = han.removeprefix("> ").removesuffix("\n")
                if f.readline() != '\n':
                    print(f"第 {line_no} 行：書名翻譯行下未空一行，函數中止！")
                    return None
                line_no += 1

            # 檢測爲章節標題首行
            elif l.startswith("## "):
                book['chapters'].append({})
                book['chapters'][-1]['verses'] = list[dict]()
                book['chapters'][-1]['line_no'] = line_no
                book['chapters'][-1]['title'] = l.removeprefix("## ").removesuffix("\n")
                if f.readline() != '\n':
                    print(f"第 {line_no} 行：章標題行下未空一行，函數中止！")
                    return None
                han = f.readline()
                if not han.startswith("> "):
                    print(f"第 {line_no} 行：章標題行未有對應的中文版本，程序中止！")
                    return None
                line_no += 2
                if f.readline() != '\n':
                    print(f"第 {line_no} 行：章標題行下未空一行，函數中止！")
                    return None
                line_no += 1

            # 檢測爲小節行或概述行（保留原格式）
            elif l.startswith("·") or l.startswith("**_"):
                _head = l[0:3]
                if len(book['chapters']) == 0:
                    print(f"第 {line_no} 行：該小節之前未創建章，函數中止！")
                    return None
                _verse = {}
                _verse['line_no'] = line_no
                _verse['lat'] = l.removesuffix("\n")
                if f.readline() != '\n':
                    print(f"第 {line_no} 行：小節下未空一行，函數中止！")
                    return None
                han = f.readline()
                if not han.startswith("> "):
                    print(f"第 {line_no} 行：該小節未翻譯，函數中止！")
                    return None
                line_no += 2
                if _head == "**_" and not han.startswith("> **_"):
                    print(f"第 {line_no} 行：概述行格式未對應，函數中止！")
                    return None
                _verse['han'] = han.removesuffix("\n")
                book['chapters'][-1]['verses'].append(_verse)
                if f.readline() != '\n':
                    print(f"第 {line_no} 行：小節翻譯下未空一行，函數中止！")
                    return None
                line_no += 1

            # 檢測到空行
            elif l.startswith("\n"):
                pass

            # 檢測到腳註
            elif l.startswith("------"):
                handle_footnates = True
            elif handle_footnates:
                book['footnotes'].append(l.removesuffix("\n"))

            # 檢測到未知行
            else:
                print(f"第 {line_no} 行：檢測到未知行！函數中止！")
                return None
    return book

# 分字
def fenzi(books:list[dict]):
    """分字，返回分字 data。失敗返回 None。"""
    dict_zi = dict[(str, str), int]()
    for book in books:
        for chapter in book['chapters']:
            for verse in chapter['verses']:
                lat_line = verse['lat'].lower()
                han_line = verse['han']
                lat_line = "".join(re.split("\[.+?\]", lat_line))   # 去掉 中括號及內容
                # 分字到列表中
                list_han = re.findall("\{.+?\}|"
                                      "[\u4E00-\u9FA5❓□㾎𧮙䫲𤖼𠡒𣥼䂸㔶䥛䀹㬹㧒]", han_line)
                list_lat = re.findall("['a-zA-ZÜüÔôÖöÆæ]+", lat_line)
                # 比較原文與翻譯的字數是否統一
                if (len(list_han) != len(list_lat)):
                    print(f"{book['book_name']['han']}-{chapter['title']}-第 {verse['line_no']} 行：翻譯字數不符合原文：\n")
                    print(f"{verse['line_no']}: {lat_line}\n函數中止！")
                    return None
                # 分字並保存到字典中
                for i in range(0, len(list_lat)):
                    if dict_zi.get((list_lat[i], list_han[i]), None) == None:
                        dict_zi[(list_lat[i], list_han[i])] = 1
                    else:
                        dict_zi[(list_lat[i], list_han[i])] += 1

                    # 測試用於查詢錯誤，請屏蔽
                    # if list_lat[i] == "gün" and list_han[i] == "倦":
                    #     print(f"{chapter['title']}-第{verse['line_no']}行: {verse['lat']}")
    return dict_zi

# 分詞
def fenci(books:list[dict]):
    """分詞，返回分詞 data。失敗返回 None。"""
    dict_ci = dict[(str, str), int]()
    for book in books:
        for chapter in book['chapters']:
            for verse in chapter['verses']:
                lat_line = verse['lat'].lower()
                han_line = verse['han']
                lat_line = "".join(re.split("\[.+?\]", lat_line))   # 去掉 中括號及內容
                # 分字到列表中，用於下面比較檢測
                list_han = re.findall("\{.+?\}|"
                                      "[\u4E00-\u9FA5❓□㾎𧮙䫲𤖼𠡒𣥼䂸㔶䥛䀹㬹㧒]", han_line)
                list_lat = re.findall("['a-zA-ZÜüÔôÖöÆæ]+", lat_line)
                # 比較原文與翻譯的字數是否統一
                if (len(list_han) != len(list_lat)):
                    print(f"{book['book_name']['han']}-第 {verse['line_no']} 行：翻譯字數不符合原文：\n")
                    print(f"{verse['line_no']}: {lat_line}\n函數中止！")
                    return None
                # 拉丁字母分詞到列表中
                list_lat_ci = re.findall("['a-zA-ZÜüÔôÖöÆæ-]+", lat_line)
                # 分詞並保存到字典中
                index = 0
                for lat_ci in list_lat_ci:
                    zi_count = lat_ci.count("-")+1
                    han_ci = "".join(list_han[index:index+zi_count])
                    index += zi_count
                    if dict_ci.get((lat_ci, han_ci), None) == None:
                        dict_ci[(lat_ci, han_ci)] = 1
                    else:
                        dict_ci[(lat_ci, han_ci)] += 1
    return dict_ci

# test
book1 = load_trans_book("漢字對照/1、馬太傳福音書.md")
book2 = load_trans_book("漢字對照/2、馬可傳福音書.md")
book3 = load_trans_book("漢字對照/3、路加傳福音書.md")
book4 = load_trans_book("漢字對照/4、約翰傳福音書.md")
book5 = load_trans_book("漢字對照/5、使徒行傳.md")
book6 = load_trans_book("漢字對照/6、羅馬書信.md")
beek = list([book1, book2, book3, book4, book5, book6])
if len(beek)>0:
    d = fenci(beek)
    # pd.DataFrame({"lat":[a for (a,b) in d.keys()], 
    #             "han": [b for (a,b) in d.keys()], 
    #             "count": [v for v in d.values()]
    #             }).sort_values("count", ascending=False)\
    #             .to_csv("data.csv")