In [9]:
import re

import pandas as pd

words = [
    "къеукIуриехыу",
    "къытехуауэ",
    "зэрыщыжыпIар",
    "Жъесыну",
    "нэужьщ",
    "зэрынэсу",
    "ЩIыр",
    "зэрыпщIэщи",
    "щIашыжащ",
    "къыпежьэри",
    "хуеякъым",
    "сынэгъэс",
    "хьэпшып",
    "тесынри",
    "НтIэ",
    "Шыдыгум",
    "уафэм",
    "пулеметышэу",
    "шынагъуэу",
    "пэшым",
    "гъунэгъу",
    "къысхуеину",
    "пщIыр",
]

In [34]:
import os
from difflib import SequenceMatcher
import re


def _get_text_by_lang(book_base_dir, lang):
    lang_text_dir = os.path.join(book_base_dir, lang, "txts")

    lang_text_by_page = []
    for text_f in sorted(os.listdir(lang_text_dir)):
        with open(os.path.join(lang_text_dir, text_f)) as f:
            text = f.read()
            lang_text_by_page.append(text)

    return lang_text_by_page


def extract_diff_words(line_1, line_2):
    line_1 = re.sub(r"[^\w\s]", "", line_1)
    line_2 = re.sub(r"[^\w\s]", "", line_2)

    words1 = line_1.split()
    words2 = line_2.split()

    sequence_matcher = SequenceMatcher(None, words1, words2)
    match = sequence_matcher.get_matching_blocks()

    different_words_1 = []
    different_words_2 = []

    start1 = 0
    start2 = 0

    for block in match:
        different_words_1.extend(words1[start1 : block.a])
        different_words_2.extend(words2[start2 : block.b])

        start1 = block.a + block.size
        start2 = block.b + block.size

    return tuple(different_words_1), tuple(different_words_2)


def find_diff_words_by_lang(book_base_dir, lang_1, lang_2):
    lang_1_text_by_page = _get_text_by_lang(book_base_dir, lang_1)
    lang_2_text_by_page = _get_text_by_lang(book_base_dir, lang_2)

    if len(lang_1_text_by_page) != len(lang_2_text_by_page):
        raise ValueError("Text lengths for both languages must be the same.")

    diff_words_1 = []
    diff_words_2 = []

    for page_i in range(len(lang_1_text_by_page)):
        page_text_1 = lang_1_text_by_page[page_i]
        page_text_2 = lang_2_text_by_page[page_i]

        for line_1, line_2 in zip(page_text_1.splitlines(), page_text_2.splitlines()):
            diff_w_1, diff_w_2 = extract_diff_words(line_1, line_2)
            diff_words_1.append(diff_w_1)
            diff_words_2.append(diff_w_2)

    return diff_words_1, diff_words_2

In [35]:
_book_base_dir = "../data/dag_results/pdf_processing/dysche_zhyg.pdf"
_lang_1 = "kbd_0.229_2995_10800"
_lang_2 = "kbd_0.009_4360_66700"
_output_file = os.path.join(_book_base_dir, f"merged_diff_{_lang_1}_vs_{_lang_2}.html")
words_1, words_2 = find_diff_words_by_lang(_book_base_dir, _lang_1, _lang_2)

In [42]:
import pandas as pd

df = pd.DataFrame([{_lang_1: " ".join(w1), _lang_2: " ".join(w2)} for w1, w2 in zip(words_1, words_2)])

In [43]:
df.head()

Unnamed: 0,kbd_0.229_2995_10800,kbd_0.009_4360_66700
0,,
1,,
2,х,I у
3,Ж,I
4,,


In [49]:
best_traineddata_dir = "/Users/panagoa/PycharmProjects/tesstrain/data/kbd_ng/tessdata_best"
best_traineddata_name = sorted(
    os.listdir(best_traineddata_dir),
    # key=lambda x: os.path.getmtime(os.path.join(best_traineddata_dir, x)),
)[0]
print(best_traineddata_name)

['kbd_ng_0.097_1738_38300.traineddata', 'kbd_ng_0.106_1737_38100.traineddata', 'kbd_ng_0.123_1725_37800.traineddata', 'kbd_ng_0.128_1178_24200.traineddata', 'kbd_ng_0.135_1173_24100.traineddata', 'kbd_ng_0.143_709_13200.traineddata', 'kbd_ng_0.149_703_13000.traineddata', 'kbd_ng_0.156_700_12900.traineddata', 'kbd_ng_0.171_693_12600.traineddata', 'kbd_ng_0.180_690_12500.traineddata', 'kbd_ng_0.185_228_3700.traineddata', 'kbd_ng_0.220_224_3600.traineddata', 'kbd_ng_0.240_222_3500.traineddata', 'kbd_ng_0.261_16_300.traineddata', 'kbd_ng_0.290_12_200.traineddata', 'kbd_ng_0.305_7_100.traineddata']
