In [1]:
import pickle
from typing import TypedDict

class RawSentence(TypedDict):
    sentence: str
    words: list[str]
    translation: str
    part_of_speech: list[list[str]]

with open("output/annotated_translated_tokenized_corpus_by_book.pkl", "rb") as f:
    translation_pair_by_book: dict[str, list[RawSentence]] = pickle.load(f)

In [2]:

class Sentence(TypedDict):
    book: str
    sentence: str
    words: list[str]
    part_of_speech: list[list[str]]
    translation: str

flatten_corpus: list[Sentence] = [Sentence(
    sentence=sentence["sentence"],
    words=sentence["words"],
    part_of_speech=sentence["part_of_speech"],
    translation=sentence["translation"],
    book=book,
) for book, sublist in translation_pair_by_book.items() for sentence in sublist]

In [3]:
ABBREVIATIONS = {
    "アイヌ語アーカイブ": "アア",
    "アイヌ語鵡川方言日本語‐アイヌ語辞典": "鵡川",
    "アイヌ語會話字典": "会話",
    "平取町アイヌ口承文芸": "平取",
    "アイヌ語音声資料": "音声",
    "鍋沢元蔵筆録ノート": "鍋沢",
    "AA研アイヌ語資料": "ＡＡ",
    "アイヌ語アーカイブ音声資料": "ア音",
    "アイヌタイムズ": "アタ",
    "アイヌ語口承文芸コーパス": "口承",
    "浅井タケ昔話全集I,II": "浅井",
    "千歳のアイヌ語（初級）": "千初",
    "アイヌ神謡集": "神謡",
    "幌別のアイヌ語（初級）": "幌初",
    "アイヌ民族文化センター研究紀要": "紀要",
    "美幌のアイヌ語（中級）": "美中",
    "千歳のアイヌ語（中級）": "千中",
    "ニューエクスプレスプラス アイヌ語": "ニュ",
    "沙流のアイヌ語（中級）": "沙中",
    "ウポポイ館内展示": "ウポ",
    "十勝のアイヌ語（初級）": "十初",
    "石狩川のアイヌ語（中級）": "石中",
    "十勝のアイヌ語（中級）": "十中",
    "カムイユカㇻを聞いてアイヌ語を学ぶ": "ユカ",
    "アコㇿイタㇰ": "アコ",
    "第27回アイヌ語弁論大会": "弁論",
    "沙流のアイヌ語（初級）": "沙初",
    "いしかりがわのアイヌご（入門）": "石入",
    "カラフトのアイヌ語（中級）": "カ中",
    "とかちのアイヌご（入門）": "ト入",
    "美幌のアイヌ語（初級）": "美初",
    "静内のアイヌ語（初級）": "静初",
    "幌別のアイヌ語（中級）": "幌中",
    "石狩川のアイヌ語（初級）": "石初",
    "びほろのあいぬご（入門）": "ビ入",
    "静内のアイヌ語（中級）": "静中",
    "カラフトのアイヌ語（初級）": "カ初",
    "ちとせのアイヌご（入門）": "チ入",
    "プラハ宣言": "プラ",
    "千徳太郎治のピウスツキ宛書簡": "千ピ",
    "からふとのアイヌご（入門）": "カ入",
    "ほろべつのアイヌご（入門）": "ホ入",
    "さるのアイヌご（入門）": "サ入",
    "しずないのアイヌご（入門）": "シ入",
    "萱野茂の国会演説": "萱国",
}

In [4]:
# search for hemanta with the next word being noun
def search_word(word: str):
    for sentence in flatten_corpus:
        if word in sentence["words"]:
            print(ABBREVIATIONS[sentence["book"]], sentence["sentence"].replace(word, f"**{word}**"))
            print(sentence["translation"])
            print("-" * 100)

In [5]:
search_word("pewtanke")

鵡川 ku=kor hapo ekuskonna hese ka niwkes wa iyapo kimatek wa hapo penramu or un hussa eciw kor an yakka  wen wa ape huci kamuy or un hemanta ka ye kor hoyupu wa rorun puray or un arpa wa rorun puray cakke wa kani pon pe ne kusu **pewtanke** sekor ku=yaynu korka hemanta hawehe otusuy resuy ki wa yakka wen wa hosipi wa suy hapo kisma kor kimatek=an akusu hapo ponno ponno poka pirka wa earkinne kimatek siri ku=nukar
私の母が突然息もできなくなって父は慌てて、母の胸の上にフッサしていたが、だめで、火の神様に何か言いながら走って東の窓へ行って、東の窓を開けて、私は小さかったので、ペウタンケだと思っていたけれど、何か声を二度三度出してもだめで、戻って母を掴むと慌てていたのだが、母は少しだけよくなって本当に驚いた様子を私は見た。
----------------------------------------------------------------------------------------------------
鵡川 niwen horippa anakne okkayo utar anak emus kor wa humse kor arpa, os… osmake menoko utar anakne **pewtanke** kor inawcipa or un arpa siri ku=nukar.
ニウェン・ホリッパは、男の人たちは刀を持ってかけ声を上げながら行って、その後ろで女の人たちは危急の叫び声を上げて、イナウチパ（祭壇）へ行く様子を私は見た。
--------------------------------------------------------------------------------------------------

In [6]:
# search for hemanta with the next word being noun
def search_word_with_next_word_noun(word: str, context_length: int = 1):
    for index, sentence in enumerate(flatten_corpus):
        previous_contexts = flatten_corpus[index - context_length:index] if index - context_length >= 0 else []
        next_contexts = flatten_corpus[index + 1:index + context_length + 1] if index + context_length + 1 <= len(flatten_corpus) else []
        previous_contexts = [context for context in previous_contexts if context["book"] == sentence["book"]]
        next_contexts = [context for context in next_contexts if context["book"] == sentence["book"]]

        if word in sentence["words"]:
            word_indices = [i for i, w in enumerate(sentence["words"]) if w == word]
            for i in word_indices:
                if i + 1 < len(sentence["words"]) and "noun" in sentence["part_of_speech"][i + 1]:
                    print(
                        ABBREVIATIONS[sentence["book"]],
                        " ".join([context["sentence"] for context in previous_contexts]),
                        sentence["sentence"].replace(word, f"**{word}**"),
                        " ".join([context["sentence"] for context in next_contexts]),
                    )
                    print(sentence["translation"])
                    print("-" * 100)

In [7]:
search_word_with_next_word_noun('hemanta', 5)

鵡川 mokor ku=kor k=uspe k=us kusu sanke wa ku=nukar akusu sinnayno an wa k=us ka k=eaykap. kani anakne sinen ne k=an pe ne kusu ku=yaykoparooyki kor coka anakne irwak ci=ne. yupke rera ek noyne hawas . **hemanta** cikuni hoka omare wa ene puspus siri an. nepki. eoripak toyta kur nep ka iki?  cep
何の木だか、火にくべたらこのようにはねているよ。
----------------------------------------------------------------------------------------------------
鵡川 upaskep tanto anakne sirpirka wa si… sikus at ruwe ne. pet karanke. pet karanke. wenpe sani kasi kewe. kasi kewe ku=kor hapo ekuskonna hese ka niwkes wa iyapo kimatek wa hapo penramu or un hussa eciw kor an yakka  wen wa ape huci kamuy or un **hemanta** ka ye kor hoyupu wa rorun puray or un arpa wa rorun puray cakke wa kani pon pe ne kusu pewtanke sekor ku=yaynu korka **hemanta** hawehe otusuy resuy ki wa yakka wen wa hosipi wa suy hapo kisma kor kimatek=an akusu hapo ponno ponno poka pirka wa earkinne kimatek siri ku=nukar sasuy sir k=onaha anakne sonno paro yupke. et

In [24]:
from typing import cast
from utils.search import Keyword, Word, find


def has_pos(pos: str, sentence: Sentence) -> bool:
    return any(pos in p for p in sentence["part_of_speech"])


def search_word_by_parameters(
    keywords: list[Keyword],
    context_length: int = 1,
):
    for i, sentence in enumerate(flatten_corpus):
        previous_contexts = (
            flatten_corpus[i - context_length : i] if i - context_length >= 0 else []
        )
        next_contexts = (
            flatten_corpus[i + 1 : i + context_length + 1]
            if i + context_length + 1 <= len(flatten_corpus)
            else []
        )
        previous_contexts = [
            context
            for context in previous_contexts
            if context["book"] == sentence["book"]
        ]
        next_contexts = [
            context for context in next_contexts if context["book"] == sentence["book"]
        ]
        extended_sentence = cast(
            Sentence,
            {
                "book": sentence["book"],
                "sentence": " ".join(
                    [context["sentence"] for context in previous_contexts]
                )
                + " "
                + sentence["sentence"]
                + " "
                + " ".join([context["sentence"] for context in next_contexts]),
                "words": [
                    *[
                        word
                        for context in previous_contexts
                        for word in context["words"]
                    ],
                    *sentence["words"],
                    *[word for context in next_contexts for word in context["words"]],
                ],
                "part_of_speech": [
                    *[
                        pos
                        for context in previous_contexts
                        for pos in context["part_of_speech"]
                    ],
                    *sentence["part_of_speech"],
                    *[
                        pos
                        for context in next_contexts
                        for pos in context["part_of_speech"]
                    ],
                ],
                "translation": " ".join([context["translation"] for context in previous_contexts]) + " " + sentence["translation"] + " " + " ".join([context["translation"] for context in next_contexts]),
            },
        )


        extended_words = [
            Word(word, pos)
            for word, pos in zip(
                extended_sentence["words"], extended_sentence["part_of_speech"]
            )
        ]

        # found = find(
        #     [Keyword("hemanta", None), Keyword(None, "noun")], extended_words, 0, 0
        # )
        # if found:
        #     print("ew", extended_sentence["words"])
        #     print("ep", extended_sentence["part_of_speech"])
        #     print(found)
        #     break

        previous_contexts_words = sum(len(context["words"]) for context in previous_contexts)
        next_contexts_words = sum(len(context["words"]) for context in next_contexts)
        found = find(keywords, extended_words[previous_contexts_words:len(extended_words) - next_contexts_words])

        found_intervals = [(f + len(previous_contexts), f + len(keywords) + len(previous_contexts)) for f in found]
        if found:
            print(ABBREVIATIONS[extended_sentence["book"]], extended_sentence["sentence"])
            # print(extended_sentence["part_of_speech"])
            print(" ".join( f"**{word}**" if any(f <= i < e for f, e in found_intervals) else word for i, word in enumerate(extended_sentence["words"])))
            print(extended_sentence["translation"])
            print("-" * 100)

search_word_by_parameters([Keyword("hemanta", None), Keyword(None, "noun")], 5)

鵡川 mokor ku=kor k=uspe k=us kusu sanke wa ku=nukar akusu sinnayno an wa k=us ka k=eaykap. kani anakne sinen ne k=an pe ne kusu ku=yaykoparooyki kor coka anakne irwak ci=ne. yupke rera ek noyne hawas . hemanta cikuni hoka omare wa ene puspus siri an. nepki. eoripak toyta kur nep ka iki?  cep
mokor ku= kor k= uspe **k=** **us** kusu sanke wa ku= nukar akusu sinnayno an wa k= us ka k= eaykap kani anakne sinen ne k= an pe ne kusu ku= yaykoparooyki kor coka anakne irwak ci= ne yupke rera ek noyne hawas hemanta cikuni hoka omare wa ene puspus siri an nepki eoripak toyta kur nep ka iki? cep
寝る 私の履物を履くために出して見たところ、違っていて履くことができない。 私はひとりだから自炊し 私たちは、きょうだいです。 台風が来るような話だ。 何の木だか、火にくべたらこのようにはねているよ。 仕事 …を敬う、…を恐れ多く思う 農夫、農業をする男の人 何かしてるかい？ 魚
----------------------------------------------------------------------------------------------------
鵡川 upaskep tanto anakne sirpirka wa si… sikus at ruwe ne. pet karanke. pet karanke. wenpe sani kasi kewe. kasi kewe ku=kor hapo ekuskonna hese ka niwkes wa iyapo kimat