# 第5章: 係り受け解析

### 40. 係り受け解析結果の読み込み（形態素)

In [92]:
from pathlib import Path
from collections import defaultdict
from pprint import pprint, pformat

class Morph:
    def __init__(self, surface: str, base: str, pos: str, pos1: str):
        self.surface = surface
        self.base = base
        self.pos = pos
        self.pos1 = pos1

    def __format__(self):
        s = ""
        s += f"surface: {self.surface}\n"
        s += f"base: {self.base}\n"
        s += f"pos: {self.pos}\n"
        s += f"pos1: {self.pos1}\n"
        return s

class Chunk:
    def __init__(self, morph_list: list, dst: int, srcs: list):
        self.morph_list = morph_list
        self.dst = dst
        self.srcs = srcs

    def __format__(self, keywargs):
        s = ""
        s += f"morph_list: \n{pformat(self.morph_list)}\n"
        s += f"dst: {self.dst}\n"
        s += f"srcs: {self.srcs}\n"
        return s


def read_cabocha(filepath: str | Path) -> list[list[Chunk]]:
    with open(filepath) as f:
        paragraph = []
        prev_is_eos = False
        sentence_list = []
        chunk_list = [] # one sentence
        kakari_dict = defaultdict(lambda: [])
        morph_list = []
        dst = None
        for line in f:
            # 表層形\t品詞,品詞細分類1,品詞細分類2,品詞細分類3,活用型,活用形,原形,読み,発音
            # -> 表層形（surface），基本形（base），品詞（pos），品詞細分類1（pos1）

            if line.strip() == "":
                continue

            # 前の行がEOSのときの処理
            if prev_is_eos:
                prev_is_eos = False

                # 今のチャンクを処理
                assert dst != None
                chunk = Chunk(
                    morph_list=morph_list,
                    dst=dst,
                    srcs=[]
                )
                chunk_list.append(chunk)
                morph_list = []
                dst = None

                # 係り結びの対応
                for i in range(len(chunk_list)):
                    if kakari_dict[i] != []:
                        chunk_list[i].srcs = kakari_dict[i]
                kakari_dict = defaultdict(lambda: [])

                # 今の文の処理
                sentence_list.append(chunk_list)
                chunk_list = []

                # EOSが二回続いたら文節が終わる
                if line.strip() == "EOS":
                    paragraph.append(sentence_list)
                    sentence_list = []
                    continue

            if line.strip() == "EOS":
                prev_is_eos = True
                continue

            # 係り結びを解析する処理
            if line.strip()[0] == "*":
                if len(morph_list) > 0:
                    # 今のチャンクを処理
                    assert dst != None
                    chunk = Chunk(
                        morph_list=morph_list,
                        dst=dst,
                        srcs=[]
                    )
                    chunk_list.append(chunk)
                    morph_list = []
                    dst = None

                line = line.strip()
                splitted = line.split(" ")
                dst_str = splitted[2]
                dst = int(dst_str[:-1])
                kakari_dict[dst].append(len(chunk_list))
                continue

            # 形態素を解析する処理
            surface, rest = line.split("\t")
            outputs = rest.split(",")
            morph = {
                "surface": surface,
                "base": outputs[-3],
                "pos": outputs[0],
                "pos1": outputs[1]
            }
            morph_list.append(morph)

        if morph_list != []:
            assert dst != None
            chunk = Chunk(
                morph_list=morph_list,
                dst=dst,
                srcs=[]
            )
            chunk_list.append(chunk)
        if chunk_list != []:
            # 係り結びの対応
            for i in range(len(chunk_list)):
                if kakari_dict[i] != []:
                    chunk_list[i].srcs = kakari_dict[i]
            sentence_list.append(chunk_list)
        if chunk_list != []:
            paragraph.append(sentence_list)

    return paragraph

paragraph = read_cabocha("./data/ai.ja.txt.parsed")

[{'surface': '答え', 'base': '答える', 'pos': '動詞', 'pos1': '自立'}, {'surface': 'て', 'base': 'て', 'pos': '助詞', 'pos1': '接続助詞'}, {'surface': 'いる', 'base': 'いる', 'pos': '動詞', 'pos1': '非自立'}, {'surface': '。', 'base': '。', 'pos': '記号', 'pos1': '句点'}]


In [95]:
# print(chunk_list[:3])
# chunk_list[2].morphs
# print(chunk_list[0].__format__())
# for i in range(10):
#     sentence = sentence_list[i]
#     for i, chunk_list in enumerate(sentence):
#         print("*** sentence:", i)
#         for chunk in chunk_list:
#             print(f"{chunk}")

print(len(paragraph))
for sentence_list in paragraph:
    l = len(sentence_list)
    if l != 1:
        print(len(sentence_list))

for i, chunk_list in enumerate(paragraph[-1]):
        for chunk in chunk_list:
            print(f"{chunk}")
    # print(sentence)
    # chunk = chunk_list[-i]
    # print(f"{chunk}")

73
2
2
2
2
4
3
2
morph_list: 
[{'base': '対談', 'pos': '名詞', 'pos1': 'サ変接続', 'surface': '対談'},
 {'base': 'で', 'pos': '助詞', 'pos1': '格助詞', 'surface': 'で'}]
dst: 20
srcs: []

morph_list: 
[{'base': '須藤', 'pos': '名詞', 'pos1': '固有名詞', 'surface': '須藤'},
 {'base': 'は', 'pos': '助詞', 'pos1': '係助詞', 'surface': 'は'}]
dst: 16
srcs: []

morph_list: 
[{'base': '「', 'pos': '記号', 'pos1': '括弧開', 'surface': '「'},
 {'base': 'これ', 'pos': '名詞', 'pos1': '代名詞', 'surface': 'これ'},
 {'base': 'まで', 'pos': '助詞', 'pos1': '副助詞', 'surface': 'まで'}]
dst: 6
srcs: []

morph_list: 
[{'base': 'けっこう', 'pos': '副詞', 'pos1': '一般', 'surface': 'けっこう'}]
dst: 6
srcs: []

morph_list: 
[{'base': '長時間', 'pos': '名詞', 'pos1': '副詞可能', 'surface': '長時間'}]
dst: 6
srcs: []

morph_list: 
[{'base': '議論', 'pos': '名詞', 'pos1': 'サ変接続', 'surface': '議論'},
 {'base': 'を', 'pos': '助詞', 'pos1': '格助詞', 'surface': 'を'}]
dst: 6
srcs: []

morph_list: 
[{'base': '行う', 'pos': '動詞', 'pos1': '自立', 'surface': '行っ'},
 {'base': 'て', 'pos': '助詞', 'pos1': '接続助詞', 