### JMDICT Lookup

In [1]:
JMDICT = '../ChiitransLite/data/JMdict.xml'
JMneDICT = '../ChiitransLite/data/JMnedict.xml'

In [2]:
import lxml.etree as ET

tree = ET.parse(JMDICT)
root = tree.getroot()

In [4]:
search_text = '呟き'
xpath = f".//entry[k_ele/keb[text()='{search_text}']]"
matching_entries = root.xpath(xpath)
matching_entries

[<Element entry at 0x1441d10dc40>]

In [16]:
e = matching_entries[0]
gloss = [g.text for g in e.iter('gloss')]
reb = [r.text for r in e.iter('reb')]
pos = [p.text for p in e.iter('pos')]
gloss, reb, pos

(['surgery', 'surgical procedure'],
 ['げかしゅじゅつ'],
 ['noun (common) (futsuumeishi)'])

In [34]:
class Entry:
    def __init__(self, ent):
        self.gloss = [g.text for g in ent.iter('gloss')]
        self.pos = [p.text for p in ent.iter('pos')]
        self.furigana = [r.text for r in ent.iter('reb')]
    
    def print_info(self):
        for item in self.furigana:
            print(item, end='')
        print()
        for item in self.gloss:
            print(item, end='; ')
        print()
        for item in self.pos:
            print(item, end='')

In [35]:
ent = Entry(e)
ent.print_info()

げかしゅじゅつ
surgery; surgical procedure; 
noun (common) (futsuumeishi)

In [36]:
import lxml.etree as ET
class EntryFinder:
    def __init__(self, JMDICT_PATH):
        self.__tree = ET.parse(JMDICT_PATH)
        self.root = self.__tree.getroot()
    
    def find_entry(self, text):
        xpath = f".//entry[k_ele/keb[text()='{search_text}']]"
        matching_entries = root.xpath(xpath)
        entries = [Entry(e) for e in matching_entries]
        return entries
    
        

### Morpheme Segmentation

In [39]:
%pip install janome

Collecting janome
  Downloading Janome-0.5.0-py2.py3-none-any.whl.metadata (2.6 kB)
Downloading Janome-0.5.0-py2.py3-none-any.whl (19.7 MB)
   ---------------------------------------- 0.0/19.7 MB ? eta -:--:--
   ----------- ---------------------------- 5.8/19.7 MB 32.0 MB/s eta 0:00:01
   -------------------------- ------------- 12.8/19.7 MB 33.6 MB/s eta 0:00:01
   ---------------------------------------- 19.7/19.7 MB 34.4 MB/s eta 0:00:00
Installing collected packages: janome
Successfully installed janome-0.5.0
Note: you may need to restart the kernel to use updated packages.


In [41]:
from janome.tokenizer import Tokenizer

tokenizer = Tokenizer()
text = "救命救急センターに到着すると、ただちに緊急手術が行われることになった。"
for token in tokenizer.tokenize(text):
    print(f"Surface: {token.surface}, Lemma: {token.base_form}")

Surface: 救命, Lemma: 救命
Surface: 救急, Lemma: 救急
Surface: センター, Lemma: センター
Surface: に, Lemma: に
Surface: 到着, Lemma: 到着
Surface: する, Lemma: する
Surface: と, Lemma: と
Surface: 、, Lemma: 、
Surface: ただちに, Lemma: ただちに
Surface: 緊急, Lemma: 緊急
Surface: 手術, Lemma: 手術
Surface: が, Lemma: が
Surface: 行わ, Lemma: 行う
Surface: れる, Lemma: れる
Surface: こと, Lemma: こと
Surface: に, Lemma: に
Surface: なっ, Lemma: なる
Surface: た, Lemma: た
Surface: 。, Lemma: 。
