# Kindle VocabMate JSON字典產生器

## 小学館 大辞泉

In [None]:
import sys
import re
import bs4
from pyglossary import Glossary

def parse_definition(text):
    result_texts = []
    objSoup = bs4.BeautifulSoup(text, 'lxml')
    tag_head = objSoup.find('span', '見出G')
    if tag_head:
        result_texts.append(f"<p>{tag_head.text}</p>")
    tag_meaning_list = objSoup.find_all('meaning')
    for meaning in tag_meaning_list:
        result_texts.append(f"<p>{meaning.text}</p>")
    return "\n".join(result_texts)



Glossary.init()
glos = Glossary()

with open('Daijisen/outdjs', 'r', encoding='utf-8') as f:
    for line in f:
        eqv_word = []
        ss = line.strip().split('\t')
        word = ss[0]
        if "|" in ss[0]:
            ss2 = ss[0].split('|')
            word = ss2[0]
        if '【' in word:
            m = re.search(r'(.*)【(.*)】', word)
            word = m.group(1)
            eqv_word.append(word)
            hanjis = m.group(2)
            if '┊' in hanjis:
                ss_hanjis = hanjis.split('┊')
                eqv_word.extend(ss_hanjis)
            else:
                eqv_word.append(hanjis)
        else:
            eqv_word.append(word)
            
        final_eqv_word = []
        for word in eqv_word:
            if '（' in word:
                remove_para = word.replace('（', '').replace('）', '')
                remove_inner = re.sub(r'（.*?）', '', word)
                final_eqv_word.append(remove_para)
                final_eqv_word.append(remove_inner)
            else:
                final_eqv_word.append(word)
                
        defi = ss[1]
        if not final_eqv_word[0].startswith('#'):
            defi = parse_definition(defi)
        glos.addEntryObj(glos.newEntry(
            final_eqv_word,
            defi,
            defiFormat="h",  # "m" for plain text, "h" for HTML
        ))

glos.setInfo("title", "大辞泉")
glos.setInfo("author", "松村明")
glos.write("test/daijisen.json", format="Json")

In [None]:
import json
import uuid

In [None]:
jobj = None
with open('test/daijisen.json', 'r', encoding='utf-8') as f:
    jobj = json.load(f)

In [None]:
word_index_dict = {}
defi_dict = {}
defi_id = 1

for key in jobj:
    eqv_word = []
    
    if "|" in key:
        eqv_word = key.split('|')
    else:
        eqv_word = [key]

    defi_dict[str(defi_id)] = jobj[key]
    
    for word in eqv_word:
        if word in word_index_dict:
            word_index_dict[word].append(str(defi_id))
        else:
            word_index_dict[word] = [str(defi_id)]
    defi_id += 1

In [None]:
with open('test/daijisen.jidx', 'w', encoding='utf-8') as outf:
    json.dump(word_index_dict, outf, ensure_ascii=False)
with open('test/daijisen.jdict', 'w', encoding='utf-8') as outf:
    json.dump(defi_dict, outf, ensure_ascii=False)

In [None]:
daijisen_ifo = {
    "name": "小学館·大辞泉",
    "lang": "ja",
    "bookname": "大辞泉",
    "author": "松村明",
    "publisher": "小学館",
    "website": "",
    "wordcount": len(word_index_dict),
    "definitioncount": len(defi_dict),
    "description": ""
}
with open('test/daijisen.jifo', 'w', encoding='utf-8') as outf:
    json.dump(daijisen_ifo, outf, ensure_ascii=False)

In [None]:
import json

class VocabDictionary():
    def __init__(self, name, lang, cover_image):
        self.name = name
        self.lang = lang
        self.cover_image = cover_image
        
    def query(self, term):
        return ''

class JsonDictionary(VocabDictionary):
    def __init__(self, name, lang, cover_image, file_prefix_path):
        super().__init__(name, lang, cover_image)
        self.file_prefix_path = file_prefix_path
        self.indexes = {}
        self.definitions = {}
        
        self.load_indexes()
        self.load_definitions()
        
    def load_indexes(self):
        file_jidx = f"{self.file_prefix_path}.jidx"
        try:
            with open(file_jidx, 'r', encoding='utf-8') as f:
                self.indexes = json.load(f)
        except:
            print("Load jidx file Error!")
    
    def load_definitions(self):
        file_jdict = f"{self.file_prefix_path}.jdict"
        try:
            with open(file_jdict, 'r', encoding='utf-8') as f:
                self.definitions = json.load(f)
        except:
            print("Load jdict file Error!")

    def query(self, term):
        if term in self.indexes:
            word_ids = self.indexes[term]
            defis = []
            for wid in word_ids:
                defis.append(self.definitions[wid])
            return "\n<br/><br/>\n".join(defis)
        return ''


In [None]:
dict1 = JsonDictionary('Daijisen', 'ja', 'test.jpg', 'test/daijisen')

In [None]:
dict1.query('あ')

## 三省堂 大辞林

In [None]:
import re
import bs4
from tqdm import tqdm

def parse_definition(text):
    result_texts = []
    objSoup = bs4.BeautifulSoup(text, 'lxml')
    tag_hg = objSoup.find('span', 'hg')
    hg = tag_hg.text
    result_texts.append(f"<p>{hg}</p>")
    tag_sg_list = objSoup.find_all('span', 'sg')

    for tag_sg in tag_sg_list:
        tag_posg = tag_sg.find('span', 'posg')
        if tag_posg:
            pos = tag_posg.text
            result_texts.append(f"<p>{pos}</p>")

        tag_tcord = tag_sg.find_all('span', 't_core')
        for tcord in tag_tcord:
            result_texts.append(f"<p>{tcord.text}</p>")
        result_texts.append("<br/>")

        tag_tsubsence = tag_sg.find_all('span', 't_large')
        for tsubsence in tag_tsubsence:
            result_texts.append(f"<p>{tsubsence.text}</p>")
        if len(tag_tsubsence) > 0:
            result_texts.append("<br/>")

        tag_infg_list = tag_sg.find_all('span', 'infg')
        for tag_infg in tag_infg_list:
            tag_lbl = tag_infg.find('span', 'lbl')
            tag_inf = tag_infg.find('span', 'inf')
            result_texts.append(f"[{tag_lbl.text}] {tag_inf.text}")

        tag_deri_list = objSoup.find_all('span', 't_derivatives')

        for tag_deri in tag_deri_list:
            result_texts.append(f"<p>{tag_deri.text}</p>")
    return "\n".join(result_texts)

word_index_dict = {}
defi_dict = {}

defi_id = 1

with open('test/daijirin.txt', 'r', encoding='utf-8') as f:
    for line in tqdm(f):
        ss = line.strip().split('\t')
        word = ss[0]
        objSoup = bs4.BeautifulSoup(ss[1], 'lxml')
        tag_role = objSoup.find('span', {'role':'text'})
        if tag_role:
            eqv_word = []
            if '（' in word:
                m = re.search(r'(.*?)（(.*?)）', word)
                eqv_word.append(m.group(1))
                eqv_word.append(m.group(2))
            else:
                eqv_word.append(word)
            role_text = tag_role.text.replace('・', '').strip()
            if role_text != eqv_word[0]:
                eqv_word.append(role_text)

            defi_dict[str(defi_id)] = parse_definition(ss[1])

            for word in eqv_word:
                if word in word_index_dict:
                    word_index_dict[word].append(str(defi_id))
                else:
                    word_index_dict[word] = [str(defi_id)]
            defi_id += 1

In [None]:
print(word_index_dict['みちびく'])
print(word_index_dict['導く'])
print(defi_dict[word_index_dict['導く'][0]])

In [None]:
with open('test/daijirin.jidx', 'w', encoding='utf-8') as outf:
    json.dump(word_index_dict, outf, ensure_ascii=False)
with open('test/daijirin.jdict', 'w', encoding='utf-8') as outf:
    json.dump(defi_dict, outf, ensure_ascii=False)

In [None]:
daijirin_ifo = {
    "name": "三省堂·スーパー大辞林",
    "lang": "ja",
    "bookname": "スーパー大辞林",
    "author": "松村 明",
    "publisher": "三省堂",
    "website": "",
    "wordcount": 409529, #len(word_index_dict),
    "definitioncount": 265051, #len(defi_dict),
    "description": ""
}
with open('test/daijirin.jifo', 'w', encoding='utf-8') as outf:
    json.dump(daijirin_ifo, outf, ensure_ascii=False)

In [None]:
dict2 = JsonDictionary('Daijirin', 'ja', 'test.jpg', 'test/daijirin')

In [None]:
len(dict2.indexes)

In [None]:
len(dict2.definitions)

## 譯典通

In [None]:
import re
import bs4
from tqdm import tqdm

def parse_definition(text):
    result_texts = []
    objSoup = bs4.BeautifulSoup(text, 'lxml')
    tag_hg = objSoup.find('span', 'hg')
    hg = tag_hg.text
    result_texts.append(f"<p>{hg}</p>")
    tag_sg_list = objSoup.find_all('span', 'sg')

    for tag_sg in tag_sg_list:
        tag_posg = tag_sg.find('span', 'posg')
        if tag_posg:
            pos = tag_posg.text
            result_texts.append(f"<p>{pos}</p>")

        tag_tcord = tag_sg.find_all('span', 't_core')
        for tcord in tag_tcord:
            result_texts.append(f"<p>{tcord.text}</p>")
        result_texts.append("<br/>")

        tag_tsubsence = tag_sg.find_all('span', 't_large')
        for tsubsence in tag_tsubsence:
            result_texts.append(f"<p>{tsubsence.text}</p>")
        if len(tag_tsubsence) > 0:
            result_texts.append("<br/>")

        tag_deri_list = objSoup.find_all('span', 't_derivatives')

        for tag_deri in tag_deri_list:
            result_texts.append(f"<p>{tag_deri.text}</p>")
    return "\n".join(result_texts)


word_index_dict = {}
defi_dict = {}

defi_id = 1

with open('test/dreye.txt', 'r', encoding='utf-8') as f:
    for line in tqdm(f):
        ss = line.strip().split('\t')
        word = ss[0]
        objSoup = bs4.BeautifulSoup(ss[1], 'lxml')
        tag_role = objSoup.find('span', {'role':'text'})
        if tag_role:
            eqv_word = []
            if '（' in word:
                m = re.search(r'(.*?)（(.*?)）', word)
                eqv_word.append(m.group(1))
                eqv_word.append(m.group(2))
            else:
                eqv_word.append(word)
            role_text = tag_role.text.replace('・', '').strip()
            if role_text != eqv_word[0]:
                eqv_word.append(role_text)

            defi_dict[str(defi_id)] = parse_definition(ss[1])

            for word in eqv_word:
                if word in word_index_dict:
                    word_index_dict[word].append(str(defi_id))
                else:
                    word_index_dict[word] = [str(defi_id)]
            defi_id += 1

In [None]:
print(word_index_dict['atmosphere'])
print(defi_dict[word_index_dict['atmosphere'][0]])

In [None]:
with open('test/dreye.jidx', 'w', encoding='utf-8') as outf:
    json.dump(word_index_dict, outf, ensure_ascii=False)
with open('test/dreye.jdict', 'w', encoding='utf-8') as outf:
    json.dump(defi_dict, outf, ensure_ascii=False)

In [None]:
dreye_ifo = {
    "name": "譯典通英漢雙向辭典",
    "lang": "en",
    "bookname": "譯典通英漢雙向辭典",
    "author": "英業達",
    "publisher": "英業達股份有限公司",
    "website": "",
    "wordcount": len(word_index_dict),
    "definitioncount": len(defi_dict),
    "description": ""
}
with open('test/dreye.jifo', 'w', encoding='utf-8') as outf:
    json.dump(dreye_ifo, outf, ensure_ascii=False)