In [12]:
import gzip
import pandas as pd
import xml.etree.ElementTree as ET
from bs4 import BeautifulSoup

# JMDICT

In [2]:
with gzip.open('./dicts/JMdict_e.gz') as f:
    lines = []
    for line in f:
        if not line == b'<JMdict>\n':
            lines.append(line)
        else:
            break

with open('./dicts/jmdict_doc.txt', mode='wb') as f:
    f.writelines(lines)

In [3]:
with gzip.open('./dicts/JMdict_e.gz') as f:
    for _ in range(1000):
        print(f.readline())

b'<?xml version="1.0" encoding="UTF-8"?>\n'
b'<!-- Rev 1.09\n'
b'        Added the g_type attribute\n'
b'-->\n'
b'<!-- Rev 1.08\n'
b'        Delete <info> and <example> elements\n'
b'-->\n'
b'<!-- Rev 1.07\n'
b'        Revised POS tags for the adjectives\n'
b'-->\n'
b'<!-- Rev 1.06\n'
b'        Dropped the "*" from the end of the entry element.\n'
b'        Dropped the g_lang attribute in favour of xml:lang\n'
b'        Dropped the <lang> element and replaced it with <lsource> at the\n'
b'        sense level.\n'
b'        Moved <dial> from the entry level to the sense level.\n'
b'        Changed "info*" to "info?".\n'
b'-->\n'
b'<!-- Rev 1.05\n'
b'        Changed the <gram> element name to <pos>\n'
b'        Added the g_gend attribute\n'
b'        moved the s_inf element\n'
b'-->\n'
b'<!-- Rev 1.04\n'
b'        Changes:\n'
b'        Rename the project  "JMdict" and add the g_lang attribute to the\n'
b'        <gloss> entity - 08 May 1999\n'
b'        Moved the <gram>, <field> and <misc

In [4]:
with gzip.open('./dicts/JMdict_e.gz', mode='r') as f:
    tree = ET.parse(f)

In [5]:
root = tree.getroot()
root[1000][0].tag

'ent_seq'

In [6]:
jmdict = []
for entry in root:
    new_entry = {}
    for element in entry:
        if element.tag == 'ent_seq':
            new_entry['id'] = int(element.text)
        else:
            if element.tag not in new_entry:
                new_entry[element.tag] = []
            sub_dict = {}
            for sub_element in element:
                tag = sub_element.tag
                text = sub_element.text
                if tag in new_entry:
                    try:
                        sub_dict[tag] += ', '+text
                    except Exception as e:
                        pass
                else:
                    sub_dict[tag] = text
            new_entry[element.tag].append(sub_dict)
    jmdict.append(new_entry)


In [7]:
jmdict[102]

{'id': 1001290,
 'k_ele': [{'keb': '悍ましい', 'ke_inf': 'rarely-used kanji form'}],
 'r_ele': [{'reb': 'おぞましい'}],
 'sense': [{'pos': 'adjective (keiyoushi)',
   'misc': 'word usually written using kana alone',
   'gloss': 'repulsive'}]}

In [8]:
readings = pd.DataFrame([{'index': i, 'readings': len(entry['r_ele'])} for i, entry in enumerate(jmdict)])
readings.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 207032 entries, 0 to 207031
Data columns (total 2 columns):
 #   Column    Non-Null Count   Dtype
---  ------    --------------   -----
 0   index     207032 non-null  int64
 1   readings  207032 non-null  int64
dtypes: int64(2)
memory usage: 3.2 MB


In [9]:
readings[readings.readings>2].head()

Unnamed: 0,index,readings
23,23,5
24,24,6
30,30,3
35,35,3
79,79,5


In [10]:
jmdict[23]

{'id': 1000310,
 'k_ele': [{'keb': '馬酔木'}],
 'r_ele': [{'reb': 'あせび'},
  {'reb': 'あしび'},
  {'reb': 'あせぼ'},
  {'reb': 'あせぶ'},
  {'reb': 'アセビ', 're_nokanji': None}],
 'sense': [{'pos': 'noun (common) (futsuumeishi)',
   'misc': 'word usually written using kana alone',
   'gloss': 'lily-of-the-valley'}]}

In [11]:
def find_id(id):
    for entry in jmdict:
        if entry['id'] == id:
            return entry

find_id(1006980)

{'id': 1006980,
 'k_ele': [{'keb': '其れから', 'ke_pri': 'ichi1'}],
 'r_ele': [{'reb': 'それから', 're_pri': 'ichi1'}],
 'sense': [{'pos': 'expressions (phrases, clauses, etc.)',
   'misc': 'word usually written using kana alone',
   'gloss': 'after that'}]}

## Generated tables

In [25]:
search_table = pd.read_csv('./tables/search_index.csv.gz', index_col=0)
search_table[600:625]

Unnamed: 0,id,entry
600,1002480,御転婆
601,1002480,於転婆
602,1002480,おてんば
603,1002490,お屠蘇
604,1002490,御屠蘇
605,1002490,おとそ
606,1002500,お土産
607,1002500,御土産
608,1002500,おみやげ
609,1002500,おみあげ


In [26]:
anki_entries = pd.read_csv('./tables/anki_entries.csv.gz', index_col=0)
anki_entries.iloc[600:625]

Unnamed: 0_level_0,expression,meaning,reading,part_of_speech
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1006970,それ,"1. that, it; 2. then, that point (in time), th...","其れ, 其 (rarely-used kanji form, search-only kan...",pronoun
1006980,それから,"And then, after that. (word usually written us...",其れから,"expressions (phrases, clauses, etc.)"
1006990,それだけ,"That much, as much, to that extent, only that,...","其れ丈, 其れだけ","adverb (fukushi), nouns which may take the gen..."
1007000,それで,"So, therefore, and, (and) then. (word usually ...",其れで (rarely-used kanji form),conjunction
1007010,それとも,"Or, or else. (word usually written using kana ...",其れとも (rarely-used kanji form),conjunction
1007020,それどころか,"On the contrary, far from it, if anything, in ...","それ処か, 其れ処か (rarely-used kanji form)",conjunction
1007030,それなり,"1. in itself, as it is, in its own way, as sui...","其れなり, 其れ形",adverb (fukushi)
1007040,それに,"Besides, in addition, also, moreover. (word us...",其れに (rarely-used kanji form),conjunction
1007050,それに加えて,In addition to this.,それにくわえて,"expressions (phrases, clauses, etc.)"
1007060,それほど,"To that degree, to that extent, that much. (wo...","それ程, 其れ程, 其程",adverb (fukushi)


In [27]:
anki_entries.iloc[601].T['meaning']

'And then, after that. (word usually written using kana alone)'

## Reading from kindle export

In [14]:
with open("G:/My Drive/learning/Languages/日本語/kindle notes exports/異世界拷問姫 (MF文庫J)-Notebook.html", encoding='utf8') as f:
    vocab_file = BeautifulSoup(f)


In [22]:
title = vocab_file.find("div", {'class':'bookTitle'})
print(title.text)
vocab = vocab_file.find_all("div", {'class':'noteText'})
print([entry.text.split('\n')[0] for entry in vocab])

異世界拷問姫 (MF文庫J)

['牧歌的', '木偶', '装飾', '動脈', '責め苛ん', '虚偽', '虚妄', '雑踏、', 'ざわつい', '沸点', '残虐', '嗜虐', '垣間見', '矮小', '一閃', '走馬灯', '残忍', '花弁']


# CC-CEDICT

In [7]:
with gzip.open('./dicts/cedict_1_0_ts_utf-8_mdbg.txt.gz', mode='rt', encoding='utf8') as f:
    for _ in range(500):
        print(f.readline())

# CC-CEDICT

# Community maintained free Chinese-English dictionary.

# 

# Published by MDBG

# 

# License:

# Creative Commons Attribution-ShareAlike 4.0 International License

# https://creativecommons.org/licenses/by-sa/4.0/

# 

# Referenced works:

# CEDICT - Copyright (C) 1997, 1998 Paul Andrew Denisowski

# 

# CC-CEDICT can be downloaded from:

# https://www.mdbg.net/chinese/dictionary?page=cc-cedict

# 

# Additions and corrections can be sent through:

# https://cc-cedict.org/editor/editor.php

# 

# For more information about CC-CEDICT see:

# https://cc-cedict.org/wiki/

# 

#! version=1

#! subversion=0

#! format=ts

#! charset=UTF-8

#! entries=121862

#! publisher=MDBG

#! license=https://creativecommons.org/licenses/by-sa/4.0/

#! date=2023-10-12T03:57:48Z

#! time=1697083068

% % [pa1] /percent (Tw)/

2019冠狀病毒病 2019冠状病毒病 [er4 ling2 yi1 jiu3 guan1 zhuang4 bing4 du2 bing4] /COVID-19, the coronavirus disease identified in 2019/

21三體綜合症 21三体综合症 [er4 shi2 yi1 san1 ti3 z

In [8]:
lines = []
with gzip.open('./dicts/cedict_1_0_ts_utf-8_mdbg.txt.gz', mode='rt', encoding='utf8') as f:
    for line in f:
        if line.startswith('#'):
            lines.append(line)
        else:
            break
with open('./dicts/cedict_doc.txt', mode='w') as f:
    f.writelines(lines)