In [2]:
%pip install --upgrade jamdict jamdict-data -q

Note: you may need to restart the kernel to use updated packages.


In [3]:
from jamdict import Jamdict
import json


jam = Jamdict()

In [4]:
import unicodedata

def is_kanji(char):
    try:
        name = unicodedata.name(char)
        return "CJK UNIFIED IDEOGRAPH" in name or "CJK COMPATIBILITY IDEOGRAPH" in name
    except ValueError:
        return False




text = "言葉付き"

for char in text:
    print(char, is_kanji(char))

言 True
葉 True
付 True
き False


In [6]:
import pandas as pd

df = pd.read_json('kanjis.jsonl', lines=True)

In [7]:
df.head(20)

Unnamed: 0,characters,vocabulary
0,柾,[]
1,瘼,[]
2,祥,[]
3,襦,[]
4,當,[]
5,詬,[]
6,簇,[]
7,閈,[]
8,惕,[]
9,佟,[]


In [8]:
kanji_set = set(df['characters'])

In [9]:
vocab_set = set(v for l in df['vocabulary'] for v in l)

In [10]:
len(kanji_set), len(vocab_set)

(9980, 6528)

In [12]:
#steps
# for each kanji search jamdict
# for word in entries
# discard if not if any character is not in kanji set
from tqdm import tqdm
import romkan

# bar = tqdm(total=len(kanji_set))

def contains_non_kanji_chars(word):
    for char in word:
        if not is_kanji(char) or char not in kanji_set:
            return True
    return False

def extract_reading(x):
    return {
        'reading': x['kana'][0]['text'],
        'primary': True,
        'romaji': romkan.to_roma(x['kana'][0]['text'])
    }

def extract_meanings(x):
    meanings = []
    for m in x['senses']:
        for o in m['SenseGloss']:
            if o['lang'] == 'eng':
                meanings.append(o['text'])
                break
    return meanings

vocabs = []

# for i, x in df.iterrows():
#     kanji = x['characters']
   
#     res = jam.lookup(f'%{kanji}%')

#     for word in res.entries:
#         word = word.to_dict()
#         if word['kanji'][0]['text'] in vocab_set:
#             continue

#         if contains_non_kanji_chars(word['kanji'][0]['text']):
#             continue
#         vocabs.append({
#             'characters': word['kanji'][0]['text'],
#             'reading': extract_reading(word),
#             'meanings': extract_meanings(word),
#             'kanjis': [k for k in word['kanji'][0]['text']]
#         })
#     bar.update(1)

In [13]:
import romkan
from tqdm import tqdm
from concurrent.futures import ProcessPoolExecutor
import pandas as pd

def process_row(index_row_tuple):
    i, x = index_row_tuple
    kanji = x['characters']

    res = jam.lookup(f'%{kanji}%')

    vocab_list = []

    for word in res.entries:
        word = word.to_dict()
        if word['kanji'][0]['text'] in vocab_set:
            continue

        if contains_non_kanji_chars(word['kanji'][0]['text']):
            continue
        vocab_list.append({
            'characters': word['kanji'][0]['text'],
            'reading': extract_reading(word),
            'meanings': extract_meanings(word),
            'kanjis': [k for k in word['kanji'][0]['text']]
        })

    return vocab_list

# Use a ProcessPoolExecutor for parallel processing
with ProcessPoolExecutor(max_workers=8) as executor:
    # Run process_row for each row in the DataFrame in parallel
    results = list(tqdm(executor.map(process_row, df.iterrows()), total=len(kanji_set)))

# Combine the results into a single list of vocabs
vocabs = [vocab for sublist in results for vocab in sublist]

# Update the progress bar
bar = tqdm(total=len(kanji_set))
bar.update(len(kanji_set))

100%|██████████| 9980/9980 [30:00<00:00,  5.54it/s]  
  0%|          | 0/9980 [00:00<?, ?it/s]

In [16]:
len(vocabs)

297822

In [17]:
df_vocab = pd.DataFrame(vocabs)

In [18]:
df_vocab.head(20)

Unnamed: 0,characters,reading,meanings,kanjis
0,柾目,"{'reading': 'まさめ', 'primary': True, 'romaji': ...",[straight grain],"[柾, 目]"
1,正木,"{'reading': 'まさき', 'primary': True, 'romaji': ...",[Japanese spindletree (Euonymus japonicus)],"[正, 木]"
2,襦袢,"{'reading': 'じゅばん', 'primary': True, 'romaji':...",[undershirt (worn under a kimono)],"[襦, 袢]"
3,肌襦袢,"{'reading': 'はだじゅばん', 'primary': True, 'romaji...",[(Japanese-style) undershirt worn under nagaju...,"[肌, 襦, 袢]"
4,肉襦袢,"{'reading': 'にくじゅばん', 'primary': True, 'romaji...",[flesh-colored leotards],"[肉, 襦, 袢]"
5,長襦袢,"{'reading': 'ながじゅばん', 'primary': True, 'romaji...","[long, kimono-like garment, made of light fabr...","[長, 襦, 袢]"
6,赤古里,"{'reading': 'チョゴリ', 'primary': True, 'romaji':...",[chogori],"[赤, 古, 里]"
7,當十錢,"{'reading': 'とうじゅうぜん', 'primary': True, 'romaj...",[type of bronze Chinese coin first issued unde...,"[當, 十, 錢]"
8,瓦当,"{'reading': 'がとう', 'primary': True, 'romaji': ...",[decorative cap of an eave-end roof tile],"[瓦, 当]"
9,石敢當,"{'reading': 'いしがんどう', 'primary': True, 'romaji...",[shigandang],"[石, 敢, 當]"


In [19]:
df_vocab.to_json('vocab.jsonl', orient='records', lines=True)