# Parse

In [None]:
from bs4 import BeautifulSoup
import urllib, tqdm, json, pickle

from extras.utils import *
db = load_hieroglyphDB_from_json('HieroglyphDB.json')

voc_dict = {}

BADVOCS = []

for lvl in range(1,61):
  voc_lvl = [i for i in db if i.level==lvl and i.hieroglyph_type==HieroglyphType.VOCAB]

  for voc in tqdm.tqdm(voc_lvl):
    if voc.resource_paths.wanikani_link in voc_dict:
      continue
    elems = BeautifulSoup(urllib.request.urlopen(voc.resource_paths.wanikani_link, timeout = 120)).find_all('a', class_="subject-character subject-character--kanji subject-character--grid subject-character--unlocked")
    hrefs = [element['href'] for element in elems if 'href' in element.attrs]
    if not hrefs:
      BADVOCS.append(voc)
      print(voc.resource_paths.wanikani_link)
    voc_dict[voc.resource_paths.wanikani_link] = hrefs
  
  print(f'\nlvl {lvl} Done\n')

  with open('voc.json', 'w') as f:
    json.dump(voc_dict, f)
  with open('badvocs.pkl', 'wb') as f:
    pickle.dump(BADVOCS, f)

In [None]:
from bs4 import BeautifulSoup
import urllib, tqdm, json, pickle

from extras.utils import *
db = load_hieroglyphDB_from_json('HieroglyphDB1.json')

with open('voc.json', 'r') as f:
  voc_dict = json.load(f)

with open('kan.json', 'r') as f:
  kan_dict = json.load(f)

with open('badvocs.pkl', 'rb') as f:
  badvocs = pickle.load(f)

In [4]:
# MOST IMPORTANT FIX
[i for i in db if i.symbol=='湯気'][0].mnemonics.reading = '''You’re getting all hot and sweaty, but the feeling makes you incredibly happy and gay (げ). Then someone asks, as they look at you in the steam: “Hey, you gay?” (ゆげ). You answer however you want to answer.'''

# KANJI FIX

In [132]:
new_kanji = Hieroglyph(
  symbol='脈',
  level=31,
  hieroglyph_type=HieroglyphType.KANJI,
  meanings=['vein'],
  readings=Reading(
    onyomi=['みゃく'],
  ),
  mnemonics=Mnemonics(
    meaning="Under the moon while you sit on a cliff, you inject a syringe of tofu right into your vein. Ahh, tofu. You're addicted to it. You sneak out to the cliff at night to get your tofu fix, injecting it straight into your vein. Feel the tofu running through your veins. Feel the tofu goodness spreading all over your body. It's so intense you can even taste a little tofu in your mouth.",
    reading='''Eventually, your friends and family find you at the cliff, tripping on tofu. Each and every vein on your body is engorged, pulsating with the effects of tofu. Your friends try and take you home. "You cyan't myake (みゃく) me!" you slur, suffering from the effects of the tofu. "None of you cyan myake me go home! You cyan’t myake me! Nyah!" Say this out loud, it'll help you remember. Really focus on the "myake" be sure to say it like you're all bugged out from tofu overdose.'''
  ),
  resource_paths=ResourcePaths(
    wanikani_link='http://www.wanikani.com/kanji/%E8%84%88',
  )
)

kan_dict['http://www.wanikani.com/kanji/%E8%84%88'] = ['https://www.wanikani.com/radicals/moon', 'https://www.wanikani.com/radicals/cliff', 'https://www.wanikani.com/radicals/tofu']

ixdx_where = [n for n, i in enumerate(db) if i.level==31 and i.symbol=='菜'][0]+1
db.hieroglyphs = db.hieroglyphs[:ixdx_where] + [new_kanji] + db.hieroglyphs[ixdx_where:]

for lvl in range(1,61):
  voc_lvl = [i for i in db if i.level==lvl and i.hieroglyph_type==HieroglyphType.VOCAB]
  kan_prev = [i.resource_paths.wanikani_link.split('.com')[1] for i in db if i.level<=lvl and i.hieroglyph_type==HieroglyphType.KANJI]
  c = 0
  for voc in voc_lvl:
    l = voc.resource_paths.wanikani_link
    kanjis = voc_dict[l]
    for k in kanjis:
      assert k.split('.com')[1] in kan_prev

In [133]:
# center fix
rad_ = db[[n for n, i in enumerate(db) if i.level==13 and i.symbol=='央'][0]]
kan_ = db[[n for n, i in enumerate(db) if i.level==4 and i.symbol=='央'][0]]

rad_.mnemonics.meaning = kan_.mnemonics.meaning
kan_.mnemonics.meaning = 'The radical for center and the kanji for center are exactly the same. That means if you know your radicals well, you know the meaning of this kanji as well! How swell.'
rad_.level = 4

idx_where = [n for n, i in enumerate(db) if i.level==4 and i.symbol=='車' and i.hieroglyph_type==HieroglyphType.RADICAL][0]+1
idx_from  = [n for n, i in enumerate(db) if i.symbol=='央' and i.hieroglyph_type==HieroglyphType.RADICAL][0]

db.hieroglyphs.insert(idx_where, db.hieroglyphs.pop(idx_from))

In [134]:
# man fix
rad_ = db[[n for n, i in enumerate(db) if i.level==15 and i.symbol=='男'][0]]
kan_ = db[[n for n, i in enumerate(db) if i.level==4 and i.symbol=='男'][0]]

rad_.mnemonics.meaning = kan_.mnemonics.meaning
kan_.mnemonics.meaning = 'The radical for man and the kanji for man are exactly the same. That means if you know your radicals well, you know the meaning of this kanji too! Man, kanji are easy sometimes.'
rad_.level = 4

idx_where = [n for n, i in enumerate(db) if i.level==4 and i.symbol=='彳' and i.hieroglyph_type==HieroglyphType.RADICAL][0]+1
idx_from  = [n for n, i in enumerate(db) if i.symbol=='男' and i.hieroglyph_type==HieroglyphType.RADICAL][0]

db.hieroglyphs.insert(idx_where, db.hieroglyphs.pop(idx_from))

In [135]:
# house fix
idx_from  = [n for n, i in enumerate(db) if i.symbol=='家' and i.hieroglyph_type==HieroglyphType.RADICAL][0]

rad_ = db[idx_from]
kan_ = db[[n for n, i in enumerate(db) if i.symbol=='家' and i.hieroglyph_type==HieroglyphType.KANJI][0]]

rad_.mnemonics.meaning = kan_.mnemonics.meaning
kan_.mnemonics.meaning = '''The house radical makes up the house kanji! Pretty easy. This kanji is also used to mean home, so try to remember that too. I guess not all houses are homes, and not all homes are houses. But there's a pretty big overlap, right? And in most situations, this kanji's got you covered.'''
rad_.level = 6

idx_where = [n for n, i in enumerate(db) if i.symbol=='禾' and i.hieroglyph_type==HieroglyphType.RADICAL][0]+1
db.hieroglyphs.insert(idx_where, db.hieroglyphs.pop(idx_from))

In [136]:
# genius/talent fix
db[[n for n, i in enumerate(db) if i.symbol=='才' and i.hieroglyph_type==HieroglyphType.RADICAL][0]].resource_paths.wanikani_link = 'https://www.wanikani.com/radicals/talent'

In [137]:
# courage fix
idx_from  = [n for n, i in enumerate(db) if i.symbol=='勇' and i.hieroglyph_type==HieroglyphType.RADICAL][0]

rad_ = db[idx_from]
kan_ = db[[n for n, i in enumerate(db) if i.symbol=='勇' and i.hieroglyph_type==HieroglyphType.KANJI][0]]

rad_.mnemonics.meaning = kan_.mnemonics.meaning
kan_.mnemonics.meaning = '''The radical for courage and the kanji for courage are exactly the same. That means if you know your radicals well, you know the meaning of this kanji too!'''
rad_.level = 15

idx_where = [n for n, i in enumerate(db) if i.symbol=='𭕄' and i.hieroglyph_type==HieroglyphType.RADICAL][0]-1
db.hieroglyphs.insert(idx_where, db.hieroglyphs.pop(idx_from))

In [138]:
new_rad = Hieroglyph(
  symbol='片',
  level=30, 
  hieroglyph_type=HieroglyphType.RADICAL,
  meanings=['One Sided'],
  mnemonics=Mnemonics(
    meaning='''This looks a bit like a waiter walking toward you with a platter in their hands... but wait. It's only half of a waiter. It's a one sided waiter!''',
  ),
  resource_paths=ResourcePaths(
    wanikani_link='https://www.wanikani.com/radicals/one-sided',
  )
)

idx_where = [n for n, i in enumerate(db) if i.meanings[0]=='hills' and i.hieroglyph_type==HieroglyphType.RADICAL][0]+1

db.hieroglyphs.insert(idx_where, new_rad)

In [139]:
new_rad = Hieroglyph(
  symbol='系',
  level=30, 
  hieroglyph_type=HieroglyphType.RADICAL,
  meanings=['Lineage'],
  mnemonics=Mnemonics(
    meaning='''A thread goes through the ground. It traces your lineage. You follow the thread all the way back through many generations, to find out your lineage involves Genghis Khan. Feel the thread and the dirt in your hand and actually trace the lineage.''',
  ),
  resource_paths=ResourcePaths(
    wanikani_link='https://www.wanikani.com/radicals/lineage',
  )
)

idx_where = [n for n, i in enumerate(db) if i.meanings[0]=='hills' and i.hieroglyph_type==HieroglyphType.RADICAL][0]+1

db.hieroglyphs.insert(idx_where, new_rad)

In [140]:
new_rad = Hieroglyph(
  symbol='冘',
  level=30, 
  hieroglyph_type=HieroglyphType.RADICAL,
  meanings=['Pirate'],
  mnemonics=Mnemonics(
    meaning='''This kind of looks like big (大), which I think you'll remember you learned by pretending this was a big guy. This big guy, however, has a funny leg... a peg leg. That makes him a pirate. Also, his arms are in the "YARRRR" position.''',
  ),
  resource_paths=ResourcePaths(
    wanikani_link='https://www.wanikani.com/radicals/pirate',
  )
)

idx_where = [n for n, i in enumerate(db) if i.symbol=='系' and i.hieroglyph_type==HieroglyphType.RADICAL][0]+1

db.hieroglyphs.insert(idx_where, new_rad)

In [141]:
new_rad = Hieroglyph(
  symbol='甲',
  level=30, 
  hieroglyph_type=HieroglyphType.RADICAL,
  meanings=['Turtle Shell'],
  mnemonics=Mnemonics(
    meaning='''Poor turtles. We don't like them here. This is a turtle shell. See the shell with just a tail remaining? It's the only thing we didn't remove off the turtle. It's just a shell with a tail, so basically it's just a turtle shell.''',
  ),
  resource_paths=ResourcePaths(
    wanikani_link='https://www.wanikani.com/radicals/turtle-shell',
  )
)

idx_where = [n for n, i in enumerate(db) if i.symbol=='冘' and i.hieroglyph_type==HieroglyphType.RADICAL][0]+1

db.hieroglyphs.insert(idx_where, new_rad)

In [142]:
# fix stiff
db[[n for n, i in enumerate(db) if i.symbol=='硬' and i.hieroglyph_type==HieroglyphType.KANJI][0]].mnemonics.meaning = 'You feel like a stone that’s been renewed. How do renewed stones feel? Stiff. Very stiff. Think of a stone being renewed like it is having something like a spa day. Pedicure, manicure, the whole shebang. It’ll come out all stiff and hardened, cause that’s how good stones are.'

In [143]:
# 3
kan_dict['http://wanikani.com/kanji/%E6%AF%8D'] = ['https://wanikani.com/radicals/sun', 'https://wanikani.com/radicals/drop'] # mom
kan_dict['http://wanikani.com/kanji/%E5%8C%97'] = ['https://wanikani.com/radicals/fingers', 'https://wanikani.com/radicals/spoon'] # north
kan_dict['http://wanikani.com/kanji/%E5%86%AC'] = ['https://wanikani.com/radicals/winter', 'http://wanikani.com/radicals/two'] # winter

# 5
kan_dict['http://wanikani.com/kanji/%E7%A9%BA'] = ['https://wanikani.com/radicals/roof', 'https://wanikani.com/radicals/legs', 'https://wanikani.com/radicals/construction'] # sky

# 14

# 15
kan_dict['http://wanikani.com/kanji/%E5%A4%89'] = ['https://wanikani.com/radicals/winter', 'https://wanikani.com/radicals/red'] # change

# 17
kan_dict['http://wanikani.com/kanji/%E6%81%8B'] = ['https://wanikani.com/radicals/red', 'https://wanikani.com/radicals/heart'] # romance

# 30!
kan_dict['http://wanikani.com/kanji/%E8%83%8C'] = ['https://wanikani.com/kanji/%E5%8C%97', 'https://wanikani.com/radicals/moon'] # back

# 31
kan_dict['http://wanikani.com/kanji/%E7%86%9F'] = ['https://www.wanikani.com/radicals/sock', 'https://www.wanikani.com/radicals/circle', 'https://www.wanikani.com/radicals/boil'] # ripen
kan_dict['http://wanikani.com/kanji/%E6%B9%BE'] = ['https://www.wanikani.com/radicals/tsunami', 'https://www.wanikani.com/radicals/red', 'https://www.wanikani.com/radicals/bow'] # gulf
kan_dict['http://wanikani.com/kanji/%E8%B7%A1'] = ['https://www.wanikani.com/radicals/foot', 'https://www.wanikani.com/radicals/red'] # traces
kan_dict['http://wanikani.com/kanji/%E9%91%91'] = ['https://www.wanikani.com/radicals/gold', 'https://wanikani.com/kanji/%E7%9B%A3'] # Model

# 41!
kan_dict['http://wanikani.com/kanji/%E8%89%A6'] = ['https://www.wanikani.com/radicals/boat', 'https://wanikani.com/kanji/%E7%9B%A3'] # warship

# 45 
kan_dict['http://wanikani.com/kanji/%E5%A1%BE'] = ['https://www.wanikani.com/radicals/sock', 'https://www.wanikani.com/radicals/circle', 'https://www.wanikani.com/radicals/dirt']
# !
kan_dict['http://wanikani.com/kanji/%E7%A1%AC'] = ['https://www.wanikani.com/radicals/stone', 'http://wanikani.com/kanji/%E6%9B%B4']

kan_dict['http://wanikani.com/kanji/%E9%83%AD'] = ['https://www.wanikani.com/radicals/sock', 'https://www.wanikani.com/radicals/building']

kan_dict['http://wanikani.com/kanji/%E4%BA%AB'] = ['https://www.wanikani.com/radicals/sock']

kan_dict['http://wanikani.com/kanji/%E8%97%8D'] = ['https://www.wanikani.com/radicals/flowers', 'http://wanikani.com/kanji/%E7%9B%A3']


In [144]:
# FINALLY ADD RADICALS
for lvl in range(1,61):
  kan_lvl  = [i for i in db if i.level==lvl and i.hieroglyph_type==HieroglyphType.KANJI]
  rad_prev = [i.resource_paths.wanikani_link.split('.com')[1] for i in db if i.level<=lvl and i.hieroglyph_type!=HieroglyphType.VOCAB]
  for kan in kan_lvl:
    l = kan.resource_paths.wanikani_link
    rads = kan_dict[l]
    for r in rads:
      assert r.split('.com')[1] in rad_prev
    kan.resource_paths.radical_links = rads

# VOC FIX

In [73]:
[i.split('/')[-1] for i in badvocs]

['%E4%B8%96%E8%BE%9E',
 '%E6%A2%85%E5%B9%B2',
 '%E5%AE%85',
 '%E8%8F%93%E5%AD%90%E5%B1%8B',
 '%E3%80%9C%E5%88%B8',
 '%E8%B2%B8%E3%81%97',
 '%E8%BE%BC%E3%82%80',
 '%E9%A3%9B%E3%81%B3%E8%BE%BC%E3%81%BF%E8%87%AA%E6%AE%BA',
 '%E3%80%9C%E7%95%91',
 '%E7%8F%8D',
 '%E5%A6%83',
 '%E8%BB%92%E4%B8%A6']

In [145]:
d = []
for lvl in range(1,61):
  voc_lvl = [i for i in db if i.level==lvl and i.hieroglyph_type==HieroglyphType.VOCAB]
  kan_prev = [i.resource_paths.wanikani_link.split('.com')[1] for i in db if i.level<=lvl and i.hieroglyph_type==HieroglyphType.KANJI]
  for voc in voc_lvl:
    l = voc.resource_paths.wanikani_link
    kans = voc_dict[l]
    for k in kans:
      assert k.split('.com')[1] in kan_prev
    voc.resource_paths.kanji_links = kans

In [None]:
Ls = [['http://wanikani.com/kanji/%E4%B8%96',
  'http://wanikani.com/kanji/%E8%BE%9E'],
 ['http://wanikani.com/kanji/%E6%A2%85',
  'http://wanikani.com/kanji/%E5%B9%B2'],
 ['http://wanikani.com/kanji/%E5%AE%85'],
 ['http://wanikani.com/kanji/%E8%8F%93',
  'http://wanikani.com/kanji/%E5%AD%90',
  'http://wanikani.com/kanji/%E5%B1%8B'],
 ['http://wanikani.com/kanji/%E5%88%B8'],
 ['http://wanikani.com/kanji/%E8%B2%B8'],
 ['http://wanikani.com/kanji/%E8%BE%BC'],
 ['http://wanikani.com/kanji/%E9%A3%9B',
  'http://wanikani.com/kanji/%E8%BE%BC',
  'http://wanikani.com/kanji/%E8%87%AA',
  'http://wanikani.com/kanji/%E6%AE%BA'],
 ['http://wanikani.com/kanji/%E7%95%91'],
 ['http://wanikani.com/kanji/%E7%8F%8D'],
 ['http://wanikani.com/kanji/%E5%A6%83'],
 ['http://wanikani.com/kanji/%E8%BB%92',
  'http://wanikani.com/kanji/%E4%B8%A6']]

for I in range(len(badvocs)):
  h = [i for i in db if i.resource_paths.wanikani_link == badvocs[I]][0]
  h.resource_paths.kanji_links = Ls[I]

# SAVE

In [None]:
for d in db:
  d.resource_paths.wanikani_link = d.resource_paths.wanikani_link.replace('https', 'http').replace('http://www.wanikani.com', 'http://wanikani.com')
db_links = set([i.resource_paths.wanikani_link for i in db])

for d in db:
  d.resource_paths.wanikani_link = d.resource_paths.wanikani_link.replace('https', 'http').replace('http://www.wanikani.com', 'http://wanikani.com')
  d.resource_paths.kanji_links = [i.replace('https://www.wanikani.com/', 'http://wanikani.com/').replace('https', 'http') for i in d.resource_paths.kanji_links]
  d.resource_paths.radical_links = [i.replace('https://www.wanikani.com/', 'http://wanikani.com/').replace('https', 'http') for i in d.resource_paths.radical_links]
  for i in d.resource_paths.kanji_links:
    assert i in db_links
  for i in d.resource_paths.radical_links:
    assert i in db_links

In [10]:
from enum import Enum
from dataclasses import is_dataclass

# with open('HieroglyphDB.pkl', 'wb') as f:
#   pickle.dump(db, f)

def enum_to_value(obj):
  # Converts enum instances to their value
  return obj.value if isinstance(obj, Enum) else obj

def dataclass_to_dict(obj):
  # Recursively converts dataclasses (and lists, tuples) to dict for JSON serialization
  if is_dataclass(obj):
    result = {}
    for field_name in obj.__dataclass_fields__:
      field_value = getattr(obj, field_name)
      result[field_name] = dataclass_to_dict(field_value)
    return result
  elif isinstance(obj, list):
    return [dataclass_to_dict(item) for item in obj]
  elif isinstance(obj, tuple):
    return tuple(dataclass_to_dict(item) for item in obj)
  elif isinstance(obj, Enum):
    return enum_to_value(obj)
  else:
    return obj

# Convert to JSON-serializable dictionary
db_dict = dataclass_to_dict(db)

# Write out to JSON
with open('HieroglyphDB1.json', 'w', encoding='utf-8') as f:
  json.dump(db_dict, f, ensure_ascii=False, indent=2)

print("HieroglyphDB successfully converted to HieroglyphDB.json")

HieroglyphDB successfully converted to HieroglyphDB.json
