In [14]:
import pandas as pd
from bs4 import BeautifulSoup


pd.set_option('display.max_columns', None)
pd.set_option('display.expand_frame_repr', False)

In [15]:
column_names = ['uid', 'number', 'kanji', 'meaning', 'stroke order?', 'components', 'onyomi', 'mnemonic', 'usefullness', 'first kunyomi', 'first kunyomi meaning', 'first kunyomi usefulness', 'first jukugo', 'first jukugo meaning', 'first jukugo usefulness', 'header', 'description', 'full onyomi', 'full mnemonic', 'full kunyomi', 'full jukugo', 'full lookalikes', 'full used in', 'frequency rating']
kj_data = pd.read_csv('Selected Notes.txt', delimiter='\t', skiprows=3, names=column_names)

In [16]:
print(kj_data[:1].to_markdown())

|    | uid        |   number | kanji   | meaning          | stroke order?          |   components | onyomi     | mnemonic                                                                                                                                                                                                             | usefullness   | first kunyomi                                | first kunyomi meaning   | first kunyomi usefulness   | first jukugo                                              | first jukugo meaning   | first jukugo usefulness   | header                                                                                                                                                                                                                                                                                                                                                                                                                                                   

In [17]:
def parse_kunyomi(html):
    soup = BeautifulSoup(html, 'html.parser')
    rows = soup.find_all('tr')
    parsed_data = []
    
    for row in rows:
        columns = row.find_all('td')
        if len(columns) < 2:
            continue
        
        # Process first column (kanji and particles)
        first_td = columns[0]
        kanji_span = first_td.find('span', class_='kanji_character')
        kanji = kanji_span.get_text(strip=True) if kanji_span else ''
        
        particle_before = []
        particle_after = []
        
        if kanji_span:
            parent = kanji_span.parent
            kanji_index = parent.contents.index(kanji_span)
            particles = parent.find_all('span', class_='particles')
            
            for particle in particles:
                try:
                    particle_index = parent.contents.index(particle)
                    if particle_index < kanji_index:
                        particle_before.append(particle.get_text(strip=True))
                    else:
                        particle_after.append(particle.get_text(strip=True))
                except ValueError:
                    continue
        
        # Process second column (meaning, rating, and labels)
        second_td = columns[1]
        meaning = second_td.get_text('\n', strip=True).split('\n')[0].strip()
        stars = second_td.find('span', class_='usefulness-stars')
        rating = stars.get_text(strip=True) if stars else ''
        
        # Extract label HTML
        labels = second_td.find_all('a', class_='label')
        label_html = ' '.join(str(label) for label in labels)
        
        parsed_data.append({
            'kanji': kanji,
            'particle_before': ' '.join(particle_before),
            'particle_after': ' '.join(particle_after),
            'meaning': meaning,
            'usefulness': rating,
            'labels': label_html
        })
    
    return parsed_data

In [18]:
def parse_jukugo(html):
    soup = BeautifulSoup(html, 'html.parser')
    rows = soup.find_all('tr')
    parsed_data = []
    
    for row in rows:
        columns = row.find_all('td')
        if len(columns) < 2:
            continue
        
        # Extract kanji word and reading
        kanji_ruby = columns[0].find('ruby')
        kanji_element = kanji_ruby.find('span', class_='kanji_character') if kanji_ruby else None
        kanji = kanji_element.get_text(strip=True) if kanji_element else ''
        reading = kanji_ruby.find('rt').get_text(strip=True) if kanji_ruby and kanji_ruby.find('rt') else ''
        
        # Initialize particles
        particle_before = []
        particle_after = []
        
        if kanji_ruby and kanji_element:
            parent = kanji_element.parent
            kanji_index = parent.contents.index(kanji_element)
            particles = kanji_ruby.find_all('span', class_='particles')
            
            for particle in particles:
                try:
                    particle_index = parent.contents.index(particle)
                    if particle_index < kanji_index:
                        particle_before.append(particle.get_text(strip=True))
                    else:
                        particle_after.append(particle.get_text(strip=True))
                except ValueError:
                    continue
        
        # Extract main content
        all_ps = columns[1].find_all('p')
        meaning = ''
        description = ''
        components = ''
        
        if all_ps:
            main_p = all_ps[0]
            # Extract meaning and components from first paragraph
            meaning_text = main_p.get_text(' ', strip=True)
            meaning = meaning_text.split(' ★')[0].split(' ☆')[0].strip()
            
            # Extract components HTML after first <br>
            br_tag = main_p.find('br')
            if br_tag:
                components = ''.join(str(sibling) for sibling in br_tag.next_siblings).strip()
            
            # Extract description from remaining paragraphs
            description = ' '.join([str(p) for p in all_ps[1:]]).strip()
        
        # Extract usefulness rating
        stars = columns[1].find('span', class_='usefulness-stars')
        rating = stars.get_text(strip=True) if stars else ''
        
        # Extract labels and components as raw HTML
        labels = [str(label) for label in columns[1].find_all('a', class_='label label-info')]
        label_html = ' '.join(str(label) for label in labels)
        
        parsed_data.append({
            'kanji': kanji,
            'reading': reading,
            'particle_before': ' '.join(particle_before),
            'particle_after': ' '.join(particle_after),
            'meaning': meaning,
            'description': description,
            'usefulness': rating,
            'components': components,
            'labels': label_html
        })
    
    return parsed_data

In [19]:
kj_data['full jukugo'][0]

'<table class="definition"><tbody><tr>\n<td>\n<ruby>\n<span class="kanji_character"><ruby>もう一度<rp>(</rp><rt>もういちど</rt><rp>)</rp></ruby></span>\n</ruby>\n</td>\n<td>\n<p>\none more time!\n<span class="usefulness-stars" title="5 out of 5 stars">★★★★★</span>\n\n<br>\n<a class="component" href="http://www.kanjidamage.com/kanji/1-one-line-radical-%E4%B8%80">一</a> (one) + <a class="component" href="http://www.kanjidamage.com/kanji/1144-times-%E5%BA%A6">度</a> (times)\n = もう一度 (one more time!)\n</p>\n</td>\n</tr>\n<tr>\n<td>\n<ruby>\n<span class="kanji_character"><ruby>一緒<rp>(</rp><rt>いっしょ</rt><rp>)</rp></ruby></span>\n<span class="particles">に</span>\n</ruby>\n</td>\n<td>\n<p>\ntogether\n<span class="usefulness-stars" title="5 out of 5 stars">★★★★★</span>\n\n<br>\n<a class="component" href="http://www.kanjidamage.com/kanji/1-one-line-radical-%E4%B8%80">一</a> (one) + <a class="component" href="http://www.kanjidamage.com/kanji/257-together-%E7%B7%92">緒</a> (together)\n = 一緒 (together)\n</p>\n</

In [20]:
parse_jukugo(kj_data['full jukugo'][0])

[{'kanji': 'もう一度',
  'reading': 'もういちど',
  'particle_before': '',
  'particle_after': '',
  'meaning': 'one more time!',
  'description': '',
  'usefulness': '★★★★★',
  'components': '<a class="component" href="http://www.kanjidamage.com/kanji/1-one-line-radical-%E4%B8%80">一</a> (one) + <a class="component" href="http://www.kanjidamage.com/kanji/1144-times-%E5%BA%A6">度</a> (times)\n = もう一度 (one more time!)',
  'labels': ''},
 {'kanji': '一緒',
  'reading': 'いっしょ',
  'particle_before': '',
  'particle_after': 'に',
  'meaning': 'together',
  'description': '',
  'usefulness': '★★★★★',
  'components': '<a class="component" href="http://www.kanjidamage.com/kanji/1-one-line-radical-%E4%B8%80">一</a> (one) + <a class="component" href="http://www.kanjidamage.com/kanji/257-together-%E7%B7%92">緒</a> (together)\n = 一緒 (together)',
  'labels': ''},
 {'kanji': '一人',
  'reading': 'ひとり',
  'particle_before': '',
  'particle_after': 'で',
  'meaning': 'one person, alone',
  'description': '',
  'usefulne

In [21]:
parse_kunyomi(kj_data['full kunyomi'][11])

[{'kanji': 'した',
  'particle_before': '',
  'particle_after': '',
  'meaning': 'below',
  'usefulness': '★★★★★',
  'labels': ''},
 {'kanji': 'さ＊げる',
  'particle_before': 'を',
  'particle_after': '',
  'meaning': 'I lower it',
  'usefulness': '★★★☆☆',
  'labels': '<a class="label label-info" href="http://www.kanjidamage.com/tags/31" title="in practice, this word written in hiragana or katakana, not in kanji form, half the time.">1/2 KANA</a>'},
 {'kanji': 'くだ＊る',
  'particle_before': 'が',
  'particle_after': '',
  'meaning': 'it gets lowered',
  'usefulness': '★★☆☆☆',
  'labels': ''},
 {'kanji': 'くだ＊ さい',
  'particle_before': '',
  'particle_after': '',
  'meaning': 'please (formal spelling only)',
  'usefulness': '★★★☆☆',
  'labels': '<a class="label label-info" href="http://www.kanjidamage.com/tags/12" title="Yet another only-in-Japanese headache: All kanji are written as hiragana. . . TO SOME EXTENT.">KANA</a> <a class="label label-info" href="http://www.kanjidamage.com/tags/16" titl

In [22]:
# Initialize a new DataFrame
jukugo_data = []
parsed_jukugo = []
kunyomi_data = []

# Iterate over the existing data DataFrame
for index, row in kj_data.iterrows():
    
    if pd.notna(row['full jukugo']):
        parsed = parse_jukugo(row['full jukugo'])
        for voc in parsed:
            if voc['kanji'] not in parsed_jukugo:
                parsed_jukugo.append(voc['kanji'])
                jukugo_data.append({'number': row['number'], 'origin kanji': row['kanji'], 'kanji': voc['kanji'], 'reading': voc['reading'], 'particle_before': voc['particle_before'], 'particle_after': voc['particle_after'], 'meaning': voc['meaning'], 'usefulness': voc['usefulness'], 'components': voc['components'], 'labels': voc['labels'], 'description': voc['description']})
    else:
        print(index)
    
    if pd.notna(row['full kunyomi']):
        parsed = parse_kunyomi(row['full kunyomi'])
        for voc in parsed:
            
            if '＊' in voc['kanji'] or '*' in voc['kanji']:
                if '＊' in voc['kanji']:
                    seperator = '＊'
                elif '*' in voc['kanji']:
                    seperator = '*'

                reading = voc['kanji'].replace(seperator, '')
                kanji = row['kanji'] + seperator + voc['kanji'].split('＊')[-1].split('*')[-1]
            
            kunyomi_data.append({'number': row['number'], 'origin kanji': row['kanji'], 'kanji': kanji, 'particle_before': voc['particle_before'], 'particle_after': voc['particle_after'], 'meaning': voc['meaning'], 'usefulness': voc['usefulness'], 'labels': voc['labels'], 'reading': reading})

jukugo_df = pd.DataFrame(jukugo_data)
kunyomi_df = pd.DataFrame(kunyomi_data)

full_df = pd.DataFrame(jukugo_data + kunyomi_data)


16
20
22
24
35
40
47
55
56
57
65
71
85
91
106
122
135
172
211
270
307
350
372
373
433
438
440
445
463
482
486
488
515
519
523
532
542
544
547
549
558
572
576
578
579
585
591
592
603
608
619
638
643
650
694
695
699
709
723
725
729
745
748
769
771
772
775
795
804
809
812
819
822
826
834
838
843
848
851
865
895
899
906
908
912
923
929
939
956
968
975
980
1000
1001
1003
1015
1030
1035
1044
1052
1056
1060
1062
1064
1067
1070
1075
1080
1083
1086
1100
1102
1106
1114
1130
1137
1144
1161
1164
1171
1172
1173
1175
1184
1188
1192
1193
1209
1210
1217
1219
1236
1239
1247
1248
1262
1271
1279
1284
1289
1295
1302
1311
1322
1344
1345
1346
1348
1350
1355
1358
1360
1364
1366
1369
1379
1380
1382
1384
1385
1386
1387
1389
1397
1399
1400
1401
1406
1409
1411
1412
1421
1422
1423
1426
1431
1436
1450
1451
1455
1463
1467
1470
1471
1482
1485
1491
1495
1504
1515
1520
1529
1535
1539
1545
1546
1550
1553
1564
1566
1569
1573
1588
1589
1594
1595
1613
1615
1626
1628
1632
1634
1644
1648
1651
1652
1658
1664
1674
1678
1681


In [23]:
jukugo_df

Unnamed: 0,number,origin kanji,kanji,reading,particle_before,particle_after,meaning,usefulness,components,labels,description
0,1,一,もう一度,もういちど,,,one more time!,★★★★★,"<a class=""component"" href=""http://www.kanjidam...",,
1,1,一,一緒,いっしょ,,に,together,★★★★★,"<a class=""component"" href=""http://www.kanjidam...",,
2,1,一,一人,ひとり,,で,"one person, alone",★★★★★,"<a class=""component"" href=""http://www.kanjidam...","<a class=""label label-info"" href=""http://www.k...",
3,1,一,一般的,いっぱんてき,,な,ordinary,★★★★★,"<a href=""http://www.kanjidamage.com/kanji/1-on...",,<p>One of many many Japanese words for 'normal...
4,1,一,一日,ついたち,,,first day of the month,★★★★☆,"<a class=""component"" href=""http://www.kanjidam...","<a class=""label label-info"" href=""http://www.k...",
...,...,...,...,...,...,...,...,...,...,...,...
2501,1757,丈,丈夫,じょうぶ,,,"sturdy, firm,healthy",★☆☆☆☆,"<a class=""component"" href=""http://www.kanjidam...",,
2502,1758,拝,拝啓,はいけい,,,Dear Sir. . .,★☆☆☆☆,,"<a class=""label label-info"" href=""http://www.k...","<p>polite way to begin a letter: ""Dear Sir""</p>"
2503,1759,互,お互い,おたがい,,にzzz,they xzz each other.,★★★☆☆,,"<a class=""label label-info"" href=""http://www.k...",
2504,1759,互,相互的,そうごてき,,の,"mutual, reciprocal.",★★☆☆☆,"<a class=""component"" href=""http://www.kanjidam...","<a class=""label label-info"" href=""http://www.k...",


In [24]:
kunyomi_df

Unnamed: 0,number,origin kanji,kanji,particle_before,particle_after,meaning,usefulness,labels,reading
0,1,一,一*つ,,,one thing,★★★★☆,,ひとつ
1,2,二,二*つ,,,two things,★★★★☆,,ふたつ
2,3,三,三*つ,,,(also sometimes pronounced み＊っつ) three things....,★★★★☆,,みつ
3,5,子,三*つ,,,child,★★★★★,,みつ
4,6,女,三*つ,,,"woman - but don't say it by itself, it's like ...",★★★★★,,みつ
...,...,...,...,...,...,...,...,...,...
1479,1754,偽,偽*る,,,"a small deception, a white lie. Lying about yo...",★★★★☆,,いつわる
1480,1754,偽,偽*る,,xxx,"PREFIX meaning, 'fake-'",☆☆☆☆☆,"<a class=""label label-info"" href=""http://www.k...",いつわる
1481,1756,隔,隔＊てる,,,"to partition, to separate: separate the class ...",★★☆☆☆,,へだてる
1482,1757,丈,隔＊てる,,,height; stature,★☆☆☆☆,,へだてる


In [25]:
full_df

Unnamed: 0,number,origin kanji,kanji,reading,particle_before,particle_after,meaning,usefulness,components,labels,description
0,1,一,もう一度,もういちど,,,one more time!,★★★★★,"<a class=""component"" href=""http://www.kanjidam...",,
1,1,一,一緒,いっしょ,,に,together,★★★★★,"<a class=""component"" href=""http://www.kanjidam...",,
2,1,一,一人,ひとり,,で,"one person, alone",★★★★★,"<a class=""component"" href=""http://www.kanjidam...","<a class=""label label-info"" href=""http://www.k...",
3,1,一,一般的,いっぱんてき,,な,ordinary,★★★★★,"<a href=""http://www.kanjidamage.com/kanji/1-on...",,<p>One of many many Japanese words for 'normal...
4,1,一,一日,ついたち,,,first day of the month,★★★★☆,"<a class=""component"" href=""http://www.kanjidam...","<a class=""label label-info"" href=""http://www.k...",
...,...,...,...,...,...,...,...,...,...,...,...
3985,1754,偽,偽*る,いつわる,,,"a small deception, a white lie. Lying about yo...",★★★★☆,,,
3986,1754,偽,偽*る,いつわる,,xxx,"PREFIX meaning, 'fake-'",☆☆☆☆☆,,"<a class=""label label-info"" href=""http://www.k...",
3987,1756,隔,隔＊てる,へだてる,,,"to partition, to separate: separate the class ...",★★☆☆☆,,,
3988,1757,丈,隔＊てる,へだてる,,,height; stature,★☆☆☆☆,,,


In [26]:
full_df.to_csv('extracted-vocab.csv', index=False, header=True)
