In [1]:
# looking at the "beginner wordlist".

import json

kr_beginner = json.load(open("kr_beginner.json"))
kr_intermediate = json.load(open("kr_intermediate.json"))
kr_advanced = json.load(open("kr_advanced.json"))


In [31]:
# the first is always 'english' (영어)
# 두번째는 중국어
wordlist = kr_beginner

# if the translation is '(no equivalent expression)', then you must fall back to the 'multi_definition'
wordlist['channel']['item'][0]['senseInfo']['senseDataList'][0]['multilanList'][0]['multi_translation']

'personality'

In [10]:
# can be used to tally words up by category
wordlist['channel']['item'][0]['wordInfo']['actCategoryList']

[{'subjectCategiory': '감정, 기분 표현하기'}]

In [11]:
wordlist['channel']['item'][0]['wordInfo']['org_word']

'마음'

In [14]:
import pypinyin
pypinyin.pinyin("你好, 我的朋友。广州。重庆", style=pypinyin.Style.TONE)

[['nǐ'],
 ['hǎo'],
 [', '],
 ['wǒ'],
 ['de'],
 ['péng'],
 ['yǒu'],
 ['。'],
 ['guǎng'],
 ['zhōu'],
 ['。'],
 ['chóng'],
 ['qìng']]

In [19]:
def to_pinyin_flat(translation):
    components = pypinyin.pinyin(translation, style=pypinyin.Style.TONE)
    vowels = "aeiouāēīōūǖaáéíóúǘàèìòùǜǎěǐǒǔǚ"
    letters = "abcdefghijklmnopqrstuvwxyz"
    flat = ""
    for (component,) in components:
        if len(flat) == 0:
            flat += component
            continue

        if flat[-1] in letters + vowels:
            if component[0] in vowels:
                flat += "'" + component
                continue
                
        flat += component
    return flat


def create_word_df(wordlists, difficulties):
    df_data = {
        "korean_word": [],
        "english_translation": [],
        "mandarin_translation": [],
        "mandarin_pinyin": [],
        "category": [],
        "difficulty": [],
    }

    for wordlist, difficulty in zip(wordlists, difficulties):
        for word in wordlist["channel"]["item"]:
            korean_word = word["wordInfo"]["org_word"]
            if len(word["senseInfo"]["senseDataList"]) == 0:
                print(korean_word)
                continue
            if len(word["senseInfo"]["senseDataList"][0]["multilanList"]) <= 1:
                print(
                    korean_word, word["senseInfo"]["senseDataList"][0]["multilanList"]
                )
                continue
            english_translation = word["senseInfo"]["senseDataList"][0]["multilanList"][
                0
            ]["multi_translation"]
            mandarin_translation = word["senseInfo"]["senseDataList"][0][
                "multilanList"
            ][1]["multi_translation"]
            assert (
                word["senseInfo"]["senseDataList"][0]["multilanList"][0][
                    "nation_code_name"
                ]
                == "영어"
            )
            category = "|".join(
                cat["subjectCategiory"] for cat in word["wordInfo"]["actCategoryList"]
            )
            df_data["korean_word"].append(korean_word)
            df_data["english_translation"].append(english_translation)
            df_data["mandarin_translation"].append(mandarin_translation)
            df_data["mandarin_pinyin"].append(to_pinyin_flat(mandarin_translation))
            df_data["category"].append(category)
            df_data["difficulty"].append(difficulty)

    return df_data

In [20]:
to_pinyin_flat("西安")

"xī'ān"

In [59]:
# time to group up words by category.
def display_categories(wordlist):
    act_categories = {}

    for word in wordlist["channel"]["item"]:
        for category in word["wordInfo"]["actCategoryList"]:
            category = category["subjectCategiory"]
            if category not in act_categories:
                act_categories[category] = []
            act_categories[category].append(word["wordInfo"]["org_word"])

    print(f"Category breakdown: {len(act_categories)} categories")
    print({k: len(v) for k, v in act_categories.items()})

In [60]:
display_categories(kr_intermediate)

Category breakdown: 37 categories
{'성격 표현하기': 98, '복장 표현하기': 69, '외모 표현하기': 54, '집 구하기': 102, '건강': 147, '감정, 기분 표현하기': 133, '학교생활': 102, '인간관계': 101, '음식 설명하기': 78, '공연과 감상': 52, '직장 생활': 96, '요리 설명하기': 94, '문제 해결하기(분실 및 고장)': 28, '직업과 진로': 72, '주말 및 휴가': 30, '연애와 결혼': 28, '날씨와 계절': 69, '사건, 사고, 재해 기술하기': 67, '집안일': 28, '가족 행사-명절': 2, '여행': 67, '한국 생활': 7, '교통 이용하기': 76, '문화 비교하기': 47, '실수담 말하기': 19, '지리 정보': 66, '언어': 29, '취미': 37, '가족 행사': 42, '환경 문제': 16, '공공기관 이용하기': 59, '초대와 방문': 9, '한국의 문학': 23, '물건 사기': 56, '컴퓨터와 인터넷': 31, '개인 정보 교환하기': 18, '대중 매체': 47}


In [21]:
import pandas as pd
# now let's make a "queue" to learn from.
df_data = create_word_df([kr_beginner, kr_intermediate, kr_advanced], [1, 2, 3]) #["beginner", "intermediate", "advanced"])
df = pd.DataFrame(df_data)
df = df.sort_values(['difficulty', 'category'])
df.to_csv("korean_wordlist.csv", index=False)


애기 []
스토리 []
