In [12]:
from konlpy.tag import Kkma, Hannanum, Komoran, Okt    # 형태소 분석기 모듈(Hannanum, Kkma, Komoran, Okt)을 탑재
from tqdm.notebook import tqdm
import pandas as pd
from hanja.impl import is_hanja as is_hanja
from hanja.hangul import is_hangul as is_hangul

In [46]:
class Word:
    def __init__(self, hangul, hanja):
        self.hangul = hangul
        self.hanja = hanja
        if hanja is not None:
            self.is_hanja = True
        else:
            self.is_hanja = False
    

    def set_hanja(self, hanja):
        self.hanja = hanja
        if hanja is not None:
            self.is_hanja = True
        else:
            self.is_hanja = False
        self.is_hanja = True
    

    def __repr__(self):
        return f'Word({self.hangul}, {self.hanja})'


class WordWithPOS(Word):
    def __init__(self, hangul, hanja, pos):
        super().__init__(hangul, hanja)
        self.pos = pos


    def set_pos(self, pos):
        self.pos = pos
    

    def __repr__(self):
        return f'WordWithPOS({self.hangul}, {self.hanja}, {self.pos})'

In [7]:
df = pd.read_parquet('./data/chosun_1990.parquet')
data = df.loc[:, 'body_archaic_hangul']

In [9]:
data.head(50)

0     전두환(全斗煥)씨 국회(国会)증언 요지\n"계엄군(軍) 강경진압 광주(光州)사태 주...
1     ◇기자(記者)회견하는 전(全)씨\n 전두환(全斗煥) 전(前)대통령이 31일 자정직전...
2     4당(党)-재야(在野)의 새로운"헤쳐모여"관건\n청산(清算)이후의 90년정국(政局)...
3     전두환(全斗煥)씨 국회(国会)증언 요지\n"도청(道庁)앞 발포(発砲) 상황끝난뒤 보...
4     "루마니아 공산당(共産党) 해체"\n【부쿠레슈티=APUPI연합(聯合)】루마니아 공산...
5     북한(北韓)외교관 1명 추방\n자이르정부\n【킨샤사(자이르)=로이터연합(聯合)】 자...
6     전두환(全斗煥)씨 국회(国会)증언 요지\n논쟁(論爭)우려……정치(政治)자금내역 공개...
7     말꿈"대길(大吉)"…길운(吉運)부르는 동물\n경오년(庚午年) 말띠해‥말에 얽힌 이야...
8     사설(私說)\n정감록(鄭鑑録)\n(306)\n바람 또 바람 ①\n유(劉) 현(賢) ...
9     중산층—서민(庶民)대중 권익위해 헌신\n김대중(金大中) 평민당총재\n 90년은 우리...
10    1면서 계속\n 그는 재임중 정치자금조성문제에대해『기업또는 개인으로부터 정치자금을 ...
11    ◇증언(証言)하는 전(全)씨\n 31일 국회 5공(共)-광주특위(光州特委)에서의 증...
12    도덕성(道德性)회복바탕 강한 나라 이룩\n김재순(金在淳) 국회의장\n 지금 우리는 ...
13    팔(八) 면(面) 봉(鋒)\n 전두환(全斗煥)씨 증언,내용에 다수가 실망.그렇다고 ...
14    새해 내집 꿈\n 스포츠—레저용품 세일즈맨인이병갑(李秉甲)씨(34·서울 성북구 장위...
15    부음(訃音)\n▲이상태(李相泰)씨(전기통신공사경영기획실부장)부친상(父親喪)=30일오...
16    내일 의경기(競技)\n농구\n 대잔치4차대회▲코오롱—상업은행 (여지부·오후2시)▲남...
17    ◇올해부터 서해안개발관련 각종 공사가 본격 착수돼「서해안시대」에 한걸음 다가

In [43]:
def recognize_hanja(string):
    hanja_cnt = 0
    hanja_reversed = ''
    hangul_cnt = 0
    hangul_reversed = ''
    hanja_removed_reversed = ''
    hanja_words_reversed = []    
    
    for s in string[::-1]:
        if is_hanja(s):
            hanja_cnt += 1
            hanja_reversed += s
        elif hanja_cnt > 0 and is_hangul(s):
            hangul_cnt += 1
            hangul_reversed += s
        
            if hanja_cnt != 0 and hangul_cnt != 0 and hangul_cnt == hanja_cnt:
                hanja_words_reversed.append(Word(hangul_reversed[::-1], hanja_reversed[::-1]))
                hanja_cnt = 0
                hanja_reversed = ''
                hangul_cnt = 0
                hangul_reversed = ''
                continue
        else:
            hanja_removed_reversed += s
    
    return hanja_words_reversed[::-1], hanja_removed_reversed[::-1]

In [47]:
hanja_words, hanja_removed_string = recognize_hanja('젊은이반대(反對)의 힘 용납할때 화해점(和解點) 발견')
print(hanja_words)
print(hanja_removed_string)

[Word(반대, 反對), Word(화해점, 和解點)]
젊은이()의 힘 용납할때 () 발견


In [51]:
def parse_noun(string, stemmer):
    r = []
    e = []
    ne = 0
    try:
        pos = stemmer(string)
        for word, tag in pos:
            if tag.startswith('N'):
                r.append(WordWithPOS(word, None, tag))
    except Exception as error:
        e.append(error)
        ne += 1

    if ne > 0:
        print(f'number of exceptions: {ne}')
        
    return r, e

In [49]:
kmr = Komoran()
kmr_pos = kmr.pos

In [52]:
parsed, error = parse_noun(hanja_removed_string, kmr_pos)

print(parsed)

[WordWithPOS(젊은이, None, NNG), WordWithPOS(힘, None, NNG), WordWithPOS(용납, None, NNG), WordWithPOS(때, None, NNG), WordWithPOS(발견, None, NNG)]


In [55]:
from TOKEN import TOKENS

TOKEN = TOKENS['표준국어대사전 API']

In [73]:
import requests

def API_search_word(query):
    url = "https://stdict.korean.go.kr/api/search.do"

    params = {'key': TOKEN,
            'type_search': 'search',
            'req_type': 'json',
            'q': query}
    payload = {}
    headers = {}

    response = requests.request("GET", url, headers=headers, data=payload, params=params)

    if len(response.text) == 0:
        return None

    return response.json()

In [71]:
def API_view_word(target_code):
    url = "https://stdict.korean.go.kr/api/view.do"
    params = {'key': TOKEN,
            'type_search': 'view',
            'req_type': 'json',
            'method': 'TARGET_CODE',
            'q': target_code}
    payload = {}
    headers = {}

    response = requests.request("GET", url, headers=headers, data=payload, params=params)

    if len(response.text) == 0:
        return None

    return response.json()

In [79]:
def get_first_search_result(word):
    result = API_search_word(word)
    if result is None:
        return None
    if 'channel' in result.keys():
        channel = result['channel']
        if 'item' in channel.keys():
            item = channel['item']
            if len(item) > 0:
                return item[0]
    return None

In [81]:
def get_first_target_code(word):
    result = get_first_search_result(word)
    if result is None:
        return None
    if 'target_code' in result.keys():
        return result['target_code']

In [95]:
def check_if_hanja_from_target_code(target_code):
    if target_code is None:
        print(f'target code is None')
        return None, None
    result = API_view_word(target_code)
    if result is None:
        print(f'{target_code} cannot be found in 표준국어대사전')
        return None, None
    if 'channel' in result.keys():
        channel = result['channel']
        if 'item' in channel.keys():
            item = channel['item']
            if 'word_info' in item.keys():
                word_info = item['word_info']
                if 'word_type' in word_info:
                    word_type = word_info['word_type']
                    if word_type == '한자어':
                        if 'original_language_info' in word_info.keys():
                            original_language_info = word_info['original_language_info']
                            if len(original_language_info) > 0:
                                first_original_language_info = original_language_info[0]
                                if 'original_language' in first_original_language_info.keys():
                                    return True, first_original_language_info['original_language']
                    return False, None
    return None, None

In [97]:
def check_hanja(word):
    target_code = get_first_target_code(word)
    is_hanja, hanja = check_if_hanja_from_target_code(target_code)
    if is_hanja is None:
        print(f'{word} cannot be found in 표준국어대사전')
        return None
    if is_hanja:
        return hanja
    else:
        return None

In [103]:
check_hanja('화해')

'火海'

In [102]:
check_hanja('심심')