<a href="https://colab.research.google.com/github/movie5/zutopia/blob/master/khaiii.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!git clone https://github.com/kakao/khaiii.git
!pip install cmake
!mkdir build
!cd build && cmake /content/khaiii
!cd /content/build/ && make all
!cd /content/build/ && make resource
!cd /content/build && make install
!cd /content/build && make package_python
!pip install /content/build/package_python

In [None]:
!cd build && cmake /content/khaiii

In [None]:
from google.colab import drive
drive.mount('/gdrive', force_remount=True)

In [None]:
import pandas as pd
# data_path = your_path
train = pd.read_json('/gdrive/My Drive/Colab Notebooks/train.json')
val = pd.read_json('/gdrive/My Drive/Colab Notebooks/val.json')

In [None]:
!ls "/gdrive"

In [None]:
cd "/gdrive/My Drive"

In [None]:
ls "/gdrive/My Drive"

In [None]:
#1 valid, test set을 보면 제목만 주어지고 songs나 tags에 대한 정보가 전혀 없는 경우가 있습니다.
val[(val['tags'] + val['songs']).map(len) == 0]

In [None]:
# Tags와 플레이리스트를 같이 보다보면 제목에 있는 단어를 그대로 태그로 붙이는 경우가 많이 있기 때문입니다. 확인해 볼 수 있습니다.
train.tail()

In [None]:
import json
import re
from collections import Counter
from typing import *

import matplotlib.pyplot as plt; plt.rcdefaults()
import numpy as np
import pandas as pd
from khaiii import KhaiiiApi  # khaiii 레포는 https://github.com/kakao/khaiii 이쪽

In [None]:
def re_sub(series: pd.Series) -> pd.Series:
    series = series.str.replace(pat=r'[ㄱ-ㅎ]', repl=r'', regex=True)  # ㅋ 제거용
    series = series.str.replace(pat=r'[^\w\s]', repl=r'', regex=True)  # 특수문자 제거
    series = series.str.replace(pat=r'[ ]{2,}', repl=r' ', regex=True)  # 공백 제거
    series = series.str.replace(pat=r'[\u3000]+', repl=r'', regex=True)  # u3000 제거
    return series

def flatten(list_of_list : List) -> List:
    flatten = [j for i in list_of_list for j in i]
    return flatten

def get_token(title: str, tokenizer)-> List[Tuple]:
    
    if len(title)== 0 or title== ' ':  # 제목이 공백인 경우 tokenizer에러 발생
        return []
    
    result = tokenizer.analyze(title)
    result = [(morph.lex, morph.tag) for split in result for morph in split.morphs]  # (형태소, 품사) 튜플의 리스트
    return result

def get_all_tags(df) -> List:
    tag_list = df['tags'].values.tolist()
    tag_list = flatten(tag_list)
    return tag_list

In [None]:
tokenizer = KhaiiiApi()
all_tag = get_all_tags(train)
token_tag = [get_token(x, tokenizer) for x in all_tag]  # 태그를 형태소 분석

In [None]:
token_tag[:10]

In [None]:
token_itself = list(filter(lambda x: len(x)==1, token_tag))  # 태그 자체가 형태소여서 분리되지 않는 태그만 골라봅니다
token_itself = flatten(token_itself)
flatten_token = flatten(token_tag)

In [None]:
print('%-23s'%'# of original tag is', f'{len(all_tag):8,}')
print('%-23s'%'# of morpheme itself is', f'{len(token_itself):8,}')
print('%-23s'%'# of total token is', f'{len(flatten_token):8,}')

In [None]:
# Counting Part of Speech (pos)
pos = [x[1] for x in token_itself]
pos_count = Counter(pos)
popular_pos = pos_count.most_common()

In [None]:
# tag 분류표는 https://github.com/kakao/khaiii/wiki/%EC%BD%94%ED%8D%BC%EC%8A%A4 를 참고
objects = [x[0] for x in popular_pos]
y_pos = np.arange(len(objects))
performance = [x[1] for x in popular_pos]

plt.barh(y_pos, performance, align='center', alpha=0.5)
plt.yticks(y_pos, objects)
plt.xlabel('Usage')
plt.title('Part of Speech - Tags')

plt.show()

플레이리스트 제목을 형태소 분석

In [None]:
train['plylst_title'] = re_sub(train['plylst_title'])
train.loc[:, 'ply_token'] = train['plylst_title'].map(lambda x: get_token(x, tokenizer))


In [None]:
using_pos = ['NNG','SL','NNP','MAG','SN','XR']  # 일반 명사, 외국어, 고유 명사, 일반 부사, 숫자, 어근
train['ply_token_basic'] = train['ply_token'].map(lambda x: list(filter(lambda x: x[1] in using_pos, x)))
train['ply_token_basic'] = train['ply_token'].map(lambda x: list(filter(lambda x: x[1] in using_pos, x)))

In [None]:
unique_tag = set(token_itself)
unique_word = [x[0] for x in unique_tag]

In [None]:
# 우리의 목적은 정답 tags를 맞추는 것이기 때문에 정답 tags에 나온 형태소만 남겨둡니다.
train['ply_token_tag'] = train['ply_token'].map(lambda x: list(filter(lambda x: x[0] in unique_word, x)))
train.head(10)

In [None]:
train.tail(20)

In [None]:
train.to_json('train_tokenized.json', orient='table')

In [None]:
taglist = pd.read_json('/gdrive/My Drive/Colab Notebooks/tag_song_pair.json', typ='frame')

In [None]:
train = pd.read_json('/gdrive/My Drive/Colab Notebooks/tag_group_list.json', orient = 'table')