In [13]:
from konlpy.tag import Twitter
import pickle
import re

In [10]:
tagger = Twitter()

## load pickle

In [None]:
with open('../data/kowiki.pick', 'rb') as f:
    kowiki = pickle.load(f)

In [2]:
with open('../data/namuwiki.pick', 'rb') as f:
    frame = pickle.load(f)

In [3]:
print (frame.shape)
frame.head(3)

(842910, 4)


Unnamed: 0,contributors,namespace,text,title
0,"[namubot, R:hoon12560]",0,#redirect 느낌표\n,!
1,"[anatra95, chkong1998, Iviyuki, kirby10, max02...",0,[[파일:3444050440.jpg]]\n([[신 세계수의 미궁 2]]에서 뜬 !!...,!!아앗!!
2,"[ABC, AhnJ2000, aottkd3014, dream33, Fairy, ga...",0,"[include(틀:다른 뜻1, other1=말줄임표 등으로 사용하는 용어, rd1...",“……”


In [119]:
def get_article(title):
    p = frame.loc[frame['title'] == title]
    if p.empty:
        return ""
    return p.text.values[0]

In [107]:
pat_redirect = re.compile('^#redirect (.+)')
pat_index = re.compile('(.+?)\#(.+)')
def redirect_filter(text):
    match = pat_index.match(text)
    if match:
        return match.group(1)
    return text

def check_redirect(text):
    match = pat_redirect.match(text)
    if match:
        return redirect_filter(match.group(1).strip())
    else:
        return False

In [79]:
pat_bracket = re.compile(r'\[\[(.+?)\]\]')
pat_file = re.compile(r'\[\[파일:(.+)\]\]')
pat_link = re.compile(r'\[\[(.+?)\|(.+?)\]\]')
pat_comment = re.compile(r'\[\*(.+?)\]')
pat_high = re.compile(r'\{\{\{(.+?)\}\}\}')
pat_frame = re.compile(r'\[include\(틀:(.+?)\)\]')

def article_filter(text):
    chk = check_redirect(text)
    if chk:
        text = get_article(chk)
    return text

def bracket_filter(text):
    ret = ""
    match = pat_file.match(text)
    if match: 
        ret = ""
    else:
        match = pat_link.match(text)
        if match: 
            ret = match.group(2)
        else:
            ret = text[2:-2]
    return ret

def context_filter(text):
    # find frame
    delc = 0
    matches = pat_frame.finditer(text)
    for match in matches:
        conv = match.group(1)
        text = text[:match.start() - delc] + conv + text[match.end() - delc:]
        delc += len(match.group(0)) - len(conv)
    
    # find bracket
    delc = 0
    matches = pat_bracket.finditer(text)
    for match in matches:
        conv = bracket_filter(match.group(0))
        text = text[:match.start() - delc] + conv +  text[match.end() - delc:]
        delc += len(match.group(0)) - len(conv)
        
    # comments
    delc = 0
    matches = pat_comment.finditer(text)
    for match in matches:
        text = text[:match.start() - delc] + match.group(0) +  text[match.end() - delc:]
        delc += 3
        
    # find highlight
    delc = 0
    matches = pat_high.finditer(text)
    for match in matches:
        text = text[:match.start() - delc] + match.group(1) +  text[match.end() - delc:]
        delc += 6
    
    return text

In [80]:
def tokenize(content):
    return ["{}/{}".format(word, tag) for word, tag in tagger.pos(content)]

In [None]:
class FrameIter:
    def __init__(self, frame, filt=lambda x:x):
        self.frame = frame
        self.filt = filt
    
    def __iter__(self):
        for _, article in self.frame.iterrows():
            yield self.filt(article)

# word2vec

In [82]:
import gensim

In [83]:
frameiter = FrameIter(frame, filt=lambda x : tokenize(context_filter(article_filter(x.text))))

In [85]:
model = gensim.models.Word2Vec()

In [None]:
model.build_vocab(frameiter)

!
!!아앗!!
“……”
"모루" 신 파이
#
#FairyJoke
#FairyJoke #SDVX_Edit
#Fairy_dancing_in_lake
#The_Relentless
$
$00pah NiN10Doh!
%
%2A뮤트
%5C
%P
&
&fmt=
'Not' Based On True Story
'O Sole Mio
페텔기우스 로마네콩티
사테라(Re: 제로부터 시작하는 이세계 생활)
요코야마 슌소
레굴루스 코르니아스
에키드나(Re: 제로부터 시작하는 이세계 생활)
(...)
(What's the story) Morning Glory
(What's the story) Morning Glory?
(miss)understood
(…)
(구)뒤틀린 숲
(구)버섯의 성
(구)서울 버스 150
(구)소환사의 협곡
(웃음)
(주)모모
한화(기업)
*
*(Asterisk)
*NSYNC
*nix
*뮤트
*헬로, 플래닛.
*현애
+틱 언니
+틱언니
,
-
- dirty rouge -
-+
-1
-2CA
-3CA
-8
-Dreaming Girl-사랑,처음 뵙겠습니다
-ERROR
-Rayrain-
-_-
-aira-
…
.22 LR
.22 Long Rifle
.223 피스톨
.300
.308
.32 피스톨
.338
.338 Lapua Magnum
.357 매그넘
.38 Special
.38 스페셜
.40 S&W
.44 매그넘
.44 스페셜
.45 ACP
.45 자동 권총
.45-70 Government
.50
.50 BMG
.50 Beowulf
.577 T-Rex
.59
.5: The Gray Chapter
.600 Nitro Express
.950 JDJ
.flow
.hack
.hack 세상의 저편에
.hack/G.U.
.hack/G.U.시리즈
.hack/Link
.hack/Quantum
.hack/SIGN
.hack/Versus
.hack/Vol 시리즈
.kkrieger
0
0.02
0.03
0/1 ANGEL
00-Aretha
007
007 시리즈
0083
00유닛
010-44

In [None]:
model.train(frameiter)

In [None]:
model.save('../data/model')