In [3]:
from konlpy.tag import Twitter
import pandas as pd
import pickle
import re

In [4]:
tagger = Twitter()

## load pickle

In [5]:
with open('../data/actors.pick', 'rb') as f:
    actors = pickle.load(f)

In [6]:
print (actors.shape)
actors.head(3)

(8, 2)


Unnamed: 0,name,type
0,김태희,배우
1,공유,배우
2,이병헌,배우


In [7]:
with open('../data/namuwiki.pick', 'rb') as f:
    frame = pickle.load(f)

In [8]:
print (frame.shape)
frame.head(3)

(842910, 4)


Unnamed: 0,contributors,namespace,text,title
0,"[namubot, R:hoon12560]",0,#redirect 느낌표\n,!
1,"[anatra95, chkong1998, Iviyuki, kirby10, max02...",0,[[파일:3444050440.jpg]]\n([[신 세계수의 미궁 2]]에서 뜬 !!...,!!아앗!!
2,"[ABC, AhnJ2000, aottkd3014, dream33, Fairy, ga...",0,"[include(틀:다른 뜻1, other1=말줄임표 등으로 사용하는 용어, rd1...",“……”


In [22]:
def get_article(title):
    p = frame.loc[frame['title'] == title]
    if p.empty:
        return ""
    return p.text.values[0]

In [23]:
pat_redirect = re.compile('^#redirect (.+)')
pat_index = re.compile('(.+?)\#(.+)')
def redirect_filter(text):
    match = pat_index.match(text)
    if match:
        return match.group(1)
    return text

def check_redirect(text):
    match = pat_redirect.match(text)
    if match:
        return redirect_filter(match.group(1).strip())
    else:
        return False

In [24]:
pat_bracket = re.compile(r'\[\[(.+?)\]\]')
pat_file = re.compile(r'\[\[파일:(.+)\]\]')
pat_link = re.compile(r'\[\[(.+?)\|(.+?)\]\]')
pat_comment = re.compile(r'\[\*(.+?)\]')
pat_high = re.compile(r'\{\{\{(.+?)\}\}\}')
pat_frame = re.compile(r'\[include\(틀:(.+?)\)\]')

def article_filter(text):
    chk = check_redirect(text)
    if chk:
        text = get_article(chk)
    return text

def bracket_filter(text):
    ret = ""
    match = pat_file.match(text)
    if match: 
        ret = ""
    else:
        match = pat_link.match(text)
        if match: 
            ret = match.group(2)
        else:
            ret = text[2:-2]
    return ret

def context_filter(text):
    # find frame
    delc = 0
    matches = pat_frame.finditer(text)
    for match in matches:
        conv = match.group(1)
        text = text[:match.start() - delc] + conv + text[match.end() - delc:]
        delc += len(match.group(0)) - len(conv)
    
    # find bracket
    delc = 0
    matches = pat_bracket.finditer(text)
    for match in matches:
        conv = bracket_filter(match.group(0))
        text = text[:match.start() - delc] + conv +  text[match.end() - delc:]
        delc += len(match.group(0)) - len(conv)
        
    # comments
    delc = 0
    matches = pat_comment.finditer(text)
    for match in matches:
        text = text[:match.start() - delc] + match.group(0) +  text[match.end() - delc:]
        delc += 3
        
    # find highlight
    delc = 0
    matches = pat_high.finditer(text)
    for match in matches:
        text = text[:match.start() - delc] + match.group(1) +  text[match.end() - delc:]
        delc += 6
    
    return text

In [25]:
def tokenize(content):
    return ["{}/{}".format(word, tag) for word, tag in tagger.pos(content) if tag == 'Noun']

In [26]:
class Words:
    def __init__(self, frame, filt = lambda x: x):
        self.frame = frame
        self.filt = filt
    
    def __iter__(self):
        for _, article in self.frame.iterrows():
            yield self.filt(article)

# word2vec

In [6]:
from os import listdir

In [7]:
listdir('.')

['word2vec.py',
 'Cluster.ipynb',
 'Cluster.py',
 'check.json',
 '.ipynb_checkpoints',
 'word2vec.ipynb',
 'DataAnalysis.ipynb']

In [5]:
import gensim

In [28]:
namuWords = Words(frame, filt=lambda x : tokenize(context_filter(article_filter(x.text))))

In [29]:
namuTrains = Words(frame, filt=lambda x : tokenize(context_filter(article_filter(x.text))))

In [None]:
model = gensim.models.Word2Vec()

In [None]:
model.build_vocab(namuWords)

In [None]:
model.train(namuTrains)

In [None]:
model.save('../data/model')

In [None]:
model.most_similar('아이유/Noun')