## 라이브러리 읽기

In [16]:
# 기본 라이브러리 로딩
import pandas as pd
import numpy as np
import re # 정규표현식

# 토픽 모델링 관련 라이브러리 로딩
import gensim 
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel
from gensim.models import LdaModel

# 시각화 라이브러리
from pprint import pprint
import pyLDAvis
from pyLDAvis import gensim

# 토크나이징 라이브러리
import MeCab
m = MeCab.Tagger('-d /usr/local/lib/mecab/dic/mecab-ko-dic')
from konlpy.tag import Mecab
mecab = Mecab()

### Load Data

In [11]:
df = pd.read_csv("battlenetproData.csv", encoding='utf-8')

In [12]:
df.head()

Unnamed: 0.1,Unnamed: 0,level_0,index,link_id,user_id,title,contents,new_datetime
0,0,0,0,4564557252,비전력이부조카당,"아바투르 유저인데, 하나 건의합니다.",아바투르 유저인데 하나 건의합니다새로 생긴 하나무라 맵뭐 이것도 맵 자체에 할 말이...,2017-05-06 05:21:00
1,1,1,1,4567016696,네이팜데쓰,"매칭 시스템, 아무리 생각해도 문제가 많습니다",매칭좀 제대로 해주세요9연패가 말이 되는 처사입니까 며칠전에 16연승했다고 벌을 주...,2018-09-08 16:53:00
2,2,2,2,4567016695,souse,ㅈ같은 매칭좀 어캐좀 해봐요,왜 내가 잘해도 ㅈ같이못하는 새키들때문에 게임을 져야하죠 왜 심지어 못하는새키랑 또...,2018-09-08 16:22:00
3,3,3,3,4567026707,Best,아나 스킨관련..,신 스킨 살무사 아나 수면총 색 고정말고 스킨 색 대로 해주셧으면 좋겠습니다 처...,2018-09-08 15:13:00
4,4,4,4,4567036673,세월,5인팟 + 승률50%집착매칭 + 밸런스 파괴 = 빠대무간지옥,승률 50 만든다고 조합 내놓고5인팟은 여전히 솔팟 사이에서 개판치고다니고너프할 애...,2018-09-07 13:58:00


### Tokenizing

In [13]:
# 포스태깅 - 잘 되어있는지 체크 
tokens_check = []

for i in list(df["contents"]):
    k = mecab.pos(i)
    tokens_check.append(k)

print(tokens_check[:10])

NotImplementedError: Wrong number or type of arguments for overloaded function 'Tagger_parse'.
  Possible C/C++ prototypes are:
    MeCab::Tagger::parse(MeCab::Model const &,MeCab::Lattice *)
    MeCab::Tagger::parse(MeCab::Lattice *) const
    MeCab::Tagger::parse(char const *)


In [17]:
# 명사형 추출

tokens_all = []

for i in list(df["contents"]):
    k = mecab.nouns(i)
    tokens = []
    
    for j in range(len(k)):
        if len(k[j]) > 1:
            tokens.append(k[j])
        else:
            continue
    tokens_all.append(tokens)
    
print(tokens_all[:10])

NotImplementedError: Wrong number or type of arguments for overloaded function 'Tagger_parse'.
  Possible C/C++ prototypes are:
    MeCab::Tagger::parse(MeCab::Model const &,MeCab::Lattice *)
    MeCab::Tagger::parse(MeCab::Lattice *) const
    MeCab::Tagger::parse(char const *)


In [None]:
tokens_df = pd.DataFrame({"tokens":tokens_all})

In [None]:
tokens_df.head()

In [None]:
df10["tokens_contents"] = tokens_all

In [None]:
id2word = corpora.Dictionary(df10["tokens_contents"])

In [None]:
texts = df10["tokens_contents"]

In [85]:
corpus = [id2word.doc2bow(text) for text in texts]

print(corpus[:1])

[[(0, 2), (1, 1), (2, 1), (3, 1), (4, 2), (5, 1), (6, 1), (7, 1), (8, 1), (9, 1), (10, 1), (11, 1), (12, 3), (13, 1), (14, 1), (15, 1), (16, 1), (17, 1), (18, 1), (19, 1), (20, 1), (21, 1), (22, 1), (23, 1), (24, 1), (25, 1), (26, 1)]]


In [86]:
id2word[0]

'4년'

In [87]:
# Human readable format of corpus (term-frequency)
[[(id2word[id], freq) for id, freq in cp] for cp in corpus[:1]]

[[('4년', 2),
  ('식', 1),
  ('못하는새키랑', 1),
  ('새키들때문', 1),
  ('게임', 2),
  ('요즘', 1),
  ('강등전', 1),
  ('같은판되서', 1),
  ('ㅈ같이못하', 1),
  ('밸런스', 1),
  ('새키들', 1),
  ('매칭', 1),
  ('나', 3),
  ('인성파탄나', 1),
  ('심한욕까', 1),
  ('사랑', 1),
  ('ㅈ같', 1),
  ('다이긴', 1),
  ('블리자드', 1),
  ('시스템', 1),
  ('히오스', 1),
  ('개사랑햇는데', 1),
  ('매칭시스템', 1),
  ('애정', 1),
  ('죽여버리고싶은욕구', 1),
  ('지경', 1),
  ('올해', 1)]]

In [105]:
# Build LDA model

%time lda_model = LdaModel(corpus=corpus, id2word=id2word, num_topics=10, random_state=100, \
                                                  update_every=1, \
                                                  chunksize=100,\
                                                  passes=10, \
                                                  alpha='auto', \
                                                  per_word_topics=True)

CPU times: user 3min 7s, sys: 11.4 s, total: 3min 18s
Wall time: 3min 28s


In [106]:
pprint(lda_model.print_topics())
doc_lda = lda_model[corpus]

[(0,
  '0.040*"스킨" + 0.023*"오니겐지" + 0.009*"2주차" + 0.008*"초상화" + 0.007*"진심" + '
  '0.007*"경찰디바" + 0.007*"공지" + 0.006*"궁" + 0.006*"누더기" + 0.005*"첫째주"'),
 (1,
  '0.013*"서버" + 0.011*"완료" + 0.007*"기준" + 0.007*"점수" + 0.006*"그거" + 0.006*"뭔" '
  '+ 0.006*"승리" + 0.006*"레벨" + 0.006*"일반" + 0.005*"바리"'),
 (2,
  '0.050*"히오스" + 0.044*"게임" + 0.036*"이벤트" + 0.034*"진짜" + 0.020*"나" + '
  '0.014*"유저들" + 0.013*"블리자드" + 0.012*"이" + 0.011*"뭐" + 0.008*"판"'),
 (3,
  '0.036*"매칭" + 0.024*"빠대" + 0.019*"시공" + 0.014*"조합" + 0.012*"일반전" + '
  '0.012*"영리" + 0.011*"시스템" + 0.011*"빠른대전" + 0.009*"파티" + 0.009*"등급전"'),
 (4,
  '0.062*"것" + 0.035*"수" + 0.019*"생각" + 0.016*"때" + 0.012*"오버워치" + 0.011*"퀘스트" '
  '+ 0.011*"데" + 0.010*"영웅" + 0.009*"이" + 0.008*"말"'),
 (5,
  '0.019*"인공지능" + 0.010*"궁금" + 0.007*"니" + 0.007*"탈주자" + 0.006*"니들" + '
  '0.006*"아이템" + 0.006*"발리라" + 0.005*"처리" + 0.005*"ㅋㅋㅋ" + 0.004*"로딩"'),
 (6,
  '0.025*"욕" + 0.008*"15판" + 0.007*"ㅠㅠ" + 0.005*"브락시스" + 0.004*"항전" + '
  '0.004*"용검" + 0.004*"장난" + 0.004*"정예" + 0.0

### coherence

In [107]:
# Compute Perplexity
print('\nPerplexity: ', lda_model.log_perplexity(corpus)) # a measure of how good the model is. lower the better.
 
# Compute Coherence Score using c_v
coherence_model_lda = CoherenceModel(model=lda_model, texts=df["tokens_contents"], dictionary=id2word, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score_c_v: ', coherence_lda)


Perplexity:  -10.774218316624724

Coherence Score:  0.42473351325729614


In [None]:
# Compute Coherence Score using UMass
coherence_model_lda = CoherenceModel(model=lda_model, texts=docs, dictionary=dictionary, coherence="u_mass")
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score_u_mass: ', coherence_lda)

In [None]:
def compute_coherence_values(dictionary, corpus, texts, limit, start=2, step=3):
    """
    Compute c_v coherence for various number of topics

    Parameters:
    ----------
    dictionary : Gensim dictionary
    corpus : Gensim corpus
    texts : List of input texts
    limit : Max num of topics

    Returns:
    -------
    model_list : List of LDA topic models
    coherence_values : Coherence values corresponding to the LDA model with respective number of topics
    """
    coherence_values = []
    model_list = []
    for num_topics in range(start, limit, step):
        model=LdaModel(corpus=corpus, id2word=dictionary, num_topics=num_topics)
        model_list.append(model)
        coherencemodel = CoherenceModel(model=model, texts=texts, dictionary=dictionary, coherence='c_v')
        coherence_values.append(coherencemodel.get_coherence())

    return model_list, coherence_values

In [None]:
model_list, coherence_values = compute_coherence_values(dictionary=dictionary, corpus=corpus, texts=docs, start=2, limit=40, step=6)

# Show graph
import matplotlib.pyplot as plt
limit=40; start=2; step=6;
x = range(start, limit, step)
plt.plot(x, coherence_values)
plt.xlabel("Num Topics")
plt.ylabel("Coherence score")
plt.legend(("coherence_values"), loc='best')
plt.show()

In [108]:
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(lda_model, corpus, id2word)
vis