# 전처리 및 토큰화 과정 step by step


### Soynlp (‘딥러닝을 이용한 자연어 처리 입문’도서 참고)

Non-null만 추출 -> 광고성 글 제거 -> soynlp normalizer -> soyspacing -> 정규식 적용(한글 및 영어 문자만 남기고 나머지 제거 + url형태 문자열 제거) -> LRNounExtractor_v2 명사추출 -> 불용어 제외


## 0. 환경 setup

In [1]:
import pandas as pd
import numpy as np
import re
import pickle

## 1. 광고 제거까지 정제된 데이터 로드

Non-null만 추출 -> 광고성 글 제거 까지 처리되어 csv저장된 data 로드

In [52]:
df_blog = pd.read_csv('./zerowaste/processed4200/blog_non-null_no-ads_4200.csv')
df_blog.head(3)

Unnamed: 0,URL,Date,Like,Content,Hashtag,SNS
0,https://blog.naver.com/lsy_sweet/222457159636,2021. 8. 15. 9:58,12.0,"안녕하세요, 친환경 살림을 하며 환경 보존을 위해 노력하는 단순이 입니다. 그동안 ...",#고체치약 #제로웨이스트 #고체치약본티 #본티치약 #Vontee,blog
1,https://blog.naver.com/newblack76/222468647865,2021. 8. 15. 9:49,,모두들 안녕하셨어요~ 입추가 지나고 말복이 지나니 이젠 바람이 틀려진듯해요^^ 금방...,#국산천연수세미 #제로웨이스트선물 #국내산천연수세미 #북어표수세미 #잘라쓰는수세미 ...,blog
2,https://blog.naver.com/somcandy117/222460526981,2021. 8. 15. 9:10,2.0,오빠가 유리빨대의 딱딱한 질감이 싫다고 해서 100퍼 자연주의 풀빨대를 사봤다 배송...,#풀빨대 #자연빨대 #제로웨이스트 #생분해빨대 #베트남풀빨대 #친환경빨대 #환경보호...,blog


In [53]:
df_insta = pd.read_csv('./zerowaste/processed4200/insta_non-null_no-ads_4200.csv')
df_insta.head(3)

Unnamed: 0,URL,Date,Like,Content,Hashtag,SNS
0,vege_yony,2021-08-15 23:59:36+00:00,49,- 그릭요거트 다들 식사로 먹지만 간식으로 먹는 사람 나야 나- 카페 조감 비건그릭...,"['#비건그릭요거트', '#쑥브라우니', '#비건쑥디저트']",instagram
1,eco_kapegg,2021-08-15 23:45:31+00:00,16,⠀ 더운날🥲 음료필수장착해야되쥬 중부사부소 조과장님의 텀블러 용기내👍👏 ⠀ 텀블러에...,"['#용기내', '#제로웨이스트', '#환경보호', '#텀블러', '#캠페인', '...",instagram
2,eco_kapegg,2021-08-15 23:42:02+00:00,14,⠀ 중부사무소 서대리님의 주말 장보기🤗 미리 장바구니를 챙겨서 봉투사용을 줄였습니다...,"['#제로웨이스트', '#캠페인', '#장바구니', '#일회용품줄이기', '#봉투줄...",instagram


In [54]:
df_cafe = pd.read_csv('./zerowaste/processed4200/cafe_non-null_no-ads_4200.csv')
df_cafe.head(3)

Unnamed: 0,URL,Date,Like,Content,Hashtag,SNS
0,https://cafe.naver.com/applestore99/1704?art=a...,2021.08.15. 23:25,0.0,​​​써보고 평이 가장 좋았던 갓성비 샴푸바 이구요. 가격후기별 순위모음​​​ <...,,cafe
1,https://cafe.naver.com/myomahealing/151141?art...,2021.08.15. 23:01,3.0,"글쓰기 전 기본 정보가 있는 '공부합시다' 게시판을 확인하시고, 중복되는 문의글이 ...",,cafe
2,https://cafe.naver.com/donggubat0/72?art=aW50Z...,2021.08.15. 22:39,2.0,"동구밭과 첫 인연을 맺게 해준, 그 첫 인연이후 쭉 저의 최애템은 '올바른 설거지 ...",,cafe


In [55]:
print(len(df_blog))
print(len(df_insta))
print(len(df_cafe))

4200
4200
4200


## 2. Soyspacing & SOYNLP normalizer 처리

#### soyspacing

띄어쓰기 에러 처리 및 이모티콘, 반복 글자 정제

In [None]:
import soyspacing
print(soyspacing.__version__)

In [None]:
from soyspacing.countbase import RuleDict, CountSpace

model2 = CountSpace()
model2.load_model('./soyspacing/demo_model/test.model', json_format=False)

verbose=False
mc = 10  # min_count
ft = 0.3 # force_abs_threshold
nt =-0.3 # nonspace_threshold
st = 0.3 # space_threshold

In [None]:
def fix_spacing(sent):
    sent_corrected, tags = model2.correct(
        sent,
        verbose=verbose,
        force_abs_threshold=ft,
        nonspace_threshold=nt,
        space_threshold=st,
        min_count=mc
    )
    return sent_corrected

In [None]:
df_blog['Content'] = df_blog['Content'].apply(lambda x: fix_spacing(x))
df_insta['Content'] = df_insta['Content'].apply(lambda x: fix_spacing(x))
df_cafe['Content'] = df_cafe['Content'].apply(lambda x: fix_spacing(x))

#### SOYNLP의 Normalizer

대화 데이터, 댓글 데이터에 등장하는 반복되는 이모티콘의 정리 및 한글, 혹은 텍스트만 남기기 위한 함수를 제공합니다.

예시)

emoticon_normalize('ㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋ쿠ㅜㅜㅜㅜㅜㅜ', num_repeats=3)

결과:  'ㅋㅋㅋㅜㅜㅜ'

In [None]:
from soynlp.normalizer import *

In [None]:
def normalize_all(df):
    for idx in df.index:
        df.loc[idx,'Content'] = emoticon_normalize(df.loc[idx,'Content'], num_repeats=3) 
        df.loc[idx,'Content'] = repeat_normalize(df.loc[idx,'Content'], num_repeats=2)
    return df

In [None]:
df_blog_norm = normalize_all(df_blog)
df_insta_norm = normalize_all(df_insta)
df_cafe_norm = normalize_all(df_cafe)

## 3. 게시글별 문장 리스트화 + 정규식 적용

게시글 내용을 문장단위로 나누어 리스트화 함

URL 제외, 한글과 영어만 남도록 정제

In [None]:
def separate_sentences(text, punc):
    # syonlp noun extractor 사용을 위해 문장사이 doublespace 삽입
    sentences=[]; onesentence=""
    for ele in text:
        onesentence += ele
        if ele in punc:
            sentences.append(onesentence+"  ")
            onesentence=""
    return sentences
            
def regular_expression(text): 
    # url제거
    url = '(https?:\/\/(?:www\.|(?!www))[a-zA-Z0-9][a-zA-Z0-9-]+[a-zA-Z0-9]\.[^\s]{2,}|www\.[a-zA-Z0-9][a-zA-Z0-9-]+[a-zA-Z0-9]\.[^\s]{2,}|https?:\/\/(?:www\.|(?!www))[a-zA-Z0-9]+\.[^\s]{2,}|www\.[a-zA-Z0-9]+\.[^\s]{2,})'
    pattern_url = re.compile(url)
    tmp = pattern_url.sub('', text)
    # regex추출 규칙: 영어 또는 띄어 쓰기(1 개)를 포함한 한글
    pattern = re.compile('[^a-zA-Z| ㄱ-ㅣ 가-힣]')  
    result = pattern.sub('', tmp)  # 위에 설정한 "hangul"규칙을 "text"에 적용(.sub)시킴
    return result

In [None]:
def apply_sep_regex(text,punct):
    sentences = separate_sentences(text,punct)
    reg_sentences=[]
    for s in sentences:
        reg_sentences.append(regular_expression(s))
    return reg_sentences

In [None]:
#punctuation = '''!()-[]{};:'"\,<>./?@#$%^&*_~'''
punctuation = '''!.?~'''

df_blog_norm['Content'] = df_blog_norm['Content'].apply(lambda x: apply_sep_regex(x, punctuation))
df_insta_norm['Content'] = df_insta_norm['Content'].apply(lambda x: apply_sep_regex(x, punctuation))
df_cafe_norm['Content'] = df_cafe_norm['Content'].apply(lambda x: apply_sep_regex(x, punctuation))

In [None]:
def elminate_empty_lines(text):
    non_empty=[]
    for line in text:
        if len(line)>0 and line!=' ': non_empty.append(line)
    return non_empty         

In [None]:
df_blog_norm['Content'] = df_blog_norm['Content'].apply(lambda x: elminate_empty_lines(x))
df_insta_norm['Content'] = df_insta_norm['Content'].apply(lambda x: elminate_empty_lines(x))
df_cafe_norm['Content'] = df_cafe_norm['Content'].apply(lambda x: elminate_empty_lines(x))

## 4. 토큰화 

### SOYNLP의 Noun Extractor(v.2) 활용 

soynlp는 품사 태깅, 단어 토큰화들을 제공함.
비지도 학습으로 단어 토큰화 진행 - 데이터에서 자주 등장하는 단어들 (신조어 포함) 단어로 분석함.

soynlp=0.0.46+ 에서는 명사 추출기 version 2 를 제공합니다. 이전 버전의 명사 추출의 정확성과 합성명사 인식 능력, 출력되는 정보의 오류를 수정한 버전입니다. 

In [None]:
from soynlp.noun import LRNounExtractor_v2

noun_extractor = LRNounExtractor_v2(verbose=True, extract_compound=True)

#### 1. 각 게시글별 명사(+게시글별 빈도수) 추출

df['Content']의 각 게시글을 문장단위로 나뉜 elements를 가진 list이다.

sents = df_blog.loc[idx,'Content']

sents를 noun_extractor에 넣어서 각 게시글별 명사와 빈도수 및 score를 추출한다

In [None]:
def extract_nouns(df,freq):
    nouns=[]
    for idx in df.index:
        sents = df.loc[idx,'Content']
        try:
            nouns_per_posting = noun_extractor.train_extract(
            sents,
            min_noun_frequency=freq)
            nouns.append(nouns_per_posting)
        except:
            pass
    return nouns

In [None]:
# minfreq=3 설정 이유 - 서론,본론,결론에 적어도 한번씩 등장하는 단어라면 3+ 예상함.
nouns_minfreq_blog = extract_nouns(df_blog_norm,3)
nouns_minfreq_blog

In [None]:
nouns_minfreq_insta = extract_nouns(df_insta_norm,1)
nouns_minfreq_insta

In [None]:
nouns_minfreq_cafe = extract_nouns(df_cafe_norm,3)
nouns_minfreq_cafe

## 5. 불용어 (stopwords) 처리

명사가 아닌 단어와 불필요한 한글자 단어들을 제거한다.

In [None]:
stopwords=[]
stopwords = pd.read_csv("https://raw.githubusercontent.com/yoonkt200/FastCampusDataset/master/korean_stopwords.txt").values.tolist()

stopwords에서 '우리', '저희' 는 빼야하는것같다. 제로웨이스트의 공동체 성격을 유지하기위해서.

In [None]:
stopwords.remove(['우리']); stopwords.remove(['저희']); stopwords.remove(['함께']); len(stopwords)

In [None]:
zerowaste_stopwords = ['제로','웨','이스트','이스터','제로 웨이스트','제로웨이스트','zero waster',\
                       'ZERO WASTER', 'Zero Waster','zerowaster','ZEROWASTER','ZeroWaster',
                       'zerowaste','ZEROWASTE','zero waste','ZERO WASTE','Zerowaste','Zero Waste',\
                       '제로 웨이스터','제로웨이스터','웨이스트','웨이스트 ', ' 웨이스트 ',' 웨이스트',\
                       '환경','친환경','너무', '그리고', '정말', '이렇게', '있어요', '때문', \
                       '정도', '조금', '분들', '진짜', '대한', '이번', '경우', '대신', '가지고',\
                       '그래서','엄청','아직','때문에','위한','그런데','그렇게','결국','것으로',\
                       '이거','있지만','사이','싶어서','나름','그것',\
                       '때문에','가장','것을','관련','있는데','것도','근데','무엇','있도록','물론',\
                       '보니','것이다','등을','더욱','등등','이것','같아서','있다면','있었어요',\
                       '있었는데','같습니다','않을까','않아도','어느','않게','너무나','이러','이곳',\
                       ]

In [None]:
for word in zerowaste_stopwords:
    stopwords.append(word)
print(len(stopwords))

In [None]:
include = ['낮', '땅', '밭', '꽃', '돌', '멋','맛', '폼', '물', '볕', '빛', '봄', '숲', '새', '산', '숨', '싹', '옷', '잎', '차', '흙', '힘']

In [None]:
# 불용어 제거
def eliminate_stopwords(nouns):
    filtered=[]
    for posting_nouns in nouns:
        filtered_per_post=[]
        for x in posting_nouns.keys():
            if (len(x) > 1 or  x in include) and (x not in stopwords): 
                filtered_per_post.append((x,posting_nouns[x]))
        filtered.append(filtered_per_post)
    return filtered

In [None]:
filtered_nouns_blog = eliminate_stopwords(nouns_minfreq_blog)
filtered_nouns_blog

In [None]:
filtered_nouns_insta = eliminate_stopwords(nouns_minfreq_insta)
filtered_nouns_insta

In [None]:
filtered_nouns_cafe = eliminate_stopwords(nouns_minfreq_cafe)
filtered_nouns_cafe

## extract된 nouns 저장(또는 미리 저장된 nouns불러오기)

뽑은 명사를 pickle파일로 저장했다. 각 플랫폼별, 게시글내 추출 명사와 빈도수 및 추출 점수가 저장된다.

In [None]:
import pickle

In [None]:
file_to_store = open("./zerowaste/processed4200/filtered_soynlp_blog.pkl", "wb")
pickle.dump(filtered_nouns_blog,file_to_store)

file_to_store.close()

In [None]:
file_to_store = open("./zerowaste/processed4200/filtered_soynlp_insta.pkl", "wb")
pickle.dump(filtered_nouns_insta,file_to_store)

file_to_store.close()

In [None]:
file_to_store = open("./zerowaste/processed4200/filtered_soynlp_cafe.pkl", "wb")
pickle.dump(filtered_nouns_cafe,file_to_store)

file_to_store.close()

저장된 pickle파일 불러오기

In [2]:
with open('./zerowaste/processed4200/filtered_soynlp_blog.pkl','rb') as input_file:
    filtered_nouns_blog = pickle.load(input_file) 

In [None]:
# with open('./zerowaste/processed4200/filtered_soynlp_insta.pkl','rb') as input_file:
#     filtered_nouns_insta = pickle.load(input_file) 

In [3]:
with open('./zerowaste/processed4200/filtered_soynlp_cafe.pkl','rb') as input_file:
    filtered_nouns_cafe = pickle.load(input_file) 

#### 인스타그램은 전처리된 hashtag가 추가된 noun 리스트를 사용

In [4]:
with open('./soynlp/hash_filtered_soynlp_insta.pkl','rb') as input_file:
    hash_filtered_nouns_insta = pickle.load(input_file) 

In [None]:
hash_filtered_nouns_insta[:3]

In [5]:
filtered_nouns_insta = hash_filtered_nouns_insta

In [None]:
filtered_nouns_blog[:2]

In [None]:
filtered_nouns_cafe[:2]

## 6. Data transformation

LDA 모델이 사용할 dictionary와 corpus를 생성한다.

### 6-1. dictionary 생성:

In [6]:
def get_tokenized(filtered_nouns):
    nouns_list=[]
    for posting in filtered_nouns:
        nouns_per_posting=[]
        if len(posting)>0:
            for word in posting:
                if len(word)>0:
                    nouns_per_posting.append(word[0])
        nouns_list.append(nouns_per_posting)
    return nouns_list

In [7]:
token_doc_blog = get_tokenized(filtered_nouns_blog)
token_doc_insta = get_tokenized(filtered_nouns_insta)
token_doc_cafe = get_tokenized(filtered_nouns_cafe)

In [8]:
token_doc = []
token_doc.extend(token_doc_blog)
token_doc.extend(token_doc_insta)
token_doc.extend(token_doc_cafe)

In [9]:
from gensim import corpora

dictionary = corpora.Dictionary(token_doc)
#corpus = [dictionary.doc2bow(text) for text in token_doc]

dictionary_blog = corpora.Dictionary(token_doc_blog)
#corpus_blog = [dictionary_blog.doc2bow(text) for text in token_doc_blog]

dictionary_insta = corpora.Dictionary(token_doc_insta)
#corpus_insta = [dictionary_insta.doc2bow(text) for text in token_doc_insta]

dictionary_cafe = corpora.Dictionary(token_doc_cafe)
#corpus_cafe = [dictionary_cafe.doc2bow(text) for text in token_doc_cafe]



In [10]:
print(len(dictionary))
print(len(dictionary_blog))
print(len(dictionary_insta))
print(len(dictionary_cafe))

8685
6301
2527
3067


In [11]:
dictionary[0]

'고민'

In [12]:
dictionary_blog[0]

'고민'

In [13]:
dictionary_insta[1]

'맛'

In [14]:
dictionary_cafe[1]

'샴푸바'

### 6-2. corpus 생성
이런..SOYNLP를 쓰면 corpus생성코드가 달리진다!!!!

In [15]:
filtered_nouns_blog[:2]

[[('고체치약', NounScore(frequency=14, score=1.0)),
  ('플라스틱', NounScore(frequency=5, score=1.0)),
  ('고민', NounScore(frequency=3, score=1.0)),
  ('사용', NounScore(frequency=17, score=1.0)),
  ('공항', NounScore(frequency=3, score=1.0)),
  ('주문', NounScore(frequency=4, score=1.0)),
  ('양치', NounScore(frequency=6, score=1.0))],
 [('천연수세미', NounScore(frequency=10, score=1.0)),
  ('화전상회', NounScore(frequency=5, score=1.0)),
  ('뉴블랙', NounScore(frequency=3, score=1.0)),
  ('수입산', NounScore(frequency=4, score=1.0)),
  ('원통', NounScore(frequency=4, score=1.0)),
  ('사용', NounScore(frequency=8, score=1.0)),
  ('국산', NounScore(frequency=8, score=1.0)),
  ('판매', NounScore(frequency=5, score=1.0)),
  ('노력', NounScore(frequency=3, score=1.0)),
  ('모습', NounScore(frequency=3, score=1.0)),
  ('압축', NounScore(frequency=6, score=1.0)),
  ('봉제', NounScore(frequency=5, score=1.0)),
  ('감사', NounScore(frequency=3, score=1.0)),
  ('저희', NounScore(frequency=4, score=1.0))]]

In [16]:
filtered_nouns_blog[0][0][1][0]

14

In [17]:
filtered_nouns_insta[:2]

[[('갸또', NounScore(frequency=1, score=0.5)),
  ('맛', NounScore(frequency=2, score=1.0))],
 [('텀블러', NounScore(frequency=2, score=1.0)),
  ('음료', NounScore(frequency=1, score=0.5))]]

In [18]:
filtered_nouns_cafe[:1]

[[('샴푸바', NounScore(frequency=18, score=0.8571428571428571)),
  ('비누', NounScore(frequency=3, score=0.6))]]

In [19]:
def get_index_for_words(nouns, dic):
    posting_list=[]
    for posting in nouns:
        per_post=[]
        for word in posting:
            for i in range(len(dic)):
                if dic[i] == word[0]:
                    per_post.append(i)
        posting_list.append(per_post)
    return posting_list 

In [20]:
def get_freq_for_words(nouns):
    posting_list=[]
    for posting in nouns:
        per_post=[]
        for word in posting:
            per_post.append(word[1][0])
        posting_list.append(per_post)
    return posting_list

In [21]:
def get_corpus(index_list, freq_list):
    posting_list=[]
    for i in range(len(index_list)):
        per_post = list(zip(index_list[i], freq_list[i]))
        posting_list.append(per_post)
    return posting_list

In [22]:
index_list_blog = get_index_for_words(filtered_nouns_blog, dictionary_blog)
freq_list_blog = get_freq_for_words(filtered_nouns_blog)
corpus_blog = get_corpus(index_list_blog, freq_list_blog)

In [23]:
corpus_blog[:2]

[[(1, 14), (6, 5), (0, 3), (3, 17), (2, 3), (5, 4), (4, 6)],
 [(17, 10),
  (19, 5),
  (10, 3),
  (13, 4),
  (15, 4),
  (3, 8),
  (8, 8),
  (18, 5),
  (9, 3),
  (11, 3),
  (14, 6),
  (12, 5),
  (7, 3),
  (16, 4)]]

In [24]:
index_list_insta = get_index_for_words(filtered_nouns_insta, dictionary_insta)
freq_list_insta = get_freq_for_words(filtered_nouns_insta)
corpus_insta = get_corpus(index_list_insta, freq_list_insta)

In [25]:
index_list_cafe = get_index_for_words(filtered_nouns_cafe, dictionary_cafe)
freq_list_cafe = get_freq_for_words(filtered_nouns_cafe)
corpus_cafe = get_corpus(index_list_cafe, freq_list_cafe)

In [26]:
corpus=[]
corpus.extend(corpus_blog)
corpus.extend(corpus_insta)
corpus.extend(corpus_cafe)

In [27]:
print(len(corpus))
print(len(corpus_blog))
print(len(corpus_insta))
print(len(corpus_cafe))

11258
4096
3360
3802


### 6-3. list of tokens 생성
(soynlp의 추출방식에서도 문서내의 단어 순서 변동없이 단어 추출하는 방법 찾아봐야함)

In [51]:
from sklearn.feature_extraction.text import CountVectorizer

In [61]:
my_vocab=[]
for i in range(len(dictionary)):
    my_vocab.append(dictionary[i])

In [63]:
my_vocab_blog=[]
for i in range(len(dictionary_blog)):
    my_vocab_blog.append(dictionary_blog[i])

In [64]:
my_vocab_insta=[]
for i in range(len(dictionary_insta)):
    my_vocab_insta.append(dictionary_insta[i])

In [65]:
my_vocab_cafe=[]
for i in range(len(dictionary_cafe)):
    my_vocab_cafe.append(dictionary_cafe[i])

get DTM using my own vocab list

In [None]:
# 수정 필요
def get_dtm(df):
    postings=[]
    for idx in df.index:
        post = df.loc[idx,'Content']
    text=
    vector = CountVectorizer()
    dtm = vector.fit_transform(text).toarray()
    return dtm

In [71]:
# dtm_blog

vectorizer_blog = CountVectorizer()
vectorizer_blog.fit_transform(my_vocab_blog)
tf_blog = vectorizer_blog.transform(df_blog['Content'].tolist())
dtm_blog = tf.toarray()

In [None]:
# dtm instagram

content_list_inta = df_insta
vectorizer = CountVectorizer()
vectorizer.fit_transform(my_vocab_blog)
tf = vectorizer.transform(df_blog['Content'].tolist())
dtm = tf.toarray()

In [None]:
# dtm_cafe

vectorizer_cafe = CountVectorizer()
vectorizer_cafe.fit_transform(my_vocab_cafe)
tf_cafe = vectorizer_cafe.transform(df_cafe['Content'].tolist())
dtm_cafe = tf.toarray()

### save corpus, dictionary as pickle file

In [28]:
file_to_store = open("./soynlp/corpus.pkl", "wb")
pickle.dump(corpus,file_to_store)

file_to_store.close()

In [29]:
file_to_store = open("./soynlp/corpus_blog.pkl", "wb")
pickle.dump(corpus_blog,file_to_store)

file_to_store.close()

In [30]:
file_to_store = open("./soynlp/corpus_insta.pkl", "wb")
pickle.dump(corpus_insta,file_to_store)

file_to_store.close()

In [31]:
file_to_store = open("./soynlp/corpus_cafe.pkl", "wb")
pickle.dump(corpus_cafe,file_to_store)

file_to_store.close()

In [32]:
file_to_store = open("./soynlp/dictionary.pkl", "wb")
pickle.dump(dictionary,file_to_store)

file_to_store.close()

In [33]:
file_to_store = open("./soynlp/dictionary_blog.pkl", "wb")
pickle.dump(dictionary_blog,file_to_store)

file_to_store.close()

In [34]:
file_to_store = open("./soynlp/dictionary_insta.pkl", "wb")
pickle.dump(dictionary_insta,file_to_store)

file_to_store.close()

In [35]:
file_to_store = open("./soynlp/dictionary_cafe.pkl", "wb")
pickle.dump(dictionary_cafe,file_to_store)

file_to_store.close()

### corpus, dictionary pickle file 불러오기

pickle 파일은 우리 구글 공유폴더>sourcecode>pkl>soynlp 폴더에 있습니다.

In [36]:
import pandas as pd
import numpy as np
import re
import pickle

In [37]:
with open('./soynlp/dictionary.pkl','rb') as input_file:
    dictionary = pickle.load(input_file) 

In [38]:
with open('./soynlp/dictionary_blog.pkl','rb') as input_file:
    dictionary_blog = pickle.load(input_file) 

In [39]:
with open('./soynlp/dictionary_insta.pkl','rb') as input_file:
    dictionary_insta = pickle.load(input_file) 

In [40]:
with open('./soynlp/dictionary_cafe.pkl','rb') as input_file:
    dictionary_cafe = pickle.load(input_file) 

In [41]:
with open('./soynlp/corpus.pkl','rb') as input_file:
    corpus = pickle.load(input_file) 

In [42]:
with open('./soynlp/corpus_blog.pkl','rb') as input_file:
    corpus_blog = pickle.load(input_file) 

In [43]:
with open('./soynlp/corpus_insta.pkl','rb') as input_file:
    corpus_insta = pickle.load(input_file) 

In [44]:
with open('./soynlp/corpus_cafe.pkl','rb') as input_file:
    corpus_cafe = pickle.load(input_file) 

## 7. LDA 모델 훈련시키기

LDA model의 매개변수 중 pass란? 

훈련과정의 epoch와 같은 개념이다.

passes controls how often we train the model on the entire corpus (set to 10). Another word for passes might be “epochs”. iterations is somewhat technical, but essentially it controls how often we repeat a particular loop over each document. It is important to set the number of “passes” and “iterations” high enough.

### 7.1 initial guess model 생성

In [45]:
import gensim

In [46]:
k = 5 #5개의 토픽 임의 지정

# 통합
lda_model = gensim.models.LdaMulticore(corpus=corpus, workers=4,
                                           id2word=dictionary,
                                           num_topics=k, 
                                           random_state=100,
                                           chunksize=100,#num of docs to be used each training
                                           passes=80, #num of passes thru corpus during training
                                           alpha=0.01,
                                           eta='auto')

topics = lda_model.print_topics(num_words=6)
for topic in topics:
    print(topic)

(0, '0.058*"사용" + 0.022*"수세미" + 0.017*"물" + 0.015*"천연세제" + 0.013*"설겆이" + 0.011*"소창"')
(1, '0.082*"계란" + 0.054*"신발정리대" + 0.033*"포장" + 0.026*"맛" + 0.023*"노력" + 0.023*"고양이"')
(2, '0.041*"맥락" + 0.025*"물품" + 0.013*"강지남" + 0.010*"수리마켓" + 0.009*"활동" + 0.009*"리필"')
(3, '0.078*"사용" + 0.034*"플라스틱" + 0.034*"제품" + 0.026*"음악" + 0.020*"비누" + 0.016*"재활용"')
(4, '0.026*"밀랍랩" + 0.025*"샴푸바" + 0.018*"맛집" + 0.014*"사용" + 0.013*"두피" + 0.010*"구골님"')


In [47]:
# 블로그
lda_model_blog = gensim.models.LdaMulticore(corpus=corpus_blog,
                                           id2word=dictionary_blog,
                                           num_topics=k, 
                                           random_state=100,
                                           chunksize=100,#num of docs to be used each training
                                           passes=80, #num of passes thru corpus during training
                                           alpha=0.01,
                                           eta='auto')

topics = lda_model_blog.print_topics(num_words=6)
for topic in topics:
    print(topic)

(0, '0.055*"맛" + 0.028*"카페" + 0.017*"수업" + 0.016*"생각" + 0.014*"커피" + 0.014*"오늘"')
(1, '0.068*"플라스틱" + 0.061*"사용" + 0.031*"칫솔" + 0.031*"실천" + 0.026*"우리" + 0.024*"지구"')
(2, '0.047*"생각" + 0.027*"사람" + 0.018*"물건" + 0.017*"시간" + 0.012*"시작" + 0.011*"활동"')
(3, '0.034*"제품" + 0.014*"사용" + 0.012*"뚜껑" + 0.011*"브랜드" + 0.010*"가능" + 0.009*"진행"')
(4, '0.157*"사용" + 0.045*"제품" + 0.043*"비누" + 0.036*"수세미" + 0.021*"물" + 0.020*"샴푸바"')


In [48]:
# 인스타
lda_model_insta = gensim.models.LdaMulticore(corpus=corpus_insta,
                                           id2word=dictionary_insta,
                                           num_topics=k, 
                                           random_state=100,
                                           chunksize=100,#num of docs to be used each training
                                           passes=80, #num of passes thru corpus during training
                                           alpha=0.01,
                                           eta='auto')

topics = lda_model_insta.print_topics(num_words=6)
for topic in topics:
    print(topic)

(0, '0.051*"생각" + 0.040*"비누" + 0.032*"맛" + 0.023*"포장" + 0.022*"물티슈" + 0.013*"시간"')
(1, '0.161*"사용" + 0.043*"제품" + 0.030*"실천" + 0.027*"플라스틱" + 0.023*"재활용" + 0.019*"지구"')
(2, '0.059*"함께" + 0.048*"피부" + 0.046*"흥분색" + 0.015*"농산물" + 0.015*"여행" + 0.010*"앰플"')
(3, '0.029*"우리" + 0.018*"제품" + 0.013*"에센스" + 0.012*"플라스틱" + 0.011*"수유" + 0.010*"대나무"')
(4, '0.037*"오늘" + 0.020*"자연" + 0.017*"건강" + 0.017*"쓰레기" + 0.016*"수세미" + 0.016*"텀블러"')


In [49]:
# 카페
lda_model_cafe = gensim.models.LdaMulticore(corpus=corpus_cafe,
                                           id2word=dictionary_cafe,
                                           num_topics=k, 
                                           random_state=100,
                                           chunksize=100,#num of docs to be used each training
                                           passes=80, #num of passes thru corpus during training
                                           alpha=0.01,
                                           eta='auto')

topics = lda_model_cafe.print_topics(num_words=6)
for topic in topics:
    print(topic)

(0, '0.087*"플라스틱" + 0.058*"재활용" + 0.027*"제품" + 0.019*"가능" + 0.017*"사용" + 0.015*"양말"')
(1, '0.014*"기업" + 0.013*"코로나" + 0.012*"시장" + 0.009*"투자" + 0.009*"주식" + 0.009*"기자"')
(2, '0.124*"사용" + 0.021*"포장" + 0.019*"비닐" + 0.017*"카페" + 0.016*"캠페인" + 0.014*"확인"')
(3, '0.050*"생각" + 0.040*"우리" + 0.032*"실천" + 0.025*"활동" + 0.022*"사람" + 0.016*"지구"')
(4, '0.069*"사용" + 0.058*"상품" + 0.035*"칫솔" + 0.030*"비누" + 0.030*"제품" + 0.013*"세제"')


### 7-2. 연습용
8번으로 넘어가면됩니다. 여기부터는 매개변수들 바꿔가며 시도해본것들입니다.

In [None]:
#통합, passes=20, 토픽5개의 각 top6 단어들을 확인
ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics = k, \
                                                id2word=dictionary, passes=20)
topics = ldamodel.print_topics(num_words=6) 
for topic in topics:
    print(topic)

In [None]:
# 통합, passes=20, 토픽5개의 각 top12 단어들을 확인
topics = ldamodel.print_topics(num_words=12) 
for topic in topics:
    print(topic)

In [None]:
#통합, passes = 80
ldamodel_highpass = gensim.models.ldamodel.LdaModel(corpus, num_topics = k, \
                                                id2word=dictionary, passes=80)
topics = ldamodel_highpass.print_topics(num_words=6)
for topic in topics:
    print(topic)

In [None]:
# 통합, LdaMulticore, alpha=0.01 지정, passes=10
lda_model_a = gensim.models.LdaMulticore(corpus=corpus,
                                           id2word=dictionary,
                                           num_topics=k, 
                                           random_state=100,
                                           chunksize=100,#num of docs to be used each training
                                           passes=10, #num of passes thru corpus during training
                                           alpha=0.01,
                                           eta=0.9)

topics = lda_model_a.print_topics(num_words=6)
for topic in topics:
    print(topic)

In [None]:
# 동일 매개변수 전달하면 동일한 결과나오는지 확인 --> 완전 동일하지 않지만 비슷함.
lda_model_a2 = gensim.models.LdaMulticore(corpus=corpus,
                                           id2word=dictionary,
                                           num_topics=k, 
                                           random_state=100,
                                           chunksize=100,#num of docs to be used each training
                                           passes=10, #num of passes thru corpus during training
                                           alpha=0.01,
                                           eta=0.9)

topics = lda_model_a2.print_topics(num_words=6)
for topic in topics:
    print(topic)

In [None]:
# passes를 더 높히면, 더 좋은 결과가 나오는지 확인 --> passes=10일때보다 군집된 토픽들어 더 뚜렷하게 다름
# 통합, LdaMulticore, alpha=0.01 지정, passes=80
lda_model_a_highpass = gensim.models.LdaMulticore(corpus=corpus,
                                           id2word=dictionary,
                                           num_topics=k, 
                                           random_state=100,
                                           chunksize=100,#num of docs to be used each training
                                           passes=80, #num of passes thru corpus during training
                                           alpha=0.01,
                                           eta=0.9)

topics = lda_model_a_highpass.print_topics(num_words=6)
for topic in topics:
    print(topic)

In [None]:
# 블로그
ldamodel_blog = gensim.models.ldamodel.LdaModel(corpus_blog, num_topics = k, \
                                                id2word=dictionary_blog, passes=20)
topics_blog = ldamodel_blog.print_topics(num_words=6)
for topic in topics_blog:
    print(topic)

In [None]:
# 인스타
ldamodel_insta = gensim.models.ldamodel.LdaModel(corpus_insta, num_topics = k, \
                                                 id2word=dictionary_insta, passes=20)
topics_insta = ldamodel_insta.print_topics(num_words=6)
for topic in topics_insta:
    print(topic)

In [None]:
# 카페
ldamodel_cafe = gensim.models.ldamodel.LdaModel(corpus_cafe, num_topics = k, \
                                                id2word=dictionary_cafe, passes=20)
topics_cafe = ldamodel_cafe.print_topics(num_words=6)
for topic in topics_cafe:
    print(topic)

## 8. 모델 topic coherence score

임의로 지정한 토픽수가 5개일때에 통합과 각 플랫폼별 coherence는 어느수준일까

In [50]:
from gensim.models import CoherenceModel

co-occurrence기반 PMI를 사용하는 방식인 coherence='c_uci'적용시 아얘 다른 scale의 값이 나옴.

In [None]:
# # Compute Coherence Score
# coherence_model_lda = CoherenceModel(model=ldamodel, texts=token_doc, \
#                                      dictionary=dictionary, coherence='c_v')
# coherence_lda = coherence_model_lda.get_coherence()
# print('통합 Coherence Score: ', coherence_lda)

In [None]:
coherence_model_lda = CoherenceModel(model=ldamodel, texts=token_doc, \
                                     dictionary=dictionary, coherence='c_uci')
coherence_lda = coherence_model_lda.get_coherence()
print('통합 Coherence Score: ', coherence_lda)

In [None]:
coherence_model_lda_blog = CoherenceModel(model=ldamodel_blog, texts=token_doc_blog, \
                                     dictionary=dictionary_blog, coherence='c_uci')
coherence_lda_blog = coherence_model_lda_blog.get_coherence()
print('블로그 Coherence Score: ', coherence_lda_blog)

In [None]:
coherence_model_lda_insta = CoherenceModel(model=ldamodel_insta, texts=token_doc_insta, \
                                     dictionary=dictionary_insta, coherence='c_uci')
coherence_lda_insta = coherence_model_lda_insta.get_coherence()
print('인스타그램 Coherence Score: ', coherence_lda_insta)

In [None]:
coherence_model_lda_cafe = CoherenceModel(model=ldamodel_cafe, texts=token_doc_cafe, \
                                     dictionary=dictionary_cafe, coherence='c_uci')
coherence_lda_cafe = coherence_model_lda_cafe.get_coherence()
print('카페 Coherence Score: ', coherence_lda_cafe)

## 9. Hyperparameter 튜닝

더 빠른 implementation을 위해 multicore 버젼의 ldamodel인 LdaMulticore()을 사용한다. LdaMulticore도 기존 ldamodel()과 동일하다.

참고: https://radimrehurek.com/gensim/models/ldamodel.html , https://radimrehurek.com/gensim/models/ldamulticore.html#module-gensim.models.ldamulticore 

- Number of Topics (K)
- Dirichlet hyperparameter **alpha**: Document-Topic Density - document내의 topic concentration
- Dirichlet hyperparameter **beta**: Word-Topic Density - topic내의 word mixture가 다양한 정도

In [None]:
# coherence값을 계산하는 함수
def compute_coherence_values(corpus, dictionary, k, a, b, tokens):
    
    lda_model = gensim.models.LdaMulticore(corpus=corpus,
                                           id2word=dictionary,
                                           num_topics=k, 
                                           random_state=100,
                                           chunksize=100,
                                           passes=20,
                                           alpha=a,
                                           eta=b)
    
    coherence_model_lda = CoherenceModel(model=lda_model, texts=tokens, dictionary=dictionary, coherence='c_uci')
    
    return coherence_model_lda.get_coherence()

#### 통합

In [None]:
import numpy as np
import tqdm

grid = {}
grid['Validation_Set'] = {}

# Hyperparameter tuning range정의:

# Topics (k값) range
min_topics = 2
max_topics = 11
step_size = 1
topics_range = range(min_topics, max_topics, step_size)

# Alpha parameter
alpha = list(np.arange(0.01, 1, 0.3))
alpha.append('symmetric')
alpha.append('asymmetric')

# Beta parameter
beta = list(np.arange(0.01, 1, 0.3))
beta.append('symmetric')

# Validation sets
num_of_docs = len(corpus)
corpus_sets = [gensim.utils.ClippedCorpus(corpus, int(num_of_docs*0.75)), 
               corpus]

corpus_title = ['75% Corpus', '100% Corpus']

model_results = {'Validation_Set': [],
                 'Topics': [],
                 'Alpha': [],
                 'Beta': [],
                 'Coherence': []
                }

# Can take a long time to run
if 1 == 1:
    pbar = tqdm.tqdm(total=(len(beta)*len(alpha)*len(topics_range)*len(corpus_title)))
    
    # iterate through validation corpuses
    for i in range(len(corpus_sets)):
        # iterate through number of topics
        for k in topics_range:
            # iterate through alpha values
            for a in alpha:
                # iterare through beta values
                for b in beta:
                    # get the coherence score for the given parameters
                    cv = compute_coherence_values(corpus=corpus_sets[i], dictionary=dictionary, 
                                                  k=k, a=a, b=b, tokens=token_doc)
                    # Save the model results
                    model_results['Validation_Set'].append(corpus_title[i])
                    model_results['Topics'].append(k)
                    model_results['Alpha'].append(a)
                    model_results['Beta'].append(b)
                    model_results['Coherence'].append(cv)
                    
                    pbar.update(1)
    pd.DataFrame(model_results).to_csv('./LDAresults/lda_tuning_results.csv', index=False)
    pbar.close()

In [None]:
df = pd.DataFrame(model_results)

In [None]:
df = pd.read_csv('./LDAresults/lda_tuning_results.csv')

In [None]:
df.head()

In [None]:
dfcopy = df.copy()

In [None]:
import matplotlib.pyplot as plt

In [None]:
def draw_plot(df, Alpha, Beta):
    df=df[df['Validation_Set']=='100% Corpus']
    df=df[df['Alpha']==Alpha]
    df=df[df['Beta']==Beta]
    xAxis=df['Topics']
    yAxis=df['Coherence']
    
    plt.plot(xAxis,yAxis)
    plt.title('coherence for alpha={}, beta={}'.format(Alpha, Beta))
    plt.xlabel('Topics')
    plt.ylabel('Coherence')
    plt.show()

In [None]:
draw_plot(dfcopy,0.01,0.31)

In [None]:
draw_plot(dfcopy,0.31,0.61)

In [None]:
df_topics8 = dfcopy[dfcopy['Topics']==8]
df_topics8.sort_values(by="Coherence", ascending=False)

In [None]:
df_topics6 = dfcopy[dfcopy['Topics']==6]
df_topics6.sort_values(by="Coherence", ascending=False)

In [None]:
coherence_max8 = df_topics8['Coherence'].max()
coherence_max8

In [None]:
percent_improved = ((coherence_max8-coherence_lda)/coherence_lda)*100
percent_improved

In [None]:
coherence_max6 = df_topics6['Coherence'].max()
coherence_max6

In [None]:
percent_improved = ((coherence_max6-coherence_lda)/coherence_lda)*100
percent_improved

#### 인스타그램

In [None]:
grid = {}
grid['Validation_Set'] = {}

# Hyperparameter tuning range정의:

# Topics (k값) range
min_topics = 2
max_topics = 14
step_size = 1
topics_range = range(min_topics, max_topics, step_size)

# Alpha parameter
alpha = list(np.arange(0.01, 1, 0.3))
alpha.append('symmetric')
alpha.append('asymmetric')

# Beta parameter
beta = list(np.arange(0.01, 1, 0.3))
beta.append('symmetric')

# Validation sets
num_of_docs = len(corpus_insta)
corpus_sets = [gensim.utils.ClippedCorpus(corpus_insta, int(num_of_docs*0.75)), 
               corpus_insta]

corpus_title = ['75% Corpus', '100% Corpus']

model_results_insta = {'Validation_Set': [],
                 'Topics': [],
                 'Alpha': [],
                 'Beta': [],
                 'Coherence': []
                }

# Can take a long time to run
if 1 == 1:
    pbar = tqdm.tqdm(total=(len(beta)*len(alpha)*len(topics_range)*len(corpus_title)))
    
    # iterate through validation corpuses
    for i in range(len(corpus_sets)):
        # iterate through number of topics
        for k in topics_range:
            # iterate through alpha values
            for a in alpha:
                # iterare through beta values
                for b in beta:
                    # get the coherence score for the given parameters
                    cv = compute_coherence_values(corpus=corpus_sets[i], dictionary=dictionary_insta, 
                                                  k=k, a=a, b=b, tokens=token_doc_insta)
                    # Save the model results
                    model_results_insta['Validation_Set'].append(corpus_title[i])
                    model_results_insta['Topics'].append(k)
                    model_results_insta['Alpha'].append(a)
                    model_results_insta['Beta'].append(b)
                    model_results_insta['Coherence'].append(cv)
                    
                    pbar.update(1)
    pd.DataFrame(model_results_insta).to_csv('./LDAresults/lda_tuning_results_insta.csv', index=False)
    pbar.close()

In [None]:
import warnings
warnings.filterwarnings('ignore')

In [None]:
df_insta = pd.read_csv('./LDAresults/lda_tuning_results_insta.csv')

In [None]:
df_insta = pd.DataFrame(model_results_insta)

In [None]:
df_insta.head()

In [None]:
dfcopy_insta = df_insta.copy()

In [None]:
draw_plot(dfcopy_insta,0.31,0.01)

In [None]:
draw_plot(dfcopy_insta,0.31,0.31)

In [None]:
draw_plot(dfcopy_insta,0.31,0.61)

In [None]:
# 인스타그램은 topics 10~13사이에서 alpha=0.31, beta=0.01로 적당한 토픽수가 결정되었음.

df_topics_insta13 = dfcopy_insta[dfcopy_insta['Topics']==13]
df_topics_insta13.sort_values(by="Coherence", ascending=False)

In [None]:
df_topics_insta10 = dfcopy_insta[dfcopy_insta['Topics']==10]
df_topics_insta10.sort_values(by="Coherence", ascending=False)

In [None]:
coherence_max_insta13 = df_topics_insta13['Coherence'].max()
coherence_max_insta13

In [None]:
percent_improved_insta = ((coherence_max_insta13-coherence_lda_insta)/coherence_lda_insta)*100
percent_improved_insta

In [None]:
coherence_max_insta10 = df_topics_insta10['Coherence'].max()
coherence_max_insta10

In [None]:
percent_improved_insta10 = ((coherence_max_insta10-coherence_lda_insta)/coherence_lda_insta)*100
percent_improved_insta10

#### 블로그

In [None]:
grid = {}
grid['Validation_Set'] = {}

# Hyperparameter tuning range정의:

# Topics (k값) range
min_topics = 2
max_topics = 11
step_size = 1
topics_range = range(min_topics, max_topics, step_size)

# Alpha parameter
alpha = list(np.arange(0.01, 1, 0.3))
alpha.append('symmetric')
alpha.append('asymmetric')

# Beta parameter
beta = list(np.arange(0.01, 1, 0.3))
beta.append('symmetric')

# Validation sets
num_of_docs = len(corpus_blog)
corpus_sets = [gensim.utils.ClippedCorpus(corpus_blog, int(num_of_docs*0.75)), 
               corpus_blog]

corpus_title = ['75% Corpus', '100% Corpus']

model_results_blog = {'Validation_Set': [],
                 'Topics': [],
                 'Alpha': [],
                 'Beta': [],
                 'Coherence': []
                }

# Can take a long time to run
if 1 == 1:
    pbar = tqdm.tqdm(total=(len(beta)*len(alpha)*len(topics_range)*len(corpus_title)))
    
    # iterate through validation corpuses
    for i in range(len(corpus_sets)):
        # iterate through number of topics
        for k in topics_range:
            # iterate through alpha values
            for a in alpha:
                # iterare through beta values
                for b in beta:
                    # get the coherence score for the given parameters
                    cv = compute_coherence_values(corpus=corpus_sets[i], dictionary=dictionary_blog, 
                                                  k=k, a=a, b=b, tokens=token_doc_blog)
                    # Save the model results
                    model_results_blog['Validation_Set'].append(corpus_title[i])
                    model_results_blog['Topics'].append(k)
                    model_results_blog['Alpha'].append(a)
                    model_results_blog['Beta'].append(b)
                    model_results_blog['Coherence'].append(cv)
                    
                    pbar.update(1)
    pd.DataFrame(model_results_blog).to_csv('./LDAresults/lda_tuning_results_blog.csv', index=False)
    pbar.close()

In [None]:
import warnings
warnings.filterwarnings('ignore')

In [None]:
df_blog = pd.read_csv('./LDAresults/lda_tuning_results_blog.csv')

In [None]:
df_blog = pd.DataFrame(model_results_blog)

In [None]:
df_blog.head()

In [None]:
dfcopy_blog = df_blog.copy()

In [None]:
draw_plot(dfcopy_blog,0.01,0.91)

In [None]:
draw_plot(dfcopy_blog,0.01,0.01)

In [None]:
draw_plot(dfcopy_insta,0.31,0.91)

In [None]:
df_topics_blog10 = dfcopy_blog[dfcopy_blog['Topics']==10]
df_topics_blog10.sort_values(by="Coherence", ascending=False)

In [None]:
df_topics_blog8 = dfcopy_blog[dfcopy_blog['Topics']==8]
df_topics_blog8.sort_values(by="Coherence", ascending=False)

In [None]:
df_topics_blog5 = dfcopy_blog[dfcopy_blog['Topics']==5]
df_topics_blog5.sort_values(by="Coherence", ascending=False)

In [None]:
coherence_max_blog10 = df_topics_blog10['Coherence'].max()
coherence_max_blog10

In [None]:
percent_improved_blog10 = ((coherence_max_blog10-coherence_lda_blog)/coherence_lda_blog)*100
percent_improved_blog10

==> 그래서 블로그 topic modeling 질문: k=10, 8, 5중 어떤것을 골라야할까??

#### 카페 

In [None]:
grid = {}
grid['Validation_Set'] = {}

# Hyperparameter tuning range정의:

# Topics (k값) range
min_topics = 2
max_topics = 11
step_size = 1
topics_range = range(min_topics, max_topics, step_size)

# Alpha parameter
alpha = list(np.arange(0.01, 1, 0.3))
alpha.append('symmetric')
alpha.append('asymmetric')

# Beta parameter
beta = list(np.arange(0.01, 1, 0.3))
beta.append('symmetric')

# Validation sets
num_of_docs = len(corpus_cafe)
corpus_sets = [gensim.utils.ClippedCorpus(corpus_cafe, int(num_of_docs*0.75)), 
               corpus_cafe]

corpus_title = ['75% Corpus', '100% Corpus']

model_results_cafe = {'Validation_Set': [],
                 'Topics': [],
                 'Alpha': [],
                 'Beta': [],
                 'Coherence': []
                }

# Can take a long time to run
if 1 == 1:
    pbar = tqdm.tqdm(total=(len(beta)*len(alpha)*len(topics_range)*len(corpus_title)))
    
    # iterate through validation corpuses
    for i in range(len(corpus_sets)):
        # iterate through number of topics
        for k in topics_range:
            # iterate through alpha values
            for a in alpha:
                # iterare through beta values
                for b in beta:
                    # get the coherence score for the given parameters
                    cv = compute_coherence_values(corpus=corpus_sets[i], dictionary=dictionary_cafe, 
                                                  k=k, a=a, b=b, tokens=token_doc_cafe)
                    # Save the model results
                    model_results_cafe ['Validation_Set'].append(corpus_title[i])
                    model_results_cafe ['Topics'].append(k)
                    model_results_cafe ['Alpha'].append(a)
                    model_results_cafe ['Beta'].append(b)
                    model_results_cafe ['Coherence'].append(cv)
                    
                    pbar.update(1)
    pd.DataFrame(model_results_cafe ).to_csv('./LDAresults/lda_tuning_results_cafe.csv', index=False)
    pbar.close()

In [None]:
df_cafe = pd.read_csv('./LDAresults/lda_tuning_results_cafe.csv')

In [None]:
df_cafe = pd.DataFrame(model_results_cafe)

In [None]:
df_cafe.head()

In [None]:
dfcopy_cafe = df_cafe.copy()

In [None]:
draw_plot(dfcopy_cafe,0.31,0.91)

In [None]:
draw_plot(dfcopy_cafe,0.31,0.01)

In [None]:
draw_plot(dfcopy_cafe,0.31,0.31)

In [None]:
draw_plot(dfcopy_cafe,0.31,0.61)

In [None]:
df_topics_cafe10 = dfcopy_cafe[dfcopy_cafe['Topics']==10]
df_topics_cafe10.sort_values(by="Coherence", ascending=False)

In [None]:
df_topics_cafe9 = dfcopy_cafe[dfcopy_cafe['Topics']==9]
df_topics_cafe9.sort_values(by="Coherence", ascending=False)

In [None]:
coherence_max_cafe9 = df_topics_cafe9['Coherence'].max()
coherence_max_cafe9

In [None]:
percent_improved_cafe9 = ((coherence_max_cafe9-coherence_lda_cafe)/coherence_lda_cafe)*100
percent_improved_cafe9

## 10. final LDA 모델 확보 및 결과 시각화

In [None]:
pip install pyldavis

In [None]:
import warnings
warnings.filterwarnings('ignore')

### 통합

In [None]:
k=8
lda_model_final = gensim.models.LdaMulticore(corpus=corpus,
                                           id2word=dictionary,
                                           num_topics=k, 
                                           random_state=100,
                                           chunksize=100,#num of docs to be used each training
                                           passes=100, #num of passes thru corpus during training
                                           alpha=0.31,
                                           eta='auto')

In [None]:
topics = lda_model_final.print_topics(num_words=6)
for topic in topics:
    print(topic)

In [None]:
import pyLDAvis
import pyLDAvis.gensim_models as gensimvis

pyLDAvis.enable_notebook()

vis = gensimvis.prepare(lda_model_final, corpus, dictionary)
pyLDAvis.display(vis)

In [None]:
pyldavis_html_path='./LDAresults/LDAvis_all_8.html'
pyLDAvis.save_html(vis, pyldavis_html_path)

In [None]:
k=6
lda_model_final6 = gensim.models.LdaMulticore(corpus=corpus,
                                           id2word=dictionary,
                                           num_topics=k, 
                                           random_state=100,
                                           chunksize=100,#num of docs to be used each training
                                           passes=100, #num of passes thru corpus during training
                                           alpha=0.31,
                                           eta='auto')

In [None]:
topics = lda_model_final6.print_topics(num_words=6)
for topic in topics:
    print(topic)

In [None]:
pyLDAvis.enable_notebook()

vis2 = gensimvis.prepare(lda_model_final6, corpus, dictionary)
pyLDAvis.display(vis2)

In [None]:
pyldavis_html_path='./LDAresults/LDAvis_all_6.html'
pyLDAvis.save_html(vis2, pyldavis_html_path)

### 인스타

In [None]:
k=10
lda_model_final_insta = gensim.models.LdaMulticore(corpus=corpus_insta,
                                           id2word=dictionary_insta,
                                           num_topics=k, 
                                           random_state=100,
                                           chunksize=100,#num of docs to be used each training
                                           passes=100, #num of passes thru corpus during training
                                           alpha=0.31,
                                           eta='auto')

In [None]:
topics = lda_model_final_insta.print_topics(num_words=6)
for topic in topics:
    print(topic)

In [None]:
pyLDAvis.enable_notebook()

vis_insta = gensimvis.prepare(lda_model_final_insta, corpus_insta, dictionary_insta)
pyLDAvis.display(vis_insta)

In [None]:
pyldavis_html_path='./LDAresults/LDAvis_insta.html'
pyLDAvis.save_html(vis_insta, pyldavis_html_path) 

### 블로그

In [None]:
k=10
lda_model_final_blog = gensim.models.LdaMulticore(corpus=corpus_blog,
                                           id2word=dictionary_blog,
                                           num_topics=k, 
                                           random_state=100,
                                           chunksize=100,#num of docs to be used each training
                                           passes=100, #num of passes thru corpus during training
                                           alpha=0.01,
                                           eta='auto')

In [None]:
topics = lda_model_final_blog.print_topics(num_words=6)
for topic in topics:
    print(topic)

In [None]:
pyLDAvis.enable_notebook()

vis_blog = gensimvis.prepare(lda_model_final_blog, corpus_blog, dictionary_blog)
pyLDAvis.display(vis_blog)

In [None]:
pyldavis_html_path='./LDAresults/LDAvis_blog.html'
pyLDAvis.save_html(vis_blog, pyldavis_html_path) 

### 카페

In [None]:
k=9
lda_model_final_cafe = gensim.models.LdaMulticore(corpus=corpus_cafe,
                                           id2word=dictionary_cafe,
                                           num_topics=k, 
                                           random_state=100,
                                           chunksize=100,#num of docs to be used each training
                                           passes=100, #num of passes thru corpus during training
                                           alpha=0.61,
                                           eta='auto')

In [None]:
topics = lda_model_final_cafe.print_topics(num_words=6)
for topic in topics:
    print(topic)

In [None]:
pyLDAvis.enable_notebook()

vis_cafe = gensimvis.prepare(lda_model_final_cafe, corpus_cafe, dictionary_cafe)
pyLDAvis.display(vis_cafe)

In [None]:
pyldavis_html_path='./LDAresults/LDAvis_cafe.html'
pyLDAvis.save_html(vis_cafe, pyldavis_html_path) 