## 1. 사용할 패키지 불러오기

In [None]:
import pandas as pd
import numpy as np
import collections
import itertools
import pyLDAvis.gensim_models
import gensim.corpora as corpora
import gensim
from gensim import models
from gensim.models.ldamodel import LdaModel
from sklearn.feature_extraction.text import CountVectorizer
from konlpy.tag import Okt
import matplotlib.pyplot as plt
from nltk import bigrams

## 2. 데이터 불러오기

### (1) 파일 불러오기
- data_dir: 파일 경로

In [None]:
data_dir = './data/data.xlsx'

In [None]:
if data_dir.split('.')[-1] in ['txt', 'csv']:
    data = pd.read_csv(data_dir)
elif data_dir.split('.')[-1] in ['xlsx', 'xls']:
    data = pd.read_excel(data_dir)

In [None]:
data.head()

### (2) 사용할 Text가 저장된 Column 지정
text_col: Column 이름

In [None]:
text_col = '사람문장1'

## 3. 텍스트 전처리

### (1) Tokenize
stopwords: 불용어

In [None]:
stopwords = ['입니다','완전','전부','같아요','대한', '있어서', '있는', '약간', '있어요', '살짝', '적당히', '그냥', '있어', '역시', '모두', '있습니다', '다만', '보고', '같은', '있고', '편이', '같습니다', '좀더', 'jpg', '첨부파일', '때문', '일단', '리뷰', '이서', '그런지', '이상', '가장', '바로', '이건', '사서', '충분히', '안나', '없는', '같네요', '없어서', '있네요', '있었어요', '기도', '없고', '있는데', '거의', '무엇', '하니', '있으면', '없네요', '없어요', '전혀', '추합니다', '이런', '같아서', '아니라', '워낙', '같아', '여기', '뭔가', '해도', '있을', '있던', '비주', '위해', '우선', '불리', '있으니', '있지만', '대로', '사실', '같은데', 'jpeg', '없을', '있었습니다', '있게', '있었으면', '기지', '있음',' 같고', '자꾸', '있다니', '없어', '그런', '그것', '짐해', '수도', '미가', '다가', '인지', '있었는데', '듭니', '만해', '없습니다', '이기', '없었어요', '그닥', '그게', '내기', '편입', '런가', '끼리', '기고', '혹시', '그랬어요', '그거', '질도', '서도', '있었지만', '처럼', '이예', '문해', '고요', '이면', '이나', '알도', '그럴', '이구', '한수', '입니당', '그렇고', '거리', '있었네요', '만하', '가요', '있구요', '어요', '있는거', '주시', '수가', '없는데', '나니', '번은', '는걸', '요건', '어떤', '야해요', '있을걸', '단지', '아예', '성하게', '있다면', '있다는', '고해', '드네', '양장', '없었는데', '로만', '있었음', '있다', '리오', '시기', '어도', '비도', '점점', '없지만', '보이', '차라리', '하라', '그래요', '그렇지', '같기도', '거나', '등등', '더더', '나', '것', '내', '그', '후', '다른', '안', '우리', '이', '오늘', '내일', '다음', '이전']


In [None]:
Okt = Okt()

In [None]:
# 명사
#tags = ['Noun', 'Alpha']

# 명사, 형용사
tags = ['Noun']


def Tokenizer(text):
    morphs = Okt.pos(text)
    if len(morphs) > 0:
        pos = []

        for x in morphs:
            if len(x) > 1:
                word, tag = x
                if(word in stopwords): 
                    continue
                if tag in tags:
                    pos.append(word)
    else:
        pos = ['nan']
    return pos

In [None]:
Token = [Tokenizer(text) for text in data[text_col]]
Token[0:3]

### (2) Bigram

In [None]:
bigram = [list(bigrams(token)) for token in Token]
merged = list(itertools.chain(*bigram))
bigram_count = collections.Counter(merged)

In [None]:
bigram_count.most_common(50)

### (3) 합성어 전처리
N: bigram 합성어 선택 개수  
comb_words: 추가 합성어 리스트

In [None]:
N = 30
comb_words = []

In [None]:
selected_bigrams = bigram_count.most_common(N)
selected_bigrams = [bg for (bg, cnt) in selected_bigrams]
comb_words = [[''.join(bg), ' '.join(bg)] for bg in selected_bigrams] + comb_words

In [None]:
Transform_words = [' ' + 'TRSF' * (i+1) + ' ' for i in range(len(comb_words))]
Transform_words[0:3]

In [None]:
def Word_transform(Text, word_list, to_word):
    for word in word_list:
        Text = str(Text).replace(word,to_word)
    return Text

In [None]:
for i in range(len(comb_words)):
    transformed_text = [Word_transform(txt, comb_words[i], Transform_words[i]) for txt in data[text_col]]    

### (4) 재 토큰화

In [None]:
tags = ['Noun', 'Alpha']

In [None]:
Token = [Tokenizer(text) for text in transformed_text]
Token[0:3]

### (5) 합성어 복원

In [None]:
def Word_restore(Text, word1, word2):
    transformed_list = []
    for word in Text:
        if word1 in word:
            word = word2
        transformed_list.append(word)
    return transformed_list

In [None]:
restore_words = [word_list[0] for word_list in comb_words]
for i in range(len(restore_words)):
    Token = [Word_restore(token, Transform_words[i][1:-1], restore_words[i]) for token in Token] 
Token[0:3]

## 4.1 LDA

### (1) Option 설정
- min_count: 특정 개수 이하 단어는 제거
- N_grams: N_gram 수 (1: Uni-gram)

In [None]:
min_count = 3
N_grams = 1

### (2) LDA 구성 요소 생성

In [None]:
def get_ngrams(token, n):
    n_grams = ngrams(token, n)
    return [' '.join(grams) for grams in n_grams]

In [None]:
if N_grams > 1:
    gram_tokens = [get_ngrams(token, N_grams) for token in Token]
else:
    gram_tokens = Token

In [None]:
id2word=corpora.Dictionary(gram_tokens)
id2word.filter_extremes(no_below = min_count)
texts = gram_tokens
corpus=[id2word.doc2bow(text) for text in texts]

### (3) 최적의 Topic 개수 확인

In [None]:
def compute_coherence_values(dictionary, corpus, texts, limit, start=4, step=2):

    coherence_values = []
    model_list = []
    for num_topics in range(start, limit, step):
        model = LdaModel(corpus=corpus, num_topics=num_topics, id2word=dictionary)
        model_list.append(model)
        coherencemodel = gensim.models.coherencemodel.CoherenceModel(model=model, texts=texts, dictionary=dictionary, coherence='c_v')
        coherence_values.append(coherencemodel.get_coherence())
    
    x = range(start, limit, step) 
    plt.plot(x, coherence_values) 
    plt.xlabel("Num Topics") 
    plt.ylabel("Coherence score") 
    plt.legend(("coherence_values"), loc='best') 
    plt.show()
    
    return model_list, coherence_values

In [None]:
start_topic_num = 3
end_topic_num = 10
step = 1

In [None]:
model_list, coherence_values = compute_coherence_values(dictionary=id2word, corpus=corpus, texts=texts, start=start_topic_num, limit=end_topic_num+1, step=step)

In [None]:
best_topic_num = range(start_topic_num, end_topic_num+1, step)[coherence_values.index(max(coherence_values))]
print("Best topic number: {}".format(best_topic_num))

### (4) 최적의 LDA 모델 저장

In [None]:
best_model = model_list[coherence_values.index(max(coherence_values))]
topics = best_model.print_topics(num_words=8) 
topics

### (5) pyLDAvis 시각화

In [None]:
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim_models.prepare(best_model, corpus, id2word)
pyLDAvis.display(vis)

### (6) 토픽 배정
- prob: 각 토픽에 대한 확률을 배정 (True) vs 가장 확률이 높은 토픽 이름만 배정 (False)
- save_file: 토픽 배정 결과를 csv파일로 저장

In [None]:
prob = False
save_file = False

In [None]:
if prob:
    probability = np.zeros((len(best_model[corpus]), best_topic_num))
    for doc in range(len(best_model[corpus])):
        for topic, proba in best_model[corpus][doc]:
            probability[doc, topic] = proba
        
    probability = pd.DataFrame(probability)
    probability.columns = ['Topic' + str(num) for num in range(1, best_topic_num + 1)]
    if save_file:
        probability.to_csv('topic_porb.csv', index = False)

In [None]:
lda_topic_assignment = [max(p,key=lambda item: item[1]) for p in best_model[corpus]]
lda_topic_assignment = [topic + 1 for (topic, proba) in lda_topic_assignment]

In [None]:
plt.figure(figsize=(10,6))
w = collections.Counter(lda_topic_assignment)
w = sorted(w.items())
keys = [key for (key, value) in w]
values = [value for (key, value) in w]
plt.bar(['Topic-' + str(key) for key in keys], values)

## 4.2 BerTopic

### (1) Option 설정
- nr_topics: 토픽 수 ('auto': 최적)
- language: ['english', 'multilingual']

In [None]:
nr_topics = 'auto'
language = 'multilingual'

### (2) BerTopic 생성

In [None]:
vectorizer = CountVectorizer(tokenizer=Token, max_features=3000)

In [None]:

model = BERTopic(embedding_model="sentence-transformers/xlm-r-100langs-bert-base-nli-stsb-mean-tokens", \
                 vectorizer_model=vectorizer,
                 nr_topics=50,
                 top_n_words=10,
                 calculate_probabilities=True)

In [None]:
topics, probs = model.fit_transform(preprocessed_text)

In [None]:
from bertopic import BERTopic
model = BERTopic(nr_topics=nr_topics, language=language)
preprocessed_text = [' '.join(token) for token in Token]
topics, probabilities = model.fit_transform(preprocessed_text)

In [None]:
model.get_topic_info().head()

### (3) 시각화

In [None]:
model.visualize_barchart(top_n_topics=10)

In [None]:
model.visualize_topics()