In [None]:
from gensim import corpora, models
from os import listdir, path
from sudachipy import tokenizer, dictionary
from collections import Counter
from wordcloud import WordCloud
import pickle
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import pyLDAvis
import pyLDAvis.gensim


In [None]:
class SudachiAnalizer():
    
    def get_token(self, source) :
        
        tokenizer_obj = dictionary.Dictionary().create()

        mode = tokenizer.Tokenizer.SplitMode.C
        result = tokenizer_obj.tokenize(source, mode)

        word_list = []
        for mrph in result:
            if not (mrph == ""):
                norm_word = mrph.normalized_form()
                hinsi = mrph.part_of_speech()[0] 

                # 単語の正規表現が特定の品詞の場合のみ採用する
                if hinsi in  ["名詞", "動詞", "形容詞"]:
                    word = tokenizer_obj.tokenize(norm_word, mode)[0].dictionary_form()
                    word_list.append(word)

        return word_list

In [None]:
docs = []
PATH = "text/"

sudachi = SudachiAnalizer()

#  pathの中のdir(txt以外)をlistにして返す
def corpus_subdirs(path):
    subdirs = []
    for x in listdir(path):
        if not x.endswith('.txt'):
            subdirs.append(x)
    return subdirs

# pathの中のファイルをlistにして返す
def corpus_filenames(path):
    labels = [] # *.txt
    for y in listdir(path):
        if not y.startswith('LICENSE'):
            labels.append(y)
    return labels

for dir in corpus_subdirs(PATH):
    for file in corpus_filenames(PATH+dir):
        corpus_data = open(path.join(PATH + dir + "/" + file), "r")
        source = corpus_data.read()
        token = sudachi.get_token(source)
        corpus_data = {"name" : file, "tag" : dir, "token" : token}
        docs.append(corpus_data)        

In [None]:
with open('model/corpus.pkl', 'wb') as temp:
  pickle.dump(docs, temp)

In [None]:
f = open('model/corpus.pkl','rb')
docs = pickle.load(f)

In [None]:
# 文書の数の確認
tag_list = []
for doc in docs:
    tag_list.append(doc["tag"])
    
df = pd.DataFrame(tag_list)
tag_counts = df[0].value_counts()
tag_counts

In [None]:
# docsの走査の順番を確認
tag = ""
for i in tag_list:
    if i != tag :
        print(i)
        tag = i


In [None]:
text_list = []

for item in docs:
    text_list.append(item["token"])

In [None]:
dictionary =corpora.Dictionary(text_list)
dictionary.filter_extremes(no_below=2,no_above=0.2)

In [None]:
corpus=[dictionary.doc2bow(tokens) for tokens in text_list]

In [None]:
#トピック数の設定
num_topics=20

#モデルの学習
model = models.LdaModel(corpus, num_topics=num_topics, id2word=dictionary, random_state=5)
model.save('model/lda.model')

In [None]:
# mediaごとのトピック分布の確認
prob_doc = np.array(model.get_document_topics(corpus, minimum_probability=0))[:,:,1]

# DataFrameに収納
L=[ z for z in range(1,num_topics+1)]
col_name = list(map(lambda x: "Prob_"+str(x),L))
df_prob = pd.DataFrame(prob_doc)
df_prob.columns = col_name


def del_Prob(x):
    return int(x.split("_")[1])

df_prob["Max"] = df_prob.idxmax(axis=1)
df_prob["Max"] = df_prob["Max"].apply(lambda x : del_Prob(x))

In [None]:
start = 0

for count in tag_counts:
    
    df_topic = pd.DataFrame(df_prob[start:start+count].drop("Max",axis=1).sum()/df_prob[start:start+count].drop("Max",axis=1).sum().sum())
    df_topic.columns = ["Prob"]
    df_topic["Topic"] = [ z for z in range(1,num_topics+1)]    

    plt.figure(figsize = (10,6))
    ax = sns.barplot(x="Topic",y="Prob",data=df_topic,color="darkblue")
    ax.set_xlabel("Topic",fontsize=10)
    ax.set_ylabel("Prob",fontsize=10)
    ax.tick_params(axis='x', labelsize=10)
    ax.tick_params(axis='y', labelsize=10)
    plt.title(tag_list[start])
    plt.show()

    start += count


In [None]:
vis_pcoa = pyLDAvis.gensim.prepare(model, corpus, dictionary, sort_topics=False)

# save as html
pyLDAvis.display(vis_pcoa)

In [None]:
from gensim.models.ldamodel import CoherenceModel

array = []
for i in range(1, 31):

    lda = models.LdaModel(corpus = corpus, id2word = dictionary, num_topics =i, random_state = 5)

    cm = CoherenceModel(model = lda, corpus = corpus, coherence = 'u_mass')
    coherence = cm.get_coherence()

    perwordbound = lda.log_perplexity(corpus)
    perplexity = np.exp2(-perwordbound)
    
    array.append([i, coherence, perplexity])
    
    print(f"num_topics = {i}, coherence = {coherence}, perplexity = {perplexity}")

In [None]:
evaluation = pd.DataFrame(array)
evaluation_ =  evaluation[[1, 2]]
evaluation_.columns = ["coherence", "perplexity"]

ax = evaluation_.plot(secondary_y=["perplexity"], figsize=(16,4), alpha=0.5, legend=True)
ax.set_ylabel('coherence ', fontsize=10)
ax.right_ax.set_ylabel('perplexity ', fontsize=10)
ax.set_xlabel('num_topics', fontsize=10)


In [None]:
#トピック数の設定
num_topics=9

#モデルの学習
model = models.LdaModel(corpus, num_topics=num_topics, id2word=dictionary, random_state=5)
model.save('model/lda.model')

In [None]:
# mediaごとのトピック分布の確認
prob_doc = np.array(model.get_document_topics(corpus, minimum_probability=0))[:,:,1]

# DataFrameに収納
L=[ z for z in range(1,num_topics+1)]
col_name = list(map(lambda x: "Prob_"+str(x),L))
df_prob = pd.DataFrame(prob_doc)
df_prob.columns = col_name


def del_Prob(x):
    return int(x.split("_")[1])

df_prob["Max"] = df_prob.idxmax(axis=1)
df_prob["Max"] = df_prob["Max"].apply(lambda x : del_Prob(x))

In [None]:
start = 0

for count in tag_counts:
    
    df_topic = pd.DataFrame(df_prob[start:start+count].drop("Max",axis=1).sum()/df_prob[start:start+count].drop("Max",axis=1).sum().sum())
    df_topic.columns = ["Prob"]
    df_topic["Topic"] = [ z for z in range(1,num_topics+1)]    

    plt.figure(figsize = (10,6))
    ax = sns.barplot(x="Topic",y="Prob",data=df_topic,color="darkblue")
    ax.set_xlabel("Topic",fontsize=10)
    ax.set_ylabel("Prob",fontsize=10)
    ax.tick_params(axis='x', labelsize=10)
    ax.tick_params(axis='y', labelsize=10)
    plt.title(tag_list[start])
    plt.show()

    start += count


In [None]:
vis_pcoa = pyLDAvis.gensim.prepare(model, corpus, dictionary, sort_topics=False)

# save as html
pyLDAvis.display(vis_pcoa)