In [12]:
import glob
import json
tweet_path_list = glob.glob('../data/tweet/*')
texts = dict()
for path in tweet_path_list[:]:
    with open(path) as f:
        tweet = json.load(f)
    for t in tweet['tweets']:
        texts[t['id']] = t['text']

In [13]:
# from janome.tokenizer import Tokenizer
# from janome.charfilter import UnicodeNormalizeCharFilter, RegexReplaceCharFilter
# from janome.tokenfilter import POSKeepFilter, LowerCaseFilter, ExtractAttributeFilter
# from janome.analyzer import Analyzer

            
# char_filters = [UnicodeNormalizeCharFilter(), # UnicodeをNFKCで正規化
#                 RegexReplaceCharFilter('\d+', '0')] # 数字を全て0に置換

# tokenizer = Tokenizer(mmap=True) # NEologdを使う場合、mmap=Trueとする

# token_filters = [POSKeepFilter(['名詞', '形容詞', '副詞', '動詞']), # 名詞、形容詞、副詞、動詞のみを抽出する
#                  LowerCaseFilter(), # 英字は小文字にする
#                  ExtractAttributeFilter('base_form')] # 原型のみを取得する

# analyzer = Analyzer(char_filters=char_filters, tokenizer=tokenizer, token_filters=token_filters)

In [14]:
import urllib.request
import re
import MeCab

In [15]:
stopwords = []
url = 'http://svn.sourceforge.jp/svnroot/slothlib/CSharp/Version1/SlothLib/NLP/Filter/StopWord/word/Japanese.txt'

# ストップワードの取得
with urllib.request.urlopen(url) as response:
    stopwords = [w for w in response.read().decode().split('\r\n') if w != '']

stopwords += ['ReTweet', '*']

path = "-d /usr/lib/x86_64-linux-gnu/mecab/dic/mecab-ipadic-neologd"
mecab = MeCab.Tagger(path)

#Neologdによるトーカナイザー(リストで返す関数・名詞のみ)
def mecab_tokenizer(text):
    replaced_text = text.lower()
    replaced_text = re.sub(r'[【】]', ' ', replaced_text)       # 【】の除去
    replaced_text = re.sub(r'[（）()]', ' ', replaced_text)     # （）の除去
    replaced_text = re.sub(r'[［］\[\]]', ' ', replaced_text)   # ［］の除去
    replaced_text = re.sub(r'[@＠]\w+', '', replaced_text)  # メンションの除去
    replaced_text = re.sub(r'\d+\.*\d*', '', replaced_text) #数字を0にする
    replaced_text = re.sub(r'[#＃]', '', replaced_text)

    # ノイズとして取り除くパターン
    rt = re.compile(r'^RT\s*')
    mention = re.compile(r'\s*@\w+:\s*')
    url = re.compile(r'\s*https?://[\w/:%#\$&\?\(\)~\.=\+\-]+\s*')

    # ノイズ除去
    replaced_text = rt.sub('', replaced_text)
    replaced_text = mention.sub(' ', replaced_text)
    replaced_text = url.sub(' ', replaced_text)

    parsed_lines = mecab.parse(replaced_text).split("\n")[:-2]
    
    # #表層形を取得
    # surfaces = [l.split('\t')[0] for l in parsed_lines]
    #原形を取得
    token_list = [l.split("\t")[1].split(",")[6] for l in parsed_lines]
    #品詞を取得
    pos = [l.split('\t')[1].split(",")[0] for l in parsed_lines]
    # 名詞,動詞,形容詞のみに絞り込み
    target_pos = ["名詞", '形容詞', '副詞', '動詞']
    token_list = [t for t, p in zip(token_list, pos) if p in target_pos]
    
    # stopwordsの除去
    token_list = [t for t in token_list if t  not in stopwords]
    
    # ひらがなのみの単語を除く
    kana_re = re.compile("^[ぁ-ゖ]+$")
    token_list = [t for t in token_list if not kana_re.match(t)]
    
    return token_list

In [16]:
texts_words = {}

for k, v in texts.items():
    texts_words[k] = mecab_tokenizer(v)

print('words count: {}'.format(sum([len(t) for t in texts_words.values()])))

words count: 800057


In [17]:
# import urllib.request


# stopwords = []
# url = 'http://svn.sourceforge.jp/svnroot/slothlib/CSharp/Version1/SlothLib/NLP/Filter/StopWord/word/Japanese.txt'

# # ストップワードの取得
# with urllib.request.urlopen(url) as response:
#     stopwords = [w for w in response.read().decode().split('\r\n') if w != '']

# # print('Stopwords: {}'.format(stopwords))
# # Stopwords: ['あそこ', 'あたり', 'あちら', 'あっち', 'あと', ...

# texts_words = {}

# for k, v in texts.items():
#     texts_words[k] = [w for w in analyzer.analyze(v)]

# print('words count: {}'.format(sum([len(t) for t in texts_words.values()])))
# # words count: 28104

In [18]:
# pip install gensim
import gensim


# 辞書の作成
dictionary = gensim.corpora.Dictionary(texts_words.values())
dictionary.filter_extremes(no_below=3, no_above=0.4)
# 辞書をテキストファイルで保存する場合
# dictionary.save_as_text('blog_dictionary.txt')
# dictionary = gensim.corpora.Dictionary.load_from_text('blog_dictionary.txt')

# print('dictionary: {}'.format(dictionary.token2id))
# dictionary: {'行く': 0, 'くる': 1, 'おいしい': 2, '庭': 3, '町屋': 4, '風': 5, '店内': 6, ...

# コーパスの作成(ベクトル化)
corpus = [dictionary.doc2bow(words) for words in texts_words.values()]
# コーパスをテキストファイルで保存する場合
# gensim.corpora.MmCorpus.serialize('blog_corpus.mm', corpus)
# corpus = gensim.corpora.MmCorpus('blog_corpus.mm')

# print('corpus: {}'.format(corpus))
# corpus: [[(0, 2), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1), (6, 1), (7, 1), ...

In [19]:
len(dictionary)

16298

In [20]:
# LDAモデルの構築
lda = gensim.models.ldamodel.LdaModel(corpus=corpus, 
                                      num_topics=100, 
                                      id2word=dictionary, 
                                      random_state=1)

In [21]:
lda.show_topics()

[(88,
  '0.291*"お願い" + 0.129*"セネガル" + 0.096*"致す" + 0.091*"姫野先輩" + 0.073*"可能" + 0.044*"悪魔" + 0.036*"頂く" + 0.031*"完璧" + 0.024*"検索" + 0.024*"送料"'),
 (53,
  '0.322*"交換" + 0.131*"人人" + 0.104*"レベル" + 0.076*"カード" + 0.060*"個人的" + 0.047*"辛い" + 0.041*"地味" + 0.034*"量" + 0.030*"会える" + 0.028*"ドロップ"'),
 (9,
  '0.340*"今日" + 0.132*"SS" + 0.098*"メンテ" + 0.084*"貼る" + 0.059*"入り" + 0.050*"仕事" + 0.047*"軸" + 0.036*"涙。" + 0.022*"掲載" + 0.018*"学舎"'),
 (95,
  '0.191*"作品" + 0.137*"大丈夫" + 0.118*"衝撃" + 0.095*"姫" + 0.067*"作曲" + 0.067*"作詞" + 0.045*"パイ" + 0.040*"最終回" + 0.030*"脚気" + 0.022*"いつの間にか"'),
 (51,
  '0.347*"歌詞" + 0.111*"冒険" + 0.017*"WILLOW" + 0.009*"妖精" + 0.005*"theme" + 0.002*"再始動" + 0.002*"開催決定" + 0.001*"女王" + 0.000*"公開" + 0.000*"チェンソーマン"'),
 (68,
  '0.396*"子供" + 0.153*"最大" + 0.099*"流れ" + 0.059*"編" + 0.052*"罪" + 0.026*"重視" + 0.015*"途端" + 0.009*"バイト" + 0.005*"大型" + 0.004*"前者"'),
 (64,
  '0.131*"明日" + 0.093*"バトル" + 0.093*"申し上げる" + 0.090*"お知らせ" + 0.073*"重要" + 0.067*"突然" + 0.063*"発言" + 0.047*"限" + 0.039*"気づく" + 

In [22]:
topic_counts = {
    'Kyoto': [0, 0, 0, 0],
    'Gourmet': [0, 0, 0, 0],
    'Keitai': [0, 0, 0, 0],
    'Sports': [0, 0, 0, 0]
}

for k, v in texts_words.items():
    category = k.split('_')[1]
    bow = dictionary.doc2bow(v)
    topics = lda.get_document_topics(bow)
    
    top_topic = sorted(topics, key=lambda topic:topic[1], reverse=True)[0][0]
    topic_counts[category][top_topic] += 1

print('Topic counts:\n{}'.format(topic_counts))

AttributeError: 'int' object has no attribute 'split'