In [1]:
import glob
import json
tweet_path_list = glob.glob('../data/tweet/*')
texts = dict()
for path in tweet_path_list[:]:
    with open(path) as f:
        tweet = json.load(f)
    for t in tweet['tweets']:
        texts[t['id']] = t['text']

In [2]:
# from janome.tokenizer import Tokenizer
# from janome.charfilter import UnicodeNormalizeCharFilter, RegexReplaceCharFilter
# from janome.tokenfilter import POSKeepFilter, LowerCaseFilter, ExtractAttributeFilter
# from janome.analyzer import Analyzer

            
# char_filters = [UnicodeNormalizeCharFilter(), # UnicodeをNFKCで正規化
#                 RegexReplaceCharFilter('\d+', '0')] # 数字を全て0に置換

# tokenizer = Tokenizer(mmap=True) # NEologdを使う場合、mmap=Trueとする

# token_filters = [POSKeepFilter(['名詞', '形容詞', '副詞', '動詞']), # 名詞、形容詞、副詞、動詞のみを抽出する
#                  LowerCaseFilter(), # 英字は小文字にする
#                  ExtractAttributeFilter('base_form')] # 原型のみを取得する

# analyzer = Analyzer(char_filters=char_filters, tokenizer=tokenizer, token_filters=token_filters)

In [3]:
import urllib.request
import re
import MeCab

In [4]:
stopwords = []
url = 'http://svn.sourceforge.jp/svnroot/slothlib/CSharp/Version1/SlothLib/NLP/Filter/StopWord/word/Japanese.txt'

# ストップワードの取得
with urllib.request.urlopen(url) as response:
    stopwords = [w for w in response.read().decode().split('\r\n') if w != '']

stopwords += ['ReTweet', '*']

path = "-d /usr/lib/x86_64-linux-gnu/mecab/dic/mecab-ipadic-neologd"
mecab = MeCab.Tagger(path)

#Neologdによるトーカナイザー(リストで返す関数・名詞のみ)
def mecab_tokenizer(text):
    replaced_text = text.lower()
    replaced_text = re.sub(r'[【】]', ' ', replaced_text)       # 【】の除去
    replaced_text = re.sub(r'[（）()]', ' ', replaced_text)     # （）の除去
    replaced_text = re.sub(r'[［］\[\]]', ' ', replaced_text)   # ［］の除去
    replaced_text = re.sub(r'[@＠]\w+', '', replaced_text)  # メンションの除去
    replaced_text = re.sub(r'\d+\.*\d*', '', replaced_text) #数字を0にする
    replaced_text = re.sub(r'[#＃]', '', replaced_text)

    # ノイズとして取り除くパターン
    rt = re.compile(r'^RT\s*')
    mention = re.compile(r'\s*@\w+:\s*')
    url = re.compile(r'\s*https?://[\w/:%#\$&\?\(\)~\.=\+\-]+\s*')

    # ノイズ除去
    replaced_text = rt.sub('', replaced_text)
    replaced_text = mention.sub(' ', replaced_text)
    replaced_text = url.sub(' ', replaced_text)

    parsed_lines = mecab.parse(replaced_text).split("\n")[:-2]
    
    # #表層形を取得
    # surfaces = [l.split('\t')[0] for l in parsed_lines]
    #原形を取得
    token_list = [l.split("\t")[1].split(",")[6] for l in parsed_lines]
    #品詞を取得
    pos = [l.split('\t')[1].split(",")[0] for l in parsed_lines]
    # 名詞,動詞,形容詞のみに絞り込み
    target_pos = ["名詞", '形容詞', '副詞', '動詞']
    token_list = [t for t, p in zip(token_list, pos) if p in target_pos]
    
    # stopwordsの除去
    token_list = [t for t in token_list if t  not in stopwords]
    
    # ひらがなのみの単語を除く
    kana_re = re.compile("^[ぁ-ゖ]+$")
    token_list = [t for t in token_list if not kana_re.match(t)]
    
    return token_list

In [5]:
texts_words = {}

for k, v in texts.items():
    texts_words[k] = mecab_tokenizer(v)

print('words count: {}'.format(sum([len(t) for t in texts_words.values()])))

words count: 351262


In [6]:
# import urllib.request


# stopwords = []
# url = 'http://svn.sourceforge.jp/svnroot/slothlib/CSharp/Version1/SlothLib/NLP/Filter/StopWord/word/Japanese.txt'

# # ストップワードの取得
# with urllib.request.urlopen(url) as response:
#     stopwords = [w for w in response.read().decode().split('\r\n') if w != '']

# # print('Stopwords: {}'.format(stopwords))
# # Stopwords: ['あそこ', 'あたり', 'あちら', 'あっち', 'あと', ...

# texts_words = {}

# for k, v in texts.items():
#     texts_words[k] = [w for w in analyzer.analyze(v)]

# print('words count: {}'.format(sum([len(t) for t in texts_words.values()])))
# # words count: 28104

In [7]:
# pip install gensim
import gensim


# 辞書の作成
dictionary = gensim.corpora.Dictionary(texts_words.values())
dictionary.filter_extremes(no_below=3, no_above=0.4)
# 辞書をテキストファイルで保存する場合
# dictionary.save_as_text('blog_dictionary.txt')
# dictionary = gensim.corpora.Dictionary.load_from_text('blog_dictionary.txt')

# print('dictionary: {}'.format(dictionary.token2id))
# dictionary: {'行く': 0, 'くる': 1, 'おいしい': 2, '庭': 3, '町屋': 4, '風': 5, '店内': 6, ...

# コーパスの作成(ベクトル化)
corpus = [dictionary.doc2bow(words) for words in texts_words.values()]
# コーパスをテキストファイルで保存する場合
# gensim.corpora.MmCorpus.serialize('blog_corpus.mm', corpus)
# corpus = gensim.corpora.MmCorpus('blog_corpus.mm')

# print('corpus: {}'.format(corpus))
# corpus: [[(0, 2), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1), (6, 1), (7, 1), ...

In [11]:
len(dictionary)

9196

In [8]:
# LDAモデルの構築
lda = gensim.models.ldamodel.LdaModel(corpus=corpus, 
                                      num_topics=100, 
                                      id2word=dictionary, 
                                      random_state=1)

In [9]:
lda.show_topics()

[(52,
  '0.320*"イベント" + 0.192*"信じる" + 0.166*"エクアドル" + 0.078*"狙う" + 0.044*"思う" + 0.030*"リリース" + 0.009*"ST☆RISH" + 0.006*"分ける" + 0.002*"見る" + 0.002*"新宿"'),
 (24,
  '0.133*"女性" + 0.096*"現実" + 0.092*"クラ" + 0.066*"恐喝" + 0.050*"逮捕" + 0.049*"料" + 0.047*"売れる" + 0.044*"甘い" + 0.034*"容疑" + 0.031*"被告"'),
 (28,
  '0.517*"大学" + 0.119*"子供" + 0.070*"地元民" + 0.047*"昔" + 0.027*"お母さん" + 0.023*"ネタ" + 0.020*"可哀想" + 0.014*"乗れる" + 0.013*"EN" + 0.013*"券"'),
 (20,
  '0.255*"可愛い" + 0.167*"大好き" + 0.118*"楽しい" + 0.088*"楽屋" + 0.080*"幸せ" + 0.041*"続ける" + 0.034*"漫画" + 0.022*"ストーリー" + 0.019*"憧れる" + 0.018*"罪"'),
 (40,
  '0.165*"取る" + 0.163*"戻る" + 0.111*"午後" + 0.094*"減る" + 0.089*"圧倒的" + 0.063*"男性" + 0.055*"キレ" + 0.001*"現在" + 0.000*"病院" + 0.000*"女性"'),
 (8,
  '0.505*"ショック" + 0.122*"素晴らしい" + 0.086*"全区" + 0.044*"デカ" + 0.037*"セリフ" + 0.034*"刃物" + 0.023*"枠" + 0.022*"モデル" + 0.020*"先輩" + 0.017*"引く"'),
 (55,
  '0.277*"良い" + 0.083*"特急券" + 0.078*"乗る" + 0.067*"特急" + 0.066*"交換" + 0.043*"思う" + 0.039*"譲" + 0.035*"読む" + 0.034*"求" + 0.026

In [10]:
topic_counts = {
    'Kyoto': [0, 0, 0, 0],
    'Gourmet': [0, 0, 0, 0],
    'Keitai': [0, 0, 0, 0],
    'Sports': [0, 0, 0, 0]
}

for k, v in texts_words.items():
    category = k.split('_')[1]
    bow = dictionary.doc2bow(v)
    topics = lda.get_document_topics(bow)
    
    top_topic = sorted(topics, key=lambda topic:topic[1], reverse=True)[0][0]
    topic_counts[category][top_topic] += 1

print('Topic counts:\n{}'.format(topic_counts))

AttributeError: 'int' object has no attribute 'split'