# 社畜丼ワードクラウド

http://qiita.com/kenmatsu4/items/9b6ac74f831443d29074

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.ticker import FormatStrFormatter
import seaborn as sns
sns.set_style("whitegrid", {'grid.linestyle': '--'})
%matplotlib inline

from wordcloud import WordCloud
import MeCab as mc

In [2]:
def mecab_analysis(text):
    t = mc.Tagger('-Ochasen -d /usr/local/lib/mecab/dic/mecab-ipadic-neologd/')
    enc_text = text.strip() # MeCabに渡した文字列は必ず変数に入れておく https://shogo82148.github.io/blog/2012/12/15/mecab-python/
    t.parse('') # UnicodeDecodeError対策 http://taka-say.hateblo.jp/entry/2015/06/24/183748 
    node = t.parseToNode(enc_text) 
    output = []
    while(node):
        if node.surface != "":  # ヘッダとフッタを除外
            word_type = node.feature.split(",")[0]
            if word_type in ["形容詞", "動詞","名詞", "副詞"]:
                output.append(node.surface)
        node = node.next
        if node is None:
            break
    return output

In [3]:
def create_wordcloud(text):

    # 環境に合わせてフォントのパスを指定する。
    #fpath = "/System/Library/Fonts/HelveticaNeue-UltraLight.otf"
    #fpath = "/Library/Fonts/ヒラギノ角ゴ Pro W3.otf"
    fpath = "/usr/share/fonts/opentype/noto/NotoSansCJK-Medium.ttc"

    # ストップワードの設定
    stop_words = [ u'てる', u'いる', u'なる', u'れる', u'する', u'ある', u'こと', u'これ', u'さん', u'して', \
             u'くれる', u'やる', u'くださる', u'そう', u'せる', u'した',  u'思う',  \
             u'それ', u'ここ', u'ちゃん', u'くん', u'', u'て',u'に',u'を',u'は',u'の', u'が', u'と', u'た', u'し', u'で', \
             u'ない', u'も', u'な', u'い', u'か', u'ので', u'よう', u'']

    wordcloud = WordCloud(background_color="white",font_path=fpath, width=900, height=500, \
                          stopwords=set(stop_words)).generate(text)

### using custom colors https://amueller.github.io/word_cloud/auto_examples/a_new_hope.html
#    import random
#    def grey_color_func(word, font_size, position, orientation, random_state=None,
#                        **kwargs):
#        return "hsl(0, 0%%, %d%%)" % random.randint(60, 100)
    
    
    plt.figure(figsize=(15,12))
#    plt.imshow(wordcloud.recolor(color_func=grey_color_func, random_state=3),
#           interpolation="bilinear")
    plt.imshow(wordcloud)
    plt.axis("off")
    plt.show()

In [4]:
col_id = 'inner_id'
col_username = 'username'
col_toot = 'toot'

In [5]:
df_raw = pd.read_csv("toot_data.current.tsv",
                     delimiter='\t',
                     header=None,
                     names=['created_at', 'username', 'userid', 'toot']
                    )
import re
df_raw['toot'] = df_raw['toot'].map(
    lambda s: re.sub('<[^>]*>', '', s)
)

In [6]:
df_time = df_raw.copy()
df_time['created_at'] = pd.to_datetime(
    df_time['created_at'].map(
        lambda s: s[1:-1]
    )).map(
        lambda t: t.tz_localize('GMT').tz_convert('Asia/Tokyo')
    )

time_begin = '12:30 +9:00'
time_end = '13:30 +9:00'
df_ranged = df_time[
    (df_time['created_at'] >= time_begin) &
    (df_time['created_at'] < time_end)
]

url_filtered = df_raw[col_toot].map(lambda x: re.sub(r'https?://[^ ]+', "", x))
# 全トゥートを結合
all_toot = ' '.join(url_filtered.tolist())

In [7]:
wordlist = mecab_analysis(all_toot[-1000000:])
create_wordcloud(' '.join(wordlist))