In [1]:
import MeCab
import psycopg2 
from psycopg2.sql import SQL
import pandas as pd
import numpy as np 

import emoji

from collections import defaultdict
from pprint import pprint

In [2]:
tagger1 = MeCab.Tagger()
em = "😗😙😚🙂🤗🤩🤔🤨😐😑😶🙄😏😣😥😮🤐😯😪😫🥱😴😌😛😜😝🤤😒😓😔😕🙃🤑😲☹🙁😖 😞😟😤😢😭😦😧😨"
n = tagger1.parseToNode(em)
while n:
    print(n.surface)
    n = n.next

dicdir = '-d /usr/local/lib/mecab/dic/mecab-ipadic-neologd'
tagger2 = MeCab.Tagger(dicdir)

# sample_txt = '鬼滅の刃もいいけれど、約束のネバーランドもね'
# print(tagger1.parse(sample_txt))
# print(tagger2.parse(sample_txt))
txt = "愛知県の方！ 今高島屋でアムールやってますよね🙌 来月行く予定なのですが､皆さんコロナウイルスとか気にせずに行ってますか？💦 子供もいるし､あんな人混みにもし感染した人がいて感染したらとか思うと躊躇してしまいます😭 でも去年は下の子妊娠中で行けなかったのですごく楽しみにしていました😭 行く人は行くと思うのですが⋯私が気にしすぎですか？"
txt2  = "今旦那と離れて実家に住んでます！ 今週末か来週末に車で千葉から愛知に旦那に迎えにきてもらって千葉の家に行こうと思っていたのですが、コロナウイルスが少し心配です😢 皆さんならどうしますか？ 車移動なので危ないのはサービスエリアだけなのですが、距離的に遠くて寄らないわけには行かない感じです😭"
#print(tagger2.parse(txt))


😗
😙
😚
🙂
🤗
🤩
🤔
🤨
😐
😑😶🙄😏😣😥😮🤐😯😪😫🥱😴😌😛😜😝🤤😒😓😔😕🙃🤑😲
☹
🙁😖
😞😟😤😢😭😦😧😨



## Fetching documents from database

In [27]:
conn = psycopg2.connect("dbname=datacomp user=datacomp password=kant_1781")
cursor = conn.cursor()

table = "questions" # ["questions", "answers", "search"]

from datetime import datetime, timedelta
start_date = datetime(2019, 1, 1)
end_date = datetime(2021, 7, 31)

query = SQL("SELECT category_id, content FROM {} WHERE created::date= (%s) LIMIT 100;".format(table))
cursor.execute(query, (str(start_date), ))

In [4]:
start_date = datetime(2019, 1, 1)
end_date = datetime(2021, 7, 31)
pd.date_range(start=start_date , end=end_date, freq='D')

DatetimeIndex(['2019-01-01', '2019-01-02', '2019-01-03', '2019-01-04',
               '2019-01-05', '2019-01-06', '2019-01-07', '2019-01-08',
               '2019-01-09', '2019-01-10',
               ...
               '2021-07-22', '2021-07-23', '2021-07-24', '2021-07-25',
               '2021-07-26', '2021-07-27', '2021-07-28', '2021-07-29',
               '2021-07-30', '2021-07-31'],
              dtype='datetime64[ns]', length=943, freq='D')

## Processing text

In [28]:
documents = list(cursor.fetchall())
documents[0]

(11, '3歳の子供がまだ起きてます😫 夫と二人でカウントダウン予定が寝かしつけー。 幼児で、こんな夜中におきてるおうちありますか？')

In [29]:
def count_word_frequency_in_question(text, category_id, freq_dict, skip=range(13, 26)):
    ''' Assume inputs:
            text:= one question post to process
            category_id:= question.category_id
            freq_dict:= defaultdict(int) to store word frequency
            skip:= posid to skip, run posid.py for reference, default to auxiliary verbs

        Return:
            No return value. Will do in-place updates on freq_dict. 
            freq_dict is expected to be tuple-indexed
            (e.g. (node.surface, node.posid, category_id)=> <frequency>)
    '''

    ######## Mecab does not work perfectly with emoji. Hence do emoji-filtering with reg first##########
    emolist = []
    emofree_text = emoji.get_emoji_regexp().sub(repl=lambda m: emolist.append(m.group(0)), string=text)   
    for emoj in emolist:
        freq_dict[(emoj, 4, category_id)] += 1
    
    ######## Parse the rest#######
    node = tagger2.parseToNode(emofree_text)
    while node:
        if node.posid in skip:
            node = node.next
        else:
            if(node.posid not in range(31, 36)):
                freq_dict[(node.surface, node.posid, category_id)] += 1
            else:
                base = node.feature.split(",")[6]
                freq_dict[(base, node.posid, category_id)] += 1
            node = node.next

In [40]:
def word_frequency_of_questions_in_one_day(documents, date):
    '''
        Assume inputs:
            documents:= list of question posts in one day
        Return:
            pandas.DataFrame with 4 columns: ['base', 'node_posid', 'category_id', 'count']
    '''

    freq_dict = defaultdict(int) # DS to hold word freq counts in a single day
    
    # Define pos of words to skip from frequency counting 
    skip = list(range(13, 26)) 
    skip.extend([0, 5, 6, 7, 8, 9])# skip function words, symbols (except 4), and 0 

    for (category_id, content) in documents:
        count_word_frequency_in_question(content, category_id, freq_dict, skip)

    
    WordFreq = make_dataclass("WordFreq", [("base", str), ("node_posid", int), ("category_id", int), ("count", int), ("date", object)])
    freq_data = []

    freq_data = [WordFreq(key[0], key[1], key[2], item, date) for key, item in freq_dict.items()]
    freq_table = pd.DataFrame(freq_data)

    freq_table = freq_table.sort_values(by=["count"], ascending=False)
    freq_table = freq_table.set_index(["date", "category_id", "node_posid"]) # multi-indexing
    freq_table = freq_table.sort_index()

    return freq_table

### Test word_frequency_of_questions_in_one_day()

In [36]:
date1 = datetime(2019, 1, 1)
date2 = datetime(2020, 3, 2)

query = SQL("SELECT category_id, content FROM {} WHERE created::date= (%s) LIMIT 100;".format(table))


cursor.execute(query, (str(date1), ))
documents1 = list(cursor.fetchall())
documents1[0]

In [42]:
wf1 = word_frequency_of_questions_in_one_day(documents1, date1)
wf1

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,base,count
date,category_id,node_posid,Unnamed: 3_level_1,Unnamed: 4_level_1
2019-01-01,1,4,？,8
2019-01-01,1,4,(,4
2019-01-01,1,4,),4
2019-01-01,1,4,…,3
2019-01-01,1,4,！,3
2019-01-01,1,4,〜,2
2019-01-01,1,4,˙̦꒳˙̦,2
2019-01-01,1,4,☹️,1
2019-01-01,1,4,🌅,1
2019-01-01,1,4,(T_T),1


In [44]:
cursor.execute(query, (str(date2), ))
documents2 = list(cursor.fetchall())
documents2[0]
wf2 = word_frequency_of_questions_in_one_day(documents2, date2)

word_frequency_dataframe = pd.concat([wf1, wf2])
with open('./test_count_word_in_questions.csv' , 'w') as f:
    word_frequency_dataframe.to_csv(path_or_buf=f, sep=',', )


In [30]:
#documents = [(11, "遠足弁当をインスタで調べようとして、数分で閉じた。。普通の！！お弁当はないんかい！ キャラ弁とかオシャレなお弁当じゃ、参考にならない〜😭インスタ映えしないお弁当しか作れない母でごめん、息子。")]
#print(documents)
freq_dict = defaultdict(int) # DS to hold word freq counts in a single day



skip = list(range(13, 26)) 
skip.extend([0, 5, 6, 7, 8, 9])# skip function words, symbols (except 4), and 0 

for (category_id, content) in documents[:40]:
    count_word_frequency_in_question(content, category_id, freq_dict, skip)
    
#pprint(freq_dict)

from dataclasses import make_dataclass
WordFreq = make_dataclass("WordFreq", [("base", str), ("node_posid", int), ("category_id", int), ("count", int), ("date", object)])
freq_data = []

freq_data = [WordFreq(key[0], key[1], key[2], item, start_date) for key, item in freq_dict.items()]
freq_table = pd.DataFrame(freq_data)

freq_table = freq_table.sort_values(by=["count"], ascending=False)
freq_table_mulinx = freq_table.set_index(["date", "category_id", "node_posid"])
freq_table_mulin = freq_table_mulinx.sort_index(inplace=True)


pd.options.display.min_rows = 500
pd.options.display.max_rows = 700
#freq_table["node_posid"]
freq_table_mulinx

#freq_table.columns
#freq_table[(freq_table["node_posid"]>=38) & (freq_table["node_posid"]<=47)].sort_values(by=["count"], ascending=False)



Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,base,count
date,category_id,node_posid,Unnamed: 3_level_1,Unnamed: 4_level_1
2019-01-01,1,4,？,7
2019-01-01,1,4,(,4
2019-01-01,1,4,),4
2019-01-01,1,4,！,3
2019-01-01,1,4,˙̦꒳˙̦,2
2019-01-01,1,4,〜,2
2019-01-01,1,4,❤️,1
2019-01-01,1,4,💦,1
2019-01-01,1,4,🌅,1
2019-01-01,1,4,😂,1


## 参考：品詞IDの定義

In [None]:
import codecs
from pprint import pprint
from collections import OrderedDict

In [2]:
def load_category(path):
    # 品詞一覧の取得
    print('Category list from ', path)
 
    cat_od = OrderedDict()
 
    try:
        with codecs.open(path, 'r', 'euc_jp') as f:
            lines = f.readlines()
 
        for l in lines:
            l_split = l.split()
            id = int(l_split[1].strip())
            cat_l = l_split[0].split(',')
            cat_od.update({id: cat_l})
 
    except Exception as e:
        if hasattr(e, 'message'):
            print(e.message)
        else:
            print(e)
        return 1
    
    return cat_od

def get_category_id(cat_od, features):
    # IPA辞書の品詞IDから一致する品詞IDを求める
    for cat_id in range(len(cat_od)):
        if is_target_category(cat_od[id], features, level=2):
            return cat_id
 
    # 「BOS/EOS」のときは該当IDなし
    return -1 # Should avoid reaching here
 
def is_target_category(cat_l, features, level):
    # 形態素解析した語の品詞が対象の品詞と一致するか確認
    return all(list(map(lambda id_cat: id_cat[1] == features[id_cat[0]], enumerate(cat_l[0:level]))))

In [3]:
pos_id_path = "/usr/local/lib/mecab/dic/ipadic/pos-id.def"
cat_od = load_category(pos_id_path)

pprint(cat_od)

Category list from  /usr/local/lib/mecab/dic/ipadic/pos-id.def
OrderedDict([(0, ['その他', '間投', '*', '*']),
             (1, ['フィラー', '*', '*', '*']),
             (2, ['感動詞', '*', '*', '*']),
             (3, ['記号', 'アルファベット', '*', '*']),
             (4, ['記号', '一般', '*', '*']),
             (5, ['記号', '括弧開', '*', '*']),
             (6, ['記号', '括弧閉', '*', '*']),
             (7, ['記号', '句点', '*', '*']),
             (8, ['記号', '空白', '*', '*']),
             (9, ['記号', '読点', '*', '*']),
             (10, ['形容詞', '自立', '*', '*']),
             (11, ['形容詞', '接尾', '*', '*']),
             (12, ['形容詞', '非自立', '*', '*']),
             (13, ['助詞', '格助詞', '一般', '*']),
             (14, ['助詞', '格助詞', '引用', '*']),
             (15, ['助詞', '格助詞', '連語', '*']),
             (16, ['助詞', '係助詞', '*', '*']),
             (17, ['助詞', '終助詞', '*', '*']),
             (18, ['助詞', '接続助詞', '*', '*']),
             (19, ['助詞', '特殊', '*', '*']),
             (20, ['助詞', '副詞化', '*', '*']),
             (21, ['