In [5]:
import pandas as pd
import numpy as np
import seaborn as sns
import sys
import os
import glob
import pathlib
import re
import MeCab
import matplotlib
import matplotlib.pyplot as plt
import japanize_matplotlib
import datetime as dt
import pyperclip
%matplotlib inline 

plt.style.use('ggplot') #プロットスタイルをいい感じにする
pd.set_option('display.max_rows', 25000)
pd.set_option('display.max_columns', 500)

In [6]:
#parseした結果をDataFrame形式に出力するクラス
class CustomMeCabTagger(MeCab.Tagger):
    
    COLUMNS = ['単語', '品詞', '品詞細分類1', '品詞細分類2', '品詞細分類3', '活用型', '活用形', 'word', 'yomi', '発音']
    
    def parseToDataFrame(self, text: str) -> pd.DataFrame:
        results = []
        for line in self.parse(text).split('\n'):
            if line == 'EOS':
                break
            surface, feature = line.split('\t')
            feature = [None if f == '*' else f for f in feature.split(',')]
            results.append([surface, *feature])
        return pd.DataFrame(results, columns=type(self).COLUMNS)        
tagger = CustomMeCabTagger('mecab-ipadic-neologd')

In [187]:
#ファイル名の設定
fname = "bad/natsu.txt"
#極性辞書の読み込み
features = ['word', 'p', 'n', 'cate1', 'cate2']
posi_nega_dic = pd.read_csv("stable_posi_nega_dic_ver1.1.1.csv", names=features)

with open(fname,encoding='utf-8') as f:    
    text = f.read()
    
#テキスト全てを形態素解析
morpho_analysed_all = tagger.parseToDataFrame(text)
#解析結果と極性辞書をマージ
morpho_analysed_all = pd.merge(morpho_analysed_all, posi_nega_dic, on = ['word'], how = 'left')
morpho_analysed_all = morpho_analysed_all.reset_index()

#否定語に係る単語の削除(暫定的に前二つに設定)
for i in range(len(morpho_analysed_all)):
    if morpho_analysed_all.loc[i,'word'] == 'ない':
        morpho_analysed_all = morpho_analysed_all.drop(i-1)
        morpho_analysed_all = morpho_analysed_all.drop(i-2)        
        
#辞書とマッチングしたデータフレームの抽出
analysed = morpho_analysed_all.dropna(subset=['p'])
analysed = analysed.reset_index()
#morpho_analysed_all[(morpho_analysed_all['品詞細分類1']== "一般") |(morpho_analysed_all['品詞細分類1']== "自立")]

In [188]:
#分割数の設定
slice_num = 4

#分割するindexの設定
bins = [0]
for i in range(1,slice_num):
    bins.append(morpho_analysed_all['index'].quantile(1/slice_num* i))
bins = np.rint(bins)
bins = [int(i) for i in bins]
end_index = int(morpho_analysed_all['index'].tail(1))
split_morpho_df = []

#DataFrameの分割
for index in range(len(bins)):
    if index == len(bins)-1:
         a = morpho_analysed_all[bins[index]:end_index]
    else:
         a = morpho_analysed_all[bins[index]:bins[index+1]]
    split_morpho_df.append(a)
    split_morpho_df[index] = split_morpho_df[index].dropna(subset=['p'])
    split_morpho_df[index] = split_morpho_df[index].drop("index", axis=1)


last_df_len = len(split_morpho_df[slice_num-1])
last_emo_len = (split_morpho_df[slice_num-1]['cate1']=="感情").sum()
analysed_all_len = len(analysed)
analysed_all_eve_len = (analysed['cate1']=="イベント").sum()

split_df_p = split_morpho_df[slice_num-1]['p']==1  
split_df_n = split_morpho_df[slice_num-1]['n']==1
last_upper = split_morpho_df[slice_num-1]['cate1']
last_lower = split_morpho_df[slice_num-1]['cate2']
analysed_upper = analysed['cate1']
analysed_lower = analysed['cate2']


emo = "感情"
result_data = ""
emo_low_attribute = ["喜","安","好","厭","悲","恐","怒"]
for i in emo_low_attribute:
    a = str(((last_upper == emo)&(last_lower == i)).sum()/last_df_len)
    result_data += a+"\t"

event = "イベント"
event_low_attribute = ["災", "争い", "死亡", "病気"]
for i in event_low_attribute:
    a = str(((analysed_upper == event)&(analysed_lower == i)).sum()/analysed_all_len)
    result_data += a+"\t"
pyperclip.copy(result_data)
print(fname, "finish")

bad/natsu.txt finish


In [189]:
split_morpho_df[3][split_morpho_df[3]['cate2']== "悲"]

Unnamed: 0,単語,品詞,品詞細分類1,品詞細分類2,品詞細分類3,活用型,活用形,word,yomi,発音,p,n,cate1,cate2
28172,涙,名詞,一般,,,,,涙,ナミダ,ナミダ,0.0,1.0,感情,悲
29235,絶望,名詞,サ変接続,,,,,絶望,ゼツボウ,ゼツボー,0.0,1.0,感情,悲
29245,涙,名詞,一般,,,,,涙,ナミダ,ナミダ,0.0,1.0,感情,悲
29299,泣く,動詞,自立,,,五段・カ行イ音便,基本形,泣く,ナク,ナク,0.0,1.0,感情,悲
29443,悲しみ,名詞,一般,,,,,悲しみ,カナシミ,カナシミ,0.0,1.0,感情,悲
29510,あきらめ,動詞,自立,,,一段,連用形,あきらめる,アキラメ,アキラメ,0.0,1.0,感情,悲
29556,沈痛,名詞,形容動詞語幹,,,,,沈痛,チンツウ,チンツー,0.0,1.0,感情,悲
30011,涙,名詞,一般,,,,,涙,ナミダ,ナミダ,0.0,1.0,感情,悲
30019,泣い,動詞,自立,,,五段・カ行イ音便,連用タ接続,泣く,ナイ,ナイ,0.0,1.0,感情,悲
30108,泣き,動詞,自立,,,五段・カ行イ音便,連用形,泣く,ナキ,ナキ,0.0,1.0,感情,悲


In [198]:
test = analysed.loc[:,['単語','word', 'p', 'n', 'cate1', 'cate2']]
test = test.rename(columns = {'word':'原型', 'p':'P', 'n':'N', 'cate1':'上位意味カテゴリ', 'cate2':'下位意味カテゴリ'})
test

Unnamed: 0,単語,原型,P,N,上位意味カテゴリ,下位意味カテゴリ
0,慌て,慌てる,0.0,1.0,感情,恐
1,死ん,死ぬ,0.0,1.0,イベント,死亡
2,地震,地震,0.0,1.0,イベント,災
3,楽し,楽しい,1.0,0.0,感情,喜
4,笑っ,笑う,1.0,0.0,感情,喜
5,失敗,失敗,0.0,1.0,感情,厭
6,地震,地震,0.0,1.0,イベント,災
7,汚,汚い,0.0,1.0,様相,姿
8,苦労,苦労,0.0,1.0,感情,厭
9,死ん,死ぬ,0.0,1.0,イベント,死亡


In [182]:
analysed[analysed['cate2']== "災"]

Unnamed: 0,level_0,index,単語,品詞,品詞細分類1,品詞細分類2,品詞細分類3,活用型,活用形,word,yomi,発音,p,n,cate1,cate2
6,921,921,事件,名詞,一般,,,,,事件,ジケン,ジケン,0.0,1.0,イベント,災
15,2870,2870,事件,名詞,一般,,,,,事件,ジケン,ジケン,0.0,1.0,イベント,災
22,3435,3435,事件,名詞,一般,,,,,事件,ジケン,ジケン,0.0,1.0,イベント,災
23,3467,3467,事件,名詞,一般,,,,,事件,ジケン,ジケン,0.0,1.0,イベント,災
24,3484,3484,事件,名詞,一般,,,,,事件,ジケン,ジケン,0.0,1.0,イベント,災
25,3493,3493,被害,名詞,一般,,,,,被害,ヒガイ,ヒガイ,0.0,1.0,イベント,災
27,3741,3741,事件,名詞,一般,,,,,事件,ジケン,ジケン,0.0,1.0,イベント,災
34,6771,6771,被害,名詞,一般,,,,,被害,ヒガイ,ヒガイ,0.0,1.0,イベント,災
50,10544,10544,犯行,名詞,一般,,,,,犯行,ハンコウ,ハンコー,0.0,1.0,イベント,災
51,10679,10679,被害,名詞,一般,,,,,被害,ヒガイ,ヒガイ,0.0,1.0,イベント,災


In [258]:
posi_nega_dic.sample(n = 10).rename(columns={"word":"単語", "p":"P","n":"N","cate1":"上位意味カテゴリ", "cate2":"下位意味カテゴリ"})

Unnamed: 0,単語,P,N,上位意味カテゴリ,下位意味カテゴリ
1181,ほほえむ,1,0,感情,好
67,気疲れ,0,1,感情,厭
182,悲観,0,1,感情,悲
18,殴打,0,1,イベント,争い
1276,争議,0,1,イベント,争い
404,絶望感,0,1,感情,悲
611,傷害,0,1,イベント,病気
13,苦手,0,1,感情,厭
961,うるうる,0,1,感情,悲
809,疎む,0,1,感情,厭


In [152]:
posi_nega_dic.groupby('cate2').sum()

Unnamed: 0_level_0,p,n
cate2,Unnamed: 1_level_1,Unnamed: 2_level_1
争い,0,187
厭,0,158
喜,84,0
好,167,0
姿,71,22
安,19,0
怒,0,50
性質,46,36
恐,0,79
悲,0,111
