In [2]:
# !pip install mecab-python3

# !pip install unidic-lite
# !pip install japanize-matplotlib

In [3]:
import MeCab
import unidic
import sys
import re
import unicodedata
import matplotlib.pyplot as plt
import tqdm
import japanize_matplotlib
import os
import collections
import glob
import pandas as pd

In [4]:
# Check if MeCab works
# t = MeCab.Tagger('-r/nul -d"C:/Program Files (x86)/MeCab/dic/ipadic"')

t = MeCab.Tagger('-r/nul -d"C:/Program Files (x86)/MeCab/dic/ipadic"')

sentence = 'すもももももももものうち'
print(t.parse(sentence))

すもも	名詞,一般,*,*,*,*,すもも,スモモ,スモモ
も	助詞,係助詞,*,*,*,*,も,モ,モ
もも	名詞,一般,*,*,*,*,もも,モモ,モモ
も	助詞,係助詞,*,*,*,*,も,モ,モ
もも	名詞,一般,*,*,*,*,もも,モモ,モモ
の	助詞,連体化,*,*,*,*,の,ノ,ノ
うち	名詞,非自立,副詞可能,*,*,*,うち,ウチ,ウチ
EOS



### Here is a translation 
- すもも	名詞 (= noun),一般 (=general),*,*,*,*,すもも,スモモ,スモモ
- も	助詞 (=particle) ,係助詞 (=binding particle),*,*,*,*,も,モ,モ
- もも	名詞 (= noun),一般 (=general),*,*,*,*,もも,モモ,モモ
- も	助詞 (=particle) ,係助詞 (=binding particle),*,*,*,*,も,モ,モ
- もも	名詞 (= noun),一般 (=general),*,*,*,*,もも,モモ,モモ
- の	助詞 (=particle) ,連体化(=attributive),*,*,*,*,の,ノ,ノ
- うち	名詞(=noun),非自立(=non-indipendent),副詞可能(adverb)=,*,*,*,うち,ウチ,ウチ

In [5]:
#create the list of meaningless words such as "that" "year" "what" etc... 
with open("stopwords.txt", encoding='UTF-8') as f:
    stopwords = f.read().split("\n")
print(stopwords)

['あそこ', 'あたり', 'あちら', 'あっち', 'あと', 'あな', 'あなた', 'あれ', 'いくつ', 'いつ', 'いま', 'いや', 'いろいろ', 'うち', 'おおまか', 'おまえ', 'おれ', 'がい', 'かく', 'かたち', 'かやの', 'から', 'がら', 'きた', 'くせ', 'ここ', 'こっち', 'こと', 'ごと', 'こちら', 'ごっちゃ', 'これ', 'これら', 'ごろ', 'さまざま', 'さらい', 'さん', 'しかた', 'しよう', 'すか', 'ずつ', 'すね', 'すべて', 'ぜんぶ', 'そう', 'そこ', 'そちら', 'そっち', 'そで', 'それ', 'それぞれ', 'それなり', 'たくさん', 'たち', 'たび', 'ため', 'だめ', 'ちゃ', 'てん', 'とおり', 'とき', 'どこ', 'どこか', 'ところ', 'どちら', 'どっか', 'どっち', 'どれ', 'なか', 'なかば', 'なに', 'など', 'なん', 'はじめ', 'はず', 'はるか', 'ひと', 'ひとつ', 'ふく', 'ぶり', 'べつ', 'へん', 'ぺん', 'ほう', 'ほか', 'まさ', 'まし', 'まとも', 'まま', 'みたい', 'みつ', 'みなさん', 'みんな', 'もと', 'もの', 'もん', 'やつ', 'よう', 'よそ', 'わけ', 'わたし', '', 'ハイ', '', '', '上', '中', '下', '字', '', '', '年', '月', '日', '時', '分', '秒', '週', '火', '水', '木', '金', '土', '国', '都', '道', '府', '県', '市', '区', '町', '村', '', '', '各', '第', '方', '何', '的', '度', '文', '者', '体', '人', '他', '今', '部', '課', '係', '外', '類', '達', '気', '室', '口', '誰', '用', '界', '会', '首', '別', '話', '私', '屋', '店', '家', '場', '等', '見', '際', '観', 

In [37]:
def count_word(text, pos_tags, stopwords):
    
    words_list = []
    tagger = MeCab.Tagger('-r/nul -d"C:/Program Files (x86)/MeCab/dic/ipadic"')
    
    node = tagger.parseToNode(text)

    while node:
        features = node.feature.split(",")
        base = features[6]
        pos_tag = features[0]
        
        # remove "stopwords"
        if pos_tag in pos_tags and base not in stopwords:
            words_list.append((base, pos_tag))
            

        node = node.next
    count = collections.Counter(words_list)

    return count


In [38]:
file_list = glob.glob("F:/Wikipedia/wikiextractor-master/wikiextractor/japaneseFemaleIdol/*") 

In [39]:
file_list[:5]

['F:/Wikipedia/wikiextractor-master/wikiextractor/japaneseFemaleIdol\\松田聖子',
 'F:/Wikipedia/wikiextractor-master/wikiextractor/japaneseFemaleIdol\\内田有紀',
 'F:/Wikipedia/wikiextractor-master/wikiextractor/japaneseFemaleIdol\\後藤久美子',
 'F:/Wikipedia/wikiextractor-master/wikiextractor/japaneseFemaleIdol\\吉田里深',
 'F:/Wikipedia/wikiextractor-master/wikiextractor/japaneseFemaleIdol\\吉岡美穂']

In [40]:
# read stopwords
with open("stopwords.txt",encoding='UTF-8') as f:
    stopwords = f.read().split("\n")

pos_tags = ["名詞", "動詞", "形容詞"] 
#  ["名詞", "動詞", "形容詞"]=  ["noun", "verb", "adj"]

In [35]:
dfs = []

In [47]:
for f in file_list:
    with open(f, 'r', encoding='UTF-8') as file:
        text = file.read()
    
    f_name = f.split("\\")[-1]
    
    count = count_word(text, pos_tags, stopwords)
    
    #top100_words = count.most_common(100)
    all_words = count.items()
    
    all_words_df = (
        pd
        .DataFrame(all_words, columns = ['word_tuple', "count"])
        .assign(
            word = lambda x: x["word_tuple"].apply(lambda wt: wt[0]),
            pos = lambda x: x["word_tuple"].apply(lambda wt: wt[1]),
            file_name = f_name
        )
        .sort_values("count", ascending=False)
    )
    #CHANGE PATH!!!!!!!
    full_csv_file = "F:/Wikipedia/wikiextractor-master/wikiextractor/japaneseFemaleIdol/new_csvs/" + f_name + "_words.csv"
    #print(f_name)
    all_words_df.to_csv(full_csv_file, index=None)
    dfs.append(all_words_df)


PermissionError: [Errno 13] Permission denied: 'F:/Wikipedia/wikiextractor-master/wikiextractor/japaneseFemaleIdol\\csv'

In [48]:
overall = (
    pd
    .concat(dfs)
    .groupby(["word", "pos"])
    .pipe(
        lambda grp: pd.DataFrame({
            "total": grp["count"].sum(),
            "articles": grp["file_name"].nunique()
        }))
    .reset_index()
    .sort_values("total", ascending=False)
)

In [49]:
overall[overall.pos == "名詞"].head(10)

Unnamed: 0,word,pos,total,articles
2810,アイドル,名詞,15624,3716
19051,活動,名詞,12470,2334
15519,所属,名詞,10798,3322
9715,出演,名詞,10362,1838
16591,日本,名詞,10072,3966
6626,メンバー,名詞,9336,1920
9734,出身,名詞,8246,3706
11092,同年,名詞,6944,1219
5035,デビュー,名詞,6716,1594
3795,グラビア,名詞,6646,1987


In [50]:
overall[overall.pos == "動詞"].head(10)

Unnamed: 0,word,pos,total,articles
23858,行う,動詞,3982,1027
10176,務める,動詞,3002,841
168,いう,動詞,1952,510
10801,受ける,動詞,1838,576
25668,選ぶ,動詞,1454,533
2470,みる,動詞,1402,492
14972,思う,動詞,1382,358
12782,始める,動詞,1378,503
18298,歌う,動詞,1376,341
24362,言う,動詞,1246,375


In [51]:
overall[overall.pos == "形容詞"].head(10)

Unnamed: 0,word,pos,total,articles
1773,ない,形容詞,1814,503
12182,多い,形容詞,1060,362
23258,良い,形容詞,642,229
167,いい,形容詞,594,214
14551,強い,形容詞,438,157
27187,高い,形容詞,428,164
12245,大きい,形容詞,296,115
2669,よい,形容詞,288,114
2255,ほしい,形容詞,254,98
1272,すごい,形容詞,248,81


In [52]:
overall.to_csv("japeseFemaleIdol_text_overall.csv", index= False)

In [53]:
df= pd.read_csv("japeseFemaleIdol_text_overall.csv")
df

Unnamed: 0,word,pos,total,articles
0,アイドル,名詞,15624,3716
1,活動,名詞,12470,2334
2,所属,名詞,10798,3322
3,出演,名詞,10362,1838
4,日本,名詞,10072,3966
...,...,...,...,...
27572,島尻,名詞,2,1
27573,峻,名詞,2,1
27574,崇高,名詞,2,1
27575,崩れ,名詞,2,1
