In [1]:
# !pip install mecab-python3

# !pip install unidic-lite
# !pip install japanize-matplotlib

In [2]:
import MeCab
import unidic
import sys
import re
import unicodedata
import matplotlib.pyplot as plt
import tqdm
import japanize_matplotlib
import os
import collections
import glob
import pandas as pd

In [3]:
# Check if MeCab works
# t = MeCab.Tagger('-r/nul -d"C:/Program Files (x86)/MeCab/dic/ipadic"')

t = MeCab.Tagger('-r/nul -d"C:/Program Files (x86)/MeCab/dic/ipadic"')

sentence = 'すもももももももものうち'
print(t.parse(sentence))

すもも	名詞,一般,*,*,*,*,すもも,スモモ,スモモ
も	助詞,係助詞,*,*,*,*,も,モ,モ
もも	名詞,一般,*,*,*,*,もも,モモ,モモ
も	助詞,係助詞,*,*,*,*,も,モ,モ
もも	名詞,一般,*,*,*,*,もも,モモ,モモ
の	助詞,連体化,*,*,*,*,の,ノ,ノ
うち	名詞,非自立,副詞可能,*,*,*,うち,ウチ,ウチ
EOS



### Here is a translation 
- すもも	名詞 (= noun),一般 (=general),*,*,*,*,すもも,スモモ,スモモ
- も	助詞 (=particle) ,係助詞 (=binding particle),*,*,*,*,も,モ,モ
- もも	名詞 (= noun),一般 (=general),*,*,*,*,もも,モモ,モモ
- も	助詞 (=particle) ,係助詞 (=binding particle),*,*,*,*,も,モ,モ
- もも	名詞 (= noun),一般 (=general),*,*,*,*,もも,モモ,モモ
- の	助詞 (=particle) ,連体化(=attributive),*,*,*,*,の,ノ,ノ
- うち	名詞(=noun),非自立(=non-indipendent),副詞可能(adverb)=,*,*,*,うち,ウチ,ウチ

In [8]:
#create the list of meaningless words such as "that" "year" "what" etc... 
with open("stopwords.txt", encoding='UTF-8') as f:
    stopwords = f.read().split("\n")
print(stopwords)

['あそこ', 'あたり', 'あちら', 'あっち', 'あと', 'あな', 'あなた', 'あれ', 'いくつ', 'いつ', 'いま', 'いや', 'いろいろ', 'うち', 'おおまか', 'おまえ', 'おれ', 'がい', 'かく', 'かたち', 'かやの', 'から', 'がら', 'きた', 'くせ', 'ここ', 'こっち', 'こと', 'ごと', 'こちら', 'ごっちゃ', 'これ', 'これら', 'ごろ', 'さまざま', 'さらい', 'さん', 'しかた', 'しよう', 'すか', 'ずつ', 'すね', 'すべて', 'ぜんぶ', 'そう', 'そこ', 'そちら', 'そっち', 'そで', 'それ', 'それぞれ', 'それなり', 'たくさん', 'たち', 'たび', 'ため', 'だめ', 'ちゃ', 'てん', 'とおり', 'とき', 'どこ', 'どこか', 'ところ', 'どちら', 'どっか', 'どっち', 'どれ', 'なか', 'なかば', 'なに', 'など', 'なん', 'はじめ', 'はず', 'はるか', 'ひと', 'ひとつ', 'ふく', 'ぶり', 'べつ', 'へん', 'ぺん', 'ほう', 'ほか', 'まさ', 'まし', 'まとも', 'まま', 'みたい', 'みつ', 'みなさん', 'みんな', 'もと', 'もの', 'もん', 'やつ', 'よう', 'よそ', 'わけ', 'わたし', '', 'ハイ', '', '', '上', '中', '下', '字', '', '', '年', '月', '日', '時', '分', '秒', '週', '火', '水', '木', '金', '土', '国', '都', '道', '府', '県', '市', '区', '町', '村', '', '', '各', '第', '方', '何', '的', '度', '文', '者', '体', '人', '他', '今', '部', '課', '係', '外', '類', '達', '気', '室', '口', '誰', '用', '界', '会', '首', '別', '話', '私', '屋', '店', '家', '場', '等', '見', '際', '観', 

In [38]:

def count_word(text, pos_tags, stopwords):
    
    words_list = []
    tagger = MeCab.Tagger('-r/nul -d"C:/Program Files (x86)/MeCab/dic/ipadic"')
    
    node = tagger.parseToNode(text)

    while node:
        features = node.feature.split(",")
        base = features[6]
        pos_tag = features[0]
        
        # remove "stopwords"
        if pos_tag in pos_tags and base not in stopwords:
            words_list.append((base, pos_tag))
            

        node = node.next
    count = collections.Counter(words_list)

    return count

In [39]:
file_list = glob.glob("F:/Wikipedia/wikiextractor-master/wikiextractor/japaneseMaleIdol/*") 

In [40]:
file_list[:5]

['F:/Wikipedia/wikiextractor-master/wikiextractor/japaneseMaleIdol\\稲垣吾郎',
 'F:/Wikipedia/wikiextractor-master/wikiextractor/japaneseMaleIdol\\森且行',
 'F:/Wikipedia/wikiextractor-master/wikiextractor/japaneseMaleIdol\\野口五郎',
 'F:/Wikipedia/wikiextractor-master/wikiextractor/japaneseMaleIdol\\沢田研二',
 'F:/Wikipedia/wikiextractor-master/wikiextractor/japaneseMaleIdol\\水野晴郎']

In [41]:
# read stopwords
with open("stopwords.txt",encoding='UTF-8') as f:
    stopwords = f.read().split("\n")

pos_tags = ["名詞", "動詞", "形容詞"] 
#  ["名詞", "動詞", "形容詞"]=  ["noun", "verb", "adj"]

In [42]:
dfs = []

In [43]:
for f in file_list:
    with open(f, 'r', encoding='UTF-8') as file:
        text = file.read()
    
    f_name = f.split("\\")[-1]
    
    count = count_word(text, pos_tags, stopwords)
    
   
    all_words = count.items()
    
    all_words_df = (
        pd
        .DataFrame(all_words, columns = ['word_tuple', "count"])
        .assign(
            word = lambda x: x["word_tuple"].apply(lambda wt: wt[0]),
            pos = lambda x: x["word_tuple"].apply(lambda wt: wt[1]),
            file_name = f_name
        )
        .sort_values("count", ascending=False)
    )
    # CHANGE PATH!!!!!!!!
    full_csv_file = "F:/Wikipedia/wikiextractor-master/wikiextractor/japaneseMaleIdol/new_csvs/" + f_name + "_words.csv"
    
    all_words_df.to_csv(full_csv_file, index=None)
    dfs.append(all_words_df)

PermissionError: [Errno 13] Permission denied: 'F:/Wikipedia/wikiextractor-master/wikiextractor/japaneseMaleIdol\\csvs'

In [44]:
overall = (
    pd
    .concat(dfs)
    .groupby(["word", "pos"])
    .pipe(
        lambda grp: pd.DataFrame({
            "total": grp["count"].sum(),
            "articles": grp["file_name"].nunique()
        }))
    .reset_index()
    .sort_values("total", ascending=False)
)

In [45]:
overall[overall.pos == "名詞"].head(60)

Unnamed: 0,word,pos,total,articles
3407,出演,名詞,506,109
6021,日本,名詞,408,161
7325,番組,名詞,382,87
6841,活動,名詞,357,105
8324,草,名詞,344,5
2221,メンバー,名詞,326,117
1627,デビュー,名詞,315,97
2750,事務所,名詞,314,97
6112,映画,名詞,310,59
799,アイドル,名詞,289,118


In [46]:
overall[overall.pos == "動詞"].head(60)

Unnamed: 0,word,pos,total,articles
8429,行う,動詞,157,57
3615,務める,動詞,145,69
8718,語る,動詞,128,38
49,いう,動詞,112,40
8621,言う,動詞,104,41
3818,受ける,動詞,99,48
5395,思う,動詞,96,27
5727,持つ,動詞,79,39
8540,見る,動詞,79,32
9009,述べる,動詞,79,18


In [50]:
overall[overall.pos == "形容詞"].head(60)

Unnamed: 0,word,pos,total,articles
520,ない,形容詞,128,50
4334,多い,形容詞,74,37
48,いい,形容詞,58,22
9765,高い,形容詞,53,24
5211,強い,形容詞,37,19
361,すごい,形容詞,30,10
8247,良い,形容詞,23,14
4363,大きい,形容詞,23,8
9349,長い,形容詞,19,14
6905,深い,形容詞,18,10


In [51]:
overall.to_csv("japeseMaleIdol_text_overall.csv", index= False)
df= pd.read_csv("japeseMaleIdol_text_overall.csv")
df

Unnamed: 0,word,pos,total,articles
0,出演,名詞,506,109
1,日本,名詞,408,161
2,番組,名詞,382,87
3,活動,名詞,357,105
4,草,名詞,344,5
...,...,...,...,...
9881,二塁手,名詞,1,1
9882,二女,名詞,1,1
9883,二男,名詞,1,1
9884,互い,名詞,1,1
