In [1]:
import pandas as pd
import re
import unicodedata
import matplotlib.pyplot as plt
import tqdm
import japanize_matplotlib
import os
import collections
import glob
import math
from googletrans import Translator
from pathlib import Path 

In [2]:
# NOUN
noun_df = pd.DataFrame(columns = [])

for i in glob.glob("*top20_noun.csv"):
    each_df = pd.read_csv(i)
    noun_df = pd.concat([noun_df,each_df])
    
    
noun_df["pos_en"] = "noun"
noun_df[noun_df.occupation == "figure skater"]
noun_df.head()

Unnamed: 0,word,pos,total,articles,per_article,gender,occupation,pos_en
0,研究,名詞,4261,966,0.15,female,scientist,noun
1,大学,名詞,2918,849,0.13,female,scientist,noun
2,学,名詞,2900,878,0.14,female,scientist,noun
3,教授,名詞,2275,867,0.13,female,scientist,noun
4,日本,名詞,2220,804,0.12,female,scientist,noun


In [3]:
verb_df = pd.DataFrame(columns = [])

for i in glob.glob("*top20_verb.csv"):
    each_df = pd.read_csv(i)
    verb_df = pd.concat([verb_df, each_df])
verb_df["pos_en"] = "verb"

verb_df.head()

Unnamed: 0,word,pos,total,articles,per_article,gender,occupation,pos_en
0,行う,動詞,910,401,0.06,female,scientist,verb
1,受ける,動詞,757,316,0.05,female,scientist,verb
2,務める,動詞,748,420,0.07,female,scientist,verb
3,できる,動詞,634,216,0.03,female,scientist,verb
4,生まれる,動詞,547,439,0.07,female,scientist,verb


In [4]:
adj_df = pd.DataFrame(columns = [])

for i in glob.glob("*top20_adj.csv"):
    each_df = pd.read_csv(i)
    adj_df = pd.concat([adj_df, each_df])
adj_df["pos_en"] = "adj"
adj_df

Unnamed: 0,word,pos,total,articles,per_article,gender,occupation,pos_en
0,ない,形容詞,586,218,0.03,female,scientist,adj
1,強い,形容詞,166,108,0.02,female,scientist,adj
2,多い,形容詞,158,93,0.01,female,scientist,adj
3,高い,形容詞,147,97,0.02,female,scientist,adj
4,新しい,形容詞,142,90,0.01,female,scientist,adj
...,...,...,...,...,...,...,...,...
15,いい,形容詞,10,7,0.01,male,figure skater,adj
16,少ない,形容詞,10,8,0.01,male,figure skater,adj
17,深い,形容詞,10,9,0.01,male,figure skater,adj
18,悪い,形容詞,9,7,0.01,male,figure skater,adj


In [5]:
all_df = pd.concat([noun_df,verb_df,adj_df])
all_df

Unnamed: 0,word,pos,total,articles,per_article,gender,occupation,pos_en
0,研究,名詞,4261,966,0.15,female,scientist,noun
1,大学,名詞,2918,849,0.13,female,scientist,noun
2,学,名詞,2900,878,0.14,female,scientist,noun
3,教授,名詞,2275,867,0.13,female,scientist,noun
4,日本,名詞,2220,804,0.12,female,scientist,noun
...,...,...,...,...,...,...,...,...
15,いい,形容詞,10,7,0.01,male,figure skater,adj
16,少ない,形容詞,10,8,0.01,male,figure skater,adj
17,深い,形容詞,10,9,0.01,male,figure skater,adj
18,悪い,形容詞,9,7,0.01,male,figure skater,adj


In [6]:
len(all_df.word.unique())

204

In [7]:
lst = all_df.word.unique()
df = pd.DataFrame(lst, columns = ['word'])
df.to_csv("top20_trans.csv", index=False)

In [11]:
translated = pd.read_csv("top20_words_translated.csv")
translated

Unnamed: 0,word,word_en
0,研究,reserch
1,大学,university
2,学,learning; scholarship; study
3,教授,professor
4,日本,Japan
...,...,...
199,ひどい,cruel; heartless
200,難しい,difficut
201,珍しい,rare
202,低い,low


In [13]:
all_df_merge =pd.merge(all_df, translated, on ="word")
all_df_merge

Unnamed: 0,word,pos,total,articles,per_article,gender,occupation,pos_en,word_en
0,研究,名詞,4261,966,0.15,female,scientist,noun,reserch
1,研究,名詞,62473,17011,0.69,male,scientist,noun,reserch
2,大学,名詞,2918,849,0.13,female,scientist,noun,university
3,大学,名詞,45604,15050,0.61,male,scientist,noun,university
4,学,名詞,2900,878,0.14,female,scientist,noun,learning; scholarship; study
...,...,...,...,...,...,...,...,...,...
475,難しい,形容詞,13,11,0.01,male,figure skater,adj,difficut
476,珍しい,形容詞,17,16,0.01,female,figure skater,adj,rare
477,珍しい,形容詞,11,10,0.01,male,figure skater,adj,rare
478,低い,形容詞,17,13,0.01,female,figure skater,adj,low


In [15]:
all_df_merge.columns


Index(['word', 'pos', 'total', 'articles', 'per_article', 'gender',
       'occupation', 'pos_en', 'word_en'],
      dtype='object')

In [23]:
all_df_merge = all_df_merge.reindex(columns=['word','word_en','pos_en','total','articles','per_article','gender','occupation'])

In [24]:
all_df_merge.columns

Index(['word', 'word_en', 'pos_en', 'total', 'articles', 'per_article',
       'gender', 'occupation'],
      dtype='object')

In [25]:
all_df_merge = all_df_merge.set_axis(['word', 'word in English', 'parts of speech', 'total', 'the number of articles', 'frequency (%)',
       'gender', 'occupation'], axis=1)

In [28]:
all_df_merge["frequency (%)"]= all_df_merge["frequency (%)"]*100

In [29]:
all_df_merge

Unnamed: 0,word,word in English,parts of speech,total,the number of articles,frequency (%),gender,occupation
0,研究,reserch,noun,4261,966,15.0,female,scientist
1,研究,reserch,noun,62473,17011,69.0,male,scientist
2,大学,university,noun,2918,849,13.0,female,scientist
3,大学,university,noun,45604,15050,61.0,male,scientist
4,学,learning; scholarship; study,noun,2900,878,14.0,female,scientist
...,...,...,...,...,...,...,...,...
475,難しい,difficut,adj,13,11,1.0,male,figure skater
476,珍しい,rare,adj,17,16,1.0,female,figure skater
477,珍しい,rare,adj,11,10,1.0,male,figure skater
478,低い,low,adj,17,13,1.0,female,figure skater


In [30]:
all_df_merge.to_csv("top20_all.csv", index=False)