# Installations!

In [None]:
!pip install mecab-python3

In [None]:
#These wheels include a copy of the MeCab library, but not a dictionary. 
#In order to use MeCab you'll need to install a dictionary. unidic-lite is a good one to start with:
!pip install unidic-lite

In [None]:
# normalization tool
!pip install neologdn

In [None]:
!pip install openpyxl

In [None]:
# To be able to see Japanese!
!pip install japanize_matplotlib

# Libraries

In [24]:
import os
import pandas as pd

# Preprocessing
import MeCab
import neologdn
import collections
from nltk import FreqDist
from nltk.corpus import stopwords

# Visualization
import matplotlib.pyplot as plt
import japanize_matplotlib
#import seaborn as sns # REMINDER: make sure to remove if not using!

# Just having fun

In [None]:
path = "/root/code/mochiyam/simply-japanese/data/2_RawData"
df = pd.read_excel(os.path.join(path, 'SNOW_T15_150.xlsx'))
df.head()

In [None]:
df.drop(columns=['#英語(原文)'], inplace=True)
df.head()

In [None]:
df.rename(columns={"#日本語(原文)": "original", "#やさしい日本語": "simplified"}, inplace=True)
df.head()

In [None]:
tagger = MeCab.Tagger()
text = df['original'][0]
parsed = tagger.parse(text)
print(parsed)

名詞 - noun
助詞 - particle
連体詞 - 
動詞
補助記号

In [None]:
# Just testing stuff out
test = MeCab.Tagger("-O wakati")
text = neologdn.normalize(text, repeat=2)
parsed = test.parse(text)
print(parsed.split())

In [None]:
!pip show unidic-lite

In [None]:
test = MeCab.Tagger("r'-d /root/.pyenv/versions/3.8.12/envs/simply-japanese/lib/python3.8/site-packages'")
text = neologdn.normalize(text, repeat=2)
parsed = test.parse(text)
print(parsed)

In [None]:
# Super dumb dumb method
def count_all_word_frequency():
    all_words = collections.Counter()
    t = MeCab.Tagger()
    for idx, row in df.iterrows():
        text = row['original']
        node = t.parseToNode(text)
        while node:
            all_words[node.surface] += 1
            node = node.next
    return all_words
all_words = count_all_word_frequency()
# tuples in a list
print(all_words.most_common(25))

In [None]:
!pip install nltk

In [None]:
def plot_word_frequency(word_freq, most_common_num):
    freq_dist = FreqDist(word_freq)
    freq_dist.plot(most_common_num,cumulative=False)
plot_word_frequency(all_words, 25)

In [None]:
# Super dumb dumb method
def count_all_word_frequency():
    all_words = collections.Counter()
    t = MeCab.Tagger()
    for idx, row in df.iterrows():
        text = row['original']
        node = t.parseToNode(text)
        while node:
            all_words[node.surface] += 1
            node = node.next
    return all_words
all_words = count_all_word_frequency()
# tuples in a list
print(all_words.most_common(25))

In [None]:
test = MeCab.Tagger("-O wakati")
print(text)
text = "あなたは何を見つめているのですか。"
parsed = test.parse(text)
node = test.parseToNode(text).next
while node.next:
    print(node.surface, node.feature.split(',')[0])
    node = node.next
#node.surface.decode("utf-8", "ignore")


In [None]:
#               助詞           
#              /
# Remove 付属語 
#　　　　　　　 \
#             　 助動詞

#月 が｜きれいな｜晩 でし た 。
#付属語 : が　・　でした

# With 10_000 Data!


In [None]:
path = "/root/code/mochiyam/simply-japanese/data/2_RawData"
df = pd.read_excel(os.path.join(path, 'SNOW_T15_10000.xlsx'))
df.head()

In [None]:
df.drop(columns=['#英語(原文)'], inplace=True)
df.rename(columns={"#日本語(原文)": "original", "#やさしい日本語": "simplified"}, inplace=True)
df.head()

In [None]:
# Counts all the independent word 自立語
_stopwords = stopwords.words('japanese')

def count_all_words(docs, col='original'):
    all_words = collections.Counter()
    t = MeCab.Tagger("-O wakati")
    for idx, row in docs.iterrows():
        text = row[col]
        node = t.parseToNode(text).next
        while node.next:
            part_of_speech = node.feature.split(',')[0]
            if part_of_speech in ["助動詞", "助詞", "補助記号"] or node.surface in _stopwords:
                node = node.next
                continue
            all_words[node.surface] += 1
            node = node.next
    return all_words
ind_word_freq = count_all_words(df)
plot_word_frequency(ind_word_freq, 25)

In [None]:
top_2000_word_freq = ind_word_freq.most_common(2000)
top_2000_word_freq[-25:]

In [None]:
# 1. Find sentences that are exactly the same 
# 2. temp_list of tokens for sentence original and simplified
# 3. Compare the two temp_list
# 4. two global_lists of deleted and added(simplified)

In [None]:
# temp = df.head(10)
# temp

In [None]:
# Step 1.  Get the corpuses that are different from original and simplified 
diff_corpus_df = df[df['original'] != df['simplified']]
diff_corpus_df

In [None]:
# 2. Create a temp_list of tokens for sentence original and simplified
original_temp_list = count_all_words(diff_corpus_df, 'original')
simplified_temp_list = count_all_words(diff_corpus_df, 'simplified')

In [None]:
# simplified_temp_list

In [None]:
# original_temp_list

In [None]:
# pd.DataFrame(dict(original_temp_list).items(), columns=['word', 'count'])

In [None]:
# 3. Compare the two temp_list

# Collections library
# Elements are subtracted from an iterable or from another mapping (or counter). 
# Like dict.update() but subtracts counts instead of replacing them. Both inputs and outputs may be zero or negative.
diff_temp = simplified_temp_list
diff_temp.subtract(original_temp_list)

In [None]:
diff_temp_df[diff_temp_df['count'] < 0].sort_values(by='count').head(10)

In [None]:
# 4. two global_lists of deleted and added(simplified)
deleted = []
added = []

diff_temp_df = pd.DataFrame(dict(diff_temp).items(), columns=['word', 'count'])
deleted =  diff_temp_df[diff_temp_df['count'] < 0]['word'].tolist()
added = diff_temp_df[diff_temp_df['count'] >= 0]['word'].tolist()

In [None]:
# added

In [None]:
len(deleted), len(added)

In [25]:
path = "/root/code/mochiyam/simply-japanese/data/2_RawData"
df = pd.read_excel(os.path.join(path, 'SNOW_T15_10000.xlsx'))
df.drop(columns=['#英語(原文)'], inplace=True)
df.rename(columns={"#日本語(原文)": "original", "#やさしい日本語": "simplified"}, inplace=True)
df

Unnamed: 0,original,simplified
0,父は私が外国へ行くことを承知した。,父は私が外国へ行くことを許した。
1,卑屈な奴。,自分のことをダメだと考える人。
2,それは本当のはずはない。,それは本当のはずはない。
3,車がそんなに混んでなければ問題ないでしょう。,車がそんなに混んでなければ問題ないでしょう。
4,２時間も待たされた。,２時間も待った。
...,...,...
9995,彼女は服にたくさん金を使う。,彼女は服に多くの金を使う。
9996,彼は長年、腰痛で困っている。,彼は長い間、腰が痛くて困っている。
9997,彼は公園のどこかにいる。,彼は公園のどこかにいる。
9998,子供は大人の父なり。,子供は大人の父なり。


Ignored unknown kwargs option unl_token
