# Import data and general packages

#### Install MeCab and dependencies, if necessary

#### Imports

In [1]:
import numpy as np
import os
import pandas as pd
import MeCab
import neologdn
from nltk.corpus import stopwords
from gensim.models.word2vec import Word2Vec

#### Get data

In [10]:
os.chdir("/home/andi/code/mochiyam/simply-japanese/data/2_RawData") # Change user name to run on your machine
os.getcwd()

'/home/andi/code/mochiyam/simply-japanese/data/2_RawData'

In [48]:
!ls

Combined_85K_10000.xlsx  SNOW_T15_10000.xlsx  SNOW_T23_10000.xlsx
Combined_85K_1000.xlsx	 SNOW_T15_1000.xlsx   SNOW_T23_1000.xlsx
Combined_85K_150.xlsx	 SNOW_T15_150.xlsx    SNOW_T23_150.xlsx
Combined_85K.xlsx	 SNOW_T15.xlsx	      SNOW_T23.xlsx


In [49]:
data150 = "SNOW_T15_150.xlsx"
df150 = pd.read_excel(data)

In [50]:
df150.head()

Unnamed: 0,#日本語(原文),#やさしい日本語,#英語(原文)
0,彼女は通りを横切った。,彼女は通りを横に通っていった。,she came across the street .
1,私が知るかぎり彼女は大変よい人だ。,私が知る限り彼女は大変よい人だ。,"as far as i know , she is a very good person ."
2,私のクラスの少女たちはみんな親切だ。,私のクラスの少女たちはみんな親切だ。,all the girls in my class are kind .
3,彼は試験に合格できなかった。,彼は試験に合格できなかった。,he couldn 't pass the examination .
4,彼女はあなたにあえて喜ぶでしょうね。,彼女はあなたに会うことができて喜ぶでしょうね。,"she 'll be glad to see you , won 't she ?"


In [51]:
X = df150[df150.keys()[0]]
y = df150[df150.keys()[1]]

In [52]:
X[0], y[0]

('彼女は通りを横切った。', '彼女は通りを横に通っていった。')

# Janome

In [1]:
from janome.tokenizer import Tokenizer

Simple Tokenizer

In [27]:
t = Tokenizer()
for token in t.tokenize(X[0]):
    print(token)

彼女	名詞,代名詞,一般,*,*,*,彼女,カノジョ,カノジョ
は	助詞,係助詞,*,*,*,*,は,ハ,ワ
通り	名詞,一般,*,*,*,*,通り,トオリ,トーリ
を	助詞,格助詞,一般,*,*,*,を,ヲ,ヲ
横切っ	動詞,自立,*,*,五段・ラ行,連用タ接続,横切る,ヨコギッ,ヨコギッ
た	助動詞,*,*,*,特殊・タ,基本形,た,タ,タ
。	記号,句点,*,*,*,*,。,。,。


Even simpler tokenizer<br>
wakati (分ち書き）returns only surface tokens

In [38]:
for token in t.tokenize(X[0], wakati=True):
    print(token)

彼女
は
通り
を
横切っ
た
。


# Identify words to be replaced / ignored

## Data

Our df are 150 random segments from the SNOW T15 dataset.

In [64]:
df150.head()

Unnamed: 0,#日本語(原文),#やさしい日本語,#英語(原文)
0,彼女は通りを横切った。,彼女は通りを横に通っていった。,she came across the street .
1,私が知るかぎり彼女は大変よい人だ。,私が知る限り彼女は大変よい人だ。,"as far as i know , she is a very good person ."
2,私のクラスの少女たちはみんな親切だ。,私のクラスの少女たちはみんな親切だ。,all the girls in my class are kind .
3,彼は試験に合格できなかった。,彼は試験に合格できなかった。,he couldn 't pass the examination .
4,彼女はあなたにあえて喜ぶでしょうね。,彼女はあなたに会うことができて喜ぶでしょうね。,"she 'll be glad to see you , won 't she ?"


X and y are the 0th and 1st columns of df150,

In [66]:
X[0], y[0], len(X), len(y)

('彼女は通りを横切った。', '彼女は通りを横に通っていった。', 150, 150)

## Playing around with MeCab

In [117]:
tagger = MeCab.Tagger()
text = X[0]
parsed = tagger.parse(text)
# print(parsed)

In [118]:
node = tagger.parseToNode(text)
while node:
    features = node.feature.split(",")
    original = features[6]
    print('node.surface={},\n node.feature={} => original = {}, \n\n'.format(node.surface,node.feature,original))
    node = node.next

node.surface=,
 node.feature=BOS/EOS,*,*,*,*,*,*,*,*,*,*,*,*,*,*,*,*,*,*,*,*,*,*,*,*,* => original = *, 


node.surface=彼女,
 node.feature=代名詞,*,*,*,*,*,カノジョ,彼女,彼女,カノジョ,彼女,カノジョ,混,*,*,*,*,カノジョ,カノジョ,カノジョ,カノジョ,*,*,1,*,* => original = カノジョ, 


node.surface=は,
 node.feature=助詞,係助詞,*,*,*,*,ハ,は,は,ワ,は,ワ,和,*,*,*,*,ハ,ハ,ハ,ハ,*,*,*,"動詞%F2@0,名詞%F1,形容詞%F2@-1",* => original = ハ, 


node.surface=通り,
 node.feature=名詞,普通名詞,助数詞可能,*,*,*,トオリ,通り,通り,トーリ,通り,トーリ,和,ト濁,基本形,*,*,トオリ,トオリ,トオリ,トオリ,*,*,3,C2,* => original = トオリ, 


node.surface=を,
 node.feature=助詞,格助詞,*,*,*,*,ヲ,を,を,オ,を,オ,和,*,*,*,*,ヲ,ヲ,ヲ,ヲ,*,*,*,"動詞%F2@0,名詞%F1,形容詞%F2@-1",* => original = ヲ, 


node.surface=横切っ,
 node.feature=動詞,一般,*,*,五段-ラ行,連用形-促音便,ヨコギル,横切る,横切っ,ヨコギッ,横切る,ヨコギル,和,*,*,*,*,ヨコギッ,ヨコギル,ヨコギッ,ヨコギル,*,*,3,C1,* => original = ヨコギル, 


node.surface=た,
 node.feature=助動詞,*,*,*,助動詞-タ,終止形-一般,タ,た,た,タ,た,タ,和,*,*,*,*,タ,タ,タ,タ,*,*,*,"動詞%F2@1,形容詞%F4@-2",* => original = タ, 


node.surface=。,
 node.feature=補助記号,句点,*,*,*,*,,。,。,,。,,記号,*,*,*,*,,,,,*,*,*,*,* => original

## Tokenizing sentences

In [96]:
# Define list of simplified terms
# This task is done by Moana; for the time being, we will just use a pseudo list

simp_terms = ["限り", "通う", "親切", "会う", "食べる"]

In [114]:
# Define stop-words
# https://github.com/stopwords-iso/stopwords-ja/blob/master/stopwords-ja.txt
# Must be in folder "/home/<user>/nltk_data/corpora/stopwords"
stop_words = stopwords.words('japanese.txt')

In [120]:
def replace(sentence):
    """
    This function takes a Japanese sentence (source), identifies terms to replace,
    and replaces them with terms from a list of simplified terms (simp_terms)
    """
    REPLACE_WORD_POS = ("名詞", "動詞", "形容詞", "副詞", "未知語") # TBD
    IGNORE = ("接尾", "非自立", "代名詞")                          # TBD
    
    tagger = MeCab.Tagger()
    node = tagger.parseToNode(sentence)
    
    
    

In [122]:
def find_nearest_term(term, term_list):
    """
    Takes a term, compares it to a list of terms and returns its closest neighbour from within the list
    """
    nearest_term = term
    return nearest_term

In [5]:
model_path = '/home/andi/code/PlayGround/ShiroYagi/word2vec.gensim.model'
model = Word2Vec.load(model_path)

In [8]:
model.wv.most_similar("車両")

[('車', 0.8590342998504639),
 ('車輌', 0.8511245846748352),
 ('一般車', 0.8471918106079102),
 ('電車', 0.8361836075782776),
 ('客車', 0.8352526426315308),
 ('改造車', 0.8315220475196838),
 ('気動車', 0.830657422542572),
 ('旧型', 0.8224653601646423),
 ('2000系', 0.8223382830619812),
 ('大型車', 0.8181092143058777)]