# Installations!

In [None]:
#!pip install mecab-python3

In [None]:
#These wheels include a copy of the MeCab library, but not a dictionary. 
#In order to use MeCab you'll need to install a dictionary. unidic-lite is a good one to start with:
# !pip install unidic-lite

In [None]:
# normalization tool
# !pip install neologdn

In [None]:
# !pip install openpyxl

In [None]:
# To be able to see Japanese!
# !pip install japanize_matplotlib

# Libraries

In [2]:
import os
import pandas as pd
import numpy as np

# Preprocessing
import MeCab
import neologdn
import collections
from nltk import FreqDist
from nltk.corpus import stopwords

# Visualization
import matplotlib.pyplot as plt
import japanize_matplotlib
#import seaborn as sns # REMINDER: make sure to remove if not using!

# Just having fun

In [None]:
path = "/root/code/mochiyam/simply-japanese/data/2_RawData"
df = pd.read_excel(os.path.join(path, 'SNOW_T15_150.xlsx'))
df.head()

In [None]:
df.drop(columns=['#英語(原文)'], inplace=True)
df.head()

In [None]:
df.rename(columns={"#日本語(原文)": "original", "#やさしい日本語": "simplified"}, inplace=True)
df.head()

In [None]:
tagger = MeCab.Tagger()
text = df['original'][0]
parsed = tagger.parse(text)
print(parsed)

名詞 - noun
助詞 - particle
連体詞 - 
動詞
補助記号

In [None]:
# Just testing stuff out
test = MeCab.Tagger("-O wakati") 
text = neologdn.normalize(text, repeat=2)
parsed = test.parse(text)
print(parsed.split())

In [None]:
!pip show unidic-lite

In [None]:
test = MeCab.Tagger("r'-d /root/.pyenv/versions/3.8.12/envs/simply-japanese/lib/python3.8/site-packages'")
text = neologdn.normalize(text, repeat=2)
parsed = test.parse(text)
print(parsed)

In [None]:
# Super dumb dumb method
def count_all_word_frequency():
    all_words = collections.Counter()
    t = MeCab.Tagger()
    for idx, row in df.iterrows():
        text = row['original']
        node = t.parseToNode(text)
        while node:
            all_words[node.surface] += 1
            node = node.next
    return all_words
all_words = count_all_word_frequency()
# tuples in a list
print(all_words.most_common(25))

In [None]:
!pip install nltk

In [None]:
def plot_word_frequency(word_freq, most_common_num):
    freq_dist = FreqDist(word_freq)
    freq_dist.plot(most_common_num,cumulative=False)
#plot_word_frequency(all_words, 25)

In [None]:
# Super dumb dumb method
def count_all_word_frequency():
    all_words = collections.Counter()
    t = MeCab.Tagger()
    for idx, row in df.iterrows():
        text = row['original']
        node = t.parseToNode(text)
        while node:
            all_words[node.surface] += 1
            node = node.next
    return all_words
all_words = count_all_word_frequency()
# tuples in a list
print(all_words.most_common(25))

In [None]:
test = MeCab.Tagger("-O wakati")
print(text)
text = "あなたは何を見つめているのですか。"
parsed = test.parse(text)
node = test.parseToNode(text).next
while node.next:
    print(node.surface, node.feature.split(',')[0])
    node = node.next
#node.surface.decode("utf-8", "ignore")


In [None]:
#               助詞           
#              /
# Remove 付属語 
#　　　　　　　 \
#             　 助動詞

#月 が｜きれいな｜晩 でし た 。
#付属語 : が　・　でした

# With 10_000 Data!


In [None]:
path = "/root/code/mochiyam/simply-japanese/data/2_RawData"
df = pd.read_excel(os.path.join(path, 'SNOW_T15_10000.xlsx'))
df.head()

In [None]:
df.drop(columns=['#英語(原文)'], inplace=True)
df.rename(columns={"#日本語(原文)": "original", "#やさしい日本語": "simplified"}, inplace=True)
df.head()

In [None]:
# Counts all the independent word 自立語
_stopwords = stopwords.words('japanese')

def count_all_words(docs, col='original'):
    all_words = collections.Counter()
    t = MeCab.Tagger("-O wakati")
    for idx, row in docs.iterrows():
        text = row[col]
        node = t.parseToNode(text).next
        while node.next:
            part_of_speech = node.feature.split(',')[0]
            # REPLACE_WORD_POS = ("名詞", "動詞", "形容詞", "副詞", "未知語") # TBD
            # IGNORE = ("接尾", "非自立", "代名詞")    
            if part_of_speech in ["助動詞", "助詞", "補助記号"] or node.surface in _stopwords:
                node = node.next
                continue
            all_words[node.surface] += 1
            node = node.next
    return all_words
ind_word_freq = count_all_words(df)
plot_word_frequency(ind_word_freq, 25)

In [None]:
top_2000_word_freq = ind_word_freq.most_common(2000)
top_2000_word_freq[-25:]

In [None]:
# 1. Find sentences that are exactly the same 
# 2. temp_list of tokens for sentence original and simplified
# 3. Compare the two temp_list
# 4. two global_lists of deleted and added(simplified)

In [None]:
# temp = df.head(10)
# temp

In [None]:
# Step 1.  Get the corpuses that are different from original and simplified 
diff_corpus_df = df[df['original'] != df['simplified']]
diff_corpus_df

In [None]:
# 2. Create a temp_list of tokens for sentence original and simplified
original_temp_list = count_all_words(diff_corpus_df, 'original')
simplified_temp_list = count_all_words(diff_corpus_df, 'simplified')

In [None]:
# simplified_temp_list

In [None]:
# original_temp_list

In [None]:
# pd.DataFrame(dict(original_temp_list).items(), columns=['word', 'count'])

In [None]:
# 3. Compare the two temp_list

# Collections library
# Elements are subtracted from an iterable or from another mapping (or counter). 
# Like dict.update() but subtracts counts instead of replacing them. Both inputs and outputs may be zero or negative.
diff_temp_df = simplified_temp_list
diff_temp_df.subtract(original_temp_list)

In [None]:
diff_temp_df[diff_temp_df['count'] < 0].sort_values(by='count').head(10)

In [None]:
# 4. two global_lists of deleted and added(simplified)
deleted = []
added = []

diff_temp_df = pd.DataFrame(dict(diff_temp).items(), columns=['word', 'count'])
deleted =  diff_temp_df[diff_temp_df['count'] < 0]['word'].tolist()
added = diff_temp_df[diff_temp_df['count'] >= 0]['word'].tolist()

In [None]:
# added

In [None]:
len(deleted), len(added)

# Exploring DNN

In [3]:
path = "/root/code/mochiyam/simply-japanese/data/2_RawData"
df = pd.read_excel(os.path.join(path, 'SNOW_T15_10000.xlsx'))
df.drop(columns=['#英語(原文)'], inplace=True)
df.rename(columns={"#日本語(原文)": "original", "#やさしい日本語": "simplified"}, inplace=True)

In [4]:
DATA_LENGTH = len(df)
BATCH_SIZE = 64
EPOCHS = 20
LSTM_NODES =256
NUM_SENTENCES = DATA_LENGTH
MAX_SENTENCE_LENGTH = 50
MAX_NUM_WORDS = DATA_LENGTH
EMBEDDING_SIZE = 0

In [6]:
#Seq2Seq : Encoder LSTM -Decoder LSTM architecture

original_sentence = df['original'].to_list()
sos_simplified_sentence = [f'<sos> {sentence}' for sentence in df['simplified'].to_list()]
eof_simplified_sentence = df['simplified'].str.cat(['<eof>' for _ in range(DATA_LENGTH)], sep =' ').to_list()

In [None]:
! pip freeze | grep gensim

In [5]:
from gensim.models.word2vec import Word2Vec

In [None]:
# please work
model = word2vec.Word2Vec.load('word2vec.gensim.model')

In [None]:
model.wv['なまえ']

In [None]:
model.wv.most_similar('ただいま', topn=30)

In [None]:
v1 = model.wv['ただいま']
v2 = model.wv['本日']
res = v1 - v2
model.wv.similarity('いま', '今')

In [None]:
list[:10]

In [None]:
list = diff_temp_df['word'].to_list()
w2v = word2vec.Word2Vec(list, vector_size=10,
                        window=5,
                        min_count=5)

In [None]:
# w2v.wv.key_to_index

In [None]:
#代名詞、名詞、動詞

test = MeCab.Tagger()
text = "ただいま話し中です。"
parsed = test.parse(text)
node = test.parseToNode(text).next
while node.next:
    print(node.surface, node.feature)
    node = node.next
#node.surface.decode("utf-8", "ignore")

## LSTM Encoder Decoder Transformation Model... attempt

In [2]:
from gensim.models.word2vec import Word2Vec

In [13]:
original_sentences = df['original'].to_list()
simplified_sentences = df['simplified'].to_list()

In [15]:
model = Word2Vec.load("word2vec.gensim.model")
# len(words) = 335477
words = ["<PAD>"] + model.wv.index_to_key
# embedding.shape = (335477, 50)
embedding = np.insert(model.wv.vectors, 0, 0, axis=0)
# Dictionary of word and its index
input_token = target_token = dict((w, i) for i, w in enumerate(words))
encoder_tokens = decoder_tokens = embedding.shape[0] # for Masking > 335477
max_encoder_seq_length = max(len(sentence) for sentence in original_sentence) # > 28

# Input and output vocabulary sizes (types of words)
# Prepend BOS (Beginning Of Sentence) at the beginning of the sentence  
max_decoder_seq_length = max_encoder_seq_length + 1 # <BOS> 29
output_dim = embedding.shape[1] # > 50

NameError: name 'original_sentence' is not defined

In [9]:
# !pip list | grep tensorflow
# !pip install tensorflow

In [10]:
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Embedding, Input, Dense, Activation, LSTM
from tensorflow.keras.optimizers import RMSprop
# from tensorflow.keras.callbacks import ModelCheckpoint, ReduceLROnPlateau

2023-02-18 16:21:16.874562: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F AVX512_VNNI FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-02-18 16:21:17.289025: I tensorflow/core/util/port.cc:104] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2023-02-18 16:21:17.304671: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2023-02-18 16:21:17.304695: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore 

In [12]:
hidden_dimension = 64

# Embedding
layer_emb = Embedding(input_dim=encoder_tokens,
                      output_dim=output_dim,
                      trainable=False,
                      mask_zero=True)
# Encoder
# Input() is used to instantiate a Keras tensor
encoder_inputs = Input(shape=(None,), dtype=tf.int32)
x = layer_emb(encoder_inputs)
# Takes the hidden state and internal state of this Embedding layer
# state_h : hidden state in a cell, state_c : memory cell internal state
_, state_h, state_c = LSTM(hidden_dimension, return_sequences=True, return_state=True)(x)
encoder_states = [state_h, state_c]
encoder_states

2023-02-18 13:41:43.205937: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcuda.so.1'; dlerror: libcuda.so.1: cannot open shared object file: No such file or directory
2023-02-18 13:41:43.206083: W tensorflow/compiler/xla/stream_executor/cuda/cuda_driver.cc:265] failed call to cuInit: UNKNOWN ERROR (303)
2023-02-18 13:41:43.206147: I tensorflow/compiler/xla/stream_executor/cuda/cuda_diagnostics.cc:156] kernel driver does not appear to be running on this host (DESKTOP-3193HLS): /proc/driver/nvidia/version does not exist
2023-02-18 13:41:43.206922: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F AVX512_VNNI FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


[<KerasTensor: shape=(None, 64) dtype=float32 (created by layer 'lstm')>,
 <KerasTensor: shape=(None, 64) dtype=float32 (created by layer 'lstm')>]

In [13]:
# Decoder
decoder_inputs = Input(shape=(None,), dtype=tf.int32)
x = layer_emb(decoder_inputs)
x, _, _ = LSTM(hidden_dimension, return_sequences=True, return_state=True)(x, initial_state=encoder_states)
decoder_outputs = Dense(decoder_tokens)(x)

In [14]:
def accuracy_masking(y_true, y_pred):
    return tf.keras.metrics.sparse_categorical_accuracy(tf.gather_nd(y_true, tf.where(y_pred._keras_mask)), tf.gather_nd(y_pred, tf.where(y_pred._keras_mask)))

In [15]:
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)
opt = RMSprop(learning_rate=0.01)
model.compile(optimizer=opt, loss=lambda y_true, y_pred: tf.nn.softmax_cross_entropy_with_logits(tf.one_hot(tf.cast(y_true, tf.int32), num_decoder_tokens), y_pred),
              metrics=[accuracy_masking])
# set embedding matrix
layer_emb.set_weights([embedding])

In [16]:
layer_emb

<keras.layers.core.embedding.Embedding at 0x7fb7b4b5a970>

In [17]:
encoder_input_data = np.zeros(
    (len(original_sentences), max_encoder_seq_length))
decoder_input_data = np.zeros(
    (len(original_sentences), max_decoder_seq_length))
decoder_target_data = np.zeros(
    (len(original_sentences), max_decoder_seq_length))

len(encoder_input_data), len(decoder_input_data), len(decoder_target_data)

(10000, 10000, 10000)

In [22]:
input_token['<BOS>'] = len(input_token)
input_token['<EOS>'] = len(input_token) + 1
input_token['<BOS>'], input_token['<EOS>']

(335479, 335480)

In [18]:
target_token['<BOS>'] = len(target_token)
target_token['<EOS>'] = len(target_token) + 1
target_token['<BOS>'], target_token['<EOS>']

(335477, 335479)

In [23]:
for i, (original_sentence, simplified_sentence) in enumerate(zip(original_sentences, simplified_sentences)):
    for t, w in enumerate(original_sentence):
        encoder_input_data[i, t] = input_token[w]

    decoder_input_data[i, 0] = target_token['<BOS>'] # BOS
    for t, w in enumerate(simplified_sentence):
        decoder_input_data[i, t + 1] = target_token[w]
        decoder_target_data[i, t] = target_token[w]
    decoder_target_data[i, t + 1:] = target_token['<EOS>'] # EOS

KeyError: '疲'

### Process the dataset

In [6]:

"""
Process the dataset
Append SOS and EOS
"""

df["simplified_w_marker"] = [f'<sos> {sentence} <eof>' for sentence in df['simplified']]
# df

In [7]:
def get_vocab(df, col):
    vocabulary = []
    t = MeCab.Tagger("-O wakati")
    for idx, row in df.iterrows():
        text = row['simplified_w_marker']
        node = t.parseToNode(text).next
        while node.next:
            vocabulary.append(node.surface)
            node = node.next
    vocabulary = sorted(set(vocabulary)) + ['<unk>']
    word2idx = dict((idx, vocab) for idx, vocab in enumerate(vocabulary))
    idx2word = dict((vocab, idx) for idx, vocab in enumerate(vocabulary))
    return word2idx, idx2word

### Encode Decode

In [8]:
# X : original sentence
# y : simplified sentence
X_word2idx, X_idx2word = get_vocab(df, 'original')
y_word2idx, y_idx2word = get_vocab(df, 'simpflied_w_marker')

X_train = df['original'].to_list()
y_train = df["simplified_w_marker"].to_list()

X_vocab_size = len(X_word2idx)
y_vocab_size = len(y_word2idx)

hidden_dimension = 1000

In [16]:
# Embedding
layer_emb = Embedding(input_dim=encoder_tokens,
                      output_dim=output_dim,
                      trainable=False,
                      mask_zero=True)
# Encoder
# Input() is used to instantiate a Keras tensor
encoder_inputs = Input(shape=(None,), dtype=tf.int32)
# x = layer_emb(encoder_inputs)

# Takes the hidden state and internal state of this Embedding layer
# state_h : hidden state in a cell, state_c : memory cell internal state
encoder_lstm = LSTM(hidden_dimension, return_sequences=True, return_state=True)


#Decoder
decoder_inputs = Input(shape=(None,), dtype=tf.int32)
# x = layer_emb(decoder_inputs)
decoder_lstm = LSTM(hidden_dimension, return_sequences=True, return_state=True)
decoder_dense_layer = Dense(y_vocab_size)

NameError: name 'output_dim' is not defined

In [17]:
X = np.array(original_sentences)
y = np.array(simplified_sentences)

In [20]:
BUFFER_SIZE = len(df)
BATCH_SIZE = 64

is_train = np.random.uniform(size=(len(df),)) < 0.8

train_raw = (
    tf.data.Dataset
    .from_tensor_slices((X[is_train], y[is_train]))
    .shuffle(BUFFER_SIZE)
    .batch(BATCH_SIZE))
val_raw = (
    tf.data.Dataset
    .from_tensor_slices((X[~is_train], y[~is_train]))
    .shuffle(BUFFER_SIZE)
    .batch(BATCH_SIZE))

In [76]:
# tf.strings.unicode_decode(X, 'UTF-8').to_list()

In [21]:
for example_context_strings, example_target_strings in train_raw.take(1):
  print(example_context_strings[:5])
  print()
  print(example_target_strings[:5])
  break

tf.Tensor(
[b'\xe5\xbd\xbc\xe3\x81\x8c\xe9\x83\xa8\xe5\xb1\x8b\xe3\x81\xab\xe3\x81\xaf\xe3\x81\x84\xe3\x82\x8b\xe3\x81\xae\xe3\x82\x92\xe8\xa6\x8b\xe3\x81\x9f\xe3\x80\x82'
 b'\xe7\xa7\x81\xe9\x81\x94\xe3\x81\xaf\xe3\x81\x9d\xe3\x81\xae\xe3\x83\x93\xe3\x83\xab\xe5\x85\xa8\xe9\x83\xa8\xe3\x82\x92\xe8\x87\xaa\xe7\x94\xb1\xe3\x81\xab\xe4\xbd\xbf\xe3\x81\xa3\xe3\x81\xa6\xe3\x81\x8d\xe3\x81\x9f\xe3\x80\x82'
 b'\xe3\x81\x9d\xe3\x82\x8c\xe3\x81\xa3\xe3\x81\xa6\xe6\xad\xbb\xe8\xaa\x9e\xe3\x81\x98\xe3\x82\x83\xe3\x81\xaa\xe3\x81\x84\xe3\x81\xae\xe3\x80\x82'
 b'\xe3\x81\xa7\xe3\x81\xaf\xe3\x80\x81\xe3\x81\x93\xe3\x81\x93\xe3\x81\xa7\xe3\x81\x94\xe8\xaa\xac\xe6\x98\x8e\xe3\x81\x97\xe3\x81\xbe\xe3\x81\x97\xe3\x82\x87\xe3\x81\x86\xe3\x80\x82'
 b'\xe4\xbb\x8a\xe6\x99\xa9\xe3\x83\x91\xe3\x83\xbc\xe3\x83\x86\xe3\x82\xa3\xe3\x83\xbc\xe3\x82\x92\xe3\x81\x97\xe3\x81\xbe\xe3\x81\x99\xe3\x80\x82'], shape=(5,), dtype=string)

tf.Tensor(
[b'\xe5\xbd\xbc\xe3\x81\x8c\xe9\x83\xa8\xe5\xb1\x8b\xe3\x81\xab\xe3\x81\

In [87]:
df.head()

Unnamed: 0,original,simplified,simplified_w_marker
0,父は私が外国へ行くことを承知した。,父は私が外国へ行くことを許した。,<sos> 父は私が外国へ行くことを許した。 <eof>
1,卑屈な奴。,自分のことをダメだと考える人。,<sos> 自分のことをダメだと考える人。 <eof>
2,それは本当のはずはない。,それは本当のはずはない。,<sos> それは本当のはずはない。 <eof>
3,車がそんなに混んでなければ問題ないでしょう。,車がそんなに混んでなければ問題ないでしょう。,<sos> 車がそんなに混んでなければ問題ないでしょう。 <eof>
4,２時間も待たされた。,２時間も待った。,<sos> ２時間も待った。 <eof>


In [88]:

import tensorflow_text as tf_text
example_text = tf.constant('父は私が外国へ行くことを承知した。')

print(example_text.numpy())
print(tf_text.normalize_utf8(example_text, 'NFKD').numpy())

b'\xe7\x88\xb6\xe3\x81\xaf\xe7\xa7\x81\xe3\x81\x8c\xe5\xa4\x96\xe5\x9b\xbd\xe3\x81\xb8\xe8\xa1\x8c\xe3\x81\x8f\xe3\x81\x93\xe3\x81\xa8\xe3\x82\x92\xe6\x89\xbf\xe7\x9f\xa5\xe3\x81\x97\xe3\x81\x9f\xe3\x80\x82'
b'\xe7\x88\xb6\xe3\x81\xaf\xe7\xa7\x81\xe3\x81\x8b\xe3\x82\x99\xe5\xa4\x96\xe5\x9b\xbd\xe3\x81\xb8\xe8\xa1\x8c\xe3\x81\x8f\xe3\x81\x93\xe3\x81\xa8\xe3\x82\x92\xe6\x89\xbf\xe7\x9f\xa5\xe3\x81\x97\xe3\x81\x9f\xe3\x80\x82'


In [89]:
def tf_lower_and_split_punct(text):
  # Split accented characters.
  text = tf_text.normalize_utf8(text, 'NFKD')
  text = tf.strings.lower(text)
  # Keep space, a to z, and select punctuation.
  text = tf.strings.regex_replace(text, '[^ a-z.?!,¿]', '')
  # Add spaces around punctuation.
  text = tf.strings.regex_replace(text, '[.?!,¿]', r' \0 ')
  # Strip whitespace.
  text = tf.strings.strip(text)

  text = tf.strings.join(['[START]', text, '[END]'], separator=' ')
  return text

In [90]:
print(example_text.numpy().decode())
print(tf_lower_and_split_punct(example_text).numpy().decode())

父は私が外国へ行くことを承知した。
[START]  [END]


In [1]:
from transformers import AutoTokenizer, AutoModel
tokenizer = AutoTokenizer.from_pretrained("sonoisa/t5-base-japanese")
model = AutoModel.from_pretrained("sonoisa/t5-base-japanese")

  from .autonotebook import tqdm as notebook_tqdm
2023-02-18 17:59:07.451886: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F AVX512_VNNI FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-02-18 17:59:07.806393: I tensorflow/core/util/port.cc:104] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2023-02-18 17:59:07.827160: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2023-02-18 17:59:07.827200: I tensorflow/compiler/x