In [50]:
import string
import re
from pickle import dump
from unicodedata import normalize
from numpy import array
 
# load doc into memory
def load_doc(filename):
    # open the file as read only
    file = open(filename, mode='rt', encoding='utf-8')
    # read all text
    text = file.read()
    # close the file
    file.close()
    return text
 
# split a loaded document into sentences
def to_pairs(doc):
    lines = doc.strip().split('\n')
    pairs = [line.split('%%') for line in  lines]
    return pairs
 
# save a list of clean sentences to file
def save_clean_data(sentences, filename):
    dump(sentences, open(filename, 'wb'))
    print('Saved: %s' % filename)

# load dataset
filename = 'data/jap_dataset_2.txt'
doc = load_doc(filename)
# split into english-jap pairs
pairs = to_pairs(doc)
print(type(pairs))

clean_pairs = array(pairs)
# clean sentences
# clean_pairs = clean_pairs(pairs)
# print(clean_pairs)
# save clean pairs to file
save_clean_data(clean_pairs, 'pkl/english-japanese_3.pkl')
# spot check
for i in range(50):
    print('[%s] => [%s]' % (clean_pairs[i,0], clean_pairs[i,1]))

<class 'list'>
Saved: pkl/english-japanese_2.pkl
[go] => [行けx動]
[go] => [行きx動 なさいx動]
[hi] => [こんにちはx感]
[hi] => [もしもしx感]
[hi] => [やっx動 ほx動 ーx名]
[hi] => [こんにちはx感]
[run] => [走れx動]
[run] => [走っx動 てx助]
[who] => [誰x名]
[wow] => [すごいx形]
[wow] => [ワォx名]
[wow] => [わx助 ぉx名]
[wow] => [おx接 ーx名]
[fire] => [火事x名 だx助]
[fire] => [火事x名]
[fire] => [撃てx動]
[help] => [助けx動 てx助]
[help] => [助けx動 てx助 くれx動]
[jump] => [飛び越えろx動]
[jump] => [跳べx動]
[jump] => [飛び降りろx動]
[jump] => [飛び跳ねx動 てx助]
[jump] => [ジャンプx名 しx動 てx助]
[jump] => [跳べx動]
[jump] => [飛び跳ねx動 てx助]
[jump] => [ジャンプx名 しx動 てx助]
[stop] => [やめろx動]
[stop] => [止まれx動]
[wait] => [待っx動 てx助]
[go on] => [続けx動 てx助]
[go on] => [進んx動 でx助]
[go on] => [進めx動]
[go on] => [続けろx動]
[hello] => [こんにちはx感]
[hello] => [もしもしx感]
[hello] => [こんにちはx感]
[hurry] => [急げx動]
[i see] => [なるほどx感]
[i see] => [なるほどx感 ねx助]
[i see] => [わかっx動 たx助]
[i see] => [わかりx動 ましx助 たx助]
[i see] => [そうx副 ですx助 かx助]
[i see] => [そうx副 なx助 んx名 だx助]
[i see] => [そっx名 かx助]
[i try] => [頑張っx動 てx助 みるx動]
[i try] => [やっx動 てx助 

In [51]:
from pickle import load
from pickle import dump
from numpy.random import rand
from numpy.random import shuffle

# load a clean dataset
def load_clean_sentences(filename):
    return load(open(filename, 'rb'))

# save a list of clean sentences to file
def save_clean_data(sentences, filename):
    dump(sentences, open(filename, 'wb'))
    print('Saved: %s' % filename)

# load dataset
raw_dataset = load_clean_sentences('pkl/english-japanese_3.pkl')
# print(raw_dataset)

# reduce dataset size
n_sentences = 10000
dataset = raw_dataset[:n_sentences, :]
# print(dataset)
# random shuffle
shuffle(dataset)
# print(dataset)
# split into train/test
train, test = dataset[:9000], dataset[9000:]
# save
save_clean_data(dataset, 'pkl/english-japanese_3-both.pkl')
save_clean_data(train, 'pkl/english-japanese_3-train.pkl')
save_clean_data(test, 'pkl/english-japanese_3-test.pkl')

Saved: pkl/english-japanese_2-both.pkl
Saved: pkl/english-japanese_2-train.pkl
Saved: pkl/english-japanese_2-test.pkl


In [28]:
from pickle import load
from numpy import array
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from keras.utils.vis_utils import plot_model
from keras.models import Sequential
from keras.layers import LSTM
from keras.layers import Dense
from keras.layers import Embedding
from keras.layers import RepeatVector
from keras.layers import TimeDistributed
from keras.callbacks import ModelCheckpoint

import pydot

# load a clean dataset
def load_clean_sentences(filename):
    return load(open(filename, 'rb'))

# fit a tokenizer
def create_tokenizer(lines):
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(lines)
    return tokenizer

# max sentence length
def max_length(lines):
    return max(len(line.split()) for line in lines)

# encode and pad sequences
def encode_sequences(tokenizer, length, lines):
    # integer encode sequences
    X = tokenizer.texts_to_sequences(lines)
    # pad sequences with 0 values
    X = pad_sequences(X, maxlen=length, padding='post')
    return X

# one hot encode target sequence
def encode_output(sequences, vocab_size):
    ylist = list()
    for sequence in sequences:
        encoded = to_categorical(sequence, num_classes=vocab_size)
        ylist.append(encoded)
    y = array(ylist)
#     y = y.reshape(sequences.shape[0], sequences.shape[1], vocab_size)
    return y

# define NMT model
def define_model(src_vocab, tar_vocab, src_timesteps, tar_timesteps, n_units):
    model = Sequential()
    model.add(Embedding(src_vocab, n_units, input_length=src_timesteps, mask_zero=True))
    model.add(LSTM(n_units))
    model.add(RepeatVector(tar_timesteps))
    model.add(LSTM(n_units, return_sequences=True))
    model.add(TimeDistributed(Dense(tar_vocab, activation='softmax')))
    return model

# load datasets
dataset = load_clean_sentences('pkl/english-japanese_3-both.pkl')
train = load_clean_sentences('pkl/english-japanese_3-train.pkl')
test = load_clean_sentences('pkl/english-japanese_3-test.pkl')

# prepare english tokenizer
eng_tokenizer = create_tokenizer(dataset[:, 0])
eng_vocab_size = len(eng_tokenizer.word_index) + 1
eng_length = max_length(dataset[:, 0])
print('English Vocabulary Size: %d' % eng_vocab_size)
print('English Max Length: %d' % (eng_length))
# prepare german tokenizer
jap_tokenizer = create_tokenizer(dataset[:, 1])
jap_vocab_size = len(jap_tokenizer.word_index) + 1
jap_length = max_length(dataset[:, 1])
print('Japanese Vocabulary Size: %d' % jap_vocab_size)
print('Japanese Max Length: %d' % (jap_length))

max_len = max(jap_length,eng_length)
print('Max Length: %d' % (max_len))

# prepare training data
trainX = encode_sequences(jap_tokenizer, max_len, train[:, 1])
trainY = encode_sequences(eng_tokenizer, max_len, train[:, 0])
trainY = encode_output(trainY, eng_vocab_size)
# prepare validation data
testX = encode_sequences(jap_tokenizer, max_len, test[:, 1])
testY = encode_sequences(eng_tokenizer, max_len, test[:, 0])
testY = encode_output(testY, eng_vocab_size)

# define model
model = define_model(jap_vocab_size, eng_vocab_size, max_len, max_len, 256)
model.compile(optimizer='adam', loss='categorical_crossentropy',metrics=['accuracy'])
# summarize defined model
print(model.summary())
plot_model(model, to_file='japmodel_idk.png', show_shapes=True)
# fit model
filename = 'models/jap_eng_model3.h5'
checkpoint = ModelCheckpoint(filename, monitor='val_loss', verbose=1, save_best_only=True, mode='min')
model.fit(trainX, trainY, epochs=20, batch_size=64, validation_data=(testX, testY), callbacks=[checkpoint], verbose=2)


English Vocabulary Size: 2716
English Max Length: 7
Japanese Vocabulary Size: 4147
Japanese Max Length: 19
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, 19, 256)           1061632   
_________________________________________________________________
lstm_3 (LSTM)                (None, 256)               525312    
_________________________________________________________________
repeat_vector_2 (RepeatVecto (None, 7, 256)            0         
_________________________________________________________________
lstm_4 (LSTM)                (None, 7, 256)            525312    
_________________________________________________________________
time_distributed_2 (TimeDist (None, 7, 2716)           698012    
Total params: 2,810,268
Trainable params: 2,810,268
Non-trainable params: 0
_________________________________________________________________
None
Train on 9000 sample

<keras.callbacks.History at 0x2c10bb674e0>

In [29]:
from pickle import load
from numpy import array
from numpy import argmax
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import load_model
from nltk.translate.bleu_score import corpus_bleu

# map an integer to a word
def word_for_id(integer, tokenizer):
    for word, index in tokenizer.word_index.items():
        if index == integer:
            return word
    return None

# generate target given source sequence
def predict_sequence(model, tokenizer, source):
    prediction = model.predict(source)[0]
    integers = [argmax(vector) for vector in prediction]
    target = list()
    for i in integers:
        word = word_for_id(i, tokenizer)
        if word is None:
            break
        target.append(word)
    return ' '.join(target)

# evaluate the skill of the model
def evaluate_model(model, tokenizer, sources, raw_dataset):
    for i, source in enumerate(sources):
        # translate encoded source text
        source = source.reshape((1, source.shape[0]))
        translation = predict_sequence(model, eng_tokenizer, source)
        raw_target, raw_src = raw_dataset[i]
        if i < 10:
            print('src=[%s], target=[%s], predicted=[%s]' % (raw_src, raw_target, translation))

# load datasets
dataset = load_clean_sentences('pkl/english-japanese_3-both.pkl')
train = load_clean_sentences('pkl/english-japanese_3-train.pkl')
test = load_clean_sentences('pkl/english-japanese_3-test.pkl')
# prepare english tokenizer
eng_tokenizer = create_tokenizer(dataset[:, 0])
eng_vocab_size = len(eng_tokenizer.word_index) + 1
eng_length = max_length(dataset[:, 0])
# prepare german tokenizer
jap_tokenizer = create_tokenizer(dataset[:, 1])
jap_vocab_size = len(jap_tokenizer.word_index) + 1
jap_length = max_length(dataset[:, 1])
# prepare data
trainX = encode_sequences(jap_tokenizer, jap_length, train[:, 1])
testX = encode_sequences(jap_tokenizer, jap_length, test[:, 1])

# load model
model = load_model('models/jap_eng_model3.h5')
# test on some training sequences
print('train')
evaluate_model(model, eng_tokenizer, trainX, train)
# test on some test sequences
print('test')
evaluate_model(model, eng_tokenizer, testX, test)
# print(train)

train
src=[手伝おx動 うx助 かx助], target=[can i help], predicted=[can i help]
src=[大変x名 、x記 忙しいx形 ですx助], target=[im very busy], predicted=[im very busy]
src=[それx名 をx助 見つけx動 なくx助 ちゃx助], target=[i have to find it], predicted=[i want to it]
src=[トムx名 をx助 放せx動], target=[let go of tom], predicted=[is tom tom]
src=[ちょっとx副 考えx動 させx動 てx助], target=[let me think], predicted=[let me to]
src=[私x名 はx助 何x名 もx助 見x動 ませx助 んx助], target=[i dont see anything], predicted=[i cant see it]
src=[熱x名 はx助 ありx動 ますx助 かx助], target=[do you have a fever], predicted=[do you have a]
src=[話しx動 てx助 もx助 いいx形 ですx助 かx助], target=[may i speak to you], predicted=[may i sit you]
src=[間違えx動 ましx助 たx助], target=[ive made a mistake], predicted=[i was a mistake]
src=[トムx名 はx助 キャンセルx名 しx動 たx助], target=[tom canceled], predicted=[tom laughed]
test
src=[今x名 どこx名 にx助 いるx動 のx助], target=[where are you now], predicted=[where are you you]
src=[コンサートx名 どうx副 だっx助 たx助], target=[how was the concert], predicted=[how was your trip]
src=[まあx副 試しx動 てx助 ごらんx

In [38]:
t  = Tokenizer()
fit_text = ["彼らx名 はx助 そこx名 にx助 すんx動 でx助 いるx動","私x名 はx助 何x名 もx助 見x動 ませx助 んx助"]
t.fit_on_texts(fit_text)

#fit_on_texts fits on sentences when list of sentences is passed to fit_on_texts() function. 
#ie - fit_on_texts( [ sent1, sent2, sent3,....sentN ] )

#Similarly, list of sentences/single sentence in a list must be passed into texts_to_sequences.
test_text1 = "見x動 たx助 ことx名 がx助 ありx動 ますx助"
test_text2 = "今x名 どこx名 にx助 いるx動 のx助"
sequences = t.texts_to_sequences([test_text1, test_text2])

print('sequences : ',sequences,'\n')

print('word_index : ',t.word_index)
#texts_to_sequences() returns list of list. ie - [ [] ]

sequences :  [[11], [4, 7]] 

word_index :  {'はx助': 1, '彼らx名': 2, 'そこx名': 3, 'にx助': 4, 'すんx動': 5, 'でx助': 6, 'いるx動': 7, '私x名': 8, '何x名': 9, 'もx助': 10, '見x動': 11, 'ませx助': 12, 'んx助': 13}


In [35]:
from kuromojipy.kuromoji_server import KuromojiServer

def jap_clean(text):
    a = []
    with KuromojiServer() as kuro_server:
        kuromoji = kuro_server.kuromoji
        tokenizer = kuromoji.Tokenizer.builder().build()
        tokens = tokenizer.tokenize(text)
        for token in tokens:
            x = token.getSurfaceForm()+"x"+token.getAllFeatures()[0]
            a.append(x)
    # print(a)
    return " ".join(a)


ip = input("Enter Japanese: ")

tokenized_ip = jap_clean(ip)                
# tokenized_ip = ' '.join(tokenized_ip)

print(tokenized_ip)

# ip = encode_sequences(ger_tokenizer, ger_length, tokenized_ip)
ip = encode_sequences(jap_tokenizer, jap_length, train[:, 1])

# load model
# test on some training sequences
print('op')

translation = predict_sequence(model, eng_tokenizer, ip)

print(translation)

Enter Japanese: トムに聞いて
トムx名 にx助 聞いx動 てx助
op
can i help


In [47]:
model = load_model('models/jap_eng_model2.h5')
print('Model Loaded')


ip = input("Enter Japanese: ")

tokenized_ip = jap_clean(ip)                
# tokenized_ip = ' '.join(tokenized_ip)

print(tokenized_ip)

# ip = encode_sequences(ger_tokenizer, ger_length, tokenized_ip)
ip = encode_sequences(jap_tokenizer, jap_length, [tokenized_ip])
print(ip)

p = model.predict(ip)[0]
print(p)
print(p[0])
print(type(p[0]))

prediction = [argmax(x) for x in p]

print(prediction)
print(len(prediction))

Model Loaded
Enter Japanese: トムに聞いて
トムx名 にx助 聞いx動 てx助
[[  9   6 173   3   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0]]
[[1.8935242e-03 1.4097502e-03 8.2530268e-03 ... 5.3407922e-07
  7.8858506e-07 4.3934841e-07]
 [4.2369766e-03 1.2918802e-04 1.2500399e-03 ... 1.8689406e-08
  3.1016107e-08 1.3960496e-08]
 [5.5843288e-01 2.7029862e-05 5.1095767e-04 ... 9.0137483e-08
  1.4070852e-07 6.2224551e-08]
 ...
 [9.9892837e-01 5.4638928e-08 9.0995093e-07 ... 7.9754814e-11
  1.1659679e-10 6.7172073e-11]
 [9.9951053e-01 3.2290199e-08 2.9179233e-07 ... 3.5886346e-11
  4.9300026e-11 3.0020673e-11]
 [9.9969971e-01 3.3379774e-08 1.9557221e-07 ... 1.9006064e-11
  2.7746277e-11 1.6088081e-11]]
[1.8935242e-03 1.4097502e-03 8.2530268e-03 ... 5.3407922e-07 7.8858506e-07
 4.3934841e-07]
<class 'numpy.ndarray'>
[21, 3, 0, 0, 0, 0, 0]
7


In [49]:
m = max(0,1)
m

1