In [1]:
import string
import re
from pickle import dump
from unicodedata import normalize
from numpy import array
import tinysegmenter
 
# load doc into memory
def load_doc(filename):
	# open the file as read only
	file = open(filename, mode='rt', encoding='utf-8')
	# read all text
	text = file.read()
	# close the file
	file.close()
	return text
 
# split a loaded document into sentences
def to_pairs(doc):
	lines = doc.strip().split('\n')
	pairs = [line.split('\t') for line in  lines]
	return pairs
 
# clean a list of lines
def clean_pairs(lines):
	cleaned = list()
	# prepare regex for char filtering
	re_print = re.compile('[^%s]' % re.escape(string.printable))
	# prepare translation table for removing punctuation
	table = str.maketrans('', '', string.punctuation)
	for pair in lines:
		clean_pair = list()
		for i in range(2):
			if i == 0:
				line = pair[0]
				# normalize unicode characters
				line = normalize('NFD', line).encode('ascii', 'ignore')
				line = line.decode('UTF-8')
				# tokenize on white space
				line = line.split()           
				# convert to lowercase
				line = [word.lower() for word in line]
				# remove punctuation from each token
				line = [word.translate(table) for word in line]
				# remove non-printable chars form each token
				line = [re_print.sub('', w) for w in line]
				# remove tokens with numbers in them
				line = [word for word in line if word.isalpha()]
				# store as string
				clean_pair.append(' '.join(line))
			else:
				line = pair[1]
				l = list(line)[:-1]
				line = [word.lower() for word in l]
				clean_pair.append(''.join(line))
		cleaned.append(clean_pair)
	return array(cleaned)
 
# save a list of clean sentences to file
def save_clean_data(sentences, filename):
	dump(sentences, open(filename, 'wb'))
	print('Saved: %s' % filename)
 
# load dataset
filename = 'jpn.txt'
doc = load_doc(filename)
# split into english-german pairs
pairs = to_pairs(doc)
# print(pairs)
# clean sentences
clean_pairs = clean_pairs(pairs)
# print(clean_pairs)
# save clean pairs to file
save_clean_data(clean_pairs, 'english-japanese.pkl')
# spot check
for i in range(50):
	print('[%s] => [%s]' % (clean_pairs[i,0], clean_pairs[i,1]))
#     print(clean_pairs[i])

Saved: english-japanese.pkl
[go] => [行け]
[go] => [行きなさい]
[hi] => [こんにちは]
[hi] => [もしもし]
[hi] => [やっほー]
[hi] => [こんにちは]
[run] => [走れ]
[run] => [走って]
[who] => [誰]
[wow] => [すごい]
[wow] => [ワォ]
[wow] => [わぉ]
[wow] => [おー]
[fire] => [火事だ]
[fire] => [火事]
[fire] => [撃て]
[help] => [助けて]
[help] => [助けてくれ]
[jump] => [飛び越えろ]
[jump] => [跳べ]
[jump] => [飛び降りろ]
[jump] => [飛び跳ねて]
[jump] => [ジャンプして]
[jump] => [跳べ]
[jump] => [飛び跳ねて]
[jump] => [ジャンプして]
[stop] => [やめろ]
[stop] => [止まれ]
[wait] => [待って]
[go on] => [続けて]
[go on] => [進んで]
[go on] => [進め]
[go on] => [続けろ]
[hello] => [こんにちは]
[hello] => [もしもし]
[hello] => [こんにちは]
[hurry] => [急げ]
[i see] => [なるほど]
[i see] => [なるほどね]
[i see] => [わかった]
[i see] => [わかりました]
[i see] => [そうですか]
[i see] => [そうなんだ]
[i see] => [そっか]
[i try] => [頑張ってみる]
[i try] => [やってみる]
[i try] => [試してみる]
[i try] => [やってみよう]
[i try] => [トライしてみる]
[i won] => [俺の勝ちー]


In [2]:
from pickle import load
from pickle import dump
from numpy.random import rand
from numpy.random import shuffle

# load a clean dataset
def load_clean_sentences(filename):
	return load(open(filename, 'rb'))

# save a list of clean sentences to file
def save_clean_data(sentences, filename):
	dump(sentences, open(filename, 'wb'))
	print('Saved: %s' % filename)

# load dataset
raw_dataset = load_clean_sentences('english-japanese.pkl')

# reduce dataset size
n_sentences = 10000
dataset = raw_dataset[:n_sentences, :]
# random shuffle
shuffle(dataset)
# split into train/test
train, test = dataset[:9000], dataset[9000:]
# save
save_clean_data(dataset, 'english-japanese-both.pkl')
save_clean_data(train, 'english-japanese-train.pkl')
save_clean_data(test, 'english-japanese-test.pkl')

Saved: english-japanese-both.pkl
Saved: english-japanese-train.pkl
Saved: english-japanese-test.pkl


In [13]:
from pickle import load
from numpy import array
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from keras.utils.vis_utils import plot_model
from keras.models import Sequential
from keras.layers import LSTM
from keras.layers import Dense
from keras.layers import Embedding
from keras.layers import RepeatVector
from keras.layers import TimeDistributed
from keras.callbacks import ModelCheckpoint

import pydot

# load a clean dataset
def load_clean_sentences(filename):
	return load(open(filename, 'rb'))

# fit a tokenizer
def create_tokenizer(lines):
	tokenizer = Tokenizer()
	tokenizer.fit_on_texts(lines)
	return tokenizer

# max sentence length
def max_length(lines):
	return max(len(line.split()) for line in lines)

# encode and pad sequences
def encode_sequences(tokenizer, length, lines):
	# integer encode sequences
	X = tokenizer.texts_to_sequences(lines)
	# pad sequences with 0 values
	X = pad_sequences(X, maxlen=length, padding='post')
	return X

# one hot encode target sequence
def encode_output(sequences, vocab_size):
	ylist = list()
	for sequence in sequences:
		encoded = to_categorical(sequence, num_classes=vocab_size)
		ylist.append(encoded)
	y = array(ylist)
	y = y.reshape(sequences.shape[0], sequences.shape[1], vocab_size)
	return y

# define NMT model
def define_model(src_vocab, tar_vocab, src_timesteps, tar_timesteps, n_units):
	model = Sequential()
	model.add(Embedding(src_vocab, n_units, input_length=src_timesteps, mask_zero=True))
	model.add(LSTM(n_units))
	model.add(RepeatVector(tar_timesteps))
	model.add(LSTM(n_units, return_sequences=True))
	model.add(TimeDistributed(Dense(tar_vocab, activation='softmax')))
	return model

# load datasets
dataset = load_clean_sentences('english-japanese-both.pkl')
train = load_clean_sentences('english-japanese-train.pkl')
test = load_clean_sentences('english-japanese-test.pkl')

# prepare english tokenizer
eng_tokenizer = create_tokenizer(dataset[:, 0])
eng_vocab_size = len(eng_tokenizer.word_index) + 1
eng_length = max_length(dataset[:, 0])
print('English Vocabulary Size: %d' % eng_vocab_size)
print('English Max Length: %d' % (eng_length))
# prepare german tokenizer
ger_tokenizer = create_tokenizer(dataset[:, 1])
ger_vocab_size = len(ger_tokenizer.word_index) + 1
ger_length = max_length(dataset[:, 1])
print('Japanese Vocabulary Size: %d' % ger_vocab_size)
print('Japanese Max Length: %d' % (ger_length))

# prepare training data
trainX = encode_sequences(ger_tokenizer, ger_length, train[:, 1])
trainY = encode_sequences(eng_tokenizer, eng_length, train[:, 0])
trainY = encode_output(trainY, eng_vocab_size)
# prepare validation data
testX = encode_sequences(ger_tokenizer, ger_length, test[:, 1])
testY = encode_sequences(eng_tokenizer, eng_length, test[:, 0])
testY = encode_output(testY, eng_vocab_size)

# define model
model = define_model(ger_vocab_size, eng_vocab_size, ger_length, eng_length, 256)
model.compile(optimizer='adam', loss='categorical_crossentropy')
# summarize defined model
print(model.summary())
plot_model(model, to_file='japmodel.png', show_shapes=True)
# fit model
filename = 'japmodel.h5'
checkpoint = ModelCheckpoint(filename, monitor='val_loss', verbose=1, save_best_only=True, mode='min')
model.fit(trainX, trainY, epochs=20, batch_size=64, validation_data=(testX, testY), callbacks=[checkpoint], verbose=2)


English Vocabulary Size: 2716
English Max Length: 7
['トムは落っこちた' 'ボストンへようこそ' 'お腹いっぱいだ' ... '幸せって何なんだろう' '列車が入ってくるよ' 'トムは文章がうまい'] <class 'numpy.ndarray'>
<function tokenize at 0x000001FEB6026C80>
<function tokenize at 0x000001FEB6026C80>
Japanese Vocabulary Size: 7948
Japanese Max Length: 2
[[3, 350, 92], [538, 10, 248], [14, 843, 2, 286], [15, 67, 193], [249, 10, 11], [6, 2, 109, 118], [23, 180, 2, 123], [26, 185, 1221], [29, 734, 5], [3, 262, 652], [1, 30, 124], [15, 4, 45, 590], [2, 7, 170, 539, 1, 21], [1, 420, 5], [23, 389, 9, 993], [60, 1, 17, 2, 197], [35, 227, 24, 238], [158, 73, 7, 279], [16, 51, 118], [994, 1222], [165, 331], [12, 2, 110, 14, 351], [8, 4, 287], [2, 3, 352], [9, 1635, 40], [29, 153, 3], [55, 4, 457], [30, 5, 205, 458], [844, 36, 5], [98, 19, 5, 67, 99], [8, 37, 4, 653], [3, 735, 10, 78], [68, 24, 13, 459], [6, 2, 214], [52, 5, 654], [3, 33, 370], [14, 74, 159], [2, 6, 109, 40], [23, 2, 4, 332, 119], [1, 174, 21, 142], [12, 2, 1636, 64, 371], [16, 21, 93, 42], [6

[[29, 316, 4, 2597], [149, 142], [1, 20, 87], [2, 13, 1434], [6, 2, 41, 199], [12, 20, 60, 1283], [380], [3, 2, 2598], [319, 592], [221, 92], [3, 2, 195, 42], [80, 802], [35, 430, 4, 143], [327, 211], [1, 17, 131, 1519], [3, 59, 40], [223, 20, 328, 103], [15, 57, 348], [14, 398, 2, 303], [39, 150, 19, 5], [1274, 24], [1, 51, 97, 40], [3, 819, 46, 122], [3, 298, 24, 248], [59, 7, 140], [15, 176, 64, 905], [281, 23], [158, 30, 35, 104], [12, 701, 50, 284], [305, 22, 156], [6, 329, 29, 31, 1232], [3, 465, 11], [6, 182, 87], [1, 47, 114], [29, 18, 9, 343], [1, 25, 710, 4, 143], [12, 20, 1378], [1, 16, 72, 202], [8, 385], [1, 17, 220, 344], [28, 254], [1, 339], [6, 2, 274, 59, 6], [471, 152], [28, 18, 35, 811], [1, 17, 10, 21], [1, 162, 361], [83, 543, 10, 21], [1, 25, 7, 41, 216], [963, 137, 27], [1518, 794], [1, 49, 18, 7, 103], [29, 153, 3], [1, 53, 4, 2599], [1, 20, 54, 65, 4, 215], [26, 75, 11, 72], [3, 2, 661], [16, 84, 67, 427], [6, 444, 606, 5], [1, 81, 10, 21, 61], [1, 101, 72, 23]

In [3]:
from pickle import load
from numpy import array
from numpy import argmax
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import load_model
from nltk.translate.bleu_score import corpus_bleu

# load a clean dataset
def load_clean_sentences(filename):
	return load(open(filename, 'rb'))

# fit a tokenizer
def create_tokenizer(lines):
	tokenizer = Tokenizer()
	tokenizer.fit_on_texts(lines)
	return tokenizer

# max sentence length
def max_length(lines):
	return max(len(line.split()) for line in lines)

# encode and pad sequences
def encode_sequences(tokenizer, length, lines):
	# integer encode sequences
	X = tokenizer.texts_to_sequences(lines)
	# pad sequences with 0 values
	X = pad_sequences(X, maxlen=length, padding='post')
	return X

# map an integer to a word
def word_for_id(integer, tokenizer):
	for word, index in tokenizer.word_index.items():
		if index == integer:
			return word
	return None

# generate target given source sequence
def predict_sequence(model, tokenizer, source):
	prediction = model.predict(source)[0]
	integers = [argmax(vector) for vector in prediction]
	target = list()
	for i in integers:
		word = word_for_id(i, tokenizer)
		if word is None:
			break
		target.append(word)
	return ' '.join(target)

# evaluate the skill of the model
def evaluate_model(model, tokenizer, sources, raw_dataset):
	actual, predicted = list(), list()
	for i, source in enumerate(sources):
		# translate encoded source text
		source = source.reshape((1, source.shape[0]))
		translation = predict_sequence(model, eng_tokenizer, source)
		raw_target, raw_src = raw_dataset[i]
		if i < 10:
			print('src=[%s], target=[%s], predicted=[%s]' % (raw_src, raw_target, translation))
		actual.append([raw_target.split()])
		predicted.append(translation.split())
	# calculate BLEU score
# 	print('BLEU-1: %f' % corpus_bleu(actual, predicted, weights=(1.0, 0, 0, 0)))
# 	print('BLEU-2: %f' % corpus_bleu(actual, predicted, weights=(0.5, 0.5, 0, 0)))
# 	print('BLEU-3: %f' % corpus_bleu(actual, predicted, weights=(0.3, 0.3, 0.3, 0)))
# 	print('BLEU-4: %f' % corpus_bleu(actual, predicted, weights=(0.25, 0.25, 0.25, 0.25)))

# load datasets
dataset = load_clean_sentences('english-japanese-both.pkl')
train = load_clean_sentences('english-japanese-train.pkl')
test = load_clean_sentences('english-japanese-test.pkl')
# prepare english tokenizer
eng_tokenizer = create_tokenizer(dataset[:, 0])
eng_vocab_size = len(eng_tokenizer.word_index) + 1
eng_length = max_length(dataset[:, 0])
# prepare german tokenizer
ger_tokenizer = create_tokenizer(dataset[:, 1])
ger_vocab_size = len(ger_tokenizer.word_index) + 1
ger_length = max_length(dataset[:, 1])
# prepare data
trainX = encode_sequences(ger_tokenizer, ger_length, train[:, 1])
testX = encode_sequences(ger_tokenizer, ger_length, test[:, 1])

# load model
model = load_model('japmodel.h5')
# test on some training sequences
print('train')
evaluate_model(model, eng_tokenizer, trainX, train)
# test on some test sequences
print('test')
evaluate_model(model, eng_tokenizer, testX, test)
# print(train)

train
src=[頑 張 っ て み る よ], target=[ill try my best], predicted=[im is]
src=[い つ そ れ を 買 っ た の], target=[when did you buy it], predicted=[i you you]
src=[あ な た は ど う], target=[how about you], predicted=[im is]
src=[ち ょ う ど 今 は 忙 し い], target=[im busy now], predicted=[im is]
src=[行 く よ], target=[ill go], predicted=[its it]
src=[こ こ で 少 し 休 も う], target=[lets rest here], predicted=[i you a]
src=[彼 女 に 優 し く し て あ げ な さ い よ], target=[be nice to her], predicted=[he is a]
src=[私 は タ バ コ を 吸 う の を や め た], target=[i stopped smoking], predicted=[i was a]
src=[彼 は 今 日 家 に い る], target=[he is at home today], predicted=[he is a]
src=[こ の こ と は 秘 密 だ よ], target=[this is a secret], predicted=[i is a]
test
src=[私 達 は ど う や っ て 勝 っ た の], target=[how did we win], predicted=[what you you]
src=[座 っ て も い い で す か], target=[can i sit down], predicted=[i you you]
src=[ト ム と 連 絡 が 取 れ な い], target=[i cant contact tom], predicted=[tom is tom]
src=[誰 か い る の], target=[is anybody there], predicted=[what you you