In [1]:
import string
import re
from pickle import dump
from unicodedata import normalize
from numpy import array
import tinysegmenter
 
# load doc into memory
def load_doc(filename):
	# open the file as read only
	file = open(filename, mode='rt', encoding='utf-8')
	# read all text
	text = file.read()
	# close the file
	file.close()
	return text
 
# split a loaded document into sentences
def to_pairs(doc):
	lines = doc.strip().split('\n')
	pairs = [line.split('%%') for line in  lines]
	return pairs
 
# # clean a list of lines
# def clean_pairs(lines):
# 	cleaned = list()
# 	# prepare regex for char filtering
# 	re_print = re.compile('[^%s]' % re.escape(string.printable))
# 	# prepare translation table for removing punctuation
# 	table = str.maketrans('', '', string.punctuation)
# 	for pair in lines:
# 		clean_pair = list()
# 		for i in range(2):
# 			if i == 0:
# 				line = pair[0]
# 				# normalize unicode characters
# 				line = normalize('NFD', line).encode('ascii', 'ignore')
# 				line = line.decode('UTF-8')
# 				# tokenize on white space
# 				line = line.split()           
# 				# convert to lowercase
# 				line = [word.lower() for word in line]
# 				# remove punctuation from each token
# 				line = [word.translate(table) for word in line]
# 				# remove non-printable chars form each token
# 				line = [re_print.sub('', w) for w in line]
# 				# remove tokens with numbers in them
# 				line = [word for word in line if word.isalpha()]
# 				# store as string
# 				clean_pair.append(' '.join(line))
# 			else:
# 				line = pair[1]
# 				l = list(line)[:-1]
# 				line = [word.lower() for word in l]
# 				line1 = "".join(line)
# 				tokenized_statement = tinysegmenter.tokenize(line1)                
# 				clean_pair.append(' '.join(tokenized_statement))
# 		cleaned.append(clean_pair)
# 	return array(cleaned)
 
# save a list of clean sentences to file
def save_clean_data(sentences, filename):
	dump(sentences, open(filename, 'wb'))
	print('Saved: %s' % filename)

# load dataset
filename = 'jap_dataset2.txt'
doc = load_doc(filename)
# split into english-german pairs
pairs = to_pairs(doc)
# print(pairs)
# clean sentences
clean_pairs = array(pairs)
# print(clean_pairs)
# save clean pairs to file
save_clean_data(clean_pairs, 'pkl1/english-japanese_5.pkl')
# spot check
for i in range(10):
	print('[%s] => [%s]' % (clean_pairs[i,0], clean_pairs[i,1]))
#     print(clean_pairs[i])

Saved: pkl1/english-japanese_5.pkl
[go] => [行け。動]
[go] => [行き。動 なさい。動]
[hi] => [こんにちは。感]
[hi] => [もしもし。感]
[hi] => [やっ。動 ほ。動 ー。名]
[hi] => [こんにちは。感]
[run] => [走れ。動]
[run] => [走っ。動 て。助]
[who] => [誰。名]
[wow] => [すごい。形]


In [2]:
from pickle import load
from pickle import dump
from numpy.random import rand
from numpy.random import shuffle

# load a clean dataset
def load_clean_sentences(filename):
	return load(open(filename, 'rb'))

# save a list of clean sentences to file
def save_clean_data(sentences, filename):
	dump(sentences, open(filename, 'wb'))
	print('Saved: %s' % filename)

# load dataset
raw_dataset = load_clean_sentences('pkl1/english-japanese_3.pkl')
# print(raw_dataset)

# reduce dataset size
n_sentences = 15000
dataset = raw_dataset[:n_sentences, :]
# print(dataset)
# random shuffle
shuffle(dataset)
# print(dataset)
# split into train/test
train, test = dataset[:13500], dataset[13500:]
# save
save_clean_data(dataset, 'pkl1/english-japanese_5-both.pkl')
save_clean_data(train, 'pkl1/english-japanese_5-train.pkl')
save_clean_data(test, 'pkl1/english-japanese_5-test.pkl')

Saved: pkl1/english-japanese_5-both.pkl
Saved: pkl1/english-japanese_5-train.pkl
Saved: pkl1/english-japanese_5-test.pkl


In [3]:
from pickle import load
from numpy import array
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from keras.utils.vis_utils import plot_model
from keras.models import Sequential
from keras.layers import LSTM
from keras.layers import Dense
from keras.layers import Embedding
from keras.layers import RepeatVector
from keras.layers import TimeDistributed
from keras.callbacks import ModelCheckpoint

import pydot

# load a clean dataset
def load_clean_sentences(filename):
	return load(open(filename, 'rb'))

# fit a tokenizer
def create_tokenizer(lines):
	tokenizer = Tokenizer()
	tokenizer.fit_on_texts(lines)
	return tokenizer

# max sentence length
def max_length(lines):
	return max(len(line.split()) for line in lines)

# encode and pad sequences
def encode_sequences(tokenizer, length, lines):
	# integer encode sequences
	X = tokenizer.texts_to_sequences(lines)
	# pad sequences with 0 values
	X = pad_sequences(X, maxlen=length, padding='post')
	return X

# one hot encode target sequence
def encode_output(sequences, vocab_size):
	ylist = list()
	for sequence in sequences:
		encoded = to_categorical(sequence, num_classes=vocab_size)
		ylist.append(encoded)
	y = array(ylist)
	y = y.reshape(sequences.shape[0], sequences.shape[1], vocab_size)
	return y

# define NMT model
def define_model(src_vocab, tar_vocab, src_timesteps, tar_timesteps, n_units):
	model = Sequential()
	model.add(Embedding(src_vocab, n_units, input_length=src_timesteps, mask_zero=True))
	model.add(LSTM(n_units))
	model.add(RepeatVector(tar_timesteps))
	model.add(LSTM(n_units, return_sequences=True))
	model.add(TimeDistributed(Dense(tar_vocab, activation='softmax')))
	return model

# load datasets
dataset = load_clean_sentences('pkl1/english-japanese_5-both.pkl')
train = load_clean_sentences('pkl1/english-japanese_5-train.pkl')
test = load_clean_sentences('pkl1/english-japanese_5-test.pkl')

# prepare english tokenizer
eng_tokenizer = create_tokenizer(dataset[:, 0])
eng_vocab_size = len(eng_tokenizer.word_index) + 1
eng_length = max_length(dataset[:, 0])
print('English Vocabulary Size: %d' % eng_vocab_size)
print('English Max Length: %d' % (eng_length))
# prepare german tokenizer
ger_tokenizer = create_tokenizer(dataset[:, 1])
ger_vocab_size = len(ger_tokenizer.word_index) + 1
ger_length = max_length(dataset[:, 1])
print('Japanese Vocabulary Size: %d' % ger_vocab_size)
print('Japanese Max Length: %d' % (ger_length))

ml = max(eng_length, ger_length)

# prepare training data
trainX = encode_sequences(ger_tokenizer, ml, train[:, 1])
trainY = encode_sequences(eng_tokenizer, ml, train[:, 0])
trainY = encode_output(trainY, eng_vocab_size)
# prepare validation data
testX = encode_sequences(ger_tokenizer, ml, test[:, 1])
testY = encode_sequences(eng_tokenizer, ml, test[:, 0])
testY = encode_output(testY, eng_vocab_size)

# define model
model = define_model(ger_vocab_size, eng_vocab_size, ml, ml, 256)
model.compile(optimizer='adam', loss='categorical_crossentropy',metrics=['accuracy'])
# summarize defined model
print(model.summary())
plot_model(model, to_file='japmodel_simple2.png', show_shapes=True)
# fit model
filename = 'model1/japmodel_simple_5.h5'
checkpoint = ModelCheckpoint(filename, monitor='val_accuracy', verbose=1, save_best_only=True, mode='max')
model.fit(trainX, trainY, epochs=30, batch_size=64, validation_data=(testX, testY), callbacks=[checkpoint], verbose=2)


Using TensorFlow backend.


English Vocabulary Size: 3580
English Max Length: 7
Japanese Vocabulary Size: 5625
Japanese Max Length: 19
Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 19, 256)           1440000   
_________________________________________________________________
lstm_1 (LSTM)                (None, 256)               525312    
_________________________________________________________________
repeat_vector_1 (RepeatVecto (None, 19, 256)           0         
_________________________________________________________________
lstm_2 (LSTM)                (None, 19, 256)           525312    
_________________________________________________________________
time_distributed_1 (TimeDist (None, 19, 3580)          920060    
Total params: 3,410,684
Trainable params: 3,410,684
Non-trainable params: 0
_________________________________________________________________
Non

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Train on 13500 samples, validate on 1500 samples
Epoch 1/30
 - 119s - loss: 1.8216 - accuracy: 0.7937 - val_loss: 1.3768 - val_accuracy: 0.8015

Epoch 00001: val_accuracy improved from -inf to 0.80154, saving model to model1/japmodel_simple_5.h5
Epoch 2/30
 - 130s - loss: 1.3163 - accuracy: 0.8039 - val_loss: 1.2956 - val_accuracy: 0.8065

Epoch 00002: val_accuracy improved from 0.80154 to 0.80649, saving model to model1/japmodel_simple_5.h5
Epoch 3/30
 - 117s - loss: 1.2662 - accuracy: 0.8067 - val_loss: 1.2732 - val_accuracy: 0.8065

Epoch 00003: val_accuracy did not improve from 0.80649
Epoch 4/30
 - 118s - loss: 1.2425 - accuracy: 0.8075 - val_loss: 1.2641 - val_accuracy: 0.8060

Epoch 00004: val_accuracy did not improve from 0.80649
Epoch 5/30
 - 124s - loss: 1.2167 - accuracy: 0.8091 - val_loss: 1.2437 - val_accuracy: 0.8087

Epoch 00005: val_accuracy improved from 0.80649 to 0.80874, saving model to model1/japmodel_simple_5.h5
Epoch 6/30
 - 119s - loss: 1.1774 - accuracy: 0.8153

<keras.callbacks.callbacks.History at 0x257d479278>

In [4]:
from pickle import load
from numpy import array
from numpy import argmax
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import load_model
from nltk.translate.bleu_score import corpus_bleu

# load a clean dataset
def load_clean_sentences(filename):
	return load(open(filename, 'rb'))

# fit a tokenizer
def create_tokenizer(lines):
	tokenizer = Tokenizer()
	tokenizer.fit_on_texts(lines)
	return tokenizer

# max sentence length
def max_length(lines):
	return max(len(line.split()) for line in lines)
# encode and pad sequences
def encode_sequences(tokenizer, length, lines):
	# integer encode sequences
	X = tokenizer.texts_to_sequences(lines)
	# pad sequences with 0 values
	X = pad_sequences(X, maxlen=length, padding='post')
	return X

# map an integer to a word
def word_for_id(integer, tokenizer):
	for word, index in tokenizer.word_index.items():
		if index == integer:
			return word
	return None

# generate target given source sequence
def predict_sequence(model, tokenizer, source):
	prediction = model.predict(source)[0]
	integers = [argmax(vector) for vector in prediction]
	target = list()
	for i in integers:
		word = word_for_id(i, tokenizer)
		if word is None:
			break
		target.append(word)
	return ' '.join(target)

# evaluate the skill of the model
def evaluate_model(model, tokenizer, sources, raw_dataset):
	actual, predicted = list(), list()
	for i, source in enumerate(sources):
		# translate encoded source text
		source = source.reshape((1, source.shape[0]))
		translation = predict_sequence(model, eng_tokenizer, source)
		raw_target, raw_src = raw_dataset[i]
		if i < 10:
			print('src=[%s], target=[%s], predicted=[%s]' % (raw_src, raw_target, translation))
		actual.append([raw_target.split()])
		predicted.append(translation.split())

# load datasets
dataset = load_clean_sentences('pkl1/english-japanese_5-both.pkl')
train = load_clean_sentences('pkl1/english-japanese_5-train.pkl')
test = load_clean_sentences('pkl1/english-japanese_5-test.pkl')
# prepare english tokenizer
eng_tokenizer = create_tokenizer(dataset[:, 0])
eng_vocab_size = len(eng_tokenizer.word_index) + 1
eng_length = max_length(dataset[:, 0])
# prepare german tokenizer
ger_tokenizer = create_tokenizer(dataset[:, 1])
ger_vocab_size = len(ger_tokenizer.word_index) + 1
ger_length = max_length(dataset[:, 1])

ml = max(eng_length, ger_length)

# prepare data
trainX = encode_sequences(ger_tokenizer, ml, train[:, 1])
testX = encode_sequences(ger_tokenizer, ml, test[:, 1])

# load model
model = load_model('model1/japmodel_simple_5.h5')
# test on some training sequences
print('train')
evaluate_model(model, eng_tokenizer, trainX, train)
# test on some test sequences
print('test')
evaluate_model(model, eng_tokenizer, testX, test)
# print(train)

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


train
src=[その。連 工場。名 は。助 玩具。名 を。助 製造。名 し。動 て。助 いる。動], target=[that factory makes toys], predicted=[the factory makes toys]
src=[撃て。動], target=[shoot], predicted=[welcome]
src=[彼。名 は。助 ぐっすり。副 眠っ。動 て。助 い。動 た。助], target=[he was fast asleep], predicted=[he was asleep]
src=[だから。接 黙れ。動 って。助 ば。助], target=[i said shut up], predicted=[i said up up]
src=[砂糖。名 が。助 ない。形 よ。助], target=[theres no sugar], predicted=[we have sugar sugar]
src=[うん。感 と。助 言っ。動 て。助 よ。助], target=[just say yes], predicted=[say say yes]
src=[よく。副 食べ。動 られる。動 ね。助 、。記 そんなに。副], target=[how can you eat so much], predicted=[why can you eat eat]
src=[ハンガー。名 を。助 ください。動], target=[i need some hangers], predicted=[i need some hangers]
src=[これ。名 使っ。動 て。助], target=[take this], predicted=[take this]
src=[私。名 は。助 もう。副 彼。名 に。助 会わ。動 ない。助 だろ。助 う。助], target=[i will never see him], predicted=[i wont see him him]
test
src=[トム。名 は。助 卵。名 を。助 1。名 つも。動 買わ。動 なかっ。助 た。助], target=[tom didnt buy any eggs], predicted=[tom needs to to visa]
src=[この。連 暗号。名 は。

In [11]:
testmodel = load_model('japmodel_simple.h5')
print("Model Loaded")

Model Loaded


In [5]:
from pickle import load
from numpy import array
from numpy import argmax
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import load_model
from nltk.translate.bleu_score import corpus_bleu
import pandas as pd
import numpy as np

# load a clean dataset
def load_clean_sentences(filename):
	return load(open(filename, 'rb'))

# fit a tokenizer
def create_tokenizer(lines):
	tokenizer = Tokenizer()
	tokenizer.fit_on_texts(lines)
	return tokenizer

# max sentence length
def max_length(lines):
	return max(len(line.split()) for line in lines)

# encode and pad sequences
def encode_sequences(tokenizer, length, lines):
	# integer encode sequences
	X = tokenizer.texts_to_sequences(lines)
	# pad sequences with 0 values
	X = pad_sequences(X, maxlen=length, padding='post')
	return X

dataset = load_clean_sentences('pkl1/english-japanese_simple2-both.pkl')
train = load_clean_sentences('pkl1/english-japanese_simple2-train.pkl')
test = load_clean_sentences('pkl1/english-japanese_simple2-test.pkl')
# prepare english tokenizer
eng_tokenizer = create_tokenizer(dataset[:, 0])
eng_vocab_size = len(eng_tokenizer.word_index) + 1
eng_length = max_length(dataset[:, 0])
# prepare german tokenizer
ger_tokenizer = create_tokenizer(dataset[:, 1])
ger_vocab_size = len(ger_tokenizer.word_index) + 1
ger_length = max_length(dataset[:, 1])
# prepare data
trainX = encode_sequences(ger_tokenizer, ger_length, train[:, 1])
testX = encode_sequences(ger_tokenizer, ger_length, test[:, 1])

In [6]:
model = load_model('japmodel_simple.h5')
preds = model.predict_classes(testX.reshape((testX.shape[0],testX.shape[1])))

def get_word(n, tokenizer):
    for word, index in tokenizer.word_index.items():
        if index == n:
            return word
    return None

preds_text = []
for i in preds:
    temp = []
    for j in range(len(i)):
        t = get_word(i[j], eng_tokenizer)
        if j > 0:
            if (t == get_word(i[j-1], eng_tokenizer)) or (t == None):
                temp.append('')
            else:
                temp.append(t)
        else:
            if(t == None):
                temp.append('')
            else:
                temp.append(t) 

    preds_text.append(' '.join(temp))
    
pred_df = pd.DataFrame({'jap':test[:,1],'actual' : test[:,0], 'predicted' : preds_text})
pred_df.sample(15)

Unnamed: 0,jap,actual,predicted
665,トム は 決し て 笑わ ない,tom never laughs,i dont tom still
24,これ 誰 に もらっ た ん です か,who gave you this,i like goes
556,彼女 は と て も 賢い人 だ,she is very wise,i do good
48,おしゃれ を し ない といけ ない,i have to dress up,to made his
435,この ネクタイ は いくら です か,how much is this tie,every tom it
30,彼女 は 遅刻 魔 だ,she tends to be late,with tom sleepy
939,それ は 黒く あり ませ んか,is it not black,who it
19,とって も 好き よ,i like it a lot,i what im ill
462,トム は 反社会 的 人間 だ,tom is a sociopath,it is tom some
622,値段 が 高すぎ です,it is too expensive,to
