In [17]:
import os
import re
import sys
import string
import requests
import io
import numpy as np
import collections
import random
import pickle
import string
import matplotlib.pyplot as plt
import tensorflow as tf
from zipfile import ZipFile
from collections import Counter
from tensorflow.python.framework import ops
import shutil
ops.reset_default_graph()

In [None]:
local_repository = 'temp'
shutil.rmtree('temp')
if not os.path.exists(local_repository):
    from git import Repo
    tf_model_repository = 'https://github.com/tensorflow/models'
    Repo.clone_from(tf_model_repository, local_repository)
    
    sys.path.insert(0, 'temp/tutorials/rnn/translate/')
    import seq2seq_model as seq2seq_model
    import data_utils as data_utils

In [3]:
sess = tf.Session()

In [4]:
learning_rate = 0.1    # 学習率
lr_decay_rate = 0.99   # 減衰率
lr_decay_every = 100   # 減衰させる頻度
max_gradient = 5.0     # 最大勾配
batch_size = 50        # バッチサイズ
num_layers = 3         # RNNの層の数
rnn_size = 500         # RNNのモデルのサイズ
layer_size = 512       # 層のサイズ
generations = 10000    # データを処理する回数
vocab_size = 10000     # 語彙のサイズ
save_every = 1000      # モデルを保存する頻度
eval_every = 500       # モデルを評価する頻度
output_every = 50      # 出力の頻度
punct = string.punctuation  # 句読点を削除

# データのダウンロードと格納
data_dir = 'temp'
data_file = 'eng_ger.txt'
model_path = 'seq2seq_model'
full_model_dir = os.path.join(data_dir, model_path)

In [5]:
test_english = ['hello where is my computer',
                'the quick brown fox jumped over the lazy dog',
                'is it going to rain tomorrow']

In [6]:
if not os.path.isfile(os.path.join(data_dir, data_file)):
    print('Data not found, downloading Eng-Ger sentences from www.manythings.org')
    sentence_url = 'http://www.manythings.org/anki/deu-eng.zip'
    r = requests.get(sentence_url)
    z = ZipFile(io.BytesIO(r.content))
    file = z.read('deu.txt')
    # Format Data
    eng_ger_data = file.decode()
    eng_ger_data = eng_ger_data.encode('ascii',errors='ignore')
    eng_ger_data = eng_ger_data.decode().split('\n')
    # Write to file
    with open(os.path.join(data_dir, data_file), 'w') as out_conn:
        for sentence in eng_ger_data:
            out_conn.write(sentence + '\n')
else:
    eng_ger_data = []
    with open(os.path.join(data_dir, data_file), 'r') as in_conn:
        for row in in_conn:
            eng_ger_data.append(row[:-1])

Data not found, downloading Eng-Ger sentences from www.manythings.org


In [7]:
# 句読点を削除
eng_ger_data = [''.join(char for char in sent if char not in punct) for sent in eng_ger_data]

# 各文をタブで分割
eng_ger_data = [x.split('\t') for x in eng_ger_data if len(x)>=1]
[english_sentence, german_sentence] = [list(x) for x in zip(*eng_ger_data)]
english_sentence = [x.lower().split() for x in english_sentence]
german_sentence = [x.lower().split() for x in german_sentence]

In [8]:
# 英語の語彙を処理
all_english_words = [word for sentence in english_sentence for word in sentence]
all_english_counts = Counter(all_english_words)

# -1は0（unknown）の分を引くため
eng_word_keys = [x[0] for x in all_english_counts.most_common(vocab_size-1)]
eng_vocab2ix = dict(zip(eng_word_keys, range(1,vocab_size)))
eng_ix2vocab = {val:key for key, val in eng_vocab2ix.items()}
english_processed = []
for sent in english_sentence:
    temp_sentence = []
    for word in sent:
        try:
            temp_sentence.append(eng_vocab2ix[word])
        except:
            temp_sentence.append(0)
    english_processed.append(temp_sentence)


# ドイツ語の語彙を処理
all_german_words = [word for sentence in german_sentence for word in sentence]
all_german_counts = Counter(all_german_words)
ger_word_keys = [x[0] for x in all_german_counts.most_common(vocab_size-1)]
ger_vocab2ix = dict(zip(ger_word_keys, range(1,vocab_size)))
ger_ix2vocab = {val:key for key, val in ger_vocab2ix.items()}
german_processed = []
for sent in german_sentence:
    temp_sentence = []
    for word in sent:
        try:
            temp_sentence.append(ger_vocab2ix[word])
        except:
            temp_sentence.append(0)
    german_processed.append(temp_sentence)

In [9]:
# テスト用の英文を処理し、語彙に含まれていない単語には'0'を使用
test_data = []
for sentence in test_english:
    temp_sentence = []
    for word in sentence.split(' '):
        try:
            temp_sentence.append(eng_vocab2ix[word])
        except:
            # この単語が語彙に含まれていない場合は'0'を使用
            temp_sentence.append(0)
    test_data.append(temp_sentence)

In [10]:
# シーケンスの長さに基づいてバケットを定義し、
# データを対応するバケットに分割する
x_maxs = [5, 7, 11, 50]
y_maxs = [10, 12, 17, 60]
buckets = [x for x in zip(x_maxs, y_maxs)]
bucketed_data = [[] for _ in range(len(x_maxs))]
for eng, ger in zip(english_processed, german_processed):
    for ix, (x_max, y_max) in enumerate(zip(x_maxs, y_maxs)):
        if (len(eng) <= x_max) and (len(ger) <= y_max):
            bucketed_data[ix].append([eng, ger])
            break

In [11]:
def translation_model(sess, input_vocab_size, output_vocab_size,
                      buckets, rnn_size, num_layers, max_gradient,
                      learning_rate, lr_decay_rate, forward_only):
    model = seq2seq_model.Seq2SeqModel(
          input_vocab_size,
          output_vocab_size,
          buckets,
          rnn_size,
          num_layers,
          max_gradient,
          batch_size,
          learning_rate,
          lr_decay_rate,
          forward_only=forward_only,
          dtype=tf.float32)
    return(model)

In [12]:
input_vocab_size = vocab_size
output_vocab_size = vocab_size

translate_model = translation_model(sess=sess,
                                    input_vocab_size=vocab_size,
                                    output_vocab_size=vocab_size,
                                    buckets=buckets,
                                    rnn_size=rnn_size,
                                    num_layers=num_layers,
                                    max_gradient=max_gradient,
                                    learning_rate=learning_rate,
                                    lr_decay_rate=lr_decay_rate,
                                    forward_only=False)

# テストモデルで同じ変数を再利用
with tf.variable_scope(tf.get_variable_scope(), reuse=True):
    test_model = translation_model(sess=sess,
                                    input_vocab_size=vocab_size,
                                    output_vocab_size=vocab_size,
                                    buckets=buckets,
                                    rnn_size=rnn_size,
                                    num_layers=num_layers,
                                    max_gradient=max_gradient,
                                    learning_rate=learning_rate,
                                    lr_decay_rate=lr_decay_rate,
                                    forward_only=True)
    test_model.batch_size = 1

NameError: name 'seq2seq_model' is not defined