In [1]:
import numpy as np
import tensorflow as tf
from sklearn.utils import shuffle
import re
import time
import collections
import os
import itertools
from tqdm import tqdm

In [2]:
def build_dataset(words, n_words, atleast=1):
    count = [['PAD', 0], ['GO', 1], ['EOS', 2], ['UNK', 3]]
    counter = collections.Counter(words).most_common(n_words)
    counter = [i for i in counter if i[1] >= atleast]
    count.extend(counter)
    dictionary = dict()
    for word, _ in count:
        dictionary[word] = len(dictionary)
    data = list()
    unk_count = 0
    for word in words:
        index = dictionary.get(word, 0)
        if index == 0:
            unk_count += 1
        data.append(index)
    count[0][1] = unk_count
    reversed_dictionary = dict(zip(dictionary.values(), dictionary.keys()))
    return data, count, dictionary, reversed_dictionary

In [3]:
import json

with open('pos-training.json') as fopen:
    texts = json.load(fopen)
    
before, after = [], []
    
for splitted in texts:
    if len(splitted) < 2:
        continue
    before.append(list(splitted[0]))
    after.append(list(splitted[1]))
    
assert len(before) == len(after)

In [4]:
concat_from = list(itertools.chain(*before))
vocabulary_size_from = len(list(set(concat_from)))
data_from, count_from, dictionary_from, rev_dictionary_from = build_dataset(concat_from, vocabulary_size_from)
print('vocab from size: %d'%(vocabulary_size_from))
print('Most common words', count_from[4:10])
print('Sample data', data_from[:10], [rev_dictionary_from[i] for i in data_from[:10]])
print('filtered vocab size:',len(dictionary_from))
print("% of vocab used: {}%".format(round(len(dictionary_from)/vocabulary_size_from,4)*100))

vocab from size: 27
Most common words [('a', 781600), ('e', 493676), ('i', 478435), ('n', 475565), ('r', 359020), ('o', 332268)]
Sample data [8, 4, 11, 5, 13, 4, 5, 11, 6, 21] ['r', 'a', 't', 'e', 'l', 'a', 'e', 't', 'i', 'p']
filtered vocab size: 31
% of vocab used: 114.80999999999999%


In [5]:
concat_to = list(itertools.chain(*after))
vocabulary_size_to = len(list(set(concat_to)))
data_to, count_to, dictionary_to, rev_dictionary_to = build_dataset(concat_to, vocabulary_size_to)
print('vocab from size: %d'%(vocabulary_size_to))
print('Most common words', count_to[4:10])
print('Sample data', data_to[:10], [rev_dictionary_to[i] for i in data_to[:10]])
print('filtered vocab size:',len(dictionary_to))
print("% of vocab used: {}%".format(round(len(dictionary_to)/vocabulary_size_to,4)*100))

vocab from size: 27
Most common words [('a', 747317), ('i', 464716), ('e', 459686), ('n', 433253), ('r', 348899), ('o', 332268)]
Sample data [8, 4, 11, 6, 13, 4, 6, 11, 5, 21] ['r', 'a', 't', 'e', 'l', 'a', 'e', 't', 'i', 'p']
filtered vocab size: 31
% of vocab used: 114.80999999999999%


In [6]:
GO = dictionary_from['GO']
PAD = dictionary_from['PAD']
EOS = dictionary_from['EOS']
UNK = dictionary_from['UNK']

In [7]:
for i in range(len(after)):
    after[i].append('EOS')

In [8]:
class Stemmer:
    def __init__(self, size_layer, num_layers, embedded_size, 
                 from_dict_size, to_dict_size, learning_rate, 
                 dropout = 0.8, beam_width = 15):
        
        def lstm_cell(reuse=False):
            return tf.nn.rnn_cell.LSTMCell(size_layer, reuse=reuse)
        
        self.X = tf.placeholder(tf.int32, [None, None])
        self.X_seq_len = tf.count_nonzero(self.X, 1, dtype=tf.int32)
        self.Y = tf.placeholder(tf.int32, [None, None])
        self.Y_seq_len = tf.count_nonzero(self.Y, 1, dtype=tf.int32)
        batch_size = tf.shape(self.X)[0]

        encoder_embeddings = tf.Variable(tf.random_uniform([from_dict_size, embedded_size], -1, 1))
        encoder_embedded = tf.nn.embedding_lookup(encoder_embeddings, self.X)
        encoder_cells = tf.nn.rnn_cell.MultiRNNCell([lstm_cell() for _ in range(num_layers)])
        self.encoder_out, self.encoder_state = tf.nn.dynamic_rnn(cell = encoder_cells, 
                                                                 inputs = encoder_embedded, 
                                                                 sequence_length = self.X_seq_len,
                                                                 dtype = tf.float32)
        
        self.encoder_state = tuple(self.encoder_state[-1] for _ in range(num_layers))
        main = tf.strided_slice(self.Y, [0, 0], [batch_size, -1], [1, 1])
        decoder_input = tf.concat([tf.fill([batch_size, 1], GO), main], 1)
        decoder_embeddings = tf.Variable(tf.random_uniform([to_dict_size, embedded_size], -1, 1))
        dense_layer = tf.layers.Dense(to_dict_size)
        
        decoder_cells = tf.nn.rnn_cell.MultiRNNCell([lstm_cell() for _ in range(num_layers)])

        with tf.variable_scope('decode'):
            training_helper = tf.contrib.seq2seq.ScheduledEmbeddingTrainingHelper(
                    inputs = tf.nn.embedding_lookup(decoder_embeddings, decoder_input),
                    sequence_length = self.Y_seq_len,
                    embedding = decoder_embeddings,
                    sampling_probability = 0.5,
                    time_major = False)
            training_decoder = tf.contrib.seq2seq.BasicDecoder(
                    cell = decoder_cells,
                    helper = training_helper,
                    initial_state = self.encoder_state,
                    output_layer = dense_layer)
            training_decoder_output, _, _ = tf.contrib.seq2seq.dynamic_decode(
                    decoder = training_decoder,
                    impute_finished = True,
                    maximum_iterations = tf.reduce_max(self.Y_seq_len))
            
        # testing session
        with tf.variable_scope('decode', reuse=True):
            
            predicting_decoder = tf.contrib.seq2seq.BeamSearchDecoder(
                    cell = decoder_cells,
                    embedding = decoder_embeddings,
                    start_tokens = tf.tile(tf.constant([GO], dtype=tf.int32), [batch_size]),
                    end_token = EOS,
                    initial_state = tf.contrib.seq2seq.tile_batch(self.encoder_state, beam_width),
                    beam_width = beam_width,
                    output_layer = dense_layer,
                    length_penalty_weight = 0.0)
            predicting_decoder_output, _, _ = tf.contrib.seq2seq.dynamic_decode(
                    decoder = predicting_decoder,
                    impute_finished = False,
                    maximum_iterations = 2 * tf.reduce_max(self.X_seq_len))
            
        self.training_logits = training_decoder_output.rnn_output
        self.predicting_ids = tf.identity(predicting_decoder_output.predicted_ids[:, :, 0],name="logits")
        
        masks = tf.sequence_mask(self.Y_seq_len, tf.reduce_max(self.Y_seq_len), dtype=tf.float32)
        self.cost = tf.contrib.seq2seq.sequence_loss(logits = self.training_logits,
                                                     targets = self.Y,
                                                     weights = masks)
        
        self.optimizer = tf.train.AdamOptimizer(learning_rate).minimize(self.cost)
        y_t = tf.argmax(self.training_logits,axis=2)
        y_t = tf.cast(y_t, tf.int32)
        self.prediction = tf.boolean_mask(y_t, masks)
        mask_label = tf.boolean_mask(self.Y, masks)
        correct_pred = tf.equal(self.prediction, mask_label)
        correct_index = tf.cast(correct_pred, tf.float32)
        self.accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32))

In [9]:
size_layer = 256
num_layers = 2
embedded_size = 128
learning_rate = 1e-3
batch_size = 128
epoch = 10

In [10]:
tf.reset_default_graph()
sess = tf.InteractiveSession()
model = Stemmer(size_layer, num_layers, embedded_size, len(dictionary_from), 
                len(dictionary_to), learning_rate)
sess.run(tf.global_variables_initializer())

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


In [11]:
def str_idx(corpus, dic, UNK=3):
    X = []
    for i in corpus:
        ints = []
        for k in i:
            ints.append(dic.get(k, UNK))
        X.append(ints)
    return X

In [12]:
X = str_idx(before, dictionary_from)
Y = str_idx(after, dictionary_to)

In [13]:
from sklearn.cross_validation import train_test_split
train_X, test_X, train_Y, test_Y = train_test_split(X, Y, test_size=0.1)



In [14]:
def pad_sentence_batch(sentence_batch, pad_int):
    padded_seqs = []
    seq_lens = []
    max_sentence_len = max([len(sentence) for sentence in sentence_batch])
    for sentence in sentence_batch:
        padded_seqs.append(sentence + [pad_int] * (max_sentence_len - len(sentence)))
        seq_lens.append(len(sentence))
    return padded_seqs, seq_lens

In [15]:
EARLY_STOPPING, CURRENT_CHECKPOINT, CURRENT_ACC, EPOCH = 3, 0, 0, 0

while True:
    lasttime = time.time()
    if CURRENT_CHECKPOINT == EARLY_STOPPING:
        print('break epoch:%d\n' % (EPOCH))
        break
    total_loss, total_accuracy, total_loss_test, total_accuracy_test = 0, 0, 0, 0
    train_X, train_Y = shuffle(train_X, train_Y)
    test_X, test_Y = shuffle(test_X, test_Y)
    pbar = tqdm(range(0, len(train_X), batch_size), desc='train minibatch loop')
    for k in pbar:
        batch_x, _ = pad_sentence_batch(train_X[k: min(k+batch_size,len(train_X))], PAD)
        batch_y, _ = pad_sentence_batch(train_Y[k: min(k+batch_size,len(train_X))], PAD)
        acc, loss, _ = sess.run([model.accuracy, model.cost, model.optimizer], 
                                      feed_dict={model.X:batch_x,
                                                model.Y:batch_y})
        total_loss += loss
        total_accuracy += acc
        pbar.set_postfix(cost=loss, accuracy = acc)
        
    pbar = tqdm(range(0, len(test_X), batch_size), desc='test minibatch loop')
    for k in pbar:
        batch_x, _ = pad_sentence_batch(test_X[k: min(k+batch_size,len(test_X))], PAD)
        batch_y, _ = pad_sentence_batch(test_Y[k: min(k+batch_size,len(test_X))], PAD)
        acc, loss = sess.run([model.accuracy, model.cost], 
                                      feed_dict={model.X:batch_x,
                                                model.Y:batch_y})
        total_loss_test += loss
        total_accuracy_test += acc
        pbar.set_postfix(cost=loss, accuracy = acc)
        
    total_loss /= (len(train_X) / batch_size)
    total_accuracy /= (len(train_X) / batch_size)
    total_loss_test /= (len(test_X) / batch_size)
    total_accuracy_test /= (len(test_X) / batch_size)
    
    if total_accuracy_test > CURRENT_ACC:
        print(
            'epoch: %d, pass acc: %f, current acc: %f'
            % (EPOCH, CURRENT_ACC, total_accuracy_test)
        )
        CURRENT_ACC = total_accuracy_test
        CURRENT_CHECKPOINT = 0
    else:
        CURRENT_CHECKPOINT += 1
        
    print('epoch: %d, avg loss: %f, avg accuracy: %f'%(EPOCH, total_loss, total_accuracy))
    print('epoch: %d, avg loss test: %f, avg accuracy test: %f'%(EPOCH, total_loss_test, total_accuracy_test))
    EPOCH += 1

train minibatch loop: 100%|██████████| 5207/5207 [05:59<00:00, 14.48it/s, accuracy=1, cost=0.0282]    
test minibatch loop: 100%|██████████| 579/579 [00:16<00:00, 34.23it/s, accuracy=0.974, cost=0.0694]


epoch: 0, pass acc: 0.000000, current acc: 0.973307
epoch: 0, avg loss: 0.253140, avg accuracy: 0.917507
epoch: 0, avg loss test: 0.085634, avg accuracy test: 0.973307


train minibatch loop: 100%|██████████| 5207/5207 [05:57<00:00, 14.57it/s, accuracy=1, cost=0.0104]    
test minibatch loop: 100%|██████████| 579/579 [00:16<00:00, 34.62it/s, accuracy=0.98, cost=0.0639] 


epoch: 1, pass acc: 0.973307, current acc: 0.981620
epoch: 1, avg loss: 0.066930, avg accuracy: 0.978164
epoch: 1, avg loss test: 0.059490, avg accuracy test: 0.981620


train minibatch loop: 100%|██████████| 5207/5207 [05:57<00:00, 14.56it/s, accuracy=1, cost=0.001]      
test minibatch loop: 100%|██████████| 579/579 [00:16<00:00, 34.42it/s, accuracy=0.974, cost=0.0724]


epoch: 2, pass acc: 0.981620, current acc: 0.983380
epoch: 2, avg loss: 0.050763, avg accuracy: 0.983165
epoch: 2, avg loss test: 0.052117, avg accuracy test: 0.983380


train minibatch loop: 100%|██████████| 5207/5207 [05:57<00:00, 14.56it/s, accuracy=1, cost=0.0201]     
test minibatch loop: 100%|██████████| 579/579 [00:16<00:00, 34.72it/s, accuracy=0.985, cost=0.0467] 


epoch: 3, pass acc: 0.983380, current acc: 0.985942
epoch: 3, avg loss: 0.042299, avg accuracy: 0.985846
epoch: 3, avg loss test: 0.044006, avg accuracy test: 0.985942


train minibatch loop: 100%|██████████| 5207/5207 [05:57<00:00, 17.40it/s, accuracy=0.982, cost=0.0378] 
test minibatch loop: 100%|██████████| 579/579 [00:16<00:00, 34.59it/s, accuracy=0.998, cost=0.0114] 


epoch: 4, pass acc: 0.985942, current acc: 0.986297
epoch: 4, avg loss: 0.036827, avg accuracy: 0.987578
epoch: 4, avg loss test: 0.043699, avg accuracy test: 0.986297


train minibatch loop: 100%|██████████| 5207/5207 [05:56<00:00, 14.60it/s, accuracy=1, cost=0.0113]     
test minibatch loop: 100%|██████████| 579/579 [00:16<00:00, 34.62it/s, accuracy=0.978, cost=0.0715] 


epoch: 5, pass acc: 0.986297, current acc: 0.986347
epoch: 5, avg loss: 0.033080, avg accuracy: 0.988899
epoch: 5, avg loss test: 0.043023, avg accuracy test: 0.986347


train minibatch loop: 100%|██████████| 5207/5207 [05:57<00:00, 14.58it/s, accuracy=1, cost=0.0151]     
test minibatch loop: 100%|██████████| 579/579 [00:16<00:00, 34.85it/s, accuracy=0.983, cost=0.0368] 


epoch: 6, pass acc: 0.986347, current acc: 0.986864
epoch: 6, avg loss: 0.029517, avg accuracy: 0.990101
epoch: 6, avg loss test: 0.041912, avg accuracy test: 0.986864


train minibatch loop: 100%|██████████| 5207/5207 [05:56<00:00, 16.16it/s, accuracy=1, cost=0.0198]     
test minibatch loop: 100%|██████████| 579/579 [00:16<00:00, 34.73it/s, accuracy=0.988, cost=0.0439] 


epoch: 7, pass acc: 0.986864, current acc: 0.988453
epoch: 7, avg loss: 0.026783, avg accuracy: 0.991038
epoch: 7, avg loss test: 0.037975, avg accuracy test: 0.988453


train minibatch loop: 100%|██████████| 5207/5207 [05:57<00:00, 14.56it/s, accuracy=1, cost=1.94e-5]    
test minibatch loop: 100%|██████████| 579/579 [00:16<00:00, 34.61it/s, accuracy=0.992, cost=0.0321] 


epoch: 8, avg loss: 0.024509, avg accuracy: 0.991851
epoch: 8, avg loss test: 0.038301, avg accuracy test: 0.988439


train minibatch loop: 100%|██████████| 5207/5207 [05:57<00:00, 14.57it/s, accuracy=1, cost=0.00792]    
test minibatch loop: 100%|██████████| 579/579 [00:16<00:00, 38.21it/s, accuracy=0.985, cost=0.0527] 


epoch: 9, pass acc: 0.988453, current acc: 0.988892
epoch: 9, avg loss: 0.022305, avg accuracy: 0.992658
epoch: 9, avg loss test: 0.037014, avg accuracy test: 0.988892


train minibatch loop: 100%|██████████| 5207/5207 [05:56<00:00, 14.59it/s, accuracy=0.933, cost=0.226]  
test minibatch loop: 100%|██████████| 579/579 [00:16<00:00, 34.60it/s, accuracy=0.988, cost=0.0324] 


epoch: 10, pass acc: 0.988892, current acc: 0.989389
epoch: 10, avg loss: 0.020188, avg accuracy: 0.993379
epoch: 10, avg loss test: 0.037161, avg accuracy test: 0.989389


train minibatch loop: 100%|██████████| 5207/5207 [05:57<00:00, 14.58it/s, accuracy=0.949, cost=0.133]  
test minibatch loop: 100%|██████████| 579/579 [00:16<00:00, 34.88it/s, accuracy=0.983, cost=0.0681] 


epoch: 11, pass acc: 0.989389, current acc: 0.989590
epoch: 11, avg loss: 0.019071, avg accuracy: 0.993807
epoch: 11, avg loss test: 0.035890, avg accuracy test: 0.989590


train minibatch loop: 100%|██████████| 5207/5207 [05:57<00:00, 16.90it/s, accuracy=1, cost=0.00216]    
test minibatch loop: 100%|██████████| 579/579 [00:16<00:00, 34.75it/s, accuracy=0.986, cost=0.0287] 


epoch: 12, avg loss: 0.016948, avg accuracy: 0.994548
epoch: 12, avg loss test: 0.037803, avg accuracy test: 0.989491


train minibatch loop: 100%|██████████| 5207/5207 [05:57<00:00, 14.57it/s, accuracy=1, cost=0.00971]    
test minibatch loop: 100%|██████████| 579/579 [00:16<00:00, 34.59it/s, accuracy=0.994, cost=0.0146] 


epoch: 13, pass acc: 0.989590, current acc: 0.989636
epoch: 13, avg loss: 0.016151, avg accuracy: 0.994886
epoch: 13, avg loss test: 0.037589, avg accuracy test: 0.989636


train minibatch loop: 100%|██████████| 5207/5207 [05:57<00:00, 14.58it/s, accuracy=1, cost=0.000147]   
test minibatch loop: 100%|██████████| 579/579 [00:16<00:00, 34.70it/s, accuracy=0.982, cost=0.054]  


epoch: 14, pass acc: 0.989636, current acc: 0.989694
epoch: 14, avg loss: 0.014807, avg accuracy: 0.995336
epoch: 14, avg loss test: 0.037445, avg accuracy test: 0.989694


train minibatch loop: 100%|██████████| 5207/5207 [05:56<00:00, 14.61it/s, accuracy=1, cost=0.00317]    
test minibatch loop: 100%|██████████| 579/579 [00:16<00:00, 34.47it/s, accuracy=0.996, cost=0.0101] 


epoch: 15, pass acc: 0.989694, current acc: 0.990130
epoch: 15, avg loss: 0.014213, avg accuracy: 0.995541
epoch: 15, avg loss test: 0.037447, avg accuracy test: 0.990130


train minibatch loop: 100%|██████████| 5207/5207 [05:56<00:00, 14.60it/s, accuracy=0.98, cost=0.109]   
test minibatch loop: 100%|██████████| 579/579 [00:16<00:00, 34.85it/s, accuracy=0.998, cost=0.00543]


epoch: 16, avg loss: 0.013178, avg accuracy: 0.995891
epoch: 16, avg loss test: 0.037000, avg accuracy test: 0.990038


train minibatch loop: 100%|██████████| 5207/5207 [05:56<00:00, 14.59it/s, accuracy=1, cost=3.12e-5]    
test minibatch loop: 100%|██████████| 579/579 [00:16<00:00, 36.77it/s, accuracy=0.985, cost=0.044]  


epoch: 17, pass acc: 0.990130, current acc: 0.990191
epoch: 17, avg loss: 0.012415, avg accuracy: 0.996178
epoch: 17, avg loss test: 0.039594, avg accuracy test: 0.990191


train minibatch loop: 100%|██████████| 5207/5207 [05:57<00:00, 14.58it/s, accuracy=1, cost=7.03e-5]    
test minibatch loop: 100%|██████████| 579/579 [00:16<00:00, 34.67it/s, accuracy=0.992, cost=0.018]  


epoch: 18, avg loss: 0.011799, avg accuracy: 0.996451
epoch: 18, avg loss test: 0.039397, avg accuracy test: 0.989909


train minibatch loop: 100%|██████████| 5207/5207 [05:57<00:00, 14.58it/s, accuracy=1, cost=2.33e-5]    
test minibatch loop: 100%|██████████| 579/579 [00:16<00:00, 34.29it/s, accuracy=0.998, cost=0.00695]


epoch: 19, pass acc: 0.990191, current acc: 0.990219
epoch: 19, avg loss: 0.011475, avg accuracy: 0.996556
epoch: 19, avg loss test: 0.039024, avg accuracy test: 0.990219


train minibatch loop: 100%|██████████| 5207/5207 [05:57<00:00, 14.56it/s, accuracy=1, cost=8.23e-5]    
test minibatch loop: 100%|██████████| 579/579 [00:16<00:00, 34.80it/s, accuracy=0.994, cost=0.00775]


epoch: 20, pass acc: 0.990219, current acc: 0.990233
epoch: 20, avg loss: 0.010833, avg accuracy: 0.996737
epoch: 20, avg loss test: 0.039338, avg accuracy test: 0.990233


train minibatch loop: 100%|██████████| 5207/5207 [05:57<00:00, 14.57it/s, accuracy=1, cost=7.65e-5]    
test minibatch loop: 100%|██████████| 579/579 [00:16<00:00, 34.76it/s, accuracy=0.994, cost=0.014]  


epoch: 21, pass acc: 0.990233, current acc: 0.990311
epoch: 21, avg loss: 0.010448, avg accuracy: 0.996907
epoch: 21, avg loss test: 0.039332, avg accuracy test: 0.990311


train minibatch loop: 100%|██████████| 5207/5207 [05:57<00:00, 14.57it/s, accuracy=1, cost=0.000198]   
test minibatch loop: 100%|██████████| 579/579 [00:16<00:00, 39.11it/s, accuracy=0.985, cost=0.0427] 


epoch: 22, avg loss: 0.010113, avg accuracy: 0.997017
epoch: 22, avg loss test: 0.039674, avg accuracy test: 0.990233


train minibatch loop: 100%|██████████| 5207/5207 [05:57<00:00, 14.28it/s, accuracy=1, cost=3.57e-5]    
test minibatch loop: 100%|██████████| 579/579 [00:16<00:00, 34.54it/s, accuracy=0.994, cost=0.00926]


epoch: 23, pass acc: 0.990311, current acc: 0.990474
epoch: 23, avg loss: 0.009879, avg accuracy: 0.997085
epoch: 23, avg loss test: 0.040288, avg accuracy test: 0.990474


train minibatch loop: 100%|██████████| 5207/5207 [05:57<00:00, 14.58it/s, accuracy=0.983, cost=0.0513] 
test minibatch loop: 100%|██████████| 579/579 [00:16<00:00, 34.96it/s, accuracy=0.996, cost=0.00405]


epoch: 24, avg loss: 0.009464, avg accuracy: 0.997238
epoch: 24, avg loss test: 0.040749, avg accuracy test: 0.990400


train minibatch loop: 100%|██████████| 5207/5207 [05:56<00:00, 17.01it/s, accuracy=1, cost=0.000132]   
test minibatch loop: 100%|██████████| 579/579 [00:16<00:00, 34.75it/s, accuracy=0.996, cost=0.00774]


epoch: 25, pass acc: 0.990474, current acc: 0.990518
epoch: 25, avg loss: 0.009150, avg accuracy: 0.997362
epoch: 25, avg loss test: 0.041306, avg accuracy test: 0.990518


train minibatch loop: 100%|██████████| 5207/5207 [05:58<00:00, 14.51it/s, accuracy=0.984, cost=0.0516]  
test minibatch loop: 100%|██████████| 579/579 [00:16<00:00, 34.76it/s, accuracy=0.998, cost=0.00463]


epoch: 26, avg loss: 0.008876, avg accuracy: 0.997449
epoch: 26, avg loss test: 0.041249, avg accuracy test: 0.990411


train minibatch loop: 100%|██████████| 5207/5207 [05:56<00:00, 14.59it/s, accuracy=1, cost=0.00824]    
test minibatch loop: 100%|██████████| 579/579 [00:16<00:00, 37.48it/s, accuracy=0.988, cost=0.0581] 


epoch: 27, avg loss: 0.008687, avg accuracy: 0.997538
epoch: 27, avg loss test: 0.043180, avg accuracy test: 0.990174


train minibatch loop: 100%|██████████| 5207/5207 [05:56<00:00, 14.59it/s, accuracy=1, cost=0.000365]   
test minibatch loop: 100%|██████████| 579/579 [00:16<00:00, 34.83it/s, accuracy=0.996, cost=0.00891]


epoch: 28, pass acc: 0.990518, current acc: 0.990690
epoch: 28, avg loss: 0.008398, avg accuracy: 0.997641
epoch: 28, avg loss test: 0.040713, avg accuracy test: 0.990690


train minibatch loop: 100%|██████████| 5207/5207 [05:56<00:00, 15.21it/s, accuracy=0.986, cost=0.0257] 
test minibatch loop: 100%|██████████| 579/579 [00:16<00:00, 35.03it/s, accuracy=0.998, cost=0.0115] 


epoch: 29, avg loss: 0.008560, avg accuracy: 0.997583
epoch: 29, avg loss test: 0.043098, avg accuracy test: 0.990199


train minibatch loop: 100%|██████████| 5207/5207 [05:56<00:00, 14.59it/s, accuracy=0.986, cost=0.0254] 
test minibatch loop: 100%|██████████| 579/579 [00:16<00:00, 35.05it/s, accuracy=0.996, cost=0.0246] 


epoch: 30, avg loss: 0.008275, avg accuracy: 0.997664
epoch: 30, avg loss test: 0.042799, avg accuracy test: 0.990372


train minibatch loop: 100%|██████████| 5207/5207 [05:56<00:00, 14.62it/s, accuracy=1, cost=0.00298]     
test minibatch loop: 100%|██████████| 579/579 [00:16<00:00, 34.85it/s, accuracy=0.978, cost=0.0727] 

epoch: 31, avg loss: 0.007877, avg accuracy: 0.997801
epoch: 31, avg loss test: 0.043277, avg accuracy test: 0.990342
break epoch:32






In [16]:
saver = tf.train.Saver(tf.trainable_variables())
saver.save(sess, "beamsearch-lstm/model.ckpt")

'beamsearch-lstm/model.ckpt'

In [17]:
strings = ','.join(
    [
        n.name
        for n in tf.get_default_graph().as_graph_def().node
        if ('Variable' in n.op
        or 'Placeholder' in n.name
        or 'logits' in n.name
        or 'alphas' in n.name)
        and 'Adam' not in n.name
        and 'beta' not in n.name
        and 'OptimizeLoss' not in n.name
        and 'Global_Step' not in n.name
    ]
)

In [18]:
def freeze_graph(model_dir, output_node_names):

    if not tf.gfile.Exists(model_dir):
        raise AssertionError(
            "Export directory doesn't exists. Please specify an export "
            "directory: %s" % model_dir)

    checkpoint = tf.train.get_checkpoint_state(model_dir)
    input_checkpoint = checkpoint.model_checkpoint_path
    
    absolute_model_dir = "/".join(input_checkpoint.split('/')[:-1])
    output_graph = absolute_model_dir + "/frozen_model.pb"
    clear_devices = True
    with tf.Session(graph=tf.Graph()) as sess:
        saver = tf.train.import_meta_graph(input_checkpoint + '.meta', clear_devices=clear_devices)
        saver.restore(sess, input_checkpoint)
        output_graph_def = tf.graph_util.convert_variables_to_constants(
            sess,
            tf.get_default_graph().as_graph_def(),
            output_node_names.split(",")
        ) 
        with tf.gfile.GFile(output_graph, "wb") as f:
            f.write(output_graph_def.SerializeToString())
        print("%d ops in the final graph." % len(output_graph_def.node))

In [19]:
freeze_graph("beamsearch-lstm", strings)

INFO:tensorflow:Restoring parameters from beamsearch-lstm/model.ckpt
INFO:tensorflow:Froze 12 variables.
INFO:tensorflow:Converted 12 variables to const ops.
1128 ops in the final graph.


In [20]:
def load_graph(frozen_graph_filename):
    with tf.gfile.GFile(frozen_graph_filename, "rb") as f:
        graph_def = tf.GraphDef()
        graph_def.ParseFromString(f.read())
    with tf.Graph().as_default() as graph:
        tf.import_graph_def(graph_def)
    return graph

In [21]:
g=load_graph('beamsearch-lstm/frozen_model.pb')

In [22]:
x = g.get_tensor_by_name('import/Placeholder:0')
logits = g.get_tensor_by_name('import/logits:0')
test_sess = tf.InteractiveSession(graph=g)
predicted = test_sess.run(logits,feed_dict={x:str_idx(['kecomelan'],dictionary_from)})[0]
print('PREDICTED AFTER:',''.join([rev_dictionary_to[n] for n in predicted if n not in[0,1,2,3]]))

PREDICTED AFTER: comel




In [26]:
x = g.get_tensor_by_name('import/Placeholder:0')
logits = g.get_tensor_by_name('import/logits:0')
test_sess = tf.InteractiveSession(graph=g)
predicted = test_sess.run(logits,feed_dict={x:str_idx(['kecomelkan'],dictionary_from)})[0]
print('PREDICTED AFTER:',''.join([rev_dictionary_to[n] for n in predicted if n not in[0,1,2,3]]))

PREDICTED AFTER: comel


In [24]:
predicted = test_sess.run(logits,feed_dict={x:str_idx(['kejalanan'],dictionary_from)})[0]
print('PREDICTED AFTER:',''.join([rev_dictionary_to[n] for n in predicted if n not in[0,1,2,3]]))

PREDICTED AFTER: jalan


In [25]:
import json
with open('beamsearch-lstm-stem.json','w') as fopen:
    fopen.write(json.dumps({'dictionary_from':dictionary_from,
                'dictionary_to':dictionary_to,
                'rev_dictionary_to':rev_dictionary_to,
                'rev_dictionary_from':rev_dictionary_from}))