In [1]:
import numpy as np
import tensorflow as tf
from sklearn.utils import shuffle
import re
import time
import collections
import os
import itertools
from tqdm import tqdm

In [2]:
def build_dataset(words, n_words, atleast=1):
    count = [['GO', 0], ['PAD', 1], ['EOS', 2], ['UNK', 3]]
    counter = collections.Counter(words).most_common(n_words)
    counter = [i for i in counter if i[1] >= atleast]
    count.extend(counter)
    dictionary = dict()
    for word, _ in count:
        dictionary[word] = len(dictionary)
    data = list()
    unk_count = 0
    for word in words:
        index = dictionary.get(word, 0)
        if index == 0:
            unk_count += 1
        data.append(index)
    count[0][1] = unk_count
    reversed_dictionary = dict(zip(dictionary.values(), dictionary.keys()))
    return data, count, dictionary, reversed_dictionary

In [3]:
with open('normalizer-data.txt','r') as fopen:
    texts = fopen.read().split('\n')

print('len before %d'%(len(texts)))
before, after, ins = [], [], []
    
for i in texts:
    splitted = i.split('\t')
    if (len(splitted) < 1) or (len(splitted[0])) == 0 or (len(splitted[1]) > len(splitted[0])*3):
        continue
    if splitted[0].lower() in ins:
        continue
    ins.append(splitted[0].lower())
    before.append(list(splitted[0].lower()))
    after.append(list(splitted[1].lower()))
    
assert len(before) == len(after)
print('len after %d'%(len(before)))

len before 206226
len after 114110


In [4]:
concat_from = list(itertools.chain(*before))
vocabulary_size_from = len(list(set(concat_from)))
data_from, count_from, dictionary_from, rev_dictionary_from = build_dataset(concat_from, vocabulary_size_from)
print('vocab from size: %d'%(vocabulary_size_from))
print('Most common words', count_from[4:10])
print('Sample data', data_from[:10], [rev_dictionary_from[i] for i in data_from[:10]])
print('filtered vocab size:',len(dictionary_from))
print("% of vocab used: {}%".format(round(len(dictionary_from)/vocabulary_size_from,4)*100))

vocab from size: 50
Most common words [('n', 97787), ('a', 87288), ('r', 57924), ('e', 51506), ('i', 47830), ('s', 47641)]
Sample data [22, 7, 6, 4, 5, 19, 12, 9, 5, 13] ['c', 'e', 'r', 'n', 'a', 'h', 'k', 's', 'a', 'l']
filtered vocab size: 54
% of vocab used: 108.0%


In [5]:
concat_to = list(itertools.chain(*after))
vocabulary_size_to = len(list(set(concat_to)))
data_to, count_to, dictionary_to, rev_dictionary_to = build_dataset(concat_to, vocabulary_size_to)
print('vocab from size: %d'%(vocabulary_size_to))
print('Most common words', count_to[4:10])
print('Sample data', data_to[:10], [rev_dictionary_to[i] for i in data_to[:10]])
print('filtered vocab size:',len(dictionary_to))
print("% of vocab used: {}%".format(round(len(dictionary_to)/vocabulary_size_to,4)*100))

vocab from size: 35
Most common words [('a', 153205), ('n', 98360), ('e', 90799), ('i', 82118), ('r', 58199), ('s', 47922)]
Sample data [22, 6, 8, 7, 10, 4, 5, 4, 20, 12] ['c', 'e', 'r', 'i', 't', 'a', 'n', 'a', 'h', 'k']
filtered vocab size: 39
% of vocab used: 111.43%


In [6]:
GO = dictionary_from['GO']
PAD = dictionary_from['PAD']
EOS = dictionary_from['EOS']
UNK = dictionary_from['UNK']

In [7]:
for i in range(len(after)):
    after[i].append('EOS')

In [8]:
class Stemmer:
    def __init__(self, size_layer, num_layers, embedded_size, 
                 from_dict_size, to_dict_size, learning_rate, 
                 dropout = 0.8, beam_width = 15):
        
        def lstm_cell(size,reuse=False):
            return tf.nn.rnn_cell.LSTMCell(size, reuse=reuse)
        
        self.X = tf.placeholder(tf.int32, [None, None])
        self.X_seq_len = tf.count_nonzero(self.X, 1, dtype=tf.int32)
        self.Y = tf.placeholder(tf.int32, [None, None])
        self.Y_seq_len = tf.count_nonzero(self.Y, 1, dtype=tf.int32)
        batch_size = tf.shape(self.X)[0]

        encoder_embeddings = tf.Variable(tf.random_uniform([from_dict_size, embedded_size], -1, 1))
        encoder_embedded = tf.nn.embedding_lookup(encoder_embeddings, self.X)
        
        for n in range(num_layers):
            (out_fw, out_bw), (state_fw, state_bw) = tf.nn.bidirectional_dynamic_rnn(
                cell_fw = lstm_cell(size_layer // 2),
                cell_bw = lstm_cell(size_layer // 2),
                inputs = encoder_embedded,
                sequence_length = self.X_seq_len,
                dtype = tf.float32,
                scope = 'bidirectional_rnn_%d'%(n))
            encoder_embedded = tf.concat((out_fw, out_bw), 2)
        
        bi_state_c = tf.concat((state_fw.c, state_bw.c), -1)
        bi_state_h = tf.concat((state_fw.h, state_bw.h), -1)
        bi_lstm_state = tf.nn.rnn_cell.LSTMStateTuple(c=bi_state_c, h=bi_state_h)
        self.encoder_state = tuple([bi_lstm_state] * num_layers)
        self.encoder_state = tuple(self.encoder_state[-1] for _ in range(num_layers))
            
        main = tf.strided_slice(self.Y, [0, 0], [batch_size, -1], [1, 1])
        decoder_input = tf.concat([tf.fill([batch_size, 1], GO), main], 1)
        decoder_embeddings = tf.Variable(tf.random_uniform([to_dict_size, embedded_size], -1, 1))
        dense_layer = tf.layers.Dense(to_dict_size)
        
        decoder_cells = tf.nn.rnn_cell.MultiRNNCell([lstm_cell(size_layer) for _ in range(num_layers)])
        
        # training session
        with tf.variable_scope('decode'):
            training_helper = tf.contrib.seq2seq.ScheduledEmbeddingTrainingHelper(
                    inputs = tf.nn.embedding_lookup(decoder_embeddings, decoder_input),
                    sequence_length = self.Y_seq_len,
                    embedding = decoder_embeddings,
                    sampling_probability = 0.2,
                    time_major = False)
            training_decoder = tf.contrib.seq2seq.BasicDecoder(
                    cell = decoder_cells,
                    helper = training_helper,
                    initial_state = self.encoder_state,
                    output_layer = dense_layer)
            training_decoder_output, _, _ = tf.contrib.seq2seq.dynamic_decode(
                    decoder = training_decoder,
                    impute_finished = True,
                    maximum_iterations = tf.reduce_max(self.Y_seq_len))
            
        # testing session
        with tf.variable_scope('decode', reuse=True):
            
            predicting_decoder = tf.contrib.seq2seq.BeamSearchDecoder(
                    cell = decoder_cells,
                    embedding = decoder_embeddings,
                    start_tokens = tf.tile(tf.constant([GO], dtype=tf.int32), [batch_size]),
                    end_token = EOS,
                    initial_state = tf.contrib.seq2seq.tile_batch(self.encoder_state, beam_width),
                    beam_width = beam_width,
                    output_layer = dense_layer,
                    length_penalty_weight = 0.0)
            predicting_decoder_output, _, _ = tf.contrib.seq2seq.dynamic_decode(
                    decoder = predicting_decoder,
                    impute_finished = False,
                    maximum_iterations = 3 * tf.reduce_max(self.X_seq_len))
            
        self.training_logits = training_decoder_output.rnn_output
        self.predicting_ids = tf.identity(predicting_decoder_output.predicted_ids[:, :, 0],name="logits")
        
        masks = tf.sequence_mask(self.Y_seq_len, tf.reduce_max(self.Y_seq_len), dtype=tf.float32)
        self.cost = tf.contrib.seq2seq.sequence_loss(logits = self.training_logits,
                                                     targets = self.Y,
                                                     weights = masks)
        
        self.optimizer = tf.train.AdamOptimizer(learning_rate).minimize(self.cost)

In [9]:
size_layer = 256
num_layers = 2
embedded_size = 128
learning_rate = 5e-4
batch_size = 128
epoch = 50

In [10]:
tf.reset_default_graph()
sess = tf.InteractiveSession()
model = Stemmer(size_layer, num_layers, embedded_size, len(dictionary_from), 
                len(dictionary_to), learning_rate)
sess.run(tf.global_variables_initializer())

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


In [11]:
def str_idx(corpus, dic, UNK=3):
    X = []
    for i in corpus:
        ints = []
        for k in i:
            try:
                ints.append(dic[k])
            except Exception as e:
                ints.append(UNK)
        X.append(ints)
    return X

In [12]:
X = str_idx(before, dictionary_from)
Y = str_idx(after, dictionary_to)

In [13]:
def pad_sentence_batch(sentence_batch, pad_int):
    padded_seqs = []
    seq_lens = []
    max_sentence_len = max([len(sentence) for sentence in sentence_batch])
    for sentence in sentence_batch:
        padded_seqs.append(sentence + [pad_int] * (max_sentence_len - len(sentence)))
        seq_lens.append(len(sentence))
    return padded_seqs, seq_lens

def check_accuracy(logits, Y):
    acc = 0
    for i in range(logits.shape[0]):
        internal_acc = 0
        count = 0
        for k in range(len(Y[i])):
            try:
                if Y[i][k] == logits[i][k]:
                    internal_acc += 1
                count += 1
                if Y[i][k] == EOS:
                    break
            except:
                break
        acc += (internal_acc / count)
    return acc / logits.shape[0]

In [14]:
for i in range(epoch):
    total_loss, total_accuracy = 0, 0
    X, Y = shuffle(X, Y)
    pbar = tqdm(range(0, len(before), batch_size), desc='train minibatch loop')
    for k in pbar:
        batch_x, _ = pad_sentence_batch(X[k: min(k+batch_size,len(before))], PAD)
        batch_y, _ = pad_sentence_batch(Y[k: min(k+batch_size,len(before))], PAD)
        predicted, loss, _ = sess.run([model.predicting_ids, model.cost, model.optimizer], 
                                      feed_dict={model.X:batch_x,
                                                model.Y:batch_y})
        acc = check_accuracy(predicted,batch_y)
        total_loss += loss
        total_accuracy += acc
        pbar.set_postfix(cost=loss, accuracy = acc)
        
    total_loss /= (len(before) / batch_size)
    total_accuracy /= (len(before) / batch_size)
    print('epoch: %d, avg loss: %f, avg accuracy: %f'%(i+1, total_loss, total_accuracy))

train minibatch loop: 100%|██████████| 892/892 [05:06<00:00,  2.91it/s, accuracy=0.594, cost=0.483]
train minibatch loop:   0%|          | 0/892 [00:00<?, ?it/s]

epoch: 1, avg loss: 0.695482, avg accuracy: 0.516531


train minibatch loop: 100%|██████████| 892/892 [05:43<00:00,  2.60it/s, accuracy=0.716, cost=0.428]
train minibatch loop:   0%|          | 0/892 [00:00<?, ?it/s]

epoch: 2, avg loss: 0.356663, avg accuracy: 0.716523


train minibatch loop: 100%|██████████| 892/892 [05:44<00:00,  2.59it/s, accuracy=0.732, cost=0.238]
train minibatch loop:   0%|          | 0/892 [00:00<?, ?it/s]

epoch: 3, avg loss: 0.281400, avg accuracy: 0.760839


train minibatch loop: 100%|██████████| 892/892 [05:43<00:00,  2.60it/s, accuracy=0.726, cost=0.311] 
train minibatch loop:   0%|          | 0/892 [00:00<?, ?it/s]

epoch: 4, avg loss: 0.242011, avg accuracy: 0.782182


train minibatch loop: 100%|██████████| 892/892 [05:49<00:00,  2.56it/s, accuracy=0.85, cost=0.24]   
train minibatch loop:   0%|          | 0/892 [00:00<?, ?it/s]

epoch: 5, avg loss: 0.217453, avg accuracy: 0.797168


train minibatch loop: 100%|██████████| 892/892 [05:43<00:00,  2.59it/s, accuracy=0.842, cost=0.206] 
train minibatch loop:   0%|          | 0/892 [00:00<?, ?it/s]

epoch: 6, avg loss: 0.198011, avg accuracy: 0.809455


train minibatch loop: 100%|██████████| 892/892 [05:43<00:00,  2.59it/s, accuracy=0.77, cost=0.251]  
train minibatch loop:   0%|          | 0/892 [00:00<?, ?it/s]

epoch: 7, avg loss: 0.181745, avg accuracy: 0.820828


train minibatch loop: 100%|██████████| 892/892 [05:45<00:00,  2.58it/s, accuracy=0.846, cost=0.15]  
train minibatch loop:   0%|          | 0/892 [00:00<?, ?it/s]

epoch: 8, avg loss: 0.165816, avg accuracy: 0.831306


train minibatch loop: 100%|██████████| 892/892 [05:45<00:00,  2.58it/s, accuracy=0.806, cost=0.167] 
train minibatch loop:   0%|          | 0/892 [00:00<?, ?it/s]

epoch: 9, avg loss: 0.152959, avg accuracy: 0.842309


train minibatch loop: 100%|██████████| 892/892 [05:43<00:00,  2.60it/s, accuracy=0.825, cost=0.202] 
train minibatch loop:   0%|          | 0/892 [00:00<?, ?it/s]

epoch: 10, avg loss: 0.140761, avg accuracy: 0.852545


train minibatch loop: 100%|██████████| 892/892 [05:44<00:00,  2.59it/s, accuracy=0.844, cost=0.152] 
train minibatch loop:   0%|          | 0/892 [00:00<?, ?it/s]

epoch: 11, avg loss: 0.128879, avg accuracy: 0.863080


train minibatch loop: 100%|██████████| 892/892 [05:43<00:00,  2.60it/s, accuracy=0.841, cost=0.121] 
train minibatch loop:   0%|          | 0/892 [00:00<?, ?it/s]

epoch: 12, avg loss: 0.119069, avg accuracy: 0.872432


train minibatch loop: 100%|██████████| 892/892 [05:45<00:00,  2.58it/s, accuracy=0.871, cost=0.0944]
train minibatch loop:   0%|          | 0/892 [00:00<?, ?it/s]

epoch: 13, avg loss: 0.109670, avg accuracy: 0.882296


train minibatch loop: 100%|██████████| 892/892 [05:44<00:00,  2.59it/s, accuracy=0.919, cost=0.13]  
train minibatch loop:   0%|          | 0/892 [00:00<?, ?it/s]

epoch: 14, avg loss: 0.101104, avg accuracy: 0.891272


train minibatch loop: 100%|██████████| 892/892 [05:43<00:00,  2.59it/s, accuracy=0.877, cost=0.129] 
train minibatch loop:   0%|          | 0/892 [00:00<?, ?it/s]

epoch: 15, avg loss: 0.092002, avg accuracy: 0.898951


train minibatch loop: 100%|██████████| 892/892 [05:46<00:00,  2.57it/s, accuracy=0.844, cost=0.129] 
train minibatch loop:   0%|          | 0/892 [00:00<?, ?it/s]

epoch: 16, avg loss: 0.084631, avg accuracy: 0.908217


train minibatch loop: 100%|██████████| 892/892 [05:38<00:00,  2.63it/s, accuracy=0.894, cost=0.109] 
train minibatch loop:   0%|          | 0/892 [00:00<?, ?it/s]

epoch: 17, avg loss: 0.078061, avg accuracy: 0.914839


train minibatch loop: 100%|██████████| 892/892 [05:39<00:00,  2.62it/s, accuracy=0.919, cost=0.0776]
train minibatch loop:   0%|          | 0/892 [00:00<?, ?it/s]

epoch: 18, avg loss: 0.071318, avg accuracy: 0.922429


train minibatch loop: 100%|██████████| 892/892 [05:45<00:00,  2.58it/s, accuracy=0.933, cost=0.0522]
train minibatch loop:   0%|          | 0/892 [00:00<?, ?it/s]

epoch: 19, avg loss: 0.065093, avg accuracy: 0.929155


train minibatch loop: 100%|██████████| 892/892 [05:44<00:00,  2.59it/s, accuracy=0.957, cost=0.0473]
train minibatch loop:   0%|          | 0/892 [00:00<?, ?it/s]

epoch: 20, avg loss: 0.060398, avg accuracy: 0.935381


train minibatch loop: 100%|██████████| 892/892 [05:45<00:00,  2.59it/s, accuracy=0.926, cost=0.0515]
train minibatch loop:   0%|          | 0/892 [00:00<?, ?it/s]

epoch: 21, avg loss: 0.055276, avg accuracy: 0.940644


train minibatch loop: 100%|██████████| 892/892 [05:45<00:00,  2.58it/s, accuracy=0.954, cost=0.0693]
train minibatch loop:   0%|          | 0/892 [00:00<?, ?it/s]

epoch: 22, avg loss: 0.051781, avg accuracy: 0.945184


train minibatch loop: 100%|██████████| 892/892 [05:43<00:00,  2.60it/s, accuracy=0.945, cost=0.0392]
train minibatch loop:   0%|          | 0/892 [00:00<?, ?it/s]

epoch: 23, avg loss: 0.046855, avg accuracy: 0.951056


train minibatch loop: 100%|██████████| 892/892 [05:45<00:00,  2.58it/s, accuracy=0.982, cost=0.0285]
train minibatch loop:   0%|          | 0/892 [00:00<?, ?it/s]

epoch: 24, avg loss: 0.043199, avg accuracy: 0.955613


train minibatch loop: 100%|██████████| 892/892 [05:45<00:00,  2.59it/s, accuracy=0.989, cost=0.0535]
train minibatch loop:   0%|          | 0/892 [00:00<?, ?it/s]

epoch: 25, avg loss: 0.039693, avg accuracy: 0.959653


train minibatch loop: 100%|██████████| 892/892 [05:52<00:00,  2.53it/s, accuracy=0.973, cost=0.0299]
train minibatch loop:   0%|          | 0/892 [00:00<?, ?it/s]

epoch: 26, avg loss: 0.036768, avg accuracy: 0.963155


train minibatch loop: 100%|██████████| 892/892 [05:45<00:00,  2.58it/s, accuracy=0.951, cost=0.0358]
train minibatch loop:   0%|          | 0/892 [00:00<?, ?it/s]

epoch: 27, avg loss: 0.033757, avg accuracy: 0.967068


train minibatch loop: 100%|██████████| 892/892 [05:45<00:00,  2.58it/s, accuracy=0.993, cost=0.0221] 
train minibatch loop:   0%|          | 0/892 [00:00<?, ?it/s]

epoch: 28, avg loss: 0.031591, avg accuracy: 0.970121


train minibatch loop: 100%|██████████| 892/892 [05:42<00:00,  2.60it/s, accuracy=0.992, cost=0.0494]
train minibatch loop:   0%|          | 0/892 [00:00<?, ?it/s]

epoch: 29, avg loss: 0.030114, avg accuracy: 0.972151


train minibatch loop: 100%|██████████| 892/892 [05:44<00:00,  2.59it/s, accuracy=0.979, cost=0.0135] 
train minibatch loop:   0%|          | 0/892 [00:00<?, ?it/s]

epoch: 30, avg loss: 0.027564, avg accuracy: 0.975314


train minibatch loop: 100%|██████████| 892/892 [05:44<00:00,  2.59it/s, accuracy=0.975, cost=0.0498] 
train minibatch loop:   0%|          | 0/892 [00:00<?, ?it/s]

epoch: 31, avg loss: 0.025892, avg accuracy: 0.977682


train minibatch loop: 100%|██████████| 892/892 [05:44<00:00,  2.59it/s, accuracy=0.918, cost=0.0698] 
train minibatch loop:   0%|          | 0/892 [00:00<?, ?it/s]

epoch: 32, avg loss: 0.024069, avg accuracy: 0.979250


train minibatch loop: 100%|██████████| 892/892 [05:44<00:00,  2.59it/s, accuracy=0.959, cost=0.0744] 
train minibatch loop:   0%|          | 0/892 [00:00<?, ?it/s]

epoch: 33, avg loss: 0.023140, avg accuracy: 0.981469


train minibatch loop: 100%|██████████| 892/892 [05:45<00:00,  2.58it/s, accuracy=0.959, cost=0.0458] 
train minibatch loop:   0%|          | 0/892 [00:00<?, ?it/s]

epoch: 34, avg loss: 0.020938, avg accuracy: 0.982726


train minibatch loop: 100%|██████████| 892/892 [05:44<00:00,  2.59it/s, accuracy=0.995, cost=0.0348] 
train minibatch loop:   0%|          | 0/892 [00:00<?, ?it/s]

epoch: 35, avg loss: 0.020584, avg accuracy: 0.983847


train minibatch loop: 100%|██████████| 892/892 [05:44<00:00,  2.59it/s, accuracy=0.98, cost=0.0404]  
train minibatch loop:   0%|          | 0/892 [00:00<?, ?it/s]

epoch: 36, avg loss: 0.018204, avg accuracy: 0.985815


train minibatch loop: 100%|██████████| 892/892 [05:51<00:00,  2.54it/s, accuracy=0.997, cost=0.00758]
train minibatch loop:   0%|          | 0/892 [00:00<?, ?it/s]

epoch: 37, avg loss: 0.019663, avg accuracy: 0.985068


train minibatch loop: 100%|██████████| 892/892 [05:46<00:00,  2.57it/s, accuracy=0.989, cost=0.0161] 
train minibatch loop:   0%|          | 0/892 [00:00<?, ?it/s]

epoch: 38, avg loss: 0.017290, avg accuracy: 0.987544


train minibatch loop: 100%|██████████| 892/892 [05:45<00:00,  2.58it/s, accuracy=0.988, cost=0.0235] 
train minibatch loop:   0%|          | 0/892 [00:00<?, ?it/s]

epoch: 39, avg loss: 0.016680, avg accuracy: 0.988373


train minibatch loop: 100%|██████████| 892/892 [05:44<00:00,  2.59it/s, accuracy=0.985, cost=0.0138] 
train minibatch loop:   0%|          | 0/892 [00:00<?, ?it/s]

epoch: 40, avg loss: 0.017076, avg accuracy: 0.988090


train minibatch loop: 100%|██████████| 892/892 [05:45<00:00,  2.58it/s, accuracy=0.967, cost=0.0166] 
train minibatch loop:   0%|          | 0/892 [00:00<?, ?it/s]

epoch: 41, avg loss: 0.015203, avg accuracy: 0.990019


train minibatch loop: 100%|██████████| 892/892 [05:44<00:00,  2.59it/s, accuracy=0.987, cost=0.00969]
train minibatch loop:   0%|          | 0/892 [00:00<?, ?it/s]

epoch: 42, avg loss: 0.014750, avg accuracy: 0.990250


train minibatch loop: 100%|██████████| 892/892 [05:43<00:00,  2.60it/s, accuracy=0.966, cost=0.0154] 
train minibatch loop:   0%|          | 0/892 [00:00<?, ?it/s]

epoch: 43, avg loss: 0.013783, avg accuracy: 0.990331


train minibatch loop: 100%|██████████| 892/892 [05:44<00:00,  2.59it/s, accuracy=0.996, cost=0.00563]
train minibatch loop:   0%|          | 0/892 [00:00<?, ?it/s]

epoch: 44, avg loss: 0.013778, avg accuracy: 0.990722


train minibatch loop: 100%|██████████| 892/892 [05:44<00:00,  2.59it/s, accuracy=0.977, cost=0.0176] 
train minibatch loop:   0%|          | 0/892 [00:00<?, ?it/s]

epoch: 45, avg loss: 0.013877, avg accuracy: 0.990801


train minibatch loop: 100%|██████████| 892/892 [05:50<00:00,  2.54it/s, accuracy=0.981, cost=0.0237] 
train minibatch loop:   0%|          | 0/892 [00:00<?, ?it/s]

epoch: 46, avg loss: 0.013370, avg accuracy: 0.991398


train minibatch loop: 100%|██████████| 892/892 [05:52<00:00,  2.53it/s, accuracy=0.976, cost=0.014]  
train minibatch loop:   0%|          | 0/892 [00:00<?, ?it/s]

epoch: 47, avg loss: 0.012350, avg accuracy: 0.991903


train minibatch loop: 100%|██████████| 892/892 [05:46<00:00,  2.57it/s, accuracy=0.991, cost=0.0097] 
train minibatch loop:   0%|          | 0/892 [00:00<?, ?it/s]

epoch: 48, avg loss: 0.011992, avg accuracy: 0.992415


train minibatch loop: 100%|██████████| 892/892 [05:45<00:00,  2.58it/s, accuracy=0.998, cost=0.00385]
train minibatch loop:   0%|          | 0/892 [00:00<?, ?it/s]

epoch: 49, avg loss: 0.012817, avg accuracy: 0.991546


train minibatch loop: 100%|██████████| 892/892 [05:44<00:00,  2.59it/s, accuracy=0.976, cost=0.00929]

epoch: 50, avg loss: 0.012196, avg accuracy: 0.992369





In [16]:
predicted = sess.run(model.predicting_ids,feed_dict={model.X:batch_x})

In [17]:
for i in range(len(batch_x)):
    print('row %d'%(i+1))
    print('BEFORE:',''.join([rev_dictionary_from[n] for n in batch_x[i] if n not in [0,1,2,3]]))
    print('REAL AFTER:',''.join([rev_dictionary_to[n] for n in batch_y[i] if n not in[0,1,2,3]]))
    print('PREDICTED AFTER:',''.join([rev_dictionary_to[n] for n in predicted[i] if n not in[0,1,2,3]]),'\n')

row 1
BEFORE: mlysin
REAL AFTER: malaysian
PREDICTED AFTER: malaysian 

row 2
BEFORE: l-duny
REAL AFTER: al-dunya
PREDICTED AFTER: al-dunya 

row 3
BEFORE: misri
REAL AFTER: misri
PREDICTED AFTER: misri 

row 4
BEFORE: nteraks
REAL AFTER: interaksi
PREDICTED AFTER: interaksi 

row 5
BEFORE: arlngtn
REAL AFTER: arlington
PREDICTED AFTER: arlington 

row 6
BEFORE: knsng
REAL AFTER: kuansing
PREDICTED AFTER: kuansing 

row 7
BEFORE: bn-sekijng
REAL AFTER: bn-sekijang
PREDICTED AFTER: bn-sekijang 

row 8
BEFORE: ysr
REAL AFTER: yasir
PREDICTED AFTER: yasir 

row 9
BEFORE: mmnul
REAL AFTER: emmanuel
PREDICTED AFTER: emmanuel 

row 10
BEFORE: msladng
REAL AFTER: misleading
PREDICTED AFTER: misleading 

row 11
BEFORE: inflate
REAL AFTER: inflate
PREDICTED AFTER: inflate 

row 12
BEFORE: disebalik
REAL AFTER: disebalik
PREDICTED AFTER: disebalik 

row 13
BEFORE: perjdin
REAL AFTER: perjudian
PREDICTED AFTER: perjudian 

row 14
BEFORE: abandn
REAL AFTER: abandon
PREDICTED AFTER: abandon 

row 1

In [55]:
predicted = sess.run(model.predicting_ids,feed_dict={model.X:str_idx(['mly'],dictionary_from)})[0]
print('PREDICTED AFTER:',''.join([rev_dictionary_to[n] for n in predicted if n not in[0,1,2,3]]))

PREDICTED AFTER: milyu


In [29]:
predicted = sess.run(model.predicting_ids,feed_dict={model.X:str_idx(['nikmt'],dictionary_from)})[0]
print('PREDICTED AFTER:',''.join([rev_dictionary_to[n] for n in predicted if n not in[0,1,2,3]]))

PREDICTED AFTER: nikomati


In [27]:
predicted = sess.run(model.predicting_ids,feed_dict={model.X:str_idx(['pmsukan'],dictionary_from)})[0]
print('PREDICTED AFTER:',''.join([rev_dictionary_to[n] for n in predicted if n not in[0,1,2,3]]))

PREDICTED AFTER: pemusukan


In [21]:
predicted = sess.run(model.predicting_ids,feed_dict={model.X:str_idx(['bsuk'],dictionary_from)})[0]
print('PREDICTED AFTER:',''.join([rev_dictionary_to[n] for n in predicted if n not in[0,1,2,3]]))

PREDICTED AFTER: bsekuk


In [22]:
predicted = sess.run(model.predicting_ids,feed_dict={model.X:str_idx(['cmel'],dictionary_from)})[0]
print('PREDICTED AFTER:',''.join([rev_dictionary_to[n] for n in predicted if n not in[0,1,2,3]]))

PREDICTED AFTER: comelio


In [56]:
saver = tf.train.Saver(tf.global_variables())
saver.save(sess, "normalizer/model.ckpt")

'normalizer/model.ckpt'

In [57]:
strings=','.join([n.name for n in tf.get_default_graph().as_graph_def().node if "Variable" in n.op or n.name.find('Placeholder') >= 0 or n.name.find('logits') == 0])

In [58]:
def freeze_graph(model_dir, output_node_names):

    if not tf.gfile.Exists(model_dir):
        raise AssertionError(
            "Export directory doesn't exists. Please specify an export "
            "directory: %s" % model_dir)

    checkpoint = tf.train.get_checkpoint_state(model_dir)
    input_checkpoint = checkpoint.model_checkpoint_path
    
    absolute_model_dir = "/".join(input_checkpoint.split('/')[:-1])
    output_graph = absolute_model_dir + "/frozen_model.pb"
    clear_devices = True
    with tf.Session(graph=tf.Graph()) as sess:
        saver = tf.train.import_meta_graph(input_checkpoint + '.meta', clear_devices=clear_devices)
        saver.restore(sess, input_checkpoint)
        output_graph_def = tf.graph_util.convert_variables_to_constants(
            sess,
            tf.get_default_graph().as_graph_def(),
            output_node_names.split(",")
        ) 
        with tf.gfile.GFile(output_graph, "wb") as f:
            f.write(output_graph_def.SerializeToString())
        print("%d ops in the final graph." % len(output_graph_def.node))

In [59]:
freeze_graph("normalizer", strings)

INFO:tensorflow:Restoring parameters from normalizer/model.ckpt
INFO:tensorflow:Froze 50 variables.
Converted 50 variables to const ops.
1295 ops in the final graph.


In [60]:
def load_graph(frozen_graph_filename):
    with tf.gfile.GFile(frozen_graph_filename, "rb") as f:
        graph_def = tf.GraphDef()
        graph_def.ParseFromString(f.read())
    with tf.Graph().as_default() as graph:
        tf.import_graph_def(graph_def)
    return graph

In [61]:
from tensorflow.contrib.seq2seq.python.ops import beam_search_ops
g=load_graph('normalizer/frozen_model.pb')

In [63]:
x = g.get_tensor_by_name('import/Placeholder:0')
logits = g.get_tensor_by_name('import/logits:0')
test_sess = tf.InteractiveSession(graph=g)
predicted = test_sess.run(logits,feed_dict={x:str_idx(['bjalan'],dictionary_from)})[0]
print('PREDICTED AFTER:',''.join([rev_dictionary_to[n] for n in predicted if n not in[0,1,2,3]]))

PREDICTED AFTER: bujalan


In [64]:
import json
with open('normalizer-deep.json','w') as fopen:
    fopen.write(json.dumps({'dictionary_from':dictionary_from,
                'dictionary_to':dictionary_to,
                'rev_dictionary_to':rev_dictionary_to,
                'rev_dictionary_from':rev_dictionary_from}))