In [1]:
import numpy as np
import tensorflow as tf
from sklearn.utils import shuffle
import re
import time
import collections
import os
import itertools
from tqdm import tqdm

In [2]:
def build_dataset(words, n_words, atleast=1):
    count = [['PAD', 0], ['GO', 1], ['EOS', 2], ['UNK', 3]]
    counter = collections.Counter(words).most_common(n_words)
    counter = [i for i in counter if i[1] >= atleast]
    count.extend(counter)
    dictionary = dict()
    for word, _ in count:
        dictionary[word] = len(dictionary)
    data = list()
    unk_count = 0
    for word in words:
        index = dictionary.get(word, 0)
        if index == 0:
            unk_count += 1
        data.append(index)
    count[0][1] = unk_count
    reversed_dictionary = dict(zip(dictionary.values(), dictionary.keys()))
    return data, count, dictionary, reversed_dictionary

In [3]:
import json

with open('augment-normalizer-v4.json') as fopen:
    texts = json.load(fopen)
    
before, after = [], []
    
for splitted in texts:
    if len(splitted) < 2:
        continue
    if not len(splitted[0]):
        continue
    before.append(list(splitted[0]))
    after.append(list(splitted[1]))
    
assert len(before) == len(after)

In [4]:
concat_from = list(itertools.chain(*before))
vocabulary_size_from = len(list(set(concat_from)))
data_from, count_from, dictionary_from, rev_dictionary_from = build_dataset(concat_from, vocabulary_size_from)
print('vocab from size: %d'%(vocabulary_size_from))
print('Most common words', count_from[4:10])
print('Sample data', data_from[:10], [rev_dictionary_from[i] for i in data_from[:10]])
print('filtered vocab size:',len(dictionary_from))
print("% of vocab used: {}%".format(round(len(dictionary_from)/vocabulary_size_from,4)*100))

vocab from size: 28
Most common words [('a', 1090958), ('l', 943383), ('e', 773153), ('n', 623036), ('r', 499905), ('x', 439435)]
Sample data [4, 19, 4, 20, 9, 19, 4, 20, 9, 19] ['a', 'b', 'a', 'd', 'x', 'b', 'a', 'd', 'x', 'b']
filtered vocab size: 32
% of vocab used: 114.29%


In [5]:
concat_to = list(itertools.chain(*after))
vocabulary_size_to = len(list(set(concat_to)))
data_to, count_to, dictionary_to, rev_dictionary_to = build_dataset(concat_to, vocabulary_size_to)
print('vocab from size: %d'%(vocabulary_size_to))
print('Most common words', count_to[4:10])
print('Sample data', data_to[:10], [rev_dictionary_to[i] for i in data_to[:10]])
print('filtered vocab size:',len(dictionary_to))
print("% of vocab used: {}%".format(round(len(dictionary_to)/vocabulary_size_to,4)*100))

vocab from size: 29
Most common words [('a', 2164890), (' ', 1131495), ('l', 943383), ('k', 843343), ('h', 828089), ('t', 729459)]
Sample data [4, 19, 4, 21, 9, 4, 7, 5, 4, 19] ['a', 'b', 'a', 'd', 't', 'a', 'k', ' ', 'a', 'b']
filtered vocab size: 33
% of vocab used: 113.78999999999999%


In [6]:
GO = dictionary_from['GO']
PAD = dictionary_from['PAD']
EOS = dictionary_from['EOS']
UNK = dictionary_from['UNK']

In [7]:
for i in range(len(after)):
    after[i].append('EOS')

In [8]:
class Stemmer:
    def __init__(self, size_layer, num_layers, embedded_size, 
                 from_dict_size, to_dict_size, learning_rate, 
                 dropout = 0.8, beam_width = 15, force_teaching_ratio=0.5):
        
        def lstm_cell(reuse=False):
            return tf.nn.rnn_cell.LSTMCell(size_layer, reuse=reuse)
        
        self.X = tf.placeholder(tf.int32, [None, None])
        self.X_seq_len = tf.count_nonzero(self.X, 1, dtype=tf.int32)
        self.Y = tf.placeholder(tf.int32, [None, None])
        self.Y_seq_len = tf.count_nonzero(self.Y, 1, dtype=tf.int32)
        batch_size = tf.shape(self.X)[0]

        encoder_embeddings = tf.Variable(tf.random_uniform([from_dict_size, embedded_size], -1, 1))
        encoder_embedded = tf.nn.embedding_lookup(encoder_embeddings, self.X)
        encoder_cells = tf.nn.rnn_cell.MultiRNNCell([lstm_cell() for _ in range(num_layers)])
        self.encoder_out, self.encoder_state = tf.nn.dynamic_rnn(cell = encoder_cells, 
                                                                 inputs = encoder_embedded, 
                                                                 sequence_length = self.X_seq_len,
                                                                 dtype = tf.float32)
        
        encoder_state = tuple(self.encoder_state[-1] for _ in range(num_layers))
        main = tf.strided_slice(self.Y, [0, 0], [batch_size, -1], [1, 1])
        decoder_input = tf.concat([tf.fill([batch_size, 1], GO), main], 1)
        decoder_embeddings = tf.Variable(tf.random_uniform([to_dict_size, embedded_size], -1, 1))
        dense_layer = tf.layers.Dense(to_dict_size)
        
        with tf.variable_scope('decode'):
            attention_mechanism = tf.contrib.seq2seq.LuongAttention(
            num_units = size_layer, 
            memory = encoder_embedded,
            memory_sequence_length = self.X_seq_len)
            decoder_cell = tf.contrib.seq2seq.AttentionWrapper(
                cell = tf.nn.rnn_cell.MultiRNNCell([lstm_cell() for _ in range(num_layers)]),
                attention_mechanism = attention_mechanism,
                attention_layer_size = size_layer)
            main = tf.strided_slice(self.Y, [0, 0], [batch_size, -1], [1, 1])
            decoder_input = tf.concat([tf.fill([batch_size, 1], GO), main], 1)
            training_helper = tf.contrib.seq2seq.ScheduledEmbeddingTrainingHelper(
            inputs = tf.nn.embedding_lookup(decoder_embeddings, decoder_input),
                sequence_length = self.Y_seq_len,
                embedding = decoder_embeddings,
                sampling_probability = 1 - force_teaching_ratio,
                time_major = False)
            training_decoder = tf.contrib.seq2seq.BasicDecoder(
                cell = decoder_cell,
                helper = training_helper,
                initial_state = decoder_cell.zero_state(batch_size, tf.float32).clone(cell_state=encoder_state),
                output_layer = dense_layer)
            training_decoder_output, _, _ = tf.contrib.seq2seq.dynamic_decode(
                decoder = training_decoder,
                impute_finished = True,
                maximum_iterations = tf.reduce_max(self.Y_seq_len))
            
        with tf.variable_scope('decode', reuse=True):
            encoder_out_tiled = tf.contrib.seq2seq.tile_batch(encoder_embedded, beam_width)
            encoder_state_tiled = tf.contrib.seq2seq.tile_batch(encoder_state, beam_width)
            X_seq_len_tiled = tf.contrib.seq2seq.tile_batch(self.X_seq_len, beam_width)
            attention_mechanism = tf.contrib.seq2seq.LuongAttention(
                num_units = size_layer, 
                memory = encoder_out_tiled,
                memory_sequence_length = X_seq_len_tiled)
            decoder_cell = tf.contrib.seq2seq.AttentionWrapper(
                cell = tf.nn.rnn_cell.MultiRNNCell([lstm_cell(reuse=True) for _ in range(num_layers)]),
                attention_mechanism = attention_mechanism,
                attention_layer_size = size_layer)
            predicting_decoder = tf.contrib.seq2seq.BeamSearchDecoder(
                cell = decoder_cell,
                embedding = decoder_embeddings,
                start_tokens = tf.tile(tf.constant([GO], dtype=tf.int32), [batch_size]),
                end_token = EOS,
                initial_state = decoder_cell.zero_state(batch_size * beam_width, tf.float32).clone(cell_state = encoder_state_tiled),
                beam_width = beam_width,
                output_layer = dense_layer,
                length_penalty_weight = 0.0)
            predicting_decoder_output, _, _ = tf.contrib.seq2seq.dynamic_decode(
                decoder = predicting_decoder,
                impute_finished = False,
                maximum_iterations = 2 * tf.reduce_max(self.X_seq_len))
            
            
        self.training_logits = training_decoder_output.rnn_output
        self.predicting_ids = tf.identity(predicting_decoder_output.predicted_ids[:, :, 0],name="logits")
        
        masks = tf.sequence_mask(self.Y_seq_len, tf.reduce_max(self.Y_seq_len), dtype=tf.float32)
        self.cost = tf.contrib.seq2seq.sequence_loss(logits = self.training_logits,
                                                     targets = self.Y,
                                                     weights = masks)
        
        self.optimizer = tf.train.AdamOptimizer(learning_rate).minimize(self.cost)
        y_t = tf.argmax(self.training_logits,axis=2)
        y_t = tf.cast(y_t, tf.int32)
        self.prediction = tf.boolean_mask(y_t, masks)
        mask_label = tf.boolean_mask(self.Y, masks)
        correct_pred = tf.equal(self.prediction, mask_label)
        correct_index = tf.cast(correct_pred, tf.float32)
        self.accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32))

In [9]:
size_layer = 256
num_layers = 2
embedded_size = 128
learning_rate = 1e-3
batch_size = 128
epoch = 10

In [10]:
tf.reset_default_graph()
sess = tf.InteractiveSession()
model = Stemmer(size_layer, num_layers, embedded_size, len(dictionary_from), 
                len(dictionary_to), learning_rate)
sess.run(tf.global_variables_initializer())

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


In [11]:
def str_idx(corpus, dic, UNK=3):
    X = []
    for i in corpus:
        ints = []
        for k in i:
            ints.append(dic.get(k, UNK))
        X.append(ints)
    return X

In [12]:
X = str_idx(before, dictionary_from)
Y = str_idx(after, dictionary_to)

In [13]:
from sklearn.cross_validation import train_test_split
train_X, test_X, train_Y, test_Y = train_test_split(X, Y, test_size=0.1)



In [14]:
def pad_sentence_batch(sentence_batch, pad_int):
    padded_seqs = []
    seq_lens = []
    max_sentence_len = max([len(sentence) for sentence in sentence_batch])
    for sentence in sentence_batch:
        padded_seqs.append(sentence + [pad_int] * (max_sentence_len - len(sentence)))
        seq_lens.append(len(sentence))
    return padded_seqs, seq_lens

In [15]:
EARLY_STOPPING, CURRENT_CHECKPOINT, CURRENT_ACC, EPOCH = 3, 0, 0, 0

while True:
    lasttime = time.time()
    if CURRENT_CHECKPOINT == EARLY_STOPPING:
        print('break epoch:%d\n' % (EPOCH))
        break
    total_loss, total_accuracy, total_loss_test, total_accuracy_test = 0, 0, 0, 0
    train_X, train_Y = shuffle(train_X, train_Y)
    test_X, test_Y = shuffle(test_X, test_Y)
    pbar = tqdm(range(0, len(train_X), batch_size), desc='train minibatch loop')
    for k in pbar:
        batch_x, _ = pad_sentence_batch(train_X[k: min(k+batch_size,len(train_X))], PAD)
        batch_y, _ = pad_sentence_batch(train_Y[k: min(k+batch_size,len(train_X))], PAD)
        acc, loss, _ = sess.run([model.accuracy, model.cost, model.optimizer], 
                                      feed_dict={model.X:batch_x,
                                                model.Y:batch_y})
        total_loss += loss
        total_accuracy += acc
        pbar.set_postfix(cost=loss, accuracy = acc)
        
    pbar = tqdm(range(0, len(test_X), batch_size), desc='test minibatch loop')
    for k in pbar:
        batch_x, _ = pad_sentence_batch(test_X[k: min(k+batch_size,len(test_X))], PAD)
        batch_y, _ = pad_sentence_batch(test_Y[k: min(k+batch_size,len(test_X))], PAD)
        acc, loss = sess.run([model.accuracy, model.cost], 
                                      feed_dict={model.X:batch_x,
                                                model.Y:batch_y})
        total_loss_test += loss
        total_accuracy_test += acc
        pbar.set_postfix(cost=loss, accuracy = acc)
        
    total_loss /= (len(train_X) / batch_size)
    total_accuracy /= (len(train_X) / batch_size)
    total_loss_test /= (len(test_X) / batch_size)
    total_accuracy_test /= (len(test_X) / batch_size)
    
    if total_accuracy_test > CURRENT_ACC:
        print(
            'epoch: %d, pass acc: %f, current acc: %f'
            % (EPOCH, CURRENT_ACC, total_accuracy_test)
        )
        CURRENT_ACC = total_accuracy_test
        CURRENT_CHECKPOINT = 0
    else:
        CURRENT_CHECKPOINT += 1
        
    print('epoch: %d, avg loss: %f, avg accuracy: %f'%(EPOCH, total_loss, total_accuracy))
    print('epoch: %d, avg loss test: %f, avg accuracy test: %f'%(EPOCH, total_loss_test, total_accuracy_test))
    EPOCH += 1

train minibatch loop: 100%|██████████| 5984/5984 [09:41<00:00, 10.29it/s, accuracy=0.988, cost=0.0326]
test minibatch loop: 100%|██████████| 665/665 [00:28<00:00, 23.28it/s, accuracy=0.969, cost=0.0738]


epoch: 0, pass acc: 0.000000, current acc: 0.970278
epoch: 0, avg loss: 0.196654, avg accuracy: 0.939013
epoch: 0, avg loss test: 0.094238, avg accuracy test: 0.970278


train minibatch loop: 100%|██████████| 5984/5984 [09:38<00:00, 10.35it/s, accuracy=0.986, cost=0.0476]
test minibatch loop: 100%|██████████| 665/665 [00:28<00:00, 23.39it/s, accuracy=0.979, cost=0.0654]


epoch: 1, pass acc: 0.970278, current acc: 0.977899
epoch: 1, avg loss: 0.079812, avg accuracy: 0.974141
epoch: 1, avg loss test: 0.067784, avg accuracy test: 0.977899


train minibatch loop: 100%|██████████| 5984/5984 [09:37<00:00, 10.78it/s, accuracy=0.987, cost=0.0371]
test minibatch loop: 100%|██████████| 665/665 [00:28<00:00, 23.38it/s, accuracy=0.986, cost=0.0392]


epoch: 2, pass acc: 0.977899, current acc: 0.983517
epoch: 2, avg loss: 0.057337, avg accuracy: 0.980940
epoch: 2, avg loss test: 0.049763, avg accuracy test: 0.983517


train minibatch loop: 100%|██████████| 5984/5984 [09:38<00:00, 11.04it/s, accuracy=0.987, cost=0.0374]
test minibatch loop: 100%|██████████| 665/665 [00:28<00:00, 23.38it/s, accuracy=0.992, cost=0.0214]


epoch: 3, pass acc: 0.983517, current acc: 0.984213
epoch: 3, avg loss: 0.045526, avg accuracy: 0.984753
epoch: 3, avg loss test: 0.048175, avg accuracy test: 0.984213


train minibatch loop: 100%|██████████| 5984/5984 [09:38<00:00, 10.87it/s, accuracy=0.985, cost=0.0351]
test minibatch loop: 100%|██████████| 665/665 [00:28<00:00, 23.31it/s, accuracy=0.988, cost=0.026] 


epoch: 4, pass acc: 0.984213, current acc: 0.985816
epoch: 4, avg loss: 0.038852, avg accuracy: 0.986804
epoch: 4, avg loss test: 0.042522, avg accuracy test: 0.985816


train minibatch loop: 100%|██████████| 5984/5984 [09:38<00:00, 10.85it/s, accuracy=0.992, cost=0.022] 
test minibatch loop: 100%|██████████| 665/665 [00:28<00:00, 23.34it/s, accuracy=0.991, cost=0.0312]


epoch: 5, pass acc: 0.985816, current acc: 0.986477
epoch: 5, avg loss: 0.035392, avg accuracy: 0.987867
epoch: 5, avg loss test: 0.040669, avg accuracy test: 0.986477


train minibatch loop: 100%|██████████| 5984/5984 [09:38<00:00, 10.34it/s, accuracy=0.985, cost=0.0442] 
test minibatch loop: 100%|██████████| 665/665 [00:28<00:00, 23.36it/s, accuracy=0.98, cost=0.0506] 


epoch: 6, pass acc: 0.986477, current acc: 0.987883
epoch: 6, avg loss: 0.032731, avg accuracy: 0.988687
epoch: 6, avg loss test: 0.035074, avg accuracy test: 0.987883


train minibatch loop: 100%|██████████| 5984/5984 [09:37<00:00, 10.54it/s, accuracy=0.997, cost=0.0127] 
test minibatch loop: 100%|██████████| 665/665 [00:28<00:00, 23.35it/s, accuracy=0.99, cost=0.0258]  


epoch: 7, pass acc: 0.987883, current acc: 0.988393
epoch: 7, avg loss: 0.031217, avg accuracy: 0.989066
epoch: 7, avg loss test: 0.033957, avg accuracy test: 0.988393


train minibatch loop: 100%|██████████| 5984/5984 [09:37<00:00, 11.08it/s, accuracy=0.985, cost=0.0565] 
test minibatch loop: 100%|██████████| 665/665 [00:28<00:00, 23.31it/s, accuracy=0.991, cost=0.028]  


epoch: 8, pass acc: 0.988393, current acc: 0.989235
epoch: 8, avg loss: 0.029715, avg accuracy: 0.989519
epoch: 8, avg loss test: 0.031453, avg accuracy test: 0.989235


train minibatch loop:   8%|▊         | 508/5984 [00:49<08:45, 10.41it/s, accuracy=0.994, cost=0.0164]IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)

test minibatch loop: 100%|██████████| 665/665 [00:28<00:00, 23.34it/s, accuracy=0.987, cost=0.0427] 


epoch: 9, avg loss: 0.028808, avg accuracy: 0.989746
epoch: 9, avg loss test: 0.033073, avg accuracy test: 0.988883


train minibatch loop:  32%|███▏      | 1885/5984 [03:01<06:26, 10.60it/s, accuracy=0.991, cost=0.0232] IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)

train minibatch loop:  51%|█████     | 3058/5984 [04:55<04:45, 10.25it/s, accuracy=0.99, cost=0.0261]  IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)

train minibatch loop:  73%|███████▎  | 4380/5984 [07:03<02:42,  9.89it/s, accuracy=0.987, cost=0.0376] IOPub message rate exceeded.
The notebook

In [16]:
saver = tf.train.Saver(tf.trainable_variables())
saver.save(sess, "beamsearch-luong-normalize/model.ckpt")

'beamsearch-luong-normalize/model.ckpt'

In [17]:
strings = ','.join(
    [
        n.name
        for n in tf.get_default_graph().as_graph_def().node
        if ('Variable' in n.op
        or 'Placeholder' in n.name
        or 'logits' in n.name
        or 'alphas' in n.name)
        and 'Adam' not in n.name
        and 'beta' not in n.name
        and 'OptimizeLoss' not in n.name
        and 'Global_Step' not in n.name
    ]
)

In [18]:
def freeze_graph(model_dir, output_node_names):

    if not tf.gfile.Exists(model_dir):
        raise AssertionError(
            "Export directory doesn't exists. Please specify an export "
            "directory: %s" % model_dir)

    checkpoint = tf.train.get_checkpoint_state(model_dir)
    input_checkpoint = checkpoint.model_checkpoint_path
    
    absolute_model_dir = "/".join(input_checkpoint.split('/')[:-1])
    output_graph = absolute_model_dir + "/frozen_model.pb"
    clear_devices = True
    with tf.Session(graph=tf.Graph()) as sess:
        saver = tf.train.import_meta_graph(input_checkpoint + '.meta', clear_devices=clear_devices)
        saver.restore(sess, input_checkpoint)
        output_graph_def = tf.graph_util.convert_variables_to_constants(
            sess,
            tf.get_default_graph().as_graph_def(),
            output_node_names.split(",")
        ) 
        with tf.gfile.GFile(output_graph, "wb") as f:
            f.write(output_graph_def.SerializeToString())
        print("%d ops in the final graph." % len(output_graph_def.node))

In [19]:
freeze_graph("beamsearch-luong-normalize", strings)

INFO:tensorflow:Restoring parameters from beamsearch-luong-normalize/model.ckpt
INFO:tensorflow:Froze 14 variables.
INFO:tensorflow:Converted 14 variables to const ops.
1739 ops in the final graph.


In [20]:
def load_graph(frozen_graph_filename):
    with tf.gfile.GFile(frozen_graph_filename, "rb") as f:
        graph_def = tf.GraphDef()
        graph_def.ParseFromString(f.read())
    with tf.Graph().as_default() as graph:
        tf.import_graph_def(graph_def)
    return graph

In [21]:
g=load_graph('beamsearch-luong-normalize/frozen_model.pb')

In [22]:
x = g.get_tensor_by_name('import/Placeholder:0')
logits = g.get_tensor_by_name('import/logits:0')
test_sess = tf.InteractiveSession(graph=g)
predicted = test_sess.run(logits,feed_dict={x:str_idx(['makn'],dictionary_from)})[0]
print('PREDICTED AFTER:',''.join([rev_dictionary_to[n] for n in predicted if n not in[0,1,2,3]]))

PREDICTED AFTER: makin




In [23]:
x = g.get_tensor_by_name('import/Placeholder:0')
logits = g.get_tensor_by_name('import/logits:0')
test_sess = tf.InteractiveSession(graph=g)
predicted = test_sess.run(logits,feed_dict={x:str_idx(['kecomelkn'],dictionary_from)})[0]
print('PREDICTED AFTER:',''.join([rev_dictionary_to[n] for n in predicted if n not in[0,1,2,3]]))

PREDICTED AFTER: kecomelkan


In [24]:
predicted = test_sess.run(logits,feed_dict={x:str_idx(['xjdi'],dictionary_from)})[0]
print('PREDICTED AFTER:',''.join([rev_dictionary_to[n] for n in predicted if n not in[0,1,2,3]]))

PREDICTED AFTER: tak jadi


In [25]:
import json
with open('beamsearch-luong-normalize.json','w') as fopen:
    fopen.write(json.dumps({'dictionary_from':dictionary_from,
                'dictionary_to':dictionary_to,
                'rev_dictionary_to':rev_dictionary_to,
                'rev_dictionary_from':rev_dictionary_from}))