In [2]:
import numpy as np
import tensorflow as tf
import jamo
import os

In [3]:
ROOT = 'ᐁ' # Root of sentence symbol
MS = 'ᑌ' # morpheme separator symbol
WS = 'ᐯ' # word separator symbol
EOS = 'ᕒ' # end of sentence symbol
ESC_BEGIN = 'ᐸ' # beginning of escape sequence symbol
ESC_END = 'ᐳ' # end of escape sequence symbol
PADDING = 'ᒣ' # padding after end-of-sentence
MASK = 'ᗰ' # masking symbol
hangul = ['0', '1', '2', '3', '4', '5', '6', '7', '8', '9'] + \
    [ESC_BEGIN, ESC_END, ROOT, MS, WS, EOS, PADDING, MASK] + \
    [chr(i) for i in range(0x1100, 0x1113)] + \
    [chr(i) for i in range(0x1161, 0x1176)] + \
    [chr(i) for i in range(0x11A8, 0x11C3)] + \
    [' ', '(', ')', '.', ',', '?', '\""', '\'']
PADDING_IDX = hangul.index(PADDING)
MASK_IDX = hangul.index(MASK)
def encode_string(s):
    compat_jamos = [chr(i) for i in range(0x3131, 0x314f)]
    s = jamo.h2j(s)
    s = "".join(jamo.hcj2j(ch, "tail") if ch in compat_jamos else ch for ch in s)
    def escape(ch):
        return [10] + [hangul.index(c) for c in str(ord(ch))] + [11]
    result = []
    for ch in s:
        if ch in hangul:
            result.append(hangul.index(ch))
        else:
            result += escape(ch)
    return result

def decode_string(s):
    result = []
    i = 0
    while i < len(s):
        if s[i] == 10:
            ch = ''
            i += 1
            while s[i] != 11:
                ch += str(s[i])
                i += 1
            result.append(chr(int(ch)))
        else:
            result.append(hangul[s[i]])
        i += 1
    return "".join(result)

In [4]:
max_in_length = 600
max_out_length = 700
max_dep_length = 90

"""
CoNLL-U Format
ID: Word index, integer starting at 1 for each new sentence; may be a range for multiword tokens; may be a decimal number for empty nodes (decimal numbers can be lower than 1 but must be greater than 0).
FORM: Word form or punctuation symbol.
LEMMA: Lemma or stem of word form.
UPOS: Universal part-of-speech tag.
XPOS: Language-specific part-of-speech tag; underscore if not available.
FEATS: List of morphological features from the universal feature inventory or from a defined language-specific extension; underscore if not available.
HEAD: Head of the current word, which is either a value of ID or zero (0).
DEPREL: Universal dependency relation to the HEAD (root iff HEAD = 0) or a defined language-specific subtype of one.
DEPS: Enhanced dependency graph in the form of a list of head-deprel pairs.
MISC: Any other annotation.
"""
def read_conllu(filenames):
    texts = []
    morphs = []
    depends = []
    for filename in filenames:
        with open(filename) as fp:
            for line in fp.readlines():
                if line.startswith('#'):
                    if line.startswith('# text = '):
                        texts.append([])
                        morphs.append([ROOT])
                        depends.append([])
                else:
                    split = line.split('\t')
                    if len(split) != 10:
                        continue
                    idx, form, lemma, upos, xpos, feats, head, deprel, deps, misc = split
                    
                    if "SpaceAfter=No" not in misc:
                        form += ' '
                    texts[-1].append(encode_string(form))
                    
                    lemma = MS.join(lemma.split('+')) + WS
                    morphs[-1].append(lemma)
                    depends[-1].append(int(head) - 1)
    
    for i in range(len(morphs)):
        morphs[i][-1] += EOS
    morphs = [[encode_string(w) for w in m] for m in morphs]
    
    dep_lengths = []
    depend_idxs = []
    depend_aligns = []
    for text, morph, depend in zip(texts, morphs, depends):
        word_cum_lengths = np.cumsum([len(w) for w in text])
        morph_cum_lengths = np.cumsum([len(w) for w in morph])
        
        indices = morph_cum_lengths[1:] - 1
        pad_size = max_dep_length - len(indices)
        indices = np.pad(indices, (0, pad_size), 'constant')
        depend_idxs.append(indices)
        
        for i in range(len(depend)):
            depend[i] = word_cum_lengths[i] - 1
        dep_lengths.append(len(depend))
        depend_aligns.append(np.pad(depend, (0, pad_size), 'constant'))
    
    text_lengths = []
    for i in range(len(texts)):
        texts[i] = sum(texts[i], [])
        text_lengths.append(len(texts[i]))
        texts[i] += [PADDING_IDX] * (max_in_length - len(texts[i]))
        
    out_lengths = []
    for i in range(len(morphs)):
        morphs[i] = sum(morphs[i], [])
        out_lengths.append(len(morphs[i]))
        morphs[i] += [PADDING_IDX] * (max_out_length - len(morphs[i]))
    
    return {'inputs': np.array(texts, dtype=np.int32), 
            'in_lengths': np.array(text_lengths, dtype=np.int32),
            'depend_idxs': np.array(depend_idxs, dtype=np.int32),
            'depends': np.array(depend_aligns, dtype=np.int32),
            'dep_lengths': np.array(dep_lengths, dtype=np.int32), 
            'morphs': np.array(morphs, dtype=np.int32),
            'out_lengths': np.array(out_lengths, dtype=np.int32)}

In [5]:
reparse = False

print("Reading input files...")
if not reparse and os.path.exists('train.npy'):
    print("Reading from .npy...")
    train = np.load('train.npy').item()
    test = np.load('test.npy').item()
else:
    print("Parsing ConLLU database...")
    train = read_conllu([
        'UD_Korean-GSD/ko_gsd-ud-train.conllu',
        'UD_Korean-Kaist/ko_kaist-ud-train.conllu'])
    test  = read_conllu([
        'UD_Korean-GSD/ko_gsd-ud-test.conllu',
        'UD_Korean-Kaist/ko_kaist-ud-test.conllu'])

    # Save to file for later
    np.save('train.npy', train)
    np.save('test.npy', test)
    
print("Done.")
print("Training set size:", len(train['inputs']))
print("Test set size:", len(test['inputs']))

Reading input files...
Reading from .npy...
Done.
Training set size: 27410
Test set size: 3276


In [6]:
class ConcatOutputAndAttentionWrapper(tf.contrib.rnn.RNNCell):
    '''Concatenates RNN cell output with the attention context vector.

    This is expected to wrap a cell wrapped with an AttentionWrapper constructed with
    attention_layer_size=None and output_attention=False. Such a cell's state will include an
    "attention" field that is the context vector.
    '''
    def __init__(self, cell):
        super(ConcatOutputAndAttentionWrapper, self).__init__()
        self._cell = cell

    @property
    def state_size(self):
        return self._cell.state_size

    @property
    def output_size(self):
        return self._cell.output_size + self._cell.state_size.attention

    def call(self, inputs, state):
        output, res_state = self._cell(inputs, state)
        return tf.concat([output, res_state.attention], axis=-1), res_state

    def zero_state(self, batch_size, dtype):
        return self._cell.zero_state(batch_size, dtype)

In [7]:
with tf.variable_scope('root', reuse=tf.AUTO_REUSE):
    char_embed_table = tf.get_variable('embedding', 
                            [len(hangul), 256], # number of symbols, embedding vector size
                            dtype=tf.float32,
                            initializer=tf.truncated_normal_initializer(stddev=0.5))

In [8]:
class Encoder:
    # inputs: (batch, input_length)
    def __init__(self, inputs, lengths, is_training):
        
        char_embedded_inputs = tf.nn.embedding_lookup(char_embed_table, inputs)
        
        # 3 convolution layers
        x = char_embedded_inputs
        with tf.variable_scope('prenet'):
            layer_sizes = [256, 256, 256]
            drop_rate = 0.1 if is_training else 0.0
            for i, size in enumerate(layer_sizes):
                conv_layer = tf.layers.Conv1D(filters=size, # number of output channels
                                              kernel_size=5,
                                              padding="same",
                                              activation=tf.nn.relu,
                                              name="conv_{}".format(i))
                x = conv_layer.apply(x)
                tf.layers.dropout(x, 
                                  rate=drop_rate, 
                                  name="dropout_{}".format(i))
        conv_result = x
        
        num_hidden = 128
        lstm_fw_cell = tf.nn.rnn_cell.LSTMCell(num_hidden, forget_bias=1.0)
        lstm_bw_cell = tf.nn.rnn_cell.LSTMCell(num_hidden, forget_bias=1.0)
        outputs, rnn_states = tf.nn.bidirectional_dynamic_rnn(
            cell_fw=lstm_fw_cell,
            cell_bw=lstm_bw_cell,
            inputs=conv_result,
            sequence_length=lengths,
            dtype=tf.float32)
        output_concat = tf.concat(list(outputs), -1)
        
        self.output = output_concat

In [9]:
def batch_scatter(indices, updates, shape):
    updates = tf.reshape(updates, [-1, shape[2]])
    indices = indices + tf.expand_dims(tf.range(0, shape[0]) * shape[1], 1)
    indices = tf.reshape(indices, [-1, 1])

    scatter = tf.scatter_nd(indices, updates, [shape[0]*shape[1], shape[2]])
    scatter = tf.reshape(scatter, shape)
    return scatter

In [10]:
class Decoder:
    # encoder_outputs: (batch, input_length, 256)
    # depend_targets: (batch, input_length)
    def __init__(self, encoder_outputs, depend_targets, depend_idxs, morph_targets, out_lengths, is_training):
        
        cells = []
        num_hidden = 256
        keep_rate = 0.9 if is_training else 0.0
        for layer_index in range(2):
            lstm_cell = tf.nn.rnn_cell.LSTMCell(num_hidden, forget_bias=1.0)
            cell = tf.contrib.rnn.DropoutWrapper(lstm_cell, input_keep_prob=keep_rate)
            cells.append(cell)
        prenet = tf.contrib.rnn.MultiRNNCell(cells)
        
        attention_size = 256
        attention_mechanism = tf.contrib.seq2seq.BahdanauAttention(
            attention_size, 
            encoder_outputs,
            normalize=True)
        
        attention_cell = ConcatOutputAndAttentionWrapper(
            tf.contrib.seq2seq.AttentionWrapper(
                prenet, 
                attention_mechanism,
                output_attention=False))
        
        # lookup encoder outputs from dependency indices
        self.depend_contexts = tf.batch_gather(encoder_outputs, depend_targets)
        
        shape = [tf.shape(encoder_outputs)[0], tf.shape(morph_targets)[1], encoder_outputs.shape[2]]
        depend_contexts_sparse = batch_scatter(depend_idxs, self.depend_contexts, shape)
        
        # lookup char embeddings
        morph_embedded = tf.nn.embedding_lookup(char_embed_table, morph_targets)
        
        decoder_inputs = tf.concat([morph_embedded, depend_contexts_sparse], axis=-1) # (batch, max_out_len, 512)
        
        # mask 15% of the lengths of the decoder inputs randomly
        if is_training:
        
            batch_size = tf.shape(decoder_inputs)[0]
            max_out_len = tf.shape(decoder_inputs)[1]
            
            mask_lengths = tf.to_int32(tf.to_float(out_lengths) * 0.15)
            offset_bounds = out_lengths - mask_lengths - 1
            offsets = tf.to_int32(tf.random.uniform([batch_size]) * tf.to_float(offset_bounds))
            
            rng = tf.range(max_out_len)
            mask = tf.math.logical_and(
                tf.expand_dims(rng, 0) >= tf.expand_dims(offsets, -1), 
                tf.expand_dims(rng, 0) < tf.expand_dims(offsets + mask_lengths, -1))
            mask = tf.broadcast_to(tf.expand_dims(mask, -1), tf.shape(decoder_inputs))
            
            mask_symbol = tf.concat([char_embed_table[MASK_IDX], tf.zeros([num_hidden])], -1)
            mask_symbol = tf.expand_dims(tf.expand_dims(mask_symbol, 0), 0)
            mask_symbol = tf.broadcast_to(mask_symbol, tf.shape(decoder_inputs))
            
            decoder_inputs = tf.where(mask, mask_symbol, decoder_inputs)
            
        self.decoder_inputs = decoder_inputs
        
        output, rnn_states = tf.nn.dynamic_rnn(
            cell=attention_cell,
            inputs=decoder_inputs,
            sequence_length=out_lengths,
            dtype=tf.float32)
        
        self.char_output = tf.layers.dense(output[:, :, :num_hidden], len(hangul))
        depend_output = output[:, :, num_hidden:]
        
        self.depend_output = tf.batch_gather(depend_output, depend_idxs)

In [19]:
class Model:
    def __init__(self, batch):
        with tf.variable_scope('root', reuse=tf.AUTO_REUSE):
            encoder = Encoder(batch['inputs'], batch['in_lengths'], True)
            decoder = Decoder(encoder.output, batch['depends'], batch['depend_idxs'],
                              batch['morphs'], batch['out_lengths'], True)

            # dependency analyzer loss
            depend_seq_loss = tf.norm(decoder.depend_contexts - decoder.depend_output, axis=-1)
            
            depend_mask = tf.math.equal(batch['depends'], 0)
            depend_count = tf.to_float(tf.reduce_sum(batch['dep_lengths']))
            
            depend_masked_loss = tf.where(depend_mask, tf.zeros_like(depend_seq_loss), depend_seq_loss)
            self.depend_loss = tf.reduce_sum(depend_masked_loss) / depend_count

            # morpheme analyzer loss
            char_seq_loss = tf.nn.sparse_softmax_cross_entropy_with_logits(
                labels=batch['morphs'],
                logits=decoder.char_output)

            mask = tf.math.equal(batch['morphs'], PADDING_IDX)
            total_length = tf.to_float(tf.reduce_sum(batch['out_lengths']))

            char_masked_loss = tf.where(mask, tf.zeros_like(char_seq_loss), char_seq_loss)
            self.char_loss = tf.reduce_sum(char_masked_loss) / total_length

            # sum of the losses
            self.total_loss = self.depend_loss + self.char_loss

            # training-specific
            self.global_step = tf.get_variable("global_step", shape=[], trainable=False,
                                      initializer=tf.zeros_initializer, dtype=tf.int32)

            step = tf.cast(self.global_step + 1, dtype=tf.float32)

            learning_rate = 1e-4 * tf.train.exponential_decay(1., step, 3000, 0.95)

            optimizer = tf.train.AdamOptimizer(learning_rate, 0.9, 0.999)
            gradients, variables = zip(*optimizer.compute_gradients(self.total_loss))
            clipped_gradients, _ = tf.clip_by_global_norm(gradients, 1.0)

            self.optimize = optimizer.apply_gradients(zip(clipped_gradients, variables), global_step=self.global_step)
            
            self.training_summary = tf.summary.merge([
                tf.summary.scalar("total_loss", self.total_loss),
                tf.summary.scalar("char_loss", self.char_loss),
                tf.summary.scalar("depend_loss", self.depend_loss)
            ])
            
            self.validation_summary = tf.summary.merge([
                tf.summary.scalar("validation_total_loss", self.total_loss),
                tf.summary.scalar("validation_char_loss", self.char_loss),
                tf.summary.scalar("validation_depend_loss", self.depend_loss)
            ])

In [20]:
batch_size = 8

inputs_placeholder = tf.placeholder(name='inputs_placeholder', 
                                    shape=(None, max_in_length), 
                                    dtype=tf.int32)
in_lengths_placeholder = tf.placeholder(name='in_lengths_placeholder',
                                        shape=(None),
                                        dtype=tf.int32)
depend_idxs_placeholder = tf.placeholder(name='depend_idxs_placeholder',
                                         shape=(None, max_dep_length),
                                         dtype=tf.int32)
depends_placeholder = tf.placeholder(name='depends_placeholder',
                                     shape=(None, max_dep_length),
                                     dtype=tf.int32)
dep_lengths_placeholder = tf.placeholder(name='dep_lengths_placeholder',
                                         shape=(None),
                                         dtype=tf.int32)
morphs_placeholder = tf.placeholder(name='morphs_placeholder',
                                    shape=(None, max_out_length),
                                    dtype=tf.int32)
out_lengths_placeholder = tf.placeholder(name='out_lengths_placeholder',
                                         shape=(None),
                                         dtype=tf.int32)

dataset = tf.data.Dataset.from_tensor_slices({
    'inputs': inputs_placeholder,
    'in_lengths': in_lengths_placeholder, 
    'depend_idxs': depend_idxs_placeholder,
    'depends': depends_placeholder,
    'dep_lengths': dep_lengths_placeholder,
    'morphs': morphs_placeholder,
    'out_lengths': out_lengths_placeholder
})
dataset = dataset.shuffle(buffer_size=10000)
dataset = dataset.repeat()
dataset = dataset.batch(batch_size)

iterator = dataset.make_initializable_iterator()

model = Model(iterator.get_next())

In [21]:
# run
saver = tf.train.Saver()
sess = tf.Session()

train_writer = tf.summary.FileWriter("logs", sess.graph)
sess.run(tf.global_variables_initializer())
sess.run(iterator.initializer, feed_dict={
    inputs_placeholder: train['inputs'],
    in_lengths_placeholder: train['in_lengths'],
    depend_idxs_placeholder: train['depend_idxs'],
    depends_placeholder: train['depends'],
    dep_lengths_placeholder: train['dep_lengths'],
    morphs_placeholder: train['morphs'],
    out_lengths_placeholder: train['out_lengths']
})

In [None]:
while True:
    _, loss, step, log = sess.run((model.optimize, model.total_loss, model.global_step, model.training_summary))
    
    train_writer.add_summary(log, step)
    
    if step % 10 == 0:
        print("{}: {}".format(step, loss))
        
        save_path = saver.save(sess, "models/charrnn", step)
        print('Saved to', save_path)

40: 3.534392833709717
Saved to models/charrnn-40
50: 3.5312891006469727
Saved to models/charrnn-50
60: 3.4096686840057373
Saved to models/charrnn-60
70: 3.3832569122314453
Saved to models/charrnn-70
80: 3.2854151725769043
Saved to models/charrnn-80
