In [1]:
import os
import sys
import random
import math
import multiprocessing
import h5py
import tqdm
import json
import numpy as np
import tensorflow as tf
import keras as K
import keras.backend.tensorflow_backend as KTF
from keras.models import Model
from keras.callbacks import ModelCheckpoint
from keras.layers import Input, Embedding, LSTM, Dense, Lambda

Using TensorFlow backend.


In [2]:
K.backend.clear_session()
sess = tf.Session( config = tf.ConfigProto( device_count = {'gpu':0} ) )
KTF.set_session( sess )

In [4]:
"""Get data from path

Args:
    path: a string represents corpus path of each language.
    language_list: a list of string represents languages.
    encoding_list: a list of string represents encoding of each language
                   corresponding to language list.
    shuffle: a boolean value. True for shuffle.

Returns:
    lan_data: a dictionary contains sentences of each language.
              Its structure is:
              
              {language A: [[word1, word2, ...], [...], ...],
               language B: ...}
"""
def get_data( path = "../Data/", language_list = ["chinese", "english"],
              encoding_list = ["UTF-8", "UTF-8"], shuffle = True ):
    assert len( lan_list ) != len( encoding_list )
    # Just for my convenient in the following
    lan_list, enc_list = language_list, encoding_list
    
    # Read parallel corpus
    lan_data = {}
    for i in range( len( lan_list ) ):
        lan = lan_list[i]
        print( "Reading " + lan + " language corpus..." )
        if lan not in lan_data:
            lan_data[lan] = []
        files = os.listdir( dataPath + lan + "/" )
        for file in files:
            with open( file, "r", encoding = enc_list[i] ) as f:
                line = f.readline()
                while line:
                    line = line.strip()
                    if len( line ) == 0:
                        line = f.readline()
                        continue
                    words = ["<S>"] + line.split() + ["</S>"]
                    lan_data[lan].append( words )
                    line = f.readline()
    
    if shuffle == True:
        print( "Shuffling..." )
        
        # Decide shuffle order
        length = data[lan_list[0]].length()
        shuf_list = [i for i in range( length )]
        random.shuffle( shuf_list )

        # Shuffle corpus
        for lan in lan_list:
            lan_data[lan] = np.array( lan_data[lan] )[shuf_list].tolist()
    
    return lan_data

In [5]:
"""Build dictionary for each language

Args:
    language_data: a dictionary contains sentences of each language.
                   Its structure is:
                   
                   {language A: [[word1, word2, ...], [...], ...],
                    language B: ...}
    threshold: a integer represents threshold. If the number of a word
               is less than threshold, it will be replaced by <UNK>.

Returns:
    word_to_idx_dict: a dictionary converts word to index. Its structure is:
                      
                      {language A: {word A: index A, word B: ..., ...},
                       language B: ...}.
    idx_to_word_dict: a dictionary converts index to word. Its structure is:
                      
                      {language A: {index A: word A, index B: ..., ...},
                       language B: ...}.
"""
def build_dictionary( language_data, threshold = 0 ):
    lan_data = language_data
    word_to_idx_dict = {}
    idx_to_word_dict = {}
    for lan, sentences in lan_data.items():
        # Generate dictionary for each language
        if lan not in word_to_idx_dict:
            word_to_idx_dict[lan] = {"<PAD>": 0, "<S>": 1, "</S>": 2, "<UNK>": 3}
        if lan not in idx_to_word_dict:
            idx_to_word_dict[lan] = {0: "<PAD>", 1: "<S>", 2: "</S>", 3: "<UNK>"}
        
        # Count words
        word_count = {}
        for sentence in sentences:
            for word in sentence:
                if word not in word_count:
                    word_count[word] = 0
                word_count[word] += 1
        
        # Replace words to <UNK>
        for word, count in word_count.items():
            if count <= threshold:
                word = "<UNK>"
            if word not in word_to_idx_dict[lan]:
                idx = len( word_to_idx_dict[lan] )
                word_to_idx_dict[lan][word] = idx
                idx_to_word_dict[lan][idx] = word
                
    return word_to_idx_dict, idx_to_word_dict

In [7]:
def simple_seq2seq( output_vocab_size, input_vocab_size,
                    hidden_dim = 128, word_vec_dim = 300,
                    name = "baseline" ):
    ### Encoder-Decoder for train ###
    
    # Encoder
    encoder_embedding = Embedding( output_dim = word_vec_dim,
                                   input_dim = intput_vocab_size,
                                   mask_zero = True,
                                   name = name + "_encoder_embedding")
    encoder = LSTM( hidden_dim, return_state = True,
                    name = name + "_encoder_lstm" )
    encoder_input = Input( shape = ( None, ),
                           name = name + "_encoder_input" )
    
    encoder_input_emb   = encoder_embedding( encoder_input )
    _, state_h, state_c = encoder( encoder_input_emb )
    encoder_state       = [state_h, state_c]
    
    # Decoder
    decoder_embedding = Embedding( output_dim = word_vec_dim,
                                   input_dim = output_vocab_size,
                                   mask_zero = True,
                                   name = name + "_decoder_embedding")
    decoder = LSTM( hidden_dim, return_state = True, return_sequence = True,
                    name = name + "_decoder_lstm" )
    decoder_dense = Dense( output_vocab_size, activation = "softmax",
                           name = name + "_decoder_output" )
    decoder_input = Input( shape = ( None, ),
                           name = name + "_decoder_input" )
    
    decoder_input_emb = decoder_embedding( decoder_input )
    decoder_output, state_h, state_c = decoder( decoder_input_emb,
                                                initial_state = encoder_state )
    decoder_state = [state_h, state_c]
    decoder_output = decoder_dense( decoder_output )
    
    # Model
    model = Model( inputs = [encoder_input, decoder_input],
                   outputs = decoder_outputs,
                   name = name )
    model.compile( optimizer = 'adam', loss = "categorical_crossentropy" )
    
    ### Encoder-Decoder for generation
    
    # Encoder Model
    encoder_model   = Model( inputs  = encoder_input,
                           outputs = encoder_state,
                           name = name + "_encoder" )
    
    # Decoder Model
    decoder_state_h = Input( shape = ( hidden_dim, ), name = name + "_state_h" )
    decoder_state_c = Input( shape = ( hidden_dim, ), name = name + "_state_c" )
    decoder_state_input = [decoder_state_h, decoder_state_c]
    decoder_output, state_h, state_c = decoder( decoder_input_emb,
                                                initial_state = decoder_state_input )
    decoder_state   = [state_h, state_c]
    decoder_output  = decoder_dense( decoder_output )
    decoder_model   = Model( inputs  = [decoder_input] + decoder_state_input,
                             outputs = [decoder_output] + decoder_state,
                             name = name + "_decoder" )
    
    return model, encoder_model, decoder_model