In [None]:
""" Abstract

The demo is a prototype of the project model. Codes and structure here could change in the future.
The main point of codes below is to run on local computer and test whether it works on small scale of data.

Now:
    Prove that original model is inaccessible. Start to implemente original RNN-LSTM and treat it as baseline.

TODO:
    1. Try new way to approach the unsupervised method.
    2. Try to understand how to control gradient in Tensorflow.
"""

In [None]:
from keras.layers import Input, Embedding, LSTM, Dense, Lambda, concatenate, Multiply
from keras.models import Model
import keras as K
import os
import sys
import random
import numpy as np
import tensorflow as tf
import keras.backend.tensorflow_backend as KTF
K.backend.clear_session()
KTF.set_session( tf.Session( config = tf.ConfigProto( device_count = {'gpu':0} ) ) )

In [None]:
# print( "dogeA" )
# lmA = lm.loadLM( loadPath = "./lm/chinese", encoding = "utf-8" )
# print( "dogeB" )
# lmB = lm.loadLM( loadPath = "./lm/english", encoding = "utf-8" )

In [None]:
""" Get train data from path

Read train data from files for each language and save to a dictionary.

Args:
    dataPath: file path of train data. Default value is "../../Data/train/".
    langList: language list. Indicating which language data will be included in train data.
              Default value is ["Chinese", "English"].
    encoding: encoding of each file in train directory.
    ratio: propotion of train data, others will be treat as dev data. Default value is 0.98.
    sort: boolean value. If shuffle equals True, all data will be sorted according to their
          length from short to long. Otherwise, train sentences will be shuffled at the end.
          Default value is True.

Returns:
    trainData: a dictionary of train data sentences of each language. Its structure is:
    
               {language A: [[word1, word2, ...], [...], ...], language B: ...}.
    
    devData: a dictionary of dev data sentences of each language. Its structure is:
    
               {language A: [[word1, word2, ...], [...], ...], language B: ...}.
"""
def getTrainData( dataPath = "../Data/train/", lanList = ["chinese", "english"],
                  encoding = "UTF-8", ratio = 0.98, sort = True ):
    trainData = {}
    devData   = {}
    for lan in lanList:
        print( lan )
        if lan not in trainData:
            trainData[lan] = []
        if lan not in devData:
            devData[lan] = []
        files = os.listdir( dataPath + lan + "/" )
        data = []
        for file in files:
            with open( dataPath + lan + "/" + file, encoding = encoding ) as f:
                line = f.readline()
                while line:
                    wordList = line.split()
                    data.append( ["<S>"] + wordList + ["</S>"] )
                    line = f.readline()
        # suffle here is to make sure that all data are random distributed
        random.shuffle( data )
        noOfSentences = len( data )
        noOfTrainData = int( noOfSentences * ratio )
        devData[lan]   = data[noOfTrainData:]
        trainData[lan] = data[:noOfTrainData]
        if sort == True:
            trainData[lan].sort( key = lambda x: len( x ) )
            devData[lan].sort( key = lambda x: len( x ) )
    return trainData, devData

In [None]:
""" Generate dictionary and preprocess setences for each language

Generate dictionary for each language and convert word to corresponding index.
Here we set two dictionaries to speed up the whole program.

Args:
    data: a dictionary contains sentences of each language.  Its structure is:
    
          {language A: [[word1, word2, ...], [...], ...], language B: ...}.
    
    threshold: a word will be replace with <UNK> if frequency of a word is
               less than threshold. If the value is less than 1, it means
               no need to replace any word to <UNK>. Default value is 0.

Returns:
    wordNumDict: a dictionary which can convert words to index in each language.
                 Its structure is:
                 
                 {language A: {word A: index, word B: ..., ...}, language B: ..., ...}.
    
    numWordDict: a dictionary which can convert index to word in each language.
                 Its structure is:
                 
                 {language A: {word A: index, word B: ..., ...}, language B: ..., ...}.
"""
def generateDict( data, threshold = 0 ):
    wordNumDict = {}
    numWordDict = {}
    for lan, sentences in data.items():
        wordCount = {}
        if lan not in wordNumDict:
            # Add special word to dictionary
            wordNumDict[lan] = {"<PAD>": 0, "<S>": 1, "</S>": 2, "<UNK>": 3}
        if lan not in numWordDict:
            # Add special word to dictionary
            numWordDict[lan] = {0: "<PAD>", 1: "<S>", 2: "</S>", 3: "<UNK>"}
        
        # Count word frequency
        for sentence in sentences:
            for i in range( len( sentence ) ):
                word = sentence[i]
                if word not in wordCount:
                    wordCount[word] = 0
                wordCount[word] += 1
        
        # Find and replace with <UNK>
        for sentence in sentences:
            for i in range( len( sentence ) ):
                word = sentence[i]
                if wordCount[word] < threshold:
                    word = "<UNK>"
                if word not in wordNumDict[lan]:
                    number = len( wordNumDict[lan] )
                    wordNumDict[lan][word] = number
                    numWordDict[lan][number] = word
                sentence[i] = wordNumDict[lan][word]
    return wordNumDict, numWordDict

In [None]:
"""Number to One-hot

Only convert sentences which length are small than 30.

Args:
    data: a dictionary contains sentences of each language.  Its structure is:
    
          {language A: [[word1, word2, ...], [...], ...], language B: ...}.
    
    wordNumDict: a dictionary which can convert words to index in each language.
                 Its structure is:
                 
                 {language A: {word A: index, word B: ..., ...}, language B: ..., ...}.

Returns:
    ndata: 
    td:
"""
def toCategory( data, wordNumDict, left, right ):
#     n = right - left
    maxlch = 0
    maxlen = 0
    n = 0
    for i in range( left, right ):
        if len( data["chinese"][i] ) <= 30:
            n += 1
            maxlch = np.max( [maxlch, len( data["chinese"][i] )] )
            maxlen = np.max( [maxlen, len( data["english"][i] )] )
    if n == 0:
        return False, [], []
    zh = np.zeros( ( n, maxlch ) )
    en = np.zeros( ( n, maxlen ) )
    td = np.zeros( ( n, maxlen, len( wordNumDict["english"] ) ) )
    n = 0
    for i in range( left, right ):
        if len( data["chinese"][i] ) <= 30:
            for j in range( len( data["chinese"][i] ) ):
                zh[n, j] = data["english"][i][j]
            for j in range( len( data["english"][i] ) ):
                en[n, j] = data["english"][i][j]
                if j:
                    w = data["english"][i][j]
                    td[n, j - 1, w] = 1
            n += 1
    ndata = {}
    ndata["chinese"] = zh
    ndata["english"] = en
    return True, ndata, td

In [None]:
""" Simple Seqence to Sequence Implementation

A simple implementation of Sequence to Sequence model. It works as baseline

Args:
    input_dim:  dimension of input word vector.
    output_dim: dimension of output word vector.
    hidden_dim: dimension of hidden states vector.
    output_vocab_size: size of output language vocabulary size.
    input_vocab_size:  size of input  language vocabulary size.
    word_vec_dim: dimension of word-vector.
    name: name of the model.

Returns:
    model: the whole model of simple Seq2Seq model.

"""
def simpleSeq2Seq( output_vocab_size, input_vocab_size, hidden_dim = 256,
                   word_vec_dim = 512, name = "demo" ):
    embedding_encoder  = Embedding( output_dim = word_vec_dim, input_dim = input_vocab_size,
                                 name = name + "_encoder_embedding", mask_zero = True ) # 
    embedding_decoder = Embedding( output_dim = word_vec_dim, input_dim = output_vocab_size,
                                 name = name + "_decoder_embedding", mask_zero = True ) # 
    # Encoder
    encoder_input     = Input( shape = ( None, ), name = name + "_encoder_input" )
    # change when using pre-trained embedding trainable= False
    encoder           = LSTM( hidden_dim, return_state = True )
    encoder_input_emb = embedding_encoder( encoder_input )
    _, state_h, state_c = encoder( encoder_input_emb )
    state_encoder     = [state_h, state_c]
    # Decoder
    decoder = LSTM( hidden_dim, return_sequences = True )

    decoder_input     = Input( shape = ( None, ), name = name + "_decoder_input" )
    decoder_input_emb = embedding_decoder( decoder_input )
    decoder_outputs   = decoder( decoder_input_emb, initial_state = state_encoder )
    decoder_dense     = Dense( output_vocab_size, activation = "softmax", name = name + "_decoder_output" )
    decoder_outputs   = decoder_dense( decoder_outputs )

    # Build model
    model = Model( inputs = [encoder_input, decoder_input], outputs = decoder_outputs, name = name )
    model.compile( optimizer = 'adam', loss = "categorical_crossentropy" )
    return model

In [None]:
trainData, devData = getTrainData( "../Data/train/" )
wordNumDict, numWordDict = generateDict( trainData, threshold = 5 )
ivs = len( wordNumDict["chinese"] )
ovs = len( wordNumDict["english"] )
print( ivs, ovs )

In [None]:
trainData["chinese"] = trainData["chinese"][::-1]
devData["chinese"] = devData["chinese"][::-1]
trainData["english"] = trainData["english"][::-1]
devData["english"] = devData["english"][::-1]

In [None]:
model = simpleSeq2Seq( output_vocab_size = ovs, input_vocab_size = ivs, name = "demo" )
model.summary()
batch_size = 64

losses = []
n = 0
for i in range( 0, len( trainData["chinese"] ) + batch_size, batch_size ): 
    status, newTrainData, td = toCategory( trainData, wordNumDict, i, i + batch_size )
    if status == False:
        continue
    loss = model.train_on_batch( [newTrainData["chinese"], newTrainData["english"]], td )
    n += 1
    print( n, loss )
    losses.append( loss )

In [None]:
def loss( y_true, y_pred ):
    y1 = y_pred[:, :256]
    y2 = y_pred[:, 256:]
    return y1 * y2
#     y_rev = K.backend.reverse( y_pred, axes = -1 )
#     loss = K.backend.sum( y_pred * y_rev / 2, axis = -1, keepdims = True )
#     return y_pred

In [None]:
""" Dual Neural Machine Translation Model

The structure of the model is:

Input A ---> RNN1 ---> Output B ---> RNN2 ---> Output A
                          |                       |
                          V                       V
                        LM B                    LM A

Args:
    langA_dim:  dimension of language A word vector.
    langB_dim:  dimension of language B word vector.
    hidden_dim: dimension of hidden state vector.
    lmA: language model of language A.
    lmB: language model of language B.

Returns:
    model: the whole model of Dual Neural Machine Translation Model.
"""
def dualNMTModel( langA_vocab_size = 100000, langB_vocab_size = 100000,
                  langA_dim = 1024, langB_dim = 1024, word_vec_dim = 256, hidden_dim = 256,
                  name = "demo" ):
    # first part of translation model from language A to language B
    embedding_A2B = Embedding( output_dim = word_vec_dim, input_dim = langA_vocab_size,
                               name = name + "_embedding_A2B", mask_zero = True )
    to_label = Lambda( lambda x: K.backend.argmax( x, axis = 2 ) )
    avg_emb  = Lambda( lambda x: tf.math.reduce_mean( x, axis = 1 ) )
    rev_emb  = Lambda( lambda x: K.backend.reverse( x, axes = 1 ) )
    cos_sim  = Lambda( lambda x, y: x * y )
    # Encoder
    encoder_input_A2B     = Input( shape = ( None, ), name = name + "_encoder_input_A2B" )
    # change when using pre-trained embedding trainable= False
    encoder_A2B           = LSTM( hidden_dim, return_state = True )
    encoder_input_emb_A2B = embedding_A2B( encoder_input_A2B )
    _, state_h, state_c   = encoder_A2B( encoder_input_emb_A2B )
    state_encoder_A2B     = [state_h, state_c]
    # Decoder
    decoder_A2B = LSTM( hidden_dim, return_sequences = True )

    decoder_input_A2B     = Input( shape = ( None, ), name = name + "_decoder_input_A2B" )
    decoder_input_emb_A2B = embedding_A2B( decoder_input_A2B )
    decoder_outputs_A2B   = decoder_A2B( decoder_input_emb_A2B, initial_state = state_encoder_A2B )
    decoder_dense_A2B     = Dense( langB_vocab_size, activation = "softmax", name = name + "_decoder_output_A2B" )
    decoder_outputs_A2B   = decoder_dense_A2B( decoder_outputs_A2B )
    
    # language model <- langB_output
    # lossB = Lambda( perplexity )( langB_output )
    
    # second part of translation model from another language to original language
    # first part of translation model from language A to language B
    embedding_B2A = Embedding( output_dim = word_vec_dim, input_dim = langB_vocab_size,
                               name = name + "_embedding_B2A", mask_zero = True )
    # Encoder
    encoder_input_B2A     = to_label( decoder_outputs_A2B )
    # change when using pre-trained embedding trainable= False
    encoder_B2A           = LSTM( hidden_dim, return_state = True )
    encoder_input_emb_B2A = embedding_B2A( encoder_input_B2A )
#     _, state_h, state_c   = encoder_B2A( decoder_outputs_A2B )
    _, state_h, state_c   = encoder_B2A( encoder_input_emb_B2A )
    state_encoder_B2A     = [state_h, state_c]
    # Decoder
    decoder_B2A = LSTM( hidden_dim, return_sequences = True )

    decoder_input_B2A     = Input( shape = ( None, ), name = name + "_decoder_input_B2A" )
    decoder_input_emb_B2A = embedding_B2A( decoder_input_B2A )
    decoder_outputs_B2A   = decoder_B2A( decoder_input_emb_B2A, initial_state = state_encoder_B2A )
    decoder_dense_B2A     = Dense( langA_vocab_size, activation = "softmax", name = name + "_decoder_output_B2A" )
    decoder_outputs_B2A   = decoder_dense_B2A( decoder_outputs_B2A )
    
    avg_input_emb  = avg_emb( encoder_input_emb_A2B )
    decoder_outputs_label = to_label( decoder_outputs_B2A )
    decoder_outputs_emb   = embedding_A2B( decoder_outputs_label )
    avg_output_emb = avg_emb( decoder_outputs_emb )
#     avg_output_emb = rev_emb( avg_output_emb )
    
    output = Multiply()( [avg_input_emb, avg_output_emb] )
    
#     output = concatenate( [avg_input_emb, avg_output_emb], axis = 1 )
    
    # language model <- langA_output
    # lossA = Lambda( perplexity )( langA_output )
    
    # Build model
    model = Model( inputs = [encoder_input_A2B, decoder_input_A2B, decoder_input_B2A],
                   outputs = output ) #[decoder_outputs_B2A, decoder_outputs_A2B]
    model.compile( optimizer = 'adam', loss = lambda y_true, y_pred: y_pred ) #, loss_weights = [0.5, 1.]
    return model

In [None]:
print( "dogeLM" )
model = dualNMTModel( langA_vocab_size = 37100, langB_vocab_size = 27869 )

In [None]:
model.summary()

In [None]:
trainData, devData = getTrainData( "../Data/test/" )
wordNumDict, numWordDict = generateDict( trainData )
print( len( trainData["chinese"][-1] ) )
print( len( trainData["english"][-1] ) )
model.fit( [trainData["chinese"],
            np.zeros( ( len( trainData["chinese"] ),
                        len( trainData["english"][-1] ) ) ),
            np.zeros( ( len( trainData["chinese"] ),
                        len( trainData["chinese"][-1] ) ) )],
            trainData["chinese"] )