In [None]:
""" Abstract

The demo is a prototype of the project model. Codes and structure here could change in the future.
The main point of codes below is to run on local computer and test whether it works on small scale of data.

"""

In [None]:
from keras.layers import Input, Embedding, LSTM, Dense
from keras.models import Model
from elmoformanylangs import Embedder
import os
import sys
import random
import numpy
import lm

In [None]:
""" Get train data from path

Read train data from files for each language and save to a dictionary.

Args:
    dataPath: file path of train data. Default value is "../../Data/train/".
    langList: language list. Indicating which language data will be included in train data.
              Default value is ["Chinese", "English"].
    encoding: encoding of each file in train directory.
    ratio: propotion of train data, others will be treat as dev data. Default value is 0.98.
    sort: boolean value. If shuffle equals True, all data will be sorted according to their
          length from short to long. Otherwise, train sentences will be shuffled at the end.
          Default value is True.

Returns:
    trainData: a dictionary of train data sentences of each language. Its structure is:
    
               {language A: [[word1, word2, ...], [...], ...], language B: ...}.
    
    devData: a dictionary of dev data sentences of each language. Its structure is:
    
               {language A: [[word1, word2, ...], [...], ...], language B: ...}.
"""
def getTrainData( dataPath = "../../Data/train/", lanList = ["Chinese", "English"],
                  encoding = "UTF-8", ratio = 0.98, sort = True ):
    trainData = {}
    devData   = {}
    for lan in lanList:
        if lan not in trainData:
            trainData[lan] = []
        if lan not in devData:
            devData[lan] = []
        files = os.listdir( dataPath + lan + "/" )
        data = []
        for file in files:
            with open( dataPath + lan + "/" + file ) as f:
                line = f.readline()
                while line:
                    wordList = line.split()
                    data.append( wordList )
                    line = f.readline()
        # suffle here is to make sure that all data are random distributed
        random.shuffle( data )
        noOfSentences = len( data )
        noOfTrainData = int( noOfSentences * ratio )
        devData[lan]   = data[noOfTrainData:]
        trainData[lan] = data[:noOfTrainData]
        if sort = True:
            trainData[lan].sort()
            devData[lan].sort()
    return trainData, devData

In [None]:
""" Generate dictionary and preprocess setences for each language

Generate dictionary for each language and convert word to corresponding index.
Here we set two dictionaries to speed up the whole program.

Args:
    data: a dictionary contains sentences of each language.  Its structure is:
    
          {language A: [[word1, word2, ...], [...], ...], language B: ...}.
    
    threshold: a word will be replace with <UNK> if frequency of a word is
               less than threshold. If the value is less than 1, it means
               no need to replace any word to <UNK>. Default value is 0.

Returns:
    wordNumDict: a dictionary which can convert words to index in each language.
                 Its structure is:
                 
                 {language A: {word A: index, word B: ..., ...}, language B: ..., ...}.

    numWordDict: a dictionary which can convert index to word in each language.
                 Its structure is:
                 
                 {language A: {word A: index, word B: ..., ...}, language B: ..., ...}.
"""
def generateDict( data, threshold = 0 ):
    wordNumDict = {}
    numWordDict = {}
    for lan, sentences in data:
        wordCount = {}
        if lan not in wordNumDict:
            # Add special word to dictionary
            wordNumDict[lan] = {"<S>": 1, "</S>": 2, "<UNK>": 3}
        if lan not in numWordDict:
            # Add special word to dictionary
            numWordDict[lan] = {1: "<S>", 2: "</S>", 3: "<UNK>"}
        # Count word frequency
        for sentence in sentences:
            for i in range( len( sentence ) ):
                word = sentence[i]
                if word not in wordCount:
                    wordCount[word] = 1
                wordCount[word] += 1
        # Find and replace with <UNK>
        for sentence in sentences:
            for i in range( len( sentence ) ):
                word = sentence[i]
                if wordCount[word] < threshold:
                    word = "<UNK>"
                if word not in wordNumDict[lan]:
                    # +1 here is to set 0 in dictionary for <PAD>
                    number = len( wordNumDict ) + 1
                    wordNumDict[lan][word] = number
                    numWordDict[lan][number] = word
                sentence[i] = wordNumDict[lan][word]
    return wordNumDict, numWordDict

In [None]:
def lossB( y_true, y_pred ):
    pass

def lossA( y_true, y_pred ):
    pass

In [None]:
""" Simple Seqence to Sequence Implementation

A simple implementation of Sequence to Sequence model. It works as baseline

Args:
    input_dim:  dimension of input word vector.
    output_dim: dimension of output word vector.
    hidden_dim: dimension of hidden states vector.
    output_vocab_size: size of output language vocabulary size.
    input_vocab_size:  size of input  language vocabulary size.
    word_vec_dim: dimension of word-vector.
    name: name of the model.

Returns:
    model: the whole model of simple Seq2Seq model.

"""
def simpleSeq2Seq( output_dim = 1024, input_dim = 1024, hidden_dim = 256,
                   output_vocab_size, input_vocab_size, word_vec_dim = 512, name = "" ):
    # Encoder
    encoder_input = Input( shape = ( None, input_dim ), name = name + "_encoder_input" )
    # change when using pre-trained embedding trainable= False
    embedding     = Embedding( input_vocab_size, word_vec_dim, mask_zero = True )
    encoder_input = embedding( encoder_input )
    encoder       = LSTM( hidden_dim, return_state = True )
    encoder_output, state_h, state_c = encoder( ori_input )
    state_encoder = [state_h, state_c]
    # Decoder
    decoder_input   = Input( shape = ( None, output_dim ), name = name + "_decoder_input" )
    decoder         = LSTM( hidden_dim, return_sequences = True )
    decoder_outputs = decoder( decoder_input, initial_state = state_encoder )
    decoder_dense   = Dense( output_dim, activation = "softmax", name = name + "_decoder_output" )
    decoder_outputs = decoder_dense( decoder_outputs )
    # Build model
    model = Model( inputs = [encoder_input, decoder_input], outputs = decoder_outputs, name = name )
    return model

""" Dual Neural Machine Translation Model

The structure of the model is:

Input A ---> RNN1 ---> Output B ---> RNN2 ---> Output A
                          |                       |
                          V                       V
                        LM B                    LM A

Args:
    langA_dim:  dimension of language A word vector.
    langB_dim:  dimension of language B word vector.
    hidden_dim: dimension of hidden state vector.
    lmA: language model of language A.
    lmB: language model of language B.

Returns:
    model: the whole model of Dual Neural Machine Translation Model.
"""
def dualNMTModel( langA_dim = 1024, langB_dim = 1024, hidden_dim = 256, lmA, lmB ):
    # first part of translation model from language A to language B
    langA_input  = Input( shape = ( None, langA_dim ), name = "langA_input"  )
    langB_output = Input( shape = ( None, langB_dim ) )
    A2B_model = simpleSeq2Seq( langB_dim, langA_dim, hidden_dim, name = "A2B" )
    langB_output = A2B_model( [langA_input, langB_output], name = "langB_output" )
    # language model <- langB_output
    # lossB = Lambda( perplexity )( langB_output )
    # second part of translation model from another language to original language
    langA_output = Input( shape = ( None, langA_dim ) )
    B2A_model = simpleSeq2Seq( langA_dim, langB_dim, hidden_dim, name = "B2A" )
    langA_output = B2A_model( [langB_output, langA_output], name = "langA_output" )
    # language model <- langA_output
    # lossA = Lambda( perplexity )( langA_output )
    # Build model
    model = Model( inputs = [langA_input, langB_output, langA_output], outputs = [langB_output, langA_output] )
    model.compile( optimizer = 'adam', loss = {"langB_output": lossB, "langA_output": lossA},
                   loss_weights = {"langB_output": 0.5, "langA_output": 1.} )
    return model