In [1]:
# coding: utf-8

""" Abstract

The demo is a prototype of the project model. Codes and structure here could change in the future.
The main point of codes below is to run on local computer and test whether it works on small scale of data.

Now:
    Prove that original model is inaccessible. Start to implemente original RNN-LSTM and treat it as baseline.

TODO:
    1. Try new way to approach the unsupervised method.
    2. Try to understand how to control gradient in Tensorflow.
"""

import os
import sys
import random
import threading
import math
import queue
import multiprocessing
import h5py
import tqdm
import numpy as np
import tensorflow as tf
import keras as K
import keras.backend.tensorflow_backend as KTF
from keras.models import Model
from keras.callbacks import ModelCheckpoint
from keras.layers import Input, Embedding, LSTM, Dense, Lambda, concatenate, Multiply

K.backend.clear_session()
KTF.set_session( tf.Session( config = tf.ConfigProto( device_count = {'gpu':0} ) ) )

Using TensorFlow backend.


In [2]:
""" Get train data from path

Read paralelled train data from files for each language and save to a dictionary.

Args:
    dataPath: file path of train data. Default value is "../../Data/train/".
    langList: language list. Indicating which language data will be included in train data.
              Default value is ["Chinese", "English"].
    encoding: encoding of each file in train directory.
    ratio: propotion of train data, others will be treat as dev data. Default value is 0.98.

Returns:
    trainData: a dictionary of train data sentences of each language. Its structure is:
    
               {language A: [[word1, word2, ...], [...], ...], language B: ...}.
    
    devData: a dictionary of dev data sentences of each language. Its structure is:
    
               {language A: [[word1, word2, ...], [...], ...], language B: ...}.
"""
def getTrainData( dataPath = "../../Data/train/", lanList = ["chinese", "english"],
                  encoding = "UTF-8", ratio = 0.98 ):
    trainData = {}
    devData   = {}
    data = {}
    for lan in lanList:
        if lan not in data:
            data[lan] = []
        print( "Reading " + lan + " files..." )
        files = os.listdir( dataPath + lan + "/" )
        for file in files:
            with open( dataPath + lan + "/" + file, encoding = encoding ) as f:
                line = f.readline()
                while line:
                    wordList = line.split()
                    data[lan].append( ["<S>"] + wordList + ["</S>"] )
                    line = f.readline()
    noOfSentences = len( data[lanList[0]] )
    arr = [i for i in range( noOfSentences )]
    random.shuffle( arr )
    for lan in lanList:
        if lan not in trainData:
            trainData[lan] = []
        if lan not in devData:
            devData[lan] = []
        data[lan] = np.array( data[lan] )[arr].tolist()
        noOfTrainData = int( noOfSentences * ratio )
        devData[lan]   = data[lan][noOfTrainData:]
        trainData[lan] = data[lan][:noOfTrainData]
        print( trainData[lan][:5] )
    return trainData, devData

""" Generate dictionary and preprocess setences for each language

Generate dictionary for each language and convert word to corresponding index.
Here we set two dictionaries to speed up the whole program.

Moreover, the function will replace word in sentences to index automatically.

Args:
    data: a dictionary contains sentences of each language.  Its structure is:
    
          {language A: [[word1, word2, ...], [...], ...], language B: ...}.
    
    threshold: a word will be replace with <UNK> if frequency of a word is
               less than threshold. If the value is less than 1, it means
               no need to replace any word to <UNK>. Default value is 0.

Returns:
    wordNumDict: a dictionary which can convert words to index in each language.
                 Its structure is:
                 
                 {language A: {word A: index A, word B: ..., ...}, language B: ..., ...}.
    
    numWordDict: a dictionary which can convert index to word in each language.
                 Its structure is:
                 
                 {language A: {index A: word A, index B: ..., ...}, language B: ..., ...}.
"""
def generateDict( data, threshold = 0 ):
    wordNumDict = {}
    numWordDict = {}
    for lan, sentences in data.items():
        print( "Generating " + lan + " dictionary..." )
        wordCount = {}
        if lan not in wordNumDict:
            # Add special word to dictionary
            wordNumDict[lan] = {"<PAD>": 0, "<S>": 1, "</S>": 2, "<UNK>": 3}
        if lan not in numWordDict:
            # Add special word to dictionary
            numWordDict[lan] = {0: "<PAD>", 1: "<S>", 2: "</S>", 3: "<UNK>"}
        
        # Count word frequency
        for sentence in sentences:
            for word in sentence:
                if word not in wordCount:
                    wordCount[word] = 0
                wordCount[word] += 1
        
        # Find and replace with <UNK>
        for sentence in sentences:
            for i in range( len( sentence ) ):
                word = sentence[i]
                if wordCount[word] < threshold:
                    word = "<UNK>"
                if word not in wordNumDict[lan]:
                    number = len( wordNumDict[lan] )
                    wordNumDict[lan][word] = number
                    numWordDict[lan][number] = word
                sentence[i] = wordNumDict[lan][word]
    return wordNumDict, numWordDict

"""Sort dictionary by length of original language

Args:
    data: a dictionary contains sentences of each language.  Its structure is:
    
          {language A: [[word1, word2, ...], [...], ...], language B: ...}.
    
    lan: a list of language. The first one is the original language, the second
         one is the target language. For example:

         [language A, language B].

Returns:
    None.
"""
def sortByOriLan( data, lan = ["chinese", "english"] ):
    print( "Sorting data..." )
    tmp = list( zip( data[lan[0]], data[lan[1]] ) )
    tmp.sort( key = lambda x: len( x[0] ) )
    data[lan[0]], data[lan[1]] = zip( *tmp )

"""Number to One-hot

Only convert sentences which length are small than 30.

Args:
    data: a dictionary contains sentences of each language.  Its structure is:
    
          {language A: [[word1, word2, ...], [...], ...], language B: ...}.
    
    lan: a list of language. The first one is the original language, the second
         one is the target language. For example:

         [language A, language B].

Returns:
    status: boolean value. True represents successfully extract batch data. False
            represents extract nothing from original data.
    ndata: data like dictionary.
"""
def toCategory( data, lan ):
    maxlLan0 = 0
    maxlLan1 = 0
    n = 0
    for i in range( len( data[lan[0]] ) ):
        if len( data[lan[0]][i] ) <= 32:
            n += 1
            maxlLan0 = max( maxlLan0, len( data[lan[0]][i] ) )
            maxlLan1 = max( maxlLan1, len( data[lan[1]][i] ) )
    if n == 0:
        return False, [], []
    lan0 = np.zeros( ( n, maxlLan0 ) )
    lan1 = np.zeros( ( n, maxlLan1 ) )
    n = 0
    for i in range( len( data[lan[0]] ) ):
        if len( data[lan[0]][i] ) <= 32:
            for j in range( len( data[lan[0]][i] ) ):
                lan0[n, j] = data[lan[0]][i][j]
            for j in range( len( data[lan[1]][i] ) ):
                lan1[n, j] = data[lan[1]][i][j]
            n += 1
    data[lan[0]] = lan0
    data[lan[1]] = lan1
    return True, data

def toCategoryWrap( args ):
    return toCategory( *args )

""" Simple Seqence to Sequence Implementation

A simple implementation of Sequence to Sequence model. It works as baseline

Args:
    input_dim:  dimension of input word vector.
    output_dim: dimension of output word vector.
    hidden_dim: dimension of hidden states vector.
    output_vocab_size: size of output language vocabulary size.
    input_vocab_size:  size of input  language vocabulary size.
    word_vec_dim: dimension of word-vector.
    name: name of the model.

Returns:
    model: the whole model of simple Seq2Seq model.

"""
def simpleSeq2Seq( output_vocab_size, input_vocab_size, hidden_dim = 128,
                   word_vec_dim = 300, name = "demo" ):
    embedding_encoder  = Embedding( output_dim = word_vec_dim, input_dim = input_vocab_size,
                                 name = name + "_encoder_embedding", mask_zero = True ) # 
    embedding_decoder = Embedding( output_dim = word_vec_dim, input_dim = output_vocab_size,
                                 name = name + "_decoder_embedding", mask_zero = True ) # 
    # Encoder
    encoder_input     = Input( shape = ( None, ), name = name + "_encoder_input" )
    # change when using pre-trained embedding trainable= False
    encoder           = LSTM( hidden_dim, return_state = True )
    encoder_input_emb = embedding_encoder( encoder_input )
    _, state_h, state_c = encoder( encoder_input_emb )
    state_encoder     = [state_h, state_c]
    # Decoder
    decoder = LSTM( hidden_dim, return_sequences = True )

    decoder_input     = Input( shape = ( None, ), name = name + "_decoder_input" )
    decoder_input_emb = embedding_decoder( decoder_input )
    decoder_outputs   = decoder( decoder_input_emb, initial_state = state_encoder )
    decoder_dense     = Dense( output_vocab_size, activation = "softmax", name = name + "_decoder_output" )
    decoder_outputs   = decoder_dense( decoder_outputs )

    # Build model
    model = Model( inputs = [encoder_input, decoder_input], outputs = decoder_outputs, name = name )
    model.compile( optimizer = 'adam', loss = "categorical_crossentropy" )
    return model

In [3]:
trainData, devData = getTrainData( "../../Data/test/" )
wordNumDict, numWordDict = generateDict( trainData, threshold = 5 )
sortByOriLan( trainData, ["chinese", "english"] )
ivs = len( wordNumDict["chinese"] )
ovs = len( wordNumDict["english"] )
print( ivs, ovs )

#trainData["chinese"] = trainData["chinese"][::-1]
#devData["chinese"] = devData["chinese"][::-1]
#trainData["english"] = trainData["english"][::-1]
#devData["english"] = devData["english"][::-1]

Reading chinese files...
Reading english files...
[['<S>', '上海', '信息', '交互', '网', '采用', '的', '网络', '技术', '与', '当今', '国际', '互联', '技术', '的', '发展', '同步', ',', '是', '目前', '世界', '先进', '网络', '技术', '在', '上海', '的', '成功', '运用', '</S>'], ['<S>', '章启月', '说', ',', '中国', '在', '国际', '反恐', '斗争', '中', '发挥', '了', '建设性', '作用', ',', '坚决', '反对', '一切', '形式', '的', '恐怖主义', ',', '积极', '参与', '国际', '反恐', '合作', ',', '受到', '国际', '社会', '的', '广泛', '好评', '</S>'], ['<S>', '李鹏', '说', ':', '"', '马里', '是', '最', '早', '同', '中国', '建交', '的', '非洲', '国家', '之一', '中', '马', '友好', '关系', '始终', '健康', '顺利', '地', '发展', ',', '双方', '的', '合作', '卓有', '成效', ',', '我们', '希望', '两', '国', '的', '友好', '合作', '关系', '长期', '稳定', '地', '发展', '下去', '"', '</S>'], ['<S>', '据', '统计', ',', '19', '月', '全', '国', '乡', '及', '乡', '以上', '工业', '企业', '累计', '完成', '工业', '增加值', '11558', '亿', '元', ',', '比', '上', '年', '同', '期', '增长', '百分之十六点六', '</S>'], ['<S>', '这', '位', '不', '愿', '透露', '姓名', '的', '官员', '表示', ',', '在', '最', '坏', '的', '情况', '下', ',', '短期', '「', '将', '造成

In [4]:
print( "Building model..." )
model = simpleSeq2Seq( output_vocab_size = ovs, input_vocab_size = ivs, name = "demo" )
model.summary()

batch_size = 64
losses = []
n = 0
total = len( trainData["chinese"] )
length = len( wordNumDict["english"] )

Building model...
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
demo_encoder_input (InputLayer) (None, None)         0                                            
__________________________________________________________________________________________________
demo_decoder_input (InputLayer) (None, None)         0                                            
__________________________________________________________________________________________________
demo_encoder_embedding (Embeddi (None, None, 300)    3080100     demo_encoder_input[0][0]         
__________________________________________________________________________________________________
demo_decoder_embedding (Embeddi (None, None, 300)    2848200     demo_decoder_input[0][0]         
___________________________________________________________________________________________

In [None]:
print( "Parallel processing training data" )
# cores = multiprocessing.cpu_count()
# pool = multiprocessing.Pool( processes = cores )
params = []
for i in range( 0, total + batch_size, batch_size ):
    # Divide data into batch
    tdata = {}
    tdata["chinese"] = trainData["chinese"][i:i + batch_size]
    tdata["english"] = trainData["english"][i:i + batch_size]
    # Combine all params
    params.append( [tdata, ["chinese", "english"]] )
print( "MAP" )
rets = []
for param in params:
    rets.append( toCategoryWrap( param ) )
# for ret in pool.imap_unordered( toCategoryWrap, params ):
#     rets.append( ret )
# rets = pool.map( toCategoryWrap, params )
# pool.close()
# pool.join()

Parallel processing training data
MAP


In [None]:
print( rets[1][1]["chinese"] )

[[1.000e+00 1.100e+02 1.435e+03 1.120e+02 2.000e+00]
 [1.000e+00 3.140e+02 2.950e+02 3.000e+00 2.000e+00]
 [1.000e+00 1.100e+02 1.435e+03 1.120e+02 2.000e+00]
 [1.000e+00 2.221e+03 3.525e+03 3.000e+00 2.000e+00]
 [1.000e+00 1.100e+02 1.435e+03 1.120e+02 2.000e+00]
 [1.000e+00 1.100e+02 1.435e+03 1.120e+02 2.000e+00]
 [1.000e+00 1.100e+02 1.435e+03 1.120e+02 2.000e+00]
 [1.000e+00 2.336e+03 8.520e+02 8.225e+03 2.000e+00]
 [1.000e+00 1.100e+02 1.435e+03 1.120e+02 2.000e+00]
 [1.000e+00 2.010e+03 2.950e+02 9.170e+02 2.000e+00]
 [1.000e+00 3.000e+00 8.520e+02 3.103e+03 2.000e+00]
 [1.000e+00 1.100e+02 1.435e+03 1.120e+02 2.000e+00]
 [1.000e+00 1.100e+02 1.435e+03 1.120e+02 2.000e+00]
 [1.000e+00 3.000e+00 1.740e+02 3.000e+00 2.000e+00]
 [1.000e+00 1.100e+02 1.435e+03 1.120e+02 2.000e+00]
 [1.000e+00 1.100e+02 1.435e+03 1.120e+02 2.000e+00]
 [1.000e+00 1.100e+02 1.435e+03 1.120e+02 2.000e+00]
 [1.000e+00 1.100e+02 1.435e+03 1.120e+02 2.000e+00]
 [1.000e+00 1.100e+02 1.435e+03 1.120e+02 2.00

In [None]:
#model.load_weights( "Models/old/model_weights_18000.h5" )
#n = 18000

# Multi-thread
que = queue.Queue( 5 )
i = 0
lrets = len( rets )

inputs = [rets[1][1]["chinese"], rets[1][1]["english"]]
outputs = K.utils.to_categorical( rets[1][1]["english"], length )
loss = model.train_on_batch( inputs, outputs )
print( loss )

def putToQueue():
    global i, rets, lrets, que, length
    print( i, lrets, length )
    while i < lrets:
        if rets[i][0] == False:
            i += 1
            continue
        if not que.full():
            print( "2-", i )
            que.put( [i, K.utils.to_categorical( rets[i][1]["english"], length )] )
            i += 1

def run():
    global i, rets, lrets, que, model, losses, length
    print( i, lrets )
    while i < lrets or not que.empty():
        if not que.empty():
            tmp = que.get()
            idx = tmp[0]
            tmp = tmp[1]
            que.task_done()
            if rets[idx][0] == False:
                continue
            ret = rets[idx]
            inputs = [ret[1]["chinese"], ret[1]["english"]]
            outputs = tmp
            loss = model.train_on_batch( inputs, outputs )
            losses.append( loss )
            if idx and idx % 3000 == 0:
                model.save_weights("Models/model_weights_" + str( idx ) + ".h5" ) 

print( "Training..." )
t1 = threading.Thread( target = putToQueue )
t2 = threading.Thread( target = run )
t1.start()
t2.start()
t1.join()
t2.join()

W0315 05:48:33.125375 11084 deprecation.py:323] From C:\Users\Ruosen\Anaconda3\lib\site-packages\tensorflow\python\ops\array_grad.py:425: to_int32 (from tensorflow.python.ops.math_ops) is deprecated and will be removed in a future version.
Instructions for updating:
Use `tf.cast` instead.


9.157951
Training...
00  512512 9494

2- 0
2- 1
2- 2
2- 3
2- 4
2- 5
2- 6
2- 7
2- 8
2- 9
2- 10
2- 11
2- 12
2- 13
2- 14
2- 15
2- 16
2- 17
2- 18
2- 19
2- 20
2- 21
2- 22
2- 23
2- 24
2- 25
2- 26
2- 27
2- 28
2- 29
2- 30
2- 31
2- 32
2- 33
2- 34
2- 35
2- 36
2- 37
2- 38
2- 39
2- 40
2- 41


In [None]:
## for ret in rets:
# #    status, newTrainData, td = toCategory( trainData, ["chinese", "english"], length, i, min( i + batch_size, total ) )
#     if ret[0] == False:
#         continue
#     label = K.utils.to_categorical( ret[1]["english"], length )
#     loss = model.train_on_batch( [ret[1]["chinese"], ret[1]["english"]], label )
#     n += 1
#     print( n, loss )
#     if n and n % 3000 == 0:
#         model.save_weights("Models/model_weights_" + str( n ) + ".h5" ) 
#     losses.append( loss )

In [None]:
model.save_weights("Models/model_weights_" + str( lrets ) + ".h5" ) 
with open( "losses", "w" ) as f:
    for loss in losses:
        f.write( str( loss ) + "\n" )