In [17]:
import os
import nltk
#import trax #google neuralnetwork builder now can only be implemented in ios or linux
#from trax import layers as tl
#from trax.supervised import training
#from trax.fastmath import numpy as fastnp #make np calculate faster
import numpy as np
import pandas as pd
import random
import jieba #Chinese character segmentation tools



# Preprocessing the data and creating the vocabulary

In [15]:
data = pd.read_csv("train.txt",sep='\\t',header=None,engine="python",names=["sent1","sent2","is_duplicate"])
n_len=len(data)
print('Number of sents for total: ', n_len)
data.head(10)

Number of sents for total:  100000


Unnamed: 0,sent1,sent2,is_duplicate
0,用微信都6年，微信没有微粒贷功能,4。 号码来微粒贷,0
1,微信消费算吗,还有多少钱没还,0
2,交易密码忘记了找回密码绑定的手机卡也掉了,怎么最近安全老是要改密码呢好麻烦,0
3,你好 我昨天晚上申请的没有打电话给我 今天之内一定会打吗？,什么时候可以到账,0
4,"“微粒贷开通""",你好，我的微粒贷怎么没有开通呢,0
5,为什么借款后一直没有给我回拨电话,怎么申请借款后没有打电话过来呢！,1
6,为什么我每次都提前还款了最后却不给我贷款了,30号我一次性还清可以不,0
7,请问一天是否都是限定只能转入或转出都是五万。,微众多少可以赎回短期理财,0
8,微粒咨询电话号码多少,你们的人工客服电话是多少,1
9,已经在银行换了新预留号码。,我现在换了电话号码，这个需要更换吗,1


In [16]:
#slit dataframe data
data_train = data.sample(frac=0.9,random_state=0,axis=0) # make my each random sample same
data_test  = data[~data.index.isin(data_train.index)]
data_train["sent1+sent2"]=data_train["sent1"]+data_train["sent2"]
print("Data_train:", len(data_train), "Data_test:", len(data_test))
del(data) # free the memory

Data_train: 90000 Data_test: 10000


In [79]:
data_train.head()

Unnamed: 0,sent1,sent2,is_duplicate,sent1+sent2
3582,何时可以受邀？,何时会激请我？,1,何时可以受邀？何时会激请我？
60498,如何更改还款卡号？,如何更换还款联系电话,0,如何更改还款卡号？如何更换还款联系电话
53227,23号凌晨没存进去，白天存进去算逾期吗,。我今天早上还款算逾期吗？今天凌晨到期我晕,1,23号凌晨没存进去，白天存进去算逾期吗。我今天早上还款算逾期吗？今天凌晨到期我晕
21333,如何查看合同,可以帮我看一下合同真假吗,0,如何查看合同可以帮我看一下合同真假吗
3885,为什么我还款了不能再借？我每次还款都没预期啊,提前还款为何不能再借了,0,为什么我还款了不能再借？我每次还款都没预期啊提前还款为何不能再借了


In [18]:
data_duplicate = data_train[data_train['is_duplicate'] == 1] #filter the duplicated data
td_index = list(data_duplicate.index)
print('number of duplicate sents: ', len(td_index))
print('indexes of first ten duplicate sents:', td_index[:10])

number of duplicate sents:  44924
indexes of first ten duplicate sents: [3582, 53227, 51521, 10685, 41032, 49392, 65942, 2216, 81976, 85471]


In [19]:
print(data_train['sent1'][3582])  #  Example of similar sentences
print(data_train['sent2'][3582])
print('is_duplicate: ', data_train['is_duplicate'][5])

何时可以受邀？
何时会激请我？
is_duplicate:  1


In [20]:
S1_train_words = np.array(data_train['sent1'][td_index])
S2_train_words = np.array(data_train['sent2'][td_index])

S1_test_words = np.array(data_test['sent1'])
S2_test_words = np.array(data_test['sent2'])
y_test  = np.array(data_test['is_duplicate'])
data_train["sent1+sent2"]=data_train["sent1+sent2"][td_index]

In [21]:
#create empty arrays
S1_train=np.asarray([None]*S1_train_words.shape[0])
S2_train = np.asarray([None]*S2_train_words.shape[0])

S1_test = np.asarray([None]*S1_test_words.shape[0])
S2_test = np.asarray([None]*S2_test_words.shape[0])

In [22]:
#Constructing the vocabulary with the train set      
from collections import defaultdict

vocab = defaultdict(lambda: 0)
vocab['<PAD>'] = 1 #set pad as the first vocab

for idx in td_index:                          #cut_all=False means the sentence was tokenized by semantic unit not character
    word_list = " ".join(jieba.cut(data_train.at[idx,"sent1+sent2"], cut_all=False)).split() #jieba tokenize will return an object
    for w in word_list:
        if w in vocab:
            continue
        else:
            vocab[w] = len(vocab) + 1
print('The length of the vocabulary is: ', len(vocab))

Building prefix dict from the default dictionary ...
Dumping model to file cache C:\Users\Dell\AppData\Local\Temp\jieba.cache
Loading model cost 1.728 seconds.
Prefix dict has been built successfully.


The length of the vocabulary is:  4815


In [23]:
print(vocab['<PAD>'])
print(vocab['喝'])
print(vocab['微'])  #if word not in vocabulary, returns 0

1
0
483


In [24]:
print('Train set was downsized to: ', len(S1_train) ) 
print('Test set length: ', len(S1_test) )

Train set was downsized to:  44924
Test set length:  10000


In [25]:
# Converting sents to array of integers
for i in range(len(S1_train)):
    S1_train[i] = [vocab[word] for word in S1_train_words[i]]
    S2_train[i] = [vocab[word] for word in S2_train_words[i]]

        
for i in range(len(S1_test)):
    S1_test[i] = [vocab[word] for word in S1_test_words[i]]
    S2_test[i] = [vocab[word] for word in S2_test_words[i]]

In [26]:
# Splitting the data to train and validation
# shuffle the dataset
np.random.shuffle(S1_train)
np.random.shuffle(S2_train)
split_n = int(len(S1_train)*.9)
train_S1, train_S2 = S1_train[:split_n], S2_train[:split_n]
val_S1, val_S2 = S1_train[split_n: ], S2_train[split_n:]
print('Number of duplicate sents: ', len(S1_train))
print("The length of the training set is:  ", len(train_S1))
print("The length of the validation set is: ", len(val_S1))

Number of duplicate sents:  44924
The length of the training set is:   40431
The length of the validation set is:  4493


In [27]:
def data_generator(S1, S2, batch_size, pad=1, shuffle=True): #"""Generator function that yields batches of data
    input1 = []
    input2 = []
    idx = 0
    len_s = len(S1)
    sents_indexes = [i for i in range(len_s)]
    
    if shuffle:
        random.shuffle(sents_indexes)
    
    while True:
        if idx >= len_s:
            # if idx is greater than or equal to len_q, set idx to 0 
            idx = 0
            # shuffle to get random batches if shuffle is set to True
            if shuffle:
                random.shuffle(sents_indexes)
        
        # get sents at the sents[idx] position in sent1 and sent2
        s1 = S1[sents_indexes[idx]]
        s2 = S2[sents_indexes[idx]]
        
        # increment idx by 1
        idx += 1
        # append q1
        input1.append(s1)
        # append q2
        input2.append(s2)
        if len(input1) == batch_size:
            # take max of input1 & input2 and then max out of the two of them.
            max_len = max(max([len(q) for q in input1]),
                          max([len(q) for q in input2]))
            max_len = 2**int(np.ceil(np.log2(max_len)))# for the computation
            for idx,(s1, s2) in enumerate(zip(input1, input2)):
                # add [pad] to q1 until it reaches max_len
                input1[idx] = s1 + [pad] * (max_len - len(s1))
                input2[idx] = s2 + [pad] * (max_len - len(s2))
            yield np.array(input1), np.array(input2) # will stop the while loop
            # reset the batches
            input1, input2 = [], []  # reset the batches the next operation will start from the first line after yield

In [28]:
batch_size = 3
batch1, batch2 = next(data_generator(train_S1, train_S2, batch_size))
print("First sents  : ",'\n', batch1, '\n')
print("Second sents : ",'\n', batch2)

First sents  :  
 [[ 910 2397  755 1039 1938  171    1    1    1    1    1    1    1    1
     1    1]
 [1071 4259  281   33 2231 1015 1001    1    1    1    1    1    1    1
     1    1]
 [ 473 2143 1358   19    1    1    1    1    1    1    1    1    1    1
     1    1]] 

Second sents :  
 [[1071 4259 2052  910    0  861  208    1    1    1    1    1    1    1
     1    1]
 [1131 4802  755 1039 1201   10   78  959   14  473 3366 1938   74    1
     1    1]
 [  53  405    0    0  104  105    0    0  133    1    1    1    1    1
     1    1]]


# Define the model and loss function

In [91]:
def Siamese(vocab_size=len(vocab), d_model=128, mode='train'):

    def normalize(x):  # normalizes the vectors to have L2 norm which is better than L1 norm
        return x / fastnp.power(fastnp.sum(x**2, axis=-1, keepdims=True),0.5)
    
    
    s_processor = tl.Serial(                      
        tl.Embedding(vocab_size, d_model),
        # Run GRU. If this is not dim d_model it raises an error
        tl.GRU(d_model),
        tl.Mean(axis=1),  #calculate the mean for normalization
        tl.Fn('Normalize', lambda x: normalize(x)))
        # Apply normalize function  # Returns one vector of shape [batch_size, d_model].
    

    
    # Run on S1 and S2 in parallel.
    model = tl.Parallel(s_processor, s_processor)# Processor will run on both S1 and S2.
    return model

In [92]:
# check the model architecture
model = Siamese()
print(model)

Parallel_in2_out2[
  Serial[
    Embedding_5605_128
    GRU_128
    Mean
    Normalize
  ]
  Serial[
    Embedding_5605_128
    GRU_128
    Mean
    Normalize
  ]
]


## Mean negative examples will accelerate the training and closest negative examples contribute to penalizing the cost more.

In [93]:
def TripletLossFn(v1, v2, margin=0.25):
    
    # use fastnp to take the dot product of the two batches and the second one needs to be transposed
    scores = fastnp.dot(v1, v2.T)  # pairwise cosine similarity
    # calculate new batch size
    batch_size = len(scores)
    # use fastnp to get all postive `diagonal` entries in `scores`
    positive = fastnp.diagonal(scores)  # the positive ones (duplicates)
    empty=np.zeros([batch_size,batch_size])
    np.fill_diagonal(empty,1)
    # multiply `empty` with 2.0 and subtract it out of `scores`
    negative_without_positive = scores - 2.0 * empty
    # take the row by row `max` of `negative_without_positive`. 
    closest_negative = negative_without_positive.max(axis=1) #  means cloest negative examples
    # subtract `fastnp.eye(batch_size)` out of 1.0 and do element-wise multiplication with `scores`
    negative_zero_on_duplicate = scores * (1.0 - empty)
    # use `fastnp.sum` on `negative_zero_on_duplicate` for `axis=1` and divide it by `(batch_size - 1)` 
    mean_negative = np.sum(negative_zero_on_duplicate, axis=1) / (batch_size-1) # get the mean value of negative examples
    # compute `fastnp.maximum` among 0.0 and `A`
    triplet_loss1 = fastnp.maximum(0.0, margin - positive + closest_negative)
    # compute `fastnp.maximum` among 0.0 and `B`
    triplet_loss2 = fastnp.maximum(0.0, margin - positive + mean_negative)
    # add the two losses together and take the `fastnp.mean` of it
    triplet_loss = fastnp.mean(triplet_loss1 + triplet_loss2) # make sure the model will learn more neagtive examples
    
    
    return triplet_loss

In [94]:
from functools import partial # fix the margin, so we don't have call margin everytime
def TripletLoss(margin=0.25):
    triplet_loss_fn = partial(TripletLossFn, margin=margin)
    return tl.Fn('TripletLoss', triplet_loss_fn)

In [95]:
batch_size = 256
train_generator = data_generator(train_S1, train_S2, batch_size, vocab['<PAD>'])
val_generator = data_generator(val_S1, val_S2, batch_size, vocab['<PAD>'])
print('train_S1.shape ', train_S1.shape)
print('val_S1.shape   ', val_S1.shape)

train_S1.shape  (40431,)
val_S1.shape    (4493,)


# Train the model and test the model

In [96]:
lr_schedule = trax.lr.warmup_and_rsqrt_decay(400, 0.01) # apply the warmup strategy #Value for learning rate after warm-up has finished

def train_model(lr_schedule, train_generator=train_generator, val_generator=val_generator, output_dir='model/'):
    
    output_dir = os.path.dirname(output_dir)


    train_task = training.TrainTask(
        labeled_data=train_generator,         # Use generator (train)
        loss_layer=TripletLoss(),             # Use triplet loss. 
        optimizer=trax.optimizers.Adafactor(0.01), # chosse the Adafactor as the optimizer
        lr_schedule=lr_schedule,              # Use Trax multifactor schedule function
    )

    eval_task = training.EvalTask(
        labeled_data=val_generator,       # Use generator (val)
        metrics=[TripletLoss()],          # Use triplet loss. 
    )
    

    training_loop = training.Loop(Siamese(),
                                  train_task,
                                  eval_tasks=[eval_task],
                                  output_dir=output_dir,
                                 random_seed=34) #set the random seed to keep the same result each time

    return training_loop

In [97]:
def test(test_S1, test_S2, y, threshold, model, vocab, data_generator=data_generator, batch_size=64):
    accuracy = 0
    for i in range(0, len(test_S1), batch_size):
        # Call the data generator with shuffle=False using next()
        # use batch size chuncks of sentences as S1 & S2 arguments of the data generator. 
        s1, s2 = next(data_generator(test_S1[i:i + batch_size], test_S2[i:i + batch_size], batch_size, vocab['<PAD>'], shuffle=False))
        # use batch size chuncks of actual output targets
        y_test2 = y[i:i + batch_size]
        # Call the model
        v1, v2 = model((s1, s2)) 

        for j in range(batch_size):
            # take dot product to compute cos similarity of each pair of entries, v1[j], v2[j]
            d = np.dot(v1[j], v2[j].T) # second one needs to be transposed
            # is d greater than the threshold?
            res = d > threshold
            # increment accurancy if y_test is equal `res`
            accuracy += (y_test2[j] == res)
    # compute accuracy using accuracy and total length of test sentences
    accuracy = accuracy / len(test_S1)
    
    return accuracy

In [98]:
def predict(sent1, sent2,  model, vocab, data_generator=data_generator, verbose=False):

    # use jieba word tokenize function to tokenize Chinese characters
    s1 = " ".join(jieba.cut(sent1, cut_all=False)).split()  # tokenize
    s2 = " ".join(jieba.cut(sent2, cut_all=False)).split()  # tokenize
    S1, S2 = [], []
    for word in s1:  # encode s1
        # increment by checking the 'word' index in `vocab`
        S1 += [vocab[word]]
    for word in s2:  # encode s2
        # increment by checking the 'word' index in `vocab`
        S2 += [vocab[word]]
        
    # Call the data generator  using next()
    # pass [s1] & [s2] as s1 & s2 arguments of the data generator. Set batch size as 1
    S1, S2 = next(data_generator([S1], [S2], 1, vocab['<PAD>']))
    # Call the model
    v1, v2 = model((S1, S2)) 
    # take dot product to compute cos similarity of each pair of entries, v1, v2
    d = np.dot(v1[0], v2[0].T) #get the similarity score
    
    
    return d

In [544]:
train_steps = 800
training_loop = train_model(lr_schedule)
training_loop.run(train_steps)


Step      1: Total number of trainable weights: 816256
Step      1: Ran 1 train steps in 5.79 secs
Step      1: train TripletLoss |  0.50076336
Step      1: eval  TripletLoss |  0.50642514

Step    100: Ran 99 train steps in 114.24 secs
Step    100: train TripletLoss |  0.50036424
Step    100: eval  TripletLoss |  0.50189894

Step    200: Ran 100 train steps in 147.81 secs
Step    200: train TripletLoss |  0.48591897
Step    200: eval  TripletLoss |  1.04569507

Step    300: Ran 100 train steps in 152.23 secs
Step    300: train TripletLoss |  0.21912900
Step    300: eval  TripletLoss |  1.11851835

Step    400: Ran 100 train steps in 147.46 secs
Step    400: train TripletLoss |  0.14935488
Step    400: eval  TripletLoss |  1.07237673

Step    500: Ran 100 train steps in 149.09 secs
Step    500: train TripletLoss |  0.10622285
Step    500: eval  TripletLoss |  1.14495122

Step    600: Ran 100 train steps in 153.19 secs
Step    600: train TripletLoss |  0.06816491
Step    600: eval  Tri

In [99]:
model=Siamese()
model.init_from_file('./model/model.pkl.gz')



(((array([[-0.10104602,  0.02889115, -0.10710341, ...,  0.17211324,
            0.11238155, -0.00545579],
          [-0.00494196,  0.00592238, -0.08847794, ..., -0.02457219,
            0.12076508, -0.00646752],
          [ 0.07731185,  0.11890233, -0.08921237, ...,  0.14748329,
            0.13896737, -0.12982987],
          ...,
          [ 0.11542351, -0.08621721,  0.13382736, ..., -0.09279124,
           -0.03013758, -0.136706  ],
          [-0.10708649,  0.11113342,  0.10610324, ..., -0.079561  ,
           -0.12771045,  0.02540564],
          [-0.09707364,  0.12381083, -0.07750284, ...,  0.01700191,
           -0.09695008,  0.13629757]], dtype=float32),
   (((), ((), ())),
    ((array([[ 0.00669629,  0.00151234, -0.01979298, ...,  0.00151354,
              -0.01387604, -0.00649653],
             [-0.00878541,  0.02212545,  0.00285433, ..., -0.00119053,
              -0.00374965, -0.00775282],
             [-0.0116117 , -0.02906881, -0.02117639, ...,  0.01683593,
               0.

In [623]:
accuracy = test(S1_test,S2_test, y_test, 0.7, model, vocab, batch_size = 500) 
print("Accuracy", accuracy)

Accuracy 0.5254


## Assume "他很好" is reference

In [661]:
predict("他很好","他非常好",model,vocab,verbose=True) 

0.96708274

based on the result below, we can see that if we use bleu score to evlauate it, the 1 gram score will be 0.5, but actually these two sentences have pretty much same meaning. If we use rogue elvaluation metric, the score will be 0.67, which can not still reflect the real MT quality.

In [660]:
predict("他很好","他很出色",model,vocab,verbose=True)

0.8861375

## Assume "怎样修改号码 is reference

In [668]:
predict("怎样修改号码","如何修改号码",model,vocab,verbose=True)

0.7488622

In [678]:
predict("怎样修改号码","怎样改变号码",model,vocab,verbose=True)

0.7620132

## Train the model with less steps

In [630]:
train_steps = 300
training_loop = train_model(lr_schedule,output_dir='model2/')
training_loop.run(train_steps)


Step      1: Total number of trainable weights: 816128
Step      1: Ran 1 train steps in 5.03 secs
Step      1: train TripletLoss |  0.50999707
Step      1: eval  TripletLoss |  0.50219840

Step    100: Ran 99 train steps in 73.89 secs
Step    100: train TripletLoss |  0.50474632
Step    100: eval  TripletLoss |  0.50047755

Step    200: Ran 100 train steps in 77.71 secs
Step    200: train TripletLoss |  0.47630322
Step    200: eval  TripletLoss |  0.57024050

Step    300: Ran 100 train steps in 78.18 secs
Step    300: train TripletLoss |  0.13668355
Step    300: eval  TripletLoss |  0.75268292


In [631]:
model2=Siamese()
model2.init_from_file('./model2/model.pkl.gz')

(((array([[-0.09794217,  0.07738896, -0.11017518, ...,  0.10171206,
            0.04932896, -0.01923384],
          [-0.0230925 ,  0.00876028, -0.11206853, ..., -0.04132047,
            0.11689867, -0.00949296],
          [ 0.07769928,  0.11949887, -0.08965942, ...,  0.14822248,
            0.1396659 , -0.13048118],
          ...,
          [ 0.11600288, -0.08664934,  0.13449791, ..., -0.0932569 ,
           -0.03028884, -0.13739145],
          [-0.10762369,  0.11169153,  0.10663596, ..., -0.07995961,
           -0.1283512 ,  0.02553323],
          [-0.09756147,  0.12443251, -0.0778916 , ...,  0.01708739,
           -0.09743696,  0.13698302]], dtype=float32),
   (((), ((), ())),
    ((array([[ 9.78134479e-03,  8.52713478e-04, -5.36919106e-03, ...,
               1.00522665e-02, -4.56843816e-04, -9.80535173e-04],
             [-6.45524589e-03,  9.66709387e-03, -5.77989733e-03, ...,
               5.65170031e-03,  5.55751147e-04, -3.93061247e-03],
             [ 9.50212780e-05, -3.708902

In [632]:
accuracy = test(S1_test,S2_test, y_test, 0.7, model2, vocab, batch_size = 500) 
print("Accuracy", accuracy)

Accuracy 0.5764


In [655]:
predict("他很好","他非常好",model2,vocab,verbose=True)

0.8709724

In [659]:
predict("他很好","他很出色",model2,vocab,verbose=True)

0.8197402

In [679]:
predict("怎样修改号码","如何修改号码",model2,vocab,verbose=True)

0.74816334

In [681]:
predict("怎样修改号码","怎样改变号码",model2,vocab,verbose=True)

0.6986873

In [648]:
train_steps = 400
training_loop = train_model(lr_schedule,output_dir='model4/')
training_loop.run(train_steps)


Step      1: Total number of trainable weights: 816128
Step      1: Ran 1 train steps in 5.03 secs
Step      1: train TripletLoss |  0.50999707
Step      1: eval  TripletLoss |  0.50219840

Step    100: Ran 99 train steps in 35.32 secs
Step    100: train TripletLoss |  0.50474632
Step    100: eval  TripletLoss |  0.50047755

Step    200: Ran 100 train steps in 47.82 secs
Step    200: train TripletLoss |  0.47630322
Step    200: eval  TripletLoss |  0.57024050

Step    300: Ran 100 train steps in 66.41 secs
Step    300: train TripletLoss |  0.13668355
Step    300: eval  TripletLoss |  0.75268292

Step    400: Ran 100 train steps in 74.59 secs
Step    400: train TripletLoss |  0.06465720
Step    400: eval  TripletLoss |  0.73997897


In [651]:
model4=Siamese()
model4.init_from_file('./model4/model.pkl.gz')

(((array([[-0.09432655,  0.08404654, -0.10992641, ...,  0.10568206,
            0.04466164, -0.01650544],
          [-0.02112508,  0.00744944, -0.10839731, ..., -0.03950041,
            0.1107209 , -0.00784869],
          [ 0.0776218 ,  0.11937916, -0.08957   , ...,  0.14807346,
            0.13952583, -0.13035005],
          ...,
          [ 0.11588665, -0.08656292,  0.1343638 , ..., -0.09316377,
           -0.03025847, -0.13725436],
          [-0.10751566,  0.11157978,  0.10652941, ..., -0.07987989,
           -0.12822305,  0.02550771],
          [-0.09746387,  0.12430809, -0.07781336, ...,  0.01707025,
           -0.09733935,  0.13684593]], dtype=float32),
   (((), ((), ())),
    ((array([[ 0.00948785,  0.00025126, -0.00730977, ...,  0.01080846,
              -0.00014758,  0.00044009],
             [-0.00745586,  0.00868924, -0.00682038, ...,  0.00601123,
               0.00183884, -0.00259202],
             [-0.00239765, -0.00643752, -0.00682915, ...,  0.01017619,
               0.

In [652]:
accuracy = test(S1_test,S2_test, y_test, 0.7, model4, vocab, batch_size = 500) 
print("Accuracy", accuracy)

Accuracy 0.5652


In [656]:
predict("他很好","他非常好",model4,vocab,verbose=True)

0.88383526

In [658]:
predict("他很好","他很出色",model4,vocab,verbose=True)

0.8027178

In [682]:
predict("怎样修改号码","如何修改号码",model4,vocab,verbose=True)

0.7491411

In [683]:
predict("怎样修改号码","怎样改变号码",model4,vocab,verbose=True)

0.70876765

Based on the accuracy and predict result, I chose the first model I trained to implement the EQ of MT. Due to the time limit, I selected parallel zh-en coprus just with a few sentences and transalte by Google translate and use bleu, rouge and siamese to evaluate the MT quality respectively.

# Google translate the sentences

In [97]:
import pandas as pd
df=pd.read_csv("news-commentary-v14.en-zh.tsv",sep="\t",engine="python",nrows=5,names=["source","reference"],dtype={"source":str(),"reference":str()})

In [98]:
df["reference"]

0                                        1929年还是1989年?
1    巴黎-随着经济危机不断加深和蔓延，整个世界一直在寻找历史上的类似事件希望有助于我们了解目前正...
2    一开始，很多人把这次危机比作1982年或1973年所发生的情况，这样得类比是令人宽心的，因为...
3    如今人们的心情却是沉重多了，许多人开始把这次危机与1929年和1931年相比，即使一些国家政...
4               目前的趋势是，要么是过度的克制（欧洲 ） ， 要么是努力的扩展（美国 ） 。
Name: reference, dtype: object

In [42]:
source=df["source"].values.tolist()
reference=df["reference"].values.tolist()

In [57]:
from google_trans_new import google_translator  
translator = google_translator() 
google_translation=[]
for i,sent in enumerate(source):
    google_translation.append(translator.translate(sent,lang_tgt="zh-cn").strip().replace(" - ","-"))
    
google_translation

['1929年或1989年？',
 '巴黎-随着经济危机加深和扩大，世界一直在寻求历史模拟，以帮助我们了解发生了什么。',
 '在危机的开始时，很多人将它与1982年或1973年相比，这是令人放心的，因为这两个日期都指的是经典的周期性下滑。',
 '今天，这种情绪更加严峻，参考于1929年和1931年开始比比皆是，即使一些政府继续表现得好像危机比特殊的危机更加古典。',
 '趋势是过度克制（欧洲）或努力的扩散（美国）。']

In [44]:
reference

['1929年还是1989年?',
 '巴黎-随着经济危机不断加深和蔓延，整个世界一直在寻找历史上的类似事件希望有助于我们了解目前正在发生的情况。',
 '一开始，很多人把这次危机比作1982年或1973年所发生的情况，这样得类比是令人宽心的，因为这两段时期意味着典型的周期性衰退。',
 '如今人们的心情却是沉重多了，许多人开始把这次危机与1929年和1931年相比，即使一些国家政府的表现仍然似乎把视目前的情况为是典型的而看见的衰退。',
 '目前的趋势是，要么是过度的克制（欧洲 ） ， 要么是努力的扩展（美国 ） 。']

# Google translate with Bleu evaluation

In [58]:
reference_token=[]
google_translation_token=[]
for sent1,sent2 in zip(reference,google_translation):
    reference_token.append([" ".join(jieba.cut(sent1, cut_all=False)).split()])
    google_translation_token.append(" ".join(jieba.cut(sent2, cut_all=False)).split())

Building prefix dict from the default dictionary ...
Loading model from cache /tmp/jieba.cache
Loading model cost 0.499 seconds.
Prefix dict has been built successfully.


In [125]:
# Using bleu score to evaluate the MT quality.
from nltk.translate.bleu_score import corpus_bleu
score_bleu1=corpus_bleu(reference_token,google_translation_token,weights=[1,0,0,0]) # align different weights to differnet n-grams
score_bleu2=corpus_bleu(reference_token,google_translation_token,weights=[0,2,0,0])
score_bleu3=corpus_bleu(reference_token,google_translation_token,weights=[0.5,0.2,0.2,0.1])
print("%.2f%%"%(score_bleu*100))
print("%.2f%%"%(score_bleu2*100))
print("%.2f%%"%(score_bleu3*100))

39.29%
7.93%
26.51%


# Google translate with Rouge evalution

ROUGE-1 refers to the overlap of unigram (each word) between the system and reference summaries.
ROUGE-2 refers to the overlap of bigrams2 between the system and reference summaries.
ROUGE-L: Longest Common Subsequence based statistics. Longest common subsequence problem takes into account sentence level structure similarity naturally and identifies longest co-occurring in sequence n-grams automatically.

In [121]:
# from rouge_metric import PerlRouge
# rouge=PerlRouge(rouge_n_max=3,rouge_l=True,rouge_w=True,rouge_w_weight=1.2,rouge_s=True,rouge_su=True,skip_gap=4)

# translation_rouge=["\n".join(google_translation)]
# referance_rouge=[["\n".join(reference)]]
# score=rouge.evaluate(translation_rouge,referance_rouge)
# print(score)
# I tried to use rouge_metric, but it didnt work on the Chinese characters, so I chose another module
import lawrouge
translation_rouge=["\n".join(google_translation)]
referance_rouge=["\n".join(reference)]
rouge = lawrouge.Rouge(exclusive=True) # exclusive=True means the text is the collection of characters
scores = rouge.get_scores(translation_rouge, referance_rouge, avg=1)# avg=1 means After splitting the two substrings passed by the user according to the sentence separator, the average Rouge value is returned.
print(scores)

{'rouge-1': {'f': 0.6798418922507773, 'p': 0.7226890756302521, 'r': 0.6417910447761194}, 'rouge-2': {'f': 0.4266666617102223, 'p': 0.47058823529411764, 'r': 0.3902439024390244}, 'rouge-l': {'f': 0.6561264772310144, 'p': 0.6974789915966386, 'r': 0.6194029850746269}}


In [46]:
translation_rouge

['1929年或1989年？\n巴黎 - 随着经济危机加深和扩大，世界一直在寻求历史模拟，以帮助我们了解发生了什么。\n在危机的开始时，很多人将它与1982年或1973年相比，这是令人放心的，因为这两个日期都指的是经典的周期性下滑。\n今天，这种情绪更加严峻，参考于1929年和1931年开始比比皆是，即使一些政府继续表现得好像危机比特殊的危机更加古典。\n趋势是过度克制（欧洲）或努力的扩散（美国）。']

In [47]:
referance_rouge

[['1929年还是1989年?\n巴黎-随着经济危机不断加深和蔓延，整个世界一直在寻找历史上的类似事件希望有助于我们了解目前正在发生的情况。\n一开始，很多人把这次危机比作1982年或1973年所发生的情况，这样得类比是令人宽心的，因为这两段时期意味着典型的周期性衰退。\n如今人们的心情却是沉重多了，许多人开始把这次危机与1929年和1931年相比，即使一些国家政府的表现仍然似乎把视目前的情况为是典型的而看见的衰退。\n目前的趋势是，要么是过度的克制（欧洲 ） ， 要么是努力的扩展（美国 ） 。']]

# Google translate with Siamese evaluation

I used the model I trained to evaluate the MT quality，because my evaluation metric is by semantics, I chose to use averaged weight to calculate the overall score for all 5 sentences(hereby I assumed the longer the sentence is, the more semantics weight it will contribute to the whole corpus. The weight will be the length of each sentence divided by sum of lenghts of all sentences.

In [107]:
sent_score=[]
sent_length=[]
for sent1,sent2 in zip(reference,google_translation):
    sent_score.append(predict(sent1,sent2,model,vocab,verbose=True))
    sent_length.append(len(sent2))
weights=np.divide(sent_length,sum(sent_length))    

In [116]:
score_siamese=np.average(np.array(sent_score),weights=weights)
print("%.2f%%"%(score_siamese*100))

76.80%


In [9]:
import pandas as pd
df=pd.read_csv("news-commentary-v14.en-zh.tsv",sep="\t",engine="python",nrows=5,names=["source","reference"],dtype={"source":str(),"reference":str()})

In [10]:
df["reference"].replace("1929","o0000")

0                                        1929年还是1989年?
1    巴黎-随着经济危机不断加深和蔓延，整个世界一直在寻找历史上的类似事件希望有助于我们了解目前正...
2    一开始，很多人把这次危机比作1982年或1973年所发生的情况，这样得类比是令人宽心的，因为...
3    如今人们的心情却是沉重多了，许多人开始把这次危机与1929年和1931年相比，即使一些国家政...
4               目前的趋势是，要么是过度的克制（欧洲 ） ， 要么是努力的扩展（美国 ） 。
Name: reference, dtype: object

In [8]:
print(df)

0                                        1929 or 1989?
1    PARIS – As the economic crisis deepens and wid...
2    At the start of the crisis, many people likene...
3    Today, the mood is much grimmer, with referenc...
4    The tendency is either excessive restraint (Eu...
Name: source, dtype: object


In [13]:
df["reference"]=df["reference"].apply(lambda x: str(x))

In [12]:
df["reference"]

0                                        1929年还是1989年?
1    巴黎-随着经济危机不断加深和蔓延，整个世界一直在寻找历史上的类似事件希望有助于我们了解目前正...
2    一开始，很多人把这次危机比作1982年或1973年所发生的情况，这样得类比是令人宽心的，因为...
3    如今人们的心情却是沉重多了，许多人开始把这次危机与1929年和1931年相比，即使一些国家政...
4               目前的趋势是，要么是过度的克制（欧洲 ） ， 要么是努力的扩展（美国 ） 。
Name: reference, dtype: object

In [29]:
batch2.T

array([[1071, 1131,   53],
       [4259, 4802,  405],
       [2052,  755,    0],
       [ 910, 1039,    0],
       [   0, 1201,  104],
       [ 861,   10,  105],
       [ 208,   78,    0],
       [   1,  959,    0],
       [   1,   14,  133],
       [   1,  473,    1],
       [   1, 3366,    1],
       [   1, 1938,    1],
       [   1,   74,    1],
       [   1,    1,    1],
       [   1,    1,    1],
       [   1,    1,    1]])

In [30]:
batch1

array([[ 910, 2397,  755, 1039, 1938,  171,    1,    1,    1,    1,    1,
           1,    1,    1,    1,    1],
       [1071, 4259,  281,   33, 2231, 1015, 1001,    1,    1,    1,    1,
           1,    1,    1,    1,    1],
       [ 473, 2143, 1358,   19,    1,    1,    1,    1,    1,    1,    1,
           1,    1,    1,    1,    1]])

In [31]:
np.dot(batch1,batch2.T)

array([[13825631, 16525303,  1238662],
       [20974896, 24683947,  2120397],
       [12438604, 11878796,   893333]])