In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import json
from tensorflow.keras import optimizers
import numpy as np

data = []
en_data = []
ch_data = []

def extract(string):
    for ch in string :
        if u'\u4e00' <= ch <= u'\u9fff':
            return True
    return False

for line in open('/content/drive/MyDrive/Colab Notebooks/translation2019zh_train.json', 'r', encoding='utf-8'):
    data.append(json.loads(line))  


for i in range(len(data)):
    if extract(data[i]['chinese']) == True :
        if len(data[i]['chinese']) < 10:
            en_data.append(data[i]['english'])
            ch_data.append('\t'+data[i]['chinese']+'。')
        if len(ch_data) == 100 :
            break
data = []

In [3]:
en_vocab = set(''.join(en_data))
id2en = list(en_vocab)
en2id = {c:i for i,c in enumerate(id2en)}

# 分別生成中英文字典
ch_vocab = set(''.join(ch_data))
id2ch = list(ch_vocab)
ch2id = {c:i for i,c in enumerate(id2ch)}

In [4]:
for i in range(10):
  print(en_data[i],"\n")

Look at these coasters over here. 

Choose a recorder. 

I hadn't paid the telephone bill. 

That's easier said than done, of course. 

Side-to-Side Movements. 

about like 80 degrees. 

We all are from Shandong. 

She was possessed by a devil. 

This wool knits up well. 

The majority was wrong last time. 



In [5]:
en_num_data = [[en2id[en] for en in line ] for line in en_data]
ch_num_data = [[ch2id[ch] for ch in line] for line in ch_data]
de_num_data = [[ch2id[ch] for ch in line][1:] for line in ch_data]
data = []

In [6]:
import numpy as np

# 獲取輸入輸出端的最大長度
max_encoder_seq_length = max([len(txt) for txt in en_num_data])
max_decoder_seq_length = max([len(txt) for txt in ch_num_data])
print('max encoder length:', max_encoder_seq_length)
print('max decoder length:', max_decoder_seq_length)

# 將數據進行onehot處理
encoder_input_data = np.zeros((len(en_num_data), max_encoder_seq_length, len(en2id)), dtype='float32')
decoder_input_data = np.zeros((len(ch_num_data), max_decoder_seq_length, len(ch2id)), dtype='float32')
decoder_target_data = np.zeros((len(ch_num_data), max_decoder_seq_length, len(ch2id)), dtype='float32')

for i in range(len(ch_num_data)):
    for t, j in enumerate(en_num_data[i]):
        encoder_input_data[i, t, j] = 1.
    for t, j in enumerate(ch_num_data[i]):
        decoder_input_data[i, t, j] = 1.
    for t, j in enumerate(de_num_data[i]):
        decoder_target_data[i, t, j] = 1.

max encoder length: 49
max decoder length: 11


In [7]:
# =======預定義模型參數========
EN_VOCAB_SIZE = len(en2id)
CH_VOCAB_SIZE = len(ch2id)
HIDDEN_SIZE = 256

LEARNING_RATE = 0.001
BATCH_SIZE = 10
EPOCHS = 600

In [8]:
from keras.models import Model
from keras.layers import Input, LSTM, Dense, Embedding
from tensorflow.keras.optimizers import Adam 
import numpy as np

# ==============encoder=============
encoder_inputs = Input(shape=(None, EN_VOCAB_SIZE))
#emb_inp = Embedding(output_dim=HIDDEN_SIZE, input_dim=EN_VOCAB_SIZE)(encoder_inputs)
encoder_h1, encoder_state_h1, encoder_state_c1 = LSTM(HIDDEN_SIZE, return_sequences=True, return_state=True)(encoder_inputs)
encoder_h2, encoder_state_h2, encoder_state_c2 = LSTM(HIDDEN_SIZE, return_state=True)(encoder_h1)

In [9]:
# ==============decoder=============
decoder_inputs = Input(shape=(None, CH_VOCAB_SIZE))

#emb_target = Embedding(output_dim=HIDDEN_SIZE, input_dim=CH_VOCAB_SIZE, mask_zero=True)(decoder_inputs)
lstm1 = LSTM(HIDDEN_SIZE, return_sequences=True, return_state=True)
lstm2 = LSTM(HIDDEN_SIZE, return_sequences=True, return_state=True)
decoder_dense = Dense(CH_VOCAB_SIZE, activation='softmax')

decoder_h1, _, _ = lstm1(decoder_inputs, initial_state=[encoder_state_h1, encoder_state_c1])
decoder_h2, _, _ = lstm2(decoder_h1, initial_state=[encoder_state_h2, encoder_state_c2])
decoder_outputs = decoder_dense(decoder_h2)

In [10]:
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)
opt = Adam(lr=LEARNING_RATE, beta_1=0.9, beta_2=0.999, epsilon=1e-08)
model.compile(optimizer=opt, loss='categorical_crossentropy', metrics=['accuracy'])
model.summary()
model.fit([encoder_input_data, decoder_input_data], decoder_target_data,
          batch_size=BATCH_SIZE,
          epochs=EPOCHS,
          validation_split=0.)

  "The `lr` argument is deprecated, use `learning_rate` instead.")


Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            [(None, None, 66)]   0                                            
__________________________________________________________________________________________________
input_2 (InputLayer)            [(None, None, 403)]  0                                            
__________________________________________________________________________________________________
lstm (LSTM)                     [(None, None, 256),  330752      input_1[0][0]                    
__________________________________________________________________________________________________
lstm_2 (LSTM)                   [(None, None, 256),  675840      input_2[0][0]                    
                                                                 lstm[0][1]                   

<keras.callbacks.History at 0x7fe223cf6e90>

In [11]:
# encoder模型和訓練相同
encoder_model = Model(encoder_inputs, [encoder_state_h1, encoder_state_c1, encoder_state_h2, encoder_state_c2])

# 預測模型中的decoder的初始化狀態需要傳入新的狀態
decoder_state_input_h1 = Input(shape=(HIDDEN_SIZE,))
decoder_state_input_c1 = Input(shape=(HIDDEN_SIZE,))
decoder_state_input_h2 = Input(shape=(HIDDEN_SIZE,))
decoder_state_input_c2 = Input(shape=(HIDDEN_SIZE,))

# 使用傳入的值來初始化當前模型的輸入狀態
decoder_h1, state_h1, state_c1 = lstm1(decoder_inputs, initial_state=[decoder_state_input_h1, decoder_state_input_c1])
decoder_h2, state_h2, state_c2 = lstm2(decoder_h1, initial_state=[decoder_state_input_h2, decoder_state_input_c2])
decoder_outputs = decoder_dense(decoder_h2)

decoder_model = Model([decoder_inputs, decoder_state_input_h1, decoder_state_input_c1, decoder_state_input_h2, decoder_state_input_c2], 
                      [decoder_outputs, state_h1, state_c1, state_h2, state_c2])

In [12]:
TestPath = '/content/drive/MyDrive/Colab Notebooks/translation2019zh_valid.json'
for k in range(0,50):
    test_data = encoder_input_data[k:k+1]
    A1, B1, A2, B2 = encoder_model.predict(test_data)
    target_seq = np.zeros((1, 1, CH_VOCAB_SIZE))
    target_seq[0, 0, ch2id['\t']] = 1
    outputs = []
    while True:
        output_tokens, A1, B1, A2, B2 = decoder_model.predict([target_seq, A1, B1, A2, B2])
        sampled_token_index = np.argmax(output_tokens[0, -1, :])
        outputs.append(sampled_token_index)
        target_seq = np.zeros((1, 1, CH_VOCAB_SIZE))
        target_seq[0, 0, sampled_token_index] = 1
        if sampled_token_index == ch2id['。']: break
        # if len(outputs) > 11: break
    
    print(en_data[k])
    print(''.join([id2ch[i] for i in outputs]))

Look at these coasters over here.
看看这边的杯垫。
Choose a recorder.
选择一个记录员.。
I hadn't paid the telephone bill.
我还没交电话费。
That's easier said than done, of course.
当然，知易行难。
Side-to-Side Movements.
侧向运动。
about like 80 degrees.
大概华氏80度吧。
We all are from Shandong.
俺都是山东人。
She was possessed by a devil.
她着了魔。
This wool knits up well.
这种毛线很好织。
The majority was wrong last time.
方是错的。
Stone Soup Stories to Go!
石头汤（故事去！）。
Done. See you tomorrow.
一言为定。
He eased some of the strains on the poor.
缓解了穷人的压力。
Could it be that it was written wrongly?
莫非写错了?。
What a terrible temper!
脾气真够坏的！。
Great talents flower late.
大器晚成。
I forbid you to make a sortie today.
你今天不许出击。
C：My surname is Jiang.
C：我姓姜。
Well, if it was greater .
如果拉力很大。
They looked over to the left.
她们朝左边看。
To supervise the management of printing industry.
监督管理印刷业。
no one else can see you shake your head.
没人看的见你摇头。
All photos dials.
所有照片刻盘。
Stained glass window panels;
彩色玻璃窗板；。
The murderer was caught red-handed.
凶犯当场被抓住。
You don’t love Melanie.
你是不