**автоматический переводчик на основе рекуррентных нейросетей seq2seq**

кодируем слова word2vec

Евгений Борисов <borisov.e@solarl.ru>

## Библиотеки

In [1]:
import numpy as np
import re
import gzip

import pandas as pd
pd.options.display.max_colwidth = 200  


In [2]:
def pp(d): return "{:,.0f}".format(d).replace(",", " ")
def ppr(d): print('записей:', pp(len(d)) )  

## Учебные данные

In [3]:
with gzip.open('../data/text/rus-eng/rus.txt.gz','rt',encoding='utf8') as f: 
    pair = pd.DataFrame([ p.split('\t') for p in f.read().split('\n') if p.strip() ])

In [4]:
# with open('../data/text/pairs.txt','rt',encoding='utf8') as f: 
#     pair = pd.DataFrame([ p.split('%%') for p in f.read().split('\n') if p.strip() ])

In [5]:
pair.columns=['Q','A']
pair['Q'] = pair['Q'].str.strip()
pair['A'] = pair['A'].str.strip()

In [6]:
ppr(pair)
pair.sample(9)

записей: 336 666


Unnamed: 0,Q,A
15772,Those men died.,Те люди погибли.
203454,You didn't answer my question.,Ты не ответила на мой вопрос.
22278,This can't wait.,Это не может ждать.
144928,Tom is completely useless.,Том совершенно бесполезен.
1540,Who drove?,Кто вёл машину?
307800,We'll let you know the result within a week.,Мы сообщим вам о результатах в течение недели.
28952,It's in your bag.,Оно у Вас в сумке.
222518,It was in 1950 that he was born.,Родился он в 1950 году.
256677,Tom won't know anything about that.,Том ничего об этом не узнает.


In [7]:
pair = pair.iloc[100000:110000]

## Чистим тексты

In [8]:
pair['Q_clean'] = pair['Q'].str.lower()
pair['Q_clean'] = pair['Q_clean'].str.replace(r'([,.?!])', r' \1 ')

pair['A_clean'] = pair['A'].str.lower()
pair['A_clean'] = pair['A_clean'].str.replace(r'([,.?!])', r' \1 ')

# pair['A_clean'] = pair['A_clean'].apply(lambda s: re.sub( r'(\W)', ' \1 ', s))
# pair['A_clean'] = pair['A_clean'].apply(lambda s: re.sub( r'\W', ' ', s))
# pair['A_clean'] = pair['A_clean'].apply(lambda s: re.sub( r'\b\d+\b', ' digit ', s)) 

In [9]:
# (pair['Q_clean'] + ' <START>').str.split()

In [10]:
# добавляем "служебные" слова - начало и конец последовательности
pair['Q_clean'] = (pair['Q_clean'] + ' <START>').str.split() 
# выстраиваем входные последовательности в обратном порядке
pair['Q_clean'] = pair['Q_clean'].apply(lambda t: [ w for w in reversed(t) if w.strip() ] )

pair['A_clean'] = (pair['A_clean'] + ' <EOS>').str.split()  

# pair['A_clean'].str.split() + ['<EOS>']

In [11]:
pair[['Q_clean','A_clean']].sample(9)

Unnamed: 0,Q_clean,A_clean
101734,"[<START>, ., begun, has, season, rainy]","[настал, сезон, дождей, ., <EOS>]"
100825,"[<START>, ., you, of, kind, very, is, it]","[очень, любезно, с, вашей, стороны, ., <EOS>]"
107637,"[<START>, ., be, i'll, where, know, you]","[вы, знаете, ,, где, буду, я, ., <EOS>]"
106709,"[<START>, ?, cost, actual, the, what's]","[какова, реальная, стоимость, ?, <EOS>]"
108691,"[<START>, ., friend, your, me, consider]","[считайте, меня, своим, другом, ., <EOS>]"
107331,"[<START>, ?, car, my, in, you, were, why]","[почему, вы, в, моей, машине, ?, <EOS>]"
103692,"[<START>, ., me, impress, doesn't, tom]","[том, не, производит, на, меня, впечатления, ., <EOS>]"
107192,"[<START>, ?, us, tell, tom, didn't, why]","[почему, том, нам, не, сказал, ?, <EOS>]"
102522,"[<START>, ., ten, at, start, class, the]","[занятие, начинается, в, десять, ., <EOS>]"


---

In [12]:
sent_len_a_max = pair['A_clean'].str.len().max()
sent_len_q_max = pair['Q_clean'].str.len().max()

sent_len_a_max,sent_len_q_max

(12, 9)

In [13]:
# выравниваем длинну последовательностей,
# дополняем короткие словом "служебным" словом PAD
pair['Q_clean'] = pair['Q_clean'].apply( lambda t: ['<PAD>']*(sent_len_q_max-len(t)) + t )
pair['A_clean'] = pair['A_clean'].apply( lambda t: t+['<PAD>']*(sent_len_a_max-len(t)) )

In [14]:
pair[['Q_clean','A_clean']].sample(9)

Unnamed: 0,Q_clean,A_clean
103000,"[<PAD>, <PAD>, <PAD>, <PAD>, <START>, ., dangerous, very, they're]","[они, очень, опасные, ., <EOS>, <PAD>, <PAD>, <PAD>, <PAD>, <PAD>, <PAD>, <PAD>]"
102516,"[<PAD>, <PAD>, <START>, ., sofa, my, ruined, cat, the]","[кот, испортил, мой, диван, ., <EOS>, <PAD>, <PAD>, <PAD>, <PAD>, <PAD>, <PAD>]"
105580,"[<PAD>, <PAD>, <PAD>, <START>, ., later, explain, will, tom]","[том, потом, объяснит, ., <EOS>, <PAD>, <PAD>, <PAD>, <PAD>, <PAD>, <PAD>, <PAD>]"
103904,"[<PAD>, <PAD>, <PAD>, <START>, ., kidnapped, been, has, tom]","[тома, похитили, ., <EOS>, <PAD>, <PAD>, <PAD>, <PAD>, <PAD>, <PAD>, <PAD>, <PAD>]"
107951,"[<PAD>, <PAD>, <PAD>, <START>, ., listener, good, a, you're]","[ты, хороший, слушатель, ., <EOS>, <PAD>, <PAD>, <PAD>, <PAD>, <PAD>, <PAD>, <PAD>]"
105727,"[<PAD>, <PAD>, <PAD>, <START>, ., empty, was, wallet, tom's]","[бумажник, тома, был, пуст, ., <EOS>, <PAD>, <PAD>, <PAD>, <PAD>, <PAD>, <PAD>]"
103195,"[<PAD>, <PAD>, <START>, ., well, me, fits, shoe, this]","[эта, туфля, мне, по, ноге, ., <EOS>, <PAD>, <PAD>, <PAD>, <PAD>, <PAD>]"
108610,"[<PAD>, <START>, ?, this, do, me, help, you, can]","[можешь, помочь, мне, это, сделать, ?, <EOS>, <PAD>, <PAD>, <PAD>, <PAD>, <PAD>]"
101731,"[<PAD>, <PAD>, <START>, ., sleep, to, child, the, put]","[уложи, ребёнка, спать, ., <EOS>, <PAD>, <PAD>, <PAD>, <PAD>, <PAD>, <PAD>, <PAD>]"


## Кодируем тексты

In [15]:
%%time

from gensim.models.word2vec import Word2Vec

w2v_size = 256

w2v_q = Word2Vec( pair['Q_clean'].values.tolist(), min_count=1, size=w2v_size, window=4, workers=4)
w2v_a = Word2Vec( pair['A_clean'].values.tolist(), min_count=1, size=w2v_size, window=4, workers=4)

CPU times: user 2.39 s, sys: 33.9 ms, total: 2.42 s
Wall time: 2.03 s


In [16]:
pair['Q_code'] = pair['Q_clean'].apply(lambda t: [ w2v_q.wv.get_vector(w) for w in t ] )
pair['A_code'] = pair['A_clean'].apply(lambda t: [ w2v_a.wv.get_vector(w) for w in t ] )


-----

In [17]:
# pair = pair.sample(1000)
# pair = pair.sample(283800)
# pair = pair.sample(600)

In [18]:
encoder_input_data = np.stack( pair['Q_code'].values ).astype(np.float32)

decoder_input_data  = np.stack( pair['A_code'].values )[:,:-1,:].astype(np.float32)

#decoder_target_data = np.stack( pair['A_code'].values )[:,1:,:].astype(np.float32)
decoder_target_data = np.stack( pair['A_code'].values ).astype(np.float32)

In [19]:
encoder_input_data.shape, decoder_input_data.shape, decoder_target_data.shape

((10000, 9, 256), (10000, 11, 256), (10000, 12, 256))

In [20]:
# np.abs(encoder_input_data).max(), 
act_fact = np.ceil(np.abs(decoder_input_data).max())
act_fact

2.0

## Строим нейросеть

In [21]:
from keras.models import Model

from keras.layers import Input
from keras.layers import LSTM
from keras.layers import Dense

from keras.layers import Reshape
from keras.layers import Concatenate 

from keras import backend as K

# from tensorflow.keras.backend import expand_dims

Using TensorFlow backend.
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [22]:
latent_dim = 256  # размер сети

In [23]:
encoder_inputs = Input(shape=(None, w2v_size))

encoder = LSTM(latent_dim, return_state=True)

# encoder_outputs, state_h, state_c = encoder(encoder_inputs)
# encoder_states = [state_h, state_c]

encoder_outputs = encoder(encoder_inputs)
encoder_states = encoder_outputs[1:]

W0910 12:05:25.656138 140355813832512 deprecation_wrapper.py:119] From /usr/lib/python3.7/site-packages/keras/backend/tensorflow_backend.py:72: The name tf.get_default_graph is deprecated. Please use tf.compat.v1.get_default_graph instead.

W0910 12:05:25.678682 140355813832512 deprecation_wrapper.py:119] From /usr/lib/python3.7/site-packages/keras/backend/tensorflow_backend.py:515: The name tf.placeholder is deprecated. Please use tf.compat.v1.placeholder instead.

W0910 12:05:25.682814 140355813832512 deprecation_wrapper.py:119] From /usr/lib/python3.7/site-packages/keras/backend/tensorflow_backend.py:4048: The name tf.random_uniform is deprecated. Please use tf.random.uniform instead.



In [24]:
# K.expand_dims(encoder_outputs,1)

In [25]:
decoder_inputs = Input(shape=(None,w2v_size))

In [26]:
# decoder_inputs = concatenate([ K.expand_dims(encoder_outputs,1), decoder_inputs],axis=1)
decoder_concat_inputs = Concatenate(axis=1)([K.expand_dims(encoder_outputs[0],1), decoder_inputs])
# decoder_inputs = Concatenate(axis=1)([encoder_outputs, decoder_inputs])

In [27]:
decoder_concat_inputs

<tf.Tensor 'concatenate_1/concat:0' shape=(?, ?, 256) dtype=float32>

In [28]:
decoder_lstm = LSTM(latent_dim, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm( decoder_concat_inputs, initial_state=encoder_states )

In [29]:
decoder_outputs

<tf.Tensor 'lstm_2/transpose_1:0' shape=(?, ?, 256) dtype=float32>

In [30]:
def custom_activation(x):  return K.tanh(x)*act_fact

decoder_dense = Dense(w2v_size, activation=custom_activation)
decoder_outputs = decoder_dense(decoder_outputs)

In [31]:
decoder_outputs

<tf.Tensor 'dense_1/mul:0' shape=(?, ?, 256) dtype=float32>

In [32]:
# <tf.Tensor 'dense_1/mul:0' shape=(?, ?, 256) dtype=float32>

---

In [33]:
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)

AttributeError: 'NoneType' object has no attribute '_inbound_nodes'

In [None]:
model.summary()

In [None]:
# model.compile(optimizer='rmsprop', loss='categorical_crossentropy')
model.compile(loss='mse', optimizer='rmsprop')
# model.compile(loss='mse', optimizer='adam')
# model.compile(loss='mse', optimizer='sgd')

In [None]:
%%time 

history = model.fit([encoder_input_data, decoder_input_data], decoder_target_data,
          batch_size=50,
          epochs=10,
          validation_split=0.1
        ) 

## Проверяем результат

In [None]:
encoder_model = Model(encoder_inputs, encoder_states)

decoder_state_input_h = Input(shape=(latent_dim,))
decoder_state_input_c = Input(shape=(latent_dim,))
decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]
decoder_outputs, state_h, state_c = decoder_lstm(decoder_inputs, initial_state=decoder_states_inputs)
decoder_states = [state_h, state_c]
decoder_outputs = decoder_dense(decoder_outputs)
decoder_model = Model( [decoder_inputs] + decoder_states_inputs, [decoder_outputs] + decoder_states)

In [None]:
def decode_sequence(input_seq):
    # генерируем состояние энкодера
    states_value = encoder_model.predict(input_seq)

    # вход декодера - последовательность из одного слова GO
    output_w2v = w2v_a.wv['<GO>'].reshape([1,1,w2v_size])

    # выходная последовательность
    decoded_sentence = []
    
    for i in range(sent_len_a_max): 
        output_w2v, h, c = decoder_model.predict([output_w2v] + states_value)

        # декодируем cлово
        cc = output_w2v.reshape(w2v_size)
        w = w2v_a.wv.similar_by_vector(cc)[0][0] 
        
        # если очередное слово это EOS
        if(w=='<EOS>'): break # то завершаем работу

        decoded_sentence.append(w)
       
        states_value = [h,c] # обновляем состояние сети

    return ' '.join(decoded_sentence)

In [None]:
ii = np.random.permutation(len(encoder_input_data))[:10]
for seq_index in ii:
    input_seq = encoder_input_data[seq_index: seq_index + 1]
    decoded_sentence = decode_sequence(input_seq)
    print( pair.iloc[seq_index]['Q'],' -> ', decoded_sentence )

----

In [None]:
import matplotlib.pyplot as plt

In [None]:
history_dict = history.history
history_dict.keys()

In [None]:
# acc = history.history['acc']
#val_acc = history.history['val_acc']
loss = history.history['loss']
val_loss = history.history['val_loss']

epochs = range(1, len(loss) + 1)
plt.plot(epochs, loss, 'b', label='Training loss')
plt.plot(epochs, val_loss, 'r', label='Validation loss')
plt.title('Training and validation loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()

plt.show()

In [None]:
# plt.clf()   # clear figure
# acc_values = history_dict['acc']
# val_acc_values = history_dict['val_acc']

# plt.plot(epochs, acc, 'b', label='Training acc')
# plt.plot(epochs, val_acc, 'r', label='Validation acc')
# plt.title('Training and validation accuracy')
# plt.xlabel('Epochs')
# plt.ylabel('Accuracy')
# plt.legend()

# plt.show()

---

In [None]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.python.client import device_lib

In [None]:
print('tensorflow:', tf.__version__)
print('keras:', keras.__version__)

if tf.test.is_built_with_cuda():
    print('GPU devices:\n  ',
        [ [x.name, x.physical_device_desc] 
          for x in device_lib.list_local_devices() 
          if x.device_type == 'GPU' ]
    )
    print('default GPU device:', tf.test.gpu_device_name() )

else:
    print('no GPU device found')
    

---

In [None]:
# w2v_q_vocab = sorted([w for w in w2v_q.wv.vocab])
# ppr(w2v_q_vocab)
# w2v_a_vocab = sorted([w for w in w2v_a.wv.vocab])
# ppr(w2v_a_vocab)

In [None]:
# ii = np.random.permutation(len(w2v_q_vocab))[:10]
# for i in ii:
#     w = w2v_q_vocab[i]
#     ww = [ v[0] for v in w2v_q.wv.most_similar(w, topn=5) ]
#     print( w,':',ww )

In [None]:
# ii = np.random.permutation(len(w2v_a_vocab))[:10]
# for i in ii:
#     w = w2v_a_vocab[i]
#     ww = [ v[0] for v in w2v_a.wv.most_similar(w, topn=5) ]
#     print( w,':',ww )

---

In [None]:
# input1 = Input(shape=(16,))
# x1 = Dense(8, activation='relu')(input1)
#
# input2 = Input(shape=(32,))
# x2 = Dense(8, activation='relu')(input2)
# added = add([x1, x2])
#
# out = Dense(4)(added)
# model = Model( inputs=[input1, input2], outputs=out )

In [None]:
# x1  x2  x3
#  \  /   /
#   y1   /
#    \  /
#     y2

In [None]:
# first = Sequential()
# first.add(Dense(1, input_shape=(2,), activation='sigmoid'))

# second = Sequential()
# second.add(Dense(1, input_shape=(1,), activation='sigmoid'))

# third = Sequential()
# # of course you must provide the input to result with will be your x3
# third.add(Dense(1, input_shape=(1,), activation='sigmoid'))

# # lets say you add a few more layers to first and second.
# # concatenate them
# merged = Concatenate([first, second])

# # then concatenate the two outputs

# result = Concatenate([merged,  third])

# ada_grad = Adagrad(lr=0.1, epsilon=1e-08, decay=0.0)

# result.compile(optimizer=ada_grad, loss='binary_crossentropy',
#                metrics=['accuracy'])

In [None]:
# from keras.models import Model
# from keras.layers import Concatenate, Dense, LSTM, Input, concatenate
# from keras.optimizers import Adagrad

# first_input = Input(shape=(2, ))
# first_dense = Dense(1, )(first_input)

# second_input = Input(shape=(2, ))
# second_dense = Dense(1, )(second_input)

# merge_one = concatenate([first_dense, second_dense])

# third_input = Input(shape=(1, ))
# merge_two = concatenate([merge_one, third_input])

# model = Model(inputs=[first_input, second_input, third_input], outputs=merge_two)
# ada_grad = Adagrad(lr=0.1, epsilon=1e-08, decay=0.0)
# model.compile(optimizer=ada_grad, loss='binary_crossentropy',
#                metrics=['accuracy'])

In [None]:
# encoder_inputs = Input(shape=(None, w2v_size))
# encoder = LSTM(latent_dim, return_state=True)
# encoder_outputs, state_h, state_c = encoder(encoder_inputs)
# encoder_states = [state_h, state_c]

---

In [None]:
# merge( [ UpSampling2D(size=(2,2), dim_ordering="th")(conv5), conv4], mode='concat', concat_axis=1)

In [None]:
# x1  x2  x3
#  \  /   /
#   y1   /
#    \  /
#     y2
#
# from keras.models import Model
# from keras.layers import Input
# from keras.layers import Dense
# from keras.layers import concatenate
# from keras.optimizers import Adagrad

# first_input = Input(shape=(2, ))
# first_dense = Dense(1, )(first_input)

# second_input = Input(shape=(2, ))
# second_dense = Dense(1, )(second_input)

# merge_one = concatenate([first_dense, second_dense])

# third_input = Input(shape=(1, ))
# merge_two = concatenate([merge_one, third_input])

# model = Model(inputs=[first_input, second_input, third_input], outputs=merge_two)

# ada_grad = Adagrad(lr=0.1, epsilon=1e-08, decay=0.0)
# model.compile(optimizer=ada_grad, loss='binary_crossentropy',metrics=['accuracy'])

In [None]:
# import keras
# from keras.layers import *

# input_3D = Input(shape=(None,100,), dtype='int32', name='input_3D')
# input_2D = Input(shape=(100,), dtype='int32', name='input_2D')
# input_2D_repeat = RepeatVector(K.shape(input_3D)[1])(input_2D)
# merged = Concatenate(axis=1)([input_3D, input_2D_repeat])

In [None]:
# import keras
# from keras.layers import *

# input_3D = Input(shape=(None,100,), dtype='int32', name='input_3D')
# input_2D = Input(shape=(100,), dtype='int32', name='input_2D')
# input_2D_repeat = RepeatVector(K.shape(input_3D)[1])(input_2D)
# merged = concatenate([input_3D, input_2D_repeat], axis=1)

In [None]:
# import keras
# from keras.layers import Input, Lambda
# import keras.backend as K

# def repeat_and_concatenate(inputs):
#     input_3D, input_2D = inputs
#     # Repeat 2D vectors
#     input_2D_repeat = K.tile(K.expand_dims(input_2D, 1), [1, K.shape(input_3D)[1], 1])
#     # Concatenate feature-wise
#     return K.concatenate([input_3D, input_2D_repeat], axis=-1)

# input_3D = Input(shape=(None,100,), dtype='int32', name='input_3D')
# input_2D = Input(shape=(50,), dtype='int32', name='input_2D')
# merged = Lambda(repeat_and_concatenate)([input_3D, input_2D])
# print(merged)
# # Tensor("lambda_1/concat:0", shape=(?, ?, 150), dtype=int32)

In [None]:
# Concatenate

# decoder_inputs = Input(shape=(None, w2v_size))
# decoder_lstm = LSTM(latent_dim, return_sequences=True, return_state=True)

# # FIXME к данным добавить выход энкодера

# decoder_outputs, _, _ = decoder_lstm(decoder_inputs,initial_state=encoder_states)

# # decoder_dense = Dense(w2v_size)
# # decoder_dense = Dense(w2v_size, activation='softmax')
# # decoder_dense = Dense(w2v_size, activation='tanh')
# # decoder_dense = Dense(w2v_size, activation='sigmoid')
# decoder_dense = Dense(w2v_size, activation=custom_activation)

# decoder_outputs = decoder_dense(decoder_outputs)