**чатбот на рекуррентных нейросетях (Keras+TensorFlow)**

Евгений Борисов <borisov.e@solarl.ru>


----

## Библиотеки

In [1]:
import numpy as np
import re

import pandas as pd
pd.options.display.max_colwidth = 200  


In [2]:
def pp(d): return "{:,.0f}".format(d).replace(",", " ")
def ppr(d): print('записей:', pp(len(d)) )  

## Учебные данные

In [3]:
with open('../data/text/rus-eng/rus.txt','rt',encoding='utf8') as f: 
    pair = pd.DataFrame([ p.split('\t') for p in f.read().split('\n') if p.strip() ])
    # pair = pd.DataFrame([ p.split('%%') for p in f.read().split('\n') if p.strip() ])

In [100]:
with open('../data/text/pair.txt','rt',encoding='utf8') as f: 
    pair = pd.DataFrame([ p.split('%%') for p in f.read().split('\n') if p.strip() ])

FileNotFoundError: [Errno 2] No such file or directory: '../data/text/pair.txt'

In [5]:
pair.columns=['Q','A']
pair['Q'] = pair['Q'].str.strip()
pair['A'] = pair['A'].str.strip()

In [6]:
ppr(pair)
pair.sample(9)

записей: 336 666


Unnamed: 0,Q,A
54015,Watch your luggage.,Следи за багажом.
8267,Have some ham.,Поешь ветчины.
152248,I don't think I can fix it.,"Не думаю, что смогу это починить."
86743,I think you're stupid.,"Думаю, что ты глуп."
9953,It's not here.,Это не здесь.
290975,"Unfortunately, I have to disappoint you.","К сожалению, должен вас разочаровать."
114870,Nobody knew what to say.,"Никто не знал, что сказать."
174308,Tom thought nobody was home.,"Том думал, что никого нет дома."
111086,I got one for Christmas.,Мне такой на Рождество подарили.


## Чистим тексты

In [7]:
pair['Q_clean'] = pair['Q'].str.lower()
pair['Q_clean'] = pair['Q_clean'].str.replace(r'([,.?!])', r' \1 ')

# pair['Q_clean'] = pair['Q_clean'].apply(lambda s: re.sub( r'\([,.?!]\)', ' \1 ', s))
#pair['Q_clean'] = pair['Q_clean'].apply(lambda s: re.sub( r'\W', ' ', s))
#pair['Q_clean'] = pair['Q_clean'].apply(lambda s: re.sub( r'\b\d+\b', ' digit ', s)) 

pair['A_clean'] = pair['A'].str.lower()
pair['A_clean'] = pair['A_clean'].str.replace(r'([,.?!])', r' \1 ')

# pair['A_clean'] = pair['A_clean'].apply(lambda s: re.sub( r'(\W)', ' \1 ', s))
# pair['A_clean'] = pair['A_clean'].apply(lambda s: re.sub( r'\W', ' ', s))
# pair['A_clean'] = pair['A_clean'].apply(lambda s: re.sub( r'\b\d+\b', ' digit ', s)) 

In [8]:
# добавляем "служебные" слова - начало и конец последовательности
pair['Q_clean'] = pair['Q_clean'].str.split() + ['<START>']
pair['A_clean'] = ['<GO>'] + pair['A_clean'].str.split() + ['<EOS>']

In [9]:
pair[['Q_clean','A_clean']].sample(9)

Unnamed: 0,Q_clean,A_clean
125227,"[i, just, don't, want, to, die, ., <START>]","[<GO>, я, просто, не, хочу, умирать, ., <EOS>]"
85223,"[i, didn't, drink, coffee, ., <START>]","[<GO>, я, не, пил, кофе, ., <EOS>]"
87116,"[i, went, into, the, house, ., <START>]","[<GO>, я, вошёл, в, дом, ., <EOS>]"
319963,"[the, orchestra, is, drowning, out, the, singer's, voice, ., <START>]","[<GO>, оркестр, заглушает, голос, певца, ., <EOS>]"
273391,"[you, still, haven't, told, me, what, to, do, ., <START>]","[<GO>, вы, так, и, не, сказали, мне, ,, что, делать, ., <EOS>]"
176345,"[who, will, you, be, coming, with, ?, <START>]","[<GO>, с, кем, вы, придёте, ?, <EOS>]"
37524,"[i'll, recommend, it, ., <START>]","[<GO>, я, порекомендую, это, ., <EOS>]"
309546,"[it, took, her, all, afternoon, to, finish, the, work, ., <START>]","[<GO>, на, завершение, работы, у, неё, ушла, вся, вторая, половина, дня, ., <EOS>]"
31800,"[was, tom, a, sailor, ?, <START>]","[<GO>, том, был, моряком, ?, <EOS>]"


---

In [10]:
# считаем количество слов
pair['lenQ'] = pair['Q_clean'].str.len()
pair['lenA'] = pair['A_clean'].str.len()
pair.describe()

Unnamed: 0,lenQ,lenA
count,336666.0,336666.0
mean,8.022399,8.499317
std,2.307285,2.412566
min,3.0,4.0
25%,6.0,7.0
50%,8.0,8.0
75%,9.0,10.0
max,47.0,51.0


In [11]:
# определяем максимальную длинну последовательности
pair['lenQ'].quantile(0.95),  pair['lenA'].quantile(0.95)

(12.0, 13.0)

In [12]:
# выбираем последовательности средней длинны
sent_len_min, sent_len_max = 7,10

ppr(pair)
pair = pair[
    pair['lenQ'].between(sent_len_min,sent_len_max) 
    & pair['lenA'].between(sent_len_min,sent_len_max) 
  ]
ppr(pair)

записей: 336 666
записей: 165 112


In [13]:
# выстраиваем входные последовательности в обратном порядке
# и выравниваем длинну последовательностей,
# дополняем короткие словом "служебным" словом,
pad = ['<PAD>']*sent_len_max
pair['Q_clean'] = pair['Q_clean'].apply( lambda t: pad[len(t):] + list(reversed(t)) )
pair['A_clean'] = pair['A_clean'].apply( lambda t: t + pad[len(t):] )

In [14]:
pair[['Q_clean','A_clean']].sample(9)

Unnamed: 0,Q_clean,A_clean
263445,"[<PAD>, <PAD>, <START>, ., questions, more, three, mary, asked, tom]","[<GO>, том, задал, мэри, еще, три, вопроса, ., <EOS>, <PAD>]"
187094,"[<PAD>, <PAD>, <START>, ., say, would, mary, what, knew, tom]","[<GO>, том, знал, ,, что, мэри, скажет, ., <EOS>, <PAD>]"
310127,"[<PAD>, <START>, ., simply, quite, answered, be, can, questions, these]","[<GO>, на, эти, вопросы, очень, легко, ответить, ., <EOS>, <PAD>]"
218834,"[<PAD>, <START>, ?, this, doing, be, you, will, long, how]","[<GO>, долго, вы, будете, это, делать, ?, <EOS>, <PAD>, <PAD>]"
124856,"[<PAD>, <PAD>, <START>, ., apple, an, boy, each, gave, i]","[<GO>, я, дал, мальчикам, по, яблоку, ., <EOS>, <PAD>, <PAD>]"
158098,"[<PAD>, <PAD>, <START>, ., tree, that, to, horse, the, tie]","[<GO>, привяжи, лошадку, к, тому, дереву, ., <EOS>, <PAD>, <PAD>]"
268417,"[<START>, ., driver, taxi, a, be, i'd, thought, never, i]","[<GO>, никогда, не, думал, ,, что, буду, таксистом, ., <EOS>]"
290534,"[<START>, ., himself, for, think, to, learn, to, needs, tom]","[<GO>, тому, нужно, научиться, думать, самостоятельно, ., <EOS>, <PAD>, <PAD>]"
221490,"[<PAD>, <START>, ., anybody, to, it, tell, never, will, i]","[<GO>, я, никогда, никому, не, скажу, об, этом, ., <EOS>]"


## Кодируем тексты

In [15]:
# data = pair['Q_clean'].values.tolist() + pair['A_clean'].values.tolist()

In [16]:
%%time

from gensim.models.word2vec import Word2Vec

w2v_size = 256

w2v_q = Word2Vec( pair['Q_clean'].values.tolist(), min_count=1, size=w2v_size, window=4, workers=4)
w2v_a = Word2Vec( pair['A_clean'].values.tolist(), min_count=1, size=w2v_size, window=4, workers=4)

CPU times: user 18 s, sys: 136 ms, total: 18.2 s
Wall time: 9.21 s


In [17]:
w2v_q_vocab = sorted([w for w in w2v_q.wv.vocab])
ppr(w2v_q_vocab)

w2v_a_vocab = sorted([w for w in w2v_a.wv.vocab])
ppr(w2v_a_vocab)

записей: 12 143
записей: 35 286


In [18]:
ii = np.random.permutation(len(w2v_q_vocab))[:10]
for i in ii:
    w = w2v_q_vocab[i]
    ww = [ v[0] for v in w2v_q.wv.most_similar(w, topn=5) ]
    print( w,':',ww )

lifeguards : ['saturdays', 'beach', 'planning', 'counting', 'seesaw']
bench : ['couch', 'spot', 'grass', 'sofa', 'farm']
feeds : ['scraps', 'struggled', 'bareback', 'knotted', 'checks']
50 : ['300', '60', '200', '4', 'fifty']
observant : ['petty', 'permissive', 'illegible', 'strict', 'immature']
kind : ['capable', 'sort', 'proud', 'motherly', 'unaware']
ballot : ['absentee', 'eel', 'pamphlet', 'multiple', 'accusations']
italian : ['banned', 'band', 'asia', 'national', 'hockey']
hop : ['fade', 'return', 'run', 'shipbuilding', "pilot's"]
idiotic : ['acupuncture', 'ark', 'grand', 'majestically', 'hoover']


  if np.issubdtype(vec.dtype, np.int):


In [19]:
ii = np.random.permutation(len(w2v_a_vocab))[:10]
for i in ii:
    w = w2v_a_vocab[i]
    ww = [ v[0] for v in w2v_a.wv.most_similar(w, topn=5) ]
    print( w,':',ww )

дозвонился : ['возьмет', 'выпускником', 'гиннес', 'воспользуйся', 'приютил']
зубрил : ['выносил', 'выберет', 'разлуку', 'взбесился', 'растранжирил']
напугал : ['службу', 'духа', 'четвёртую', 'вежливый', 'искусство']
потайная : ['свободные', 'ос', '"оставь', 'капли', 'где-нибудь']
незаконченные : ['сырых', 'здоровую', 'wi-fi', 'сомнения', 'велосипеды']
сведению : ['возражение', 'мальчишка', 'засунь', 'множеству', 'призывали']
осуществлять : ['оплатить', 'обратить', 'проглотить', 'отыскать', 'разобрать']
сбивали : ['нападению', 'искренние', 'кучей', 'берега', 'реки']
протеин : ['сумасшедшим', 'джексона', 'подушкой', 'воображение', 'подбородок']
приведены : ['записке', 'диснейленде', 'спрингстин', 'планетарии', 'вранье']


---

In [20]:
# c = w2v_a.wv['<GO>']
# w2v_a.wv.similar_by_vector(c)[0][0] 
# c.min(), c.max()

---

In [21]:
pair['Q_code'] = pair['Q_clean'].apply(lambda t: [ w2v_q.wv.get_vector(w) for w in t ] )
pair['A_code'] = pair['A_clean'].apply(lambda t: [ w2v_a.wv.get_vector(w) for w in t ] )

In [22]:
pair[['Q_code','A_code']].sample(3)

Unnamed: 0,Q_code,A_code
96901,"[[0.33058208, -0.28316858, -0.22658816, -0.37592202, -0.32576087, -0.09522664, 0.78433275, -0.79865706, -0.20137724, 1.0254937, -0.25442362, 0.37049827, 0.56521356, 0.23883432, -0.14680867, 0.3908...","[[-0.49798346, -2.3365552, 0.15383142, 1.4701685, 0.30655286, -1.0703925, 0.4387113, -0.6881707, 0.23141465, 0.46207544, -0.47387442, -0.5941366, -0.5359087, -0.23798801, -0.773308, -1.1179799, 0...."
135141,"[[0.33058208, -0.28316858, -0.22658816, -0.37592202, -0.32576087, -0.09522664, 0.78433275, -0.79865706, -0.20137724, 1.0254937, -0.25442362, 0.37049827, 0.56521356, 0.23883432, -0.14680867, 0.3908...","[[-0.49798346, -2.3365552, 0.15383142, 1.4701685, 0.30655286, -1.0703925, 0.4387113, -0.6881707, 0.23141465, 0.46207544, -0.47387442, -0.5941366, -0.5359087, -0.23798801, -0.773308, -1.1179799, 0...."
308563,"[[0.33058208, -0.28316858, -0.22658816, -0.37592202, -0.32576087, -0.09522664, 0.78433275, -0.79865706, -0.20137724, 1.0254937, -0.25442362, 0.37049827, 0.56521356, 0.23883432, -0.14680867, 0.3908...","[[-0.49798346, -2.3365552, 0.15383142, 1.4701685, 0.30655286, -1.0703925, 0.4387113, -0.6881707, 0.23141465, 0.46207544, -0.47387442, -0.5941366, -0.5359087, -0.23798801, -0.773308, -1.1179799, 0...."



-----

In [23]:
ppr(pair)

записей: 165 112


In [24]:
pair = pair.sample(1000)

# pair = pair.sample(283800)
# pair = pair.sample(600)

In [68]:
encoder_input_data = np.stack( pair['Q_code'].values ).astype(np.float32)

decoder_input_data = np.stack( pair['A_code'].values )[:,:-1,:].astype(np.float32)
decoder_target_data = np.stack( pair['A_code'].values )[:,1:,:].astype(np.float32)

encoder_input_data.shape, decoder_input_data.shape, decoder_target_data.shape

((1000, 10, 256), (1000, 9, 256), (1000, 9, 256))

## Строим нейросеть

In [73]:
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input
from tensorflow.keras.layers import LSTM
from tensorflow.keras.layers import Dense

In [74]:
latent_dim = 512  # размер сети

In [75]:
encoder_inputs = Input(shape=(None, w2v_size))
encoder = LSTM(latent_dim, return_state=True)
encoder_outputs, state_h, state_c = encoder(encoder_inputs)
encoder_states = [state_h, state_c]

In [76]:
from keras import backend as K
def custom_activation(x):  return (K.tanh(x) * 5)

# model.add(Dense(32 , activation=custom_activation))

Using TensorFlow backend.


In [78]:
decoder_inputs = Input(shape=(None, w2v_size))
decoder_lstm = LSTM(latent_dim, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(decoder_inputs,initial_state=encoder_states)

# decoder_dense = Dense(w2v_size)
# decoder_dense = Dense(w2v_size, activation='softmax')
# decoder_dense = Dense(w2v_size, activation='tanh')
# decoder_dense = Dense(w2v_size, activation='sigmoid')
decoder_dense = Dense(w2v_size, activation=custom_activation)

decoder_outputs = decoder_dense(decoder_outputs)

In [79]:
# Define the model that will turn
# `encoder_input_data` & `decoder_input_data` into `decoder_target_data`
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)

In [80]:
model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_13 (InputLayer)           (None, None, 256)    0                                            
__________________________________________________________________________________________________
input_15 (InputLayer)           (None, None, 256)    0                                            
__________________________________________________________________________________________________
lstm_4 (LSTM)                   [(None, 512), (None, 1574912     input_13[0][0]                   
__________________________________________________________________________________________________
lstm_6 (LSTM)                   [(None, None, 512),  1574912     input_15[0][0]                   
                                                                 lstm_4[0][1]                     
          

In [81]:
# model.compile(optimizer='rmsprop', loss='categorical_crossentropy')
# model.compile(loss='mse', optimizer='rmsprop')
model.compile(loss='mse', optimizer='adam')
# model.compile(loss='mse', optimizer='sgd')


In [86]:
%%time 

history = model.fit([encoder_input_data, decoder_input_data], decoder_target_data,
          batch_size=100,
          epochs=200,
          validation_split=0.1
        )

Train on 900 samples, validate on 100 samples
Epoch 1/120
Epoch 2/120
Epoch 3/120
Epoch 4/120
Epoch 5/120
Epoch 6/120
Epoch 7/120
Epoch 8/120
Epoch 9/120
Epoch 10/120
Epoch 11/120
Epoch 12/120
Epoch 13/120
Epoch 14/120
Epoch 15/120
Epoch 16/120
Epoch 17/120
Epoch 18/120
Epoch 19/120
Epoch 20/120
Epoch 21/120
Epoch 22/120
Epoch 23/120
Epoch 24/120
Epoch 25/120
Epoch 26/120
Epoch 27/120
Epoch 28/120
Epoch 29/120
Epoch 30/120
Epoch 31/120
Epoch 32/120
Epoch 33/120
Epoch 34/120
Epoch 35/120
Epoch 36/120
Epoch 37/120
Epoch 38/120
Epoch 39/120
Epoch 40/120
Epoch 41/120
Epoch 42/120
Epoch 43/120
Epoch 44/120
Epoch 45/120
Epoch 46/120
Epoch 47/120
Epoch 48/120
Epoch 49/120
Epoch 50/120
Epoch 51/120
Epoch 52/120
Epoch 53/120
Epoch 54/120
Epoch 55/120
Epoch 56/120
Epoch 57/120
Epoch 58/120
Epoch 59/120
Epoch 60/120
Epoch 61/120
Epoch 62/120
Epoch 63/120
Epoch 64/120
Epoch 65/120
Epoch 66/120
Epoch 67/120
Epoch 68/120
Epoch 69/120
Epoch 70/120
Epoch 71/120
Epoch 72/120
Epoch 73/120
Epoch 74/120
E

Epoch 80/120
Epoch 81/120
Epoch 82/120
Epoch 83/120
Epoch 84/120
Epoch 85/120
Epoch 86/120
Epoch 87/120
Epoch 88/120
Epoch 89/120
Epoch 90/120
Epoch 91/120
Epoch 92/120
Epoch 93/120
Epoch 94/120
Epoch 95/120
Epoch 96/120
Epoch 97/120
Epoch 98/120
Epoch 99/120
Epoch 100/120
Epoch 101/120
Epoch 102/120
Epoch 103/120
Epoch 104/120
Epoch 105/120
Epoch 106/120
Epoch 107/120
Epoch 108/120
Epoch 109/120
Epoch 110/120
Epoch 111/120
Epoch 112/120
Epoch 113/120
Epoch 114/120
Epoch 115/120
Epoch 116/120
Epoch 117/120
Epoch 118/120
Epoch 119/120
Epoch 120/120
CPU times: user 3min 4s, sys: 1min 32s, total: 4min 37s
Wall time: 6min 7s


## Проверяем результат

In [87]:
encoder_model = Model(encoder_inputs, encoder_states)

decoder_state_input_h = Input(shape=(latent_dim,))
decoder_state_input_c = Input(shape=(latent_dim,))
decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]
decoder_outputs, state_h, state_c = decoder_lstm(decoder_inputs, initial_state=decoder_states_inputs)
decoder_states = [state_h, state_c]
decoder_outputs = decoder_dense(decoder_outputs)
decoder_model = Model( [decoder_inputs] + decoder_states_inputs, [decoder_outputs] + decoder_states)

In [88]:
def decode_sequence(input_seq):
    # генерируем состояние энкодера
    states_value = encoder_model.predict(input_seq)

    # вход декодера - последовательность из одного слова GO
    output_w2v = w2v_a.wv['<GO>'].reshape([1,1,w2v_size])

    # выходная последовательность
    decoded_sentence = []
    
    for i in range(sent_len_max): 
        output_w2v, h, c = decoder_model.predict([output_w2v] + states_value)

        # декодируем cлово
        cc = output_w2v.reshape(w2v_size)
        # cc = cc*a_fact
        w = w2v_a.wv.similar_by_vector(cc)[0][0] 
        
        # если очередное слово это EOS
        if(w=='<EOS>'): break # то завершаем работу

        decoded_sentence.append(w)

        # обновляем состояние сети
        states_value = [h, c]

    return ' '.join(decoded_sentence)

In [90]:
ii = np.random.permutation(len(encoder_input_data))[:10]
for seq_index in ii:
    input_seq = encoder_input_data[seq_index: seq_index + 1]
    decoded_sentence = decode_sequence(input_seq)
#    print( input_texts[seq_index],' -> ', decoded_sentence)
    print( pair.iloc[seq_index]['Q'],' -> ', decoded_sentence )

Do you want to look at it?  ->  хочешь на неё посмотреть ?
What're you doing in my room?  ->  что вы делаете у меня в комнате ?
Maybe I can help you.  ->  возможно , я смогу тебе помочь .
I have plenty of friends.  ->  у меня полно друзей .
Tom doesn't know how to enjoy life.  ->  том не умеет ездить тихо .
You told me not to tell anybody.  ->  ты мне сказал никому не говорить .
Tom came with a gift.  ->  том пришёл с рукавом .
I can deal with it.  ->  я могу с этим случиться .
At least they listened to me.  ->  по застигнут любому , они меня репортажа .
I'd forgotten that you'd forgotten.  ->  я забыл , что ты забыл .


----

In [94]:
# import matplotlib.pyplot as plt

In [95]:
# history_dict = history.history
# history_dict.keys()

In [96]:
# # acc = history.history['acc']
# #val_acc = history.history['val_acc']
# loss = history.history['loss']
# val_loss = history.history['val_loss']

# epochs = range(1, len(loss) + 1)
# plt.plot(epochs, loss, 'b', label='Training loss')
# plt.plot(epochs, val_loss, 'r', label='Validation loss')
# plt.title('Training and validation loss')
# plt.xlabel('Epochs')
# plt.ylabel('Loss')
# plt.legend()

# plt.show()

In [None]:
# plt.clf()   # clear figure
# acc_values = history_dict['acc']
# val_acc_values = history_dict['val_acc']

# plt.plot(epochs, acc, 'b', label='Training acc')
# plt.plot(epochs, val_acc, 'r', label='Validation acc')
# plt.title('Training and validation accuracy')
# plt.xlabel('Epochs')
# plt.ylabel('Accuracy')
# plt.legend()

# plt.show()

---

In [69]:
# q_min = encoder_input_data.min()
# q_max = encoder_input_data.max()
# encoder_input_data = (encoder_input_data-q_min)/(q_max-q_min)

# q_fact = np.max( [np.abs(encoder_input_data.max()), np.abs(encoder_input_data.min())] )
# encoder_input_data = encoder_input_data/q_fact

In [70]:
# encoder_input_data.min(),encoder_input_data.max()

In [71]:
# a_min = decoder_input_data.min()
# a_max = decoder_input_data.max()
# decoder_input_data  = (decoder_input_data-a_min)/(a_max-a_min)
# decoder_target_data = (decoder_target_data-a_min)/(a_max-a_min)

# a_fact = np.max( [np.abs(decoder_input_data.max()), np.abs(decoder_input_data.min())] )
# decoder_input_data  = decoder_input_data/a_fact
# decoder_target_data = decoder_target_data/a_fact

In [72]:
# decoder_target_data.min(),decoder_target_data.max()