In [8]:
import re
import numpy as np
from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.text import Tokenizer
from tensorflow.keras import utils
import pickle

In [2]:
questions = open('friends_questions_short.txt', encoding='UTF-8').readlines()
answers = list()
with open ('friends_answers_short.txt', 'r', encoding = 'utf-8') as f:
    for answer in f.readlines():
        answers.append( '<START> ' + answer.strip() + ' <END>' ) 
combined_list = questions+answers
len(combined_list)

41734

In [3]:
print(len(questions))
print(len(answers))

20867
20867


In [4]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(combined_list ) 


In [5]:
word_dict = tokenizer.word_index
num_tokens = len(word_dict )+1
num_tokens

4907

In [6]:
pickle.dump(tokenizer, open('tokenizer_friends_short.pkl', 'wb'))

In [9]:
tokenized_questions = tokenizer.texts_to_sequences(questions) 
length_list = list()
for token_seq in tokenized_questions:
    length_list.append( len( token_seq ))
max_input_length = np.array( length_list ).max()
print( 'Question max length is {}'.format( max_input_length ))

Question max length is 16


In [10]:
padded_questions = pad_sequences( tokenized_questions , maxlen=max_input_length , padding='post' )
encoder_input_data = np.array( padded_questions )
print( 'Encoder input data shape -> {}'.format( encoder_input_data.shape ))

Encoder input data shape -> (20867, 16)


In [11]:
np.save('encoder_input_data_friends_short.npy', encoder_input_data)

In [12]:
tokenized_answers = tokenizer.texts_to_sequences( answers ) 

length_list = list()
for token_seq in tokenized_answers:
    length_list.append( len( token_seq ))
max_output_length = np.array( length_list ).max()
print( 'Answers max length is {}'.format( max_output_length ))

padded_answers = pad_sequences( tokenized_answers , maxlen=max_output_length, padding='post' )
decoder_input_data = np.array( padded_answers )
print( 'Decoder input data shape -> {}'.format( decoder_input_data.shape ))

Answers max length is 19
Decoder input data shape -> (20867, 19)


In [13]:
np.save('decoder_input_data_friends_short.npy', decoder_input_data)

In [14]:
decoder_target_data = list()
for token_seq in tokenized_answers:
    decoder_target_data.append( token_seq[ 1 : ] ) # убираем старт

    
padded_answers = pad_sequences( decoder_target_data , maxlen=max_output_length, padding='post' )
onehot_answers = utils.to_categorical( padded_answers , num_tokens )
decoder_target_data = np.array( onehot_answers )
print( 'Decoder target data shape -> {}'.format( decoder_target_data.shape ))

Decoder target data shape -> (20867, 19, 4907)


In [15]:
np.save('decoder_target_data_friends_short.npy', decoder_target_data)

In [2]:
tokenizer = pickle.load( open('tokenizer_friends_short.pkl' , 'rb'))

num_tokens = len( tokenizer.word_index )
word_dict = tokenizer.word_index

In [16]:
vocab_size = num_tokens  # Adding again 1 because of reserved 0 index
embedding_matrix_glove = np.zeros((vocab_size, 200)) # берем эмбеддинг размерностью 200


In [18]:
with open('C:\\Users\\septo\\Lera\\glove\\glove.6B.200d.txt', encoding = 'utf-8') as f:
        for line in f:
            word, *vector = line.split()
            if word in tokenizer.word_index:
                idx = tokenizer.word_index[word] 
                embedding_matrix_glove[idx] = np.array(
                    vector, dtype=np.float32)

In [19]:
embedding_matrix_glove.shape

(4907, 200)

In [20]:
embedding_matrix_glove[7]

array([ 0.37830001,  0.039647  ,  0.36320001, -0.098111  , -0.82511997,
        0.020173  , -0.86044002,  0.30083999, -0.25029001,  0.86778003,
       -0.089421  , -0.029677  , -0.060158  ,  0.15734001, -0.13674   ,
        0.64468002, -1.03859997,  0.26578999,  0.53983998, -0.37524   ,
        0.51603001,  0.55084997, -0.15421   , -0.13323   ,  0.093934  ,
       -0.040462  , -0.097016  ,  0.12977999, -0.55970001, -0.34797999,
       -0.10582   , -0.13418999, -0.26113999,  0.24352001, -0.41727999,
        0.15101001, -0.24849001, -0.77275997, -0.75265002,  0.42407   ,
       -1.03789997,  0.39776999,  0.21489   ,  0.22041   ,  0.0040744 ,
        0.0086975 ,  0.54936999, -0.026504  , -0.63075   ,  0.13698   ,
        0.79771   ,  0.029406  ,  0.54597998, -0.11041   , -0.32098001,
        0.31044999, -0.03568   , -0.30546001,  0.41762999, -0.26973999,
        0.53738999,  0.15655001,  0.18486001,  0.28977999, -0.76911002,
       -0.65816998, -0.22201   , -0.28143001,  1.36210001, -0.42

In [21]:
nonzero_elements = np.count_nonzero(np.count_nonzero(embedding_matrix_glove, axis=1))
nonzero_elements

4489

In [22]:
np.save('embedding_matrix_friends_short.npy', embedding_matrix_glove)