In [8]:
import numpy as np
import re
from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.text import Tokenizer
from tensorflow.keras import utils
import pickle

In [2]:
questions = open('friends_questions.txt', encoding='UTF-8').readlines()
answers = list()
with open ('friends_answers.txt', 'r', encoding = 'utf-8') as f:
    for answer in f.readlines():
        answers.append( '<START> ' + answer.strip() + ' <END>' ) 
combined_list = questions+answers
len(combined_list)

48760

In [3]:
print(len(questions))
print(len(answers))

24380
24380


In [4]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(combined_list ) 


In [5]:
word_dict = tokenizer.word_index
num_tokens = len(word_dict )+1
num_tokens

8871

In [6]:
pickle.dump(tokenizer, open('tokenizer_friends.pkl', 'wb'))

In [9]:
tokenized_questions = tokenizer.texts_to_sequences(questions) 
length_list = list()
for token_seq in tokenized_questions:
    length_list.append( len( token_seq ))
max_input_length = np.array( length_list ).max()
print( 'Question max length is {}'.format( max_input_length ))

Question max length is 16


In [10]:
padded_questions = pad_sequences( tokenized_questions , maxlen=max_input_length , padding='post' )
encoder_input_data = np.array( padded_questions )
print( 'Encoder input data shape -> {}'.format( encoder_input_data.shape ))

Encoder input data shape -> (24380, 16)


In [11]:
np.save('encoder_input_data_friends.npy', encoder_input_data)

In [12]:
tokenized_answers = tokenizer.texts_to_sequences( answers ) 

length_list = list()
for token_seq in tokenized_answers:
    length_list.append( len( token_seq ))
max_output_length = np.array( length_list ).max()
print( 'Answers max length is {}'.format( max_output_length ))

padded_answers = pad_sequences( tokenized_answers , maxlen=max_output_length, padding='post' )
decoder_input_data = np.array( padded_answers )
print( 'Decoder input data shape -> {}'.format( decoder_input_data.shape ))

Answers max length is 19
Decoder input data shape -> (24380, 19)


In [13]:
np.save('decoder_input_data_friends.npy', decoder_input_data)

In [14]:
decoder_target_data = list()
for token_seq in tokenized_answers:
    decoder_target_data.append( token_seq[ 1 : ] ) # убираем старт

    
padded_answers = pad_sequences( decoder_target_data , maxlen=max_output_length, padding='post' )
onehot_answers = utils.to_categorical( padded_answers , num_tokens )
decoder_target_data = np.array( onehot_answers )
print( 'Decoder target data shape -> {}'.format( decoder_target_data.shape ))

Decoder target data shape -> (24380, 19, 8871)


In [15]:
np.save('decoder_target_data_friends.npy', decoder_target_data)

In [2]:
#tokenizer = pickle.load( open('tokenizer_friends.pkl' , 'rb'))

# num_tokens = len( tokenizer.word_index )
# word_dict = tokenizer.word_index

In [16]:
vocab_size = num_tokens  # Adding again 1 because of reserved 0 index # already done
embedding_matrix_glove = np.zeros((vocab_size, 200)) # берем эмбеддинг размерностью 200


In [17]:
with open('C:\\Users\\septo\\Lera\\glove\\glove.6B.200d.txt', encoding = 'utf-8') as f:
        for line in f:
            word, *vector = line.split()
            if word in tokenizer.word_index:
                idx = tokenizer.word_index[word] 
                embedding_matrix_glove[idx] = np.array(
                    vector, dtype=np.float32)

In [18]:
embedding_matrix_glove.shape

(8871, 200)

In [19]:
embedding_matrix_glove[7]

array([ 5.73459983e-01,  5.41700006e-01, -2.34770000e-01, -3.62399995e-01,
        4.03699994e-01,  1.13860004e-01, -4.49330002e-01, -3.09909999e-01,
       -5.34110004e-03,  5.84259987e-01, -2.59559993e-02,  4.93930012e-01,
       -3.72090004e-02, -2.84280002e-01,  9.76959988e-02, -4.89069998e-01,
        2.60269996e-02,  3.76489997e-01,  5.77879995e-02, -4.68070000e-01,
        8.12880024e-02,  3.28250003e+00, -6.36900008e-01,  3.79559994e-01,
        3.81670007e-03,  9.36070010e-02, -1.28549993e-01,  1.73800007e-01,
        1.05219997e-01,  2.86480010e-01,  2.10889995e-01, -4.70759988e-01,
        2.77330000e-02, -1.98029995e-01,  7.63280019e-02, -8.46289992e-01,
       -7.97079980e-01, -3.87430012e-01, -3.04220002e-02, -2.68489987e-01,
        4.85850006e-01,  1.28950000e-01,  3.83540004e-01,  3.87219995e-01,
       -3.85239989e-01,  1.90750003e-01,  4.89980012e-01,  1.32780001e-01,
        1.07920002e-02,  2.67699987e-01,  1.78120002e-01, -1.14330001e-01,
       -3.34939986e-01,  

In [20]:
nonzero_elements = np.count_nonzero(np.count_nonzero(embedding_matrix_glove, axis=1))
nonzero_elements

7767

In [21]:
np.save('embedding_matrix_friends.npy', embedding_matrix_glove)