In [1]:
import pickle
import tensorflow as tf
import gensim
from nltk.tokenize import TweetTokenizer
import numpy as np

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [2]:
tf.reset_default_graph()
# loading the data
with open("data.pickle", "rb") as f:
    quests, ans = pickle.load(f) 
print("Total number of lines : ", len(np.append(quests, ans)))

['Can we make this quick Roxanne Korrine and Andrew Barrett are having an incredibly horrendous public break up on the quad Again'
 'Well I thought we d start with pronunciation if that s okay with you'
 'Not the hacking and gagging and spitting part Please' ...
 'Well lose it And why aren t you pushing the large sizes Didn t you get training about upsizing'
 'But I feel weird it s so sleazy' 'It s not optional']
Total number of lines :  50298


In [3]:
def read_input(input_lines):
    for i, line in enumerate (input_lines): 
        # do some pre-processing and return a list of words for each review text
        yield gensim.utils.simple_preprocess (line)


In [4]:
# read the tokenized reviews into a list
# each review item becomes a serries of words
# so this becomes a list of lists
documents = list (read_input (np.append(quests, ans)))
print("Done reading data file")

Done reading data file


In [6]:
# Training Word2Vec model on our dataset
embedding_model = gensim.models.Word2Vec (documents, size=100, window=10, min_count=2, workers=10)
embedding_model.train(documents,total_examples=len(documents),epochs=10)

(3612643, 4939460)

In [8]:
print("number of words in our vocabulary : ", len(embedding_model.wv.vocab))

number of words in our vocabulary :  12083


In [9]:
vocabulary = list(embedding_model.wv.vocab.keys())

In [10]:
# Add padding as the 0th index of the vocabulary
vocabulary = ['<pad>'] + vocabulary

In [11]:
vector_dimension = 100

In [12]:
"""
Get vectorized representation of a words in a sentence using the custom embedding
"""
def get_vec_representation_of_text_from_custom_embedding(input_text, embedding_model, word_limit, vector_dimension, vocabulary):
    embedding_matrix = np.zeros([word_limit, vector_dimension])
    for index, token in enumerate(input_text):
        if token in vocabulary:
            embedding_vector = embedding_model.wv[token]
            embedding_matrix[index] = embedding_vector[:vector_dimension]
        else:
            embedding_matrix[index] = np.zeros(vector_dimension)
                
    return embedding_matrix


word_limit = 20
dimension = 100
trainX_question = []
Y_answers = []


In [13]:
trainx_quests = list(read_input(np.array(quests)))

trainy_ans = list(read_input(np.array(ans)))

for sentence in trainx_quests:
    trainX_question.append(get_vec_representation_of_text_from_custom_embedding(sentence[:word_limit], embedding_model, word_limit=word_limit,
                                                        vector_dimension=dimension, vocabulary=vocabulary))
    
for sentence in trainy_ans:
    Y_answers.append(get_vec_representation_of_text_from_custom_embedding(sentence[:word_limit], embedding_model, word_limit=word_limit,
                                                        vector_dimension=dimension, vocabulary=vocabulary))

In [14]:
trainX_question = np.array(trainX_question)
Y_answers = np.array(Y_answers)

print(trainX_question.shape)
print(Y_answers.shape)

(25149, 20, 100)
(25149, 20, 100)


In [15]:
# get decoders --> answers sequences mapped from the vocabulary
def get_decoder_idx_sequences(answer, vocabulary, word_limit):
    sequence = np.zeros([word_limit])
    for i, token in enumerate(answer):
        if token in vocabulary:
            sequence[i] = vocabulary.index(token)
        else:
            sequence[i] = 0
    return sequence

decoder_idx_seq = []
for sentence in trainy_ans:
    decoder_idx_seq.append(get_decoder_idx_sequences(sentence[:word_limit], vocabulary, word_limit))

25149
25149


In [16]:
decoder_idx_seq = np.array(decoder_idx_seq)
decoder_idx_seq.shape

(25149, 20)

In [17]:
# One hot encode the decoder values
target_decoder = tf.keras.utils.to_categorical(decoder_idx_seq, len(vocabulary))
print(target_decoder.shape)

(25149, 20, 12084)


In [18]:
from tensorflow.keras.layers import Input
from tensorflow.keras.models import Model
num_of_epochs = 100
tf.keras.backend.clear_session()
encoder_input_begin = Input(shape=(trainX_question.shape[1], trainX_question.shape[2]))
decoder_input_begin = Input(shape=(Y_answers.shape[1], Y_answers.shape[2]))

#ENCODER
# encoder_embedding = tf.keras.layers.Embedding( num_encoder_tokens, maxlen , mask_zero=True) (encoder_input_begin)

encoder_forward_activation, state_h_f , state_c_f = tf.keras.layers.LSTM(dimension, return_state=True, activation='relu')(encoder_input_begin)

encoder_forward_states = [state_h_f , state_c_f]
    
encoder_backward_activation, state_h_b , state_c_b = tf.keras.layers.LSTM(dimension, return_state=True, go_backwards=True, activation='relu')(encoder_input_begin)

encoder_backward_states = [state_h_b , state_c_b]


# DECODER
# decoder_embedding = tf.keras.layers.Embedding( num_decoder_tokens, maxlen , mask_zero=True) (decoder_input_begin)

decoder_forward_activation = tf.keras.layers.LSTM(dimension, return_state=True, return_sequences=True, activation='relu')
    
decoder_backward_activation = tf.keras.layers.LSTM(dimension, return_state=True, go_backwards=True, return_sequences=True, activation='relu')

decoder_f_outputs, _ , _ = decoder_forward_activation ( decoder_input_begin , initial_state=encoder_forward_states)

decoder_b_outputs, _ , _ = decoder_backward_activation ( decoder_input_begin , initial_state=encoder_backward_states)

# merges decoder Bi-LSTM outputs
merged_outputs = tf.keras.layers.concatenate([decoder_f_outputs, decoder_b_outputs])

# Dense layer 
# output = tf.keras.layers.TimeDistributed(tf.keras.layers.Dense(embedding_vector_dim, activation='softmax'))(merged_outputs)
decoder_dense = tf.keras.layers.Dense( len(vocabulary) , activation=tf.keras.activations.softmax ) 
output = decoder_dense ( merged_outputs )

model = tf.keras.models.Model(inputs=[encoder_input_begin, decoder_input_begin], outputs=output )
model.compile(optimizer=tf.keras.optimizers.RMSprop(), loss='categorical_crossentropy', metrics=['accuracy'])

model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            (None, 20, 100)      0                                            
__________________________________________________________________________________________________
input_2 (InputLayer)            (None, 20, 100)      0                                            
__________________________________________________________________________________________________
lstm (LSTM)                     [(None, 100), (None, 80400       input_1[0][0]                    
__________________________________________________________________________________________________
lstm_1 (LSTM)                   [(None, 100), (None, 80400       input_1[0][0]                    
__________________________________________________________________________________________________
lstm_2 (LS

In [19]:
# target_decoder, trainX_question, Y_answers

H = model.fit([trainX_question, Y_answers], target_decoder,
              epochs=7, batch_size=256)

Epoch 1/7
Epoch 2/7
Epoch 3/7
Epoch 4/7
Epoch 5/7
Epoch 6/7
Epoch 7/7


In [20]:
model.save("model.h5")



In [21]:
# uncomment when you want to load the model saved
# model = tf.keras.models.load_model('./model.h5')

In [None]:
# Start chatting
while True: 
    user_input = None
    X_inp = []
    user_input = input('User: ')
    user_input = list(read_input([user_input]))
    X_inp = [get_vec_representation_of_text_from_custom_embedding(user_input[:word_limit], embedding_model, word_limit=word_limit,
                                                            vector_dimension=dimension, vocabulary=vocabulary)]
    X_inp = np.array(X_inp)
    #     print(X_inp.shape)
    predictions = model.predict([X_inp, Y_answers])
    #     print(predictions.shape)
    predicted_words = np.argmax(predictions, axis=2)[0]
    #     predicted_words
    print("Bot  : ")
    for word_idx in predicted_words:
        if word_idx != 0:
            print(vocabulary[word_idx], end=' ')

User:  how are you


Bot  : 
well thought we start with if that okay with you 

User:  say something


Bot  : 
well thought we start with if that okay with you 