In [0]:
!pip install emoji



In [0]:
# Load the Drive helper and mount
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [0]:
import numpy as np
from keras.models import Model, load_model
from keras.layers import Dense, Input, Dropout, LSTM, Activation
from keras.layers.embeddings import Embedding
from keras.preprocessing import sequence
from keras.initializers import glorot_uniform
import os
import sys
import json
import string

In [0]:
# load the helper file saved in the drive
sys.path.append(os.path.abspath('/content/drive/My Drive/Machine Learning/Chatbot_Glove_model'))
from emo_utils import *
BASE_PATH = '/content/drive/My Drive/Machine Learning/Chatbot_Glove_model/'

In [0]:
# Load stop words
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [0]:
# SET max length of sentence (use padding for sentences with smaller length)
maxLen = 10

In [0]:
word_to_index, index_to_word, word_to_vec_map = read_glove_vecs('/content/drive/My Drive/Machine Learning/Chatbot_Glove_model/glove.6B.50d.txt')

In [0]:
def sentences_to_indices(X, word_to_index, max_len):
    """
    Converts an array of sentences (strings) into an array of indices corresponding to words in the sentences.
    The output shape should be such that it can be given to `Embedding()` (described in Figure 4). 
    
    Arguments:
    X -- array of sentences (strings), of shape (m, 1)
    word_to_index -- a dictionary containing the each word mapped to its index
    max_len -- maximum number of words in a sentence. You can assume every sentence in X is no longer than this. 
    
    Returns:
    X_indices -- array of indices corresponding to words in the sentences from X, of shape (m, max_len)
    """
    
    # Make translation table to replace puctuation
    replace_char = {key: None for key in string.punctuation}
    replace_char['"'] = None
    table = str.maketrans(replace_char)
    stop_words = set(stopwords.words('english'))
    
    
    m = X.shape[0]                                   # number of training examples
    
    # Initialize X_indices as a numpy matrix of zeros and the correct shape (≈ 1 line)
    X_indices = np.zeros(shape=(m, max_len))
    
    for i in range(m):                               # loop over training examples
        
        # Remove punctuation
        X[i] = X[i].translate(table)
        
        # Convert the ith training sentence in lower case and split is into words. You should get a list of words.
        sentence_words = X[i].lower().split()        
        
        # Initialize j to 0
        j = 0
        
        # Store indices of unknown words in list and then replace it by
        # the average of all words
        unknown_words_index = []
        
        # Loop over the words of sentence_words
        for w in sentence_words:
            # Skip Stopwords
            if w in stop_words:
              continue
            # Set the (i,j)th entry of X_indices to the index of the correct word.
            if w in word_to_index:
              X_indices[i, j] = word_to_index[w]
            else:
              # Handle unknown key (keep it as zeros)
              pass
            # Increment j to j + 1
            j += 1            
    
    return X_indices

In [0]:
# Load saved model
model_name = 'trained_lstm_128_128_dropout_4_3.h5'
model = load_model(BASE_PATH + model_name)

In [0]:
# make training data
X_train = []
Y_train = []

all_tags = []
all_tags_index = []

with open('/content/drive/My Drive/Machine Learning/Chatbot_Glove_model/new_intents.json') as json_data:
    intents = json.load(json_data)
    counter_index = 0
    for intent in intents['intents']:
        tag = intent['tag']
        tag_index = counter_index
        counter_index += 1
        
        all_tags.append(tag)
        all_tags_index.append(tag_index)
        for pattern in intent['patterns']:
            X_train.append(pattern)
            Y_train.append(tag_index)

with open('/content/drive/My Drive/Machine Learning/Chatbot_Glove_model/output.txt', 'w+') as out_file:
    for i in range(0,len(all_tags)):
        out_file.write('{}\t\t\t{}\n'.format(all_tags_index[i], all_tags[i]))

print(X_train[:10])
print(Y_train[:10])
print(len(X_train))
print(len(Y_train))        

# Convert to numpy array
X_train = np.array(X_train)
Y_train = np.array(Y_train)

['Hi', 'How are you', 'Is anyone there?', 'Hello', 'Good day', 'Bye', 'See you later', 'Goodbye', 'Thanks', 'Thank you']
[0, 0, 0, 0, 0, 1, 1, 1, 2, 2]
105
105


In [0]:
# Testing
test_sentence = ['what are the documents required for admission through Minority Quota',
                'what is the admission procedure for computer engineering',
                'what is the admission procedure for mechanical engineering',
                'what is the admission procedure for electrical engineering',
                'hi there',
                'thanks for your help',
                'bye, thanks',
                'what documents are required for obc',
                'what is the timing for office',
                'How can I apply as NRI?',
                'What will be the charge for application form?',
                'what time is it open today']

X_test = sentences_to_indices(np.array(test_sentence), word_to_index, maxLen)
pred = model.predict(X_test)
pred_index = np.argmax(pred, axis=1)

for i in range(len(test_sentence)):
  print(test_sentence[i])
  print(str(pred_index[i]) + '   Expected Intent --->  ' + all_tags[pred_index[i]]+'\n')

what are the documents required for admission through Minority Quota
39   Expected Intent --->  Document_Minority

what is the admission procedure for computer engineering
14   Expected Intent --->  admission_computerengineering

what is the admission procedure for mechanical engineering
15   Expected Intent --->  admission_mechanicalengineering

what is the admission procedure for electrical engineering
17   Expected Intent --->  admission_electronicalengineering

hi there
0   Expected Intent --->  greeting

thanks for your help
2   Expected Intent --->  thanks

bye, thanks
1   Expected Intent --->  goodbye

what documents are required for obc
30   Expected Intent --->  Documents_OBC

what is the timing for office
3   Expected Intent --->  application_date

How can I apply as NRI?
26   Expected Intent --->  NRI

What will be the charge for application form?
8   Expected Intent --->  application_fees

what time is it open today
5   Expected Intent --->  office_hours



In [0]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 10)                0         
_________________________________________________________________
embedding_1 (Embedding)      (None, 10, 50)            20000050  
_________________________________________________________________
lstm_1 (LSTM)                (None, 10, 128)           91648     
_________________________________________________________________
dropout_1 (Dropout)          (None, 10, 128)           0         
_________________________________________________________________
lstm_2 (LSTM)                (None, 128)               131584    
_________________________________________________________________
dropout_2 (Dropout)          (None, 128)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 42)                5418      
__________

In [0]:
maxlen = len(max(X_train, key=len).split())
print(maxlen)

In [0]:
Y_oh_train = convert_to_one_hot(Y_train, C = 42)

print(Y_train[:6])
print(Y_oh_train[:6])

[0 0 0 0 0 1]
[[1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]


In [0]:
def pretrained_embedding_layer(word_to_vec_map, word_to_index):
    """
    Creates a Keras Embedding() layer and loads in pre-trained GloVe 50-dimensional vectors.
    
    Arguments:
    word_to_vec_map -- dictionary mapping words to their GloVe vector representation.
    word_to_index -- dictionary mapping from words to their indices in the vocabulary (400,001 words)

    Returns:
    embedding_layer -- pretrained layer Keras instance
    """
    
    vocab_len = len(word_to_index) + 1                  # adding 1 to fit Keras embedding (requirement)
    emb_dim = word_to_vec_map["cucumber"].shape[0]      # define dimensionality of your GloVe word vectors (= 50)
    
    # Initialize the embedding matrix as a numpy array of zeros of shape (vocab_len, dimensions of word vectors = emb_dim)
    emb_matrix = np.zeros(shape=(vocab_len, emb_dim))
    
    # Set each row "index" of the embedding matrix to be the word vector representation of the "index"th word of the vocabulary
    for word, index in word_to_index.items():
        emb_matrix[index, :] = word_to_vec_map[word]

    # Define Keras embedding layer with the correct output/input sizes, make it trainable. Use Embedding(...). Make sure to set trainable=False. 
    embedding_layer = Embedding(vocab_len, emb_dim, trainable=False)
    
    # Build the embedding layer, it is required before setting the weights of the embedding layer. Do not modify the "None".
    embedding_layer.build((None,))
    
    # Set the weights of the embedding layer to the embedding matrix. Your layer is now pretrained.
    embedding_layer.set_weights([emb_matrix])
    
    return embedding_layer

In [0]:
def Emojify_V2(input_shape, word_to_vec_map, word_to_index):
    """
    Function creating the Emojify-v2 model's graph.
    
    Arguments:
    input_shape -- shape of the input, usually (max_len,)
    word_to_vec_map -- dictionary mapping every word in a vocabulary into its 50-dimensional vector representation
    word_to_index -- dictionary mapping from words to their indices in the vocabulary (400,001 words)

    Returns:
    model -- a model instance in Keras
    """
    
    # Define sentence_indices as the input of the graph, it should be of shape input_shape and dtype 'int32' (as it contains indices).
    sentence_indices = Input(shape=input_shape)
    
    # Create the embedding layer pretrained with GloVe Vectors (≈1 line)
    embedding_layer = pretrained_embedding_layer(word_to_vec_map, word_to_index)
    
    # Propagate sentence_indices through your embedding layer, you get back the embeddings
    embeddings = embedding_layer(sentence_indices)   
    
    # Propagate the embeddings through an LSTM layer with 128-dimensional hidden state
    # Be careful, the returned output should be a batch of sequences.
    X = LSTM(128, return_sequences=True)(embeddings)
    # Add dropout with a probability of 0.5
    X = Dropout(0.4)(X)
    # Propagate X trough another LSTM layer with 128-dimensional hidden state
    # Be careful, the returned output should be a single hidden state, not a batch of sequences.
    X = LSTM(128)(X)
    # Add dropout with a probability of 0.5
    X = Dropout(0.3)(X)
    # Propagate X through a Dense layer with softmax activation to get back a batch of 5-dimensional vectors.
    X = Dense(42)(X)
    # Add a softmax activation
    X = Activation('softmax')(X)
    
    # Create Model instance which converts sentence_indices into X.
    model = Model(inputs=sentence_indices, outputs=X)
    
    return model

In [0]:
model = Emojify_V2((maxLen,), word_to_vec_map, word_to_index)
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 10)                0         
_________________________________________________________________
embedding_1 (Embedding)      (None, 10, 50)            20000050  
_________________________________________________________________
lstm_1 (LSTM)                (None, 10, 128)           91648     
_________________________________________________________________
dropout_1 (Dropout)          (None, 10, 128)           0         
_________________________________________________________________
lstm_2 (LSTM)                (None, 128)               131584    
_________________________________________________________________
dropout_2 (Dropout)          (None, 128)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 42)                5418      
__________

In [0]:
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [0]:
X_train_indices = sentences_to_indices(X_train, word_to_index, maxLen)

In [0]:
model.fit(X_train_indices, Y_oh_train, epochs = 300, batch_size = 32, shuffle=True)

Epoch 1/300
Epoch 2/300
Epoch 3/300
Epoch 4/300
Epoch 5/300
Epoch 6/300
Epoch 7/300
Epoch 8/300
Epoch 9/300
Epoch 10/300
Epoch 11/300
Epoch 12/300
Epoch 13/300
Epoch 14/300
Epoch 15/300
Epoch 16/300
Epoch 17/300
Epoch 18/300
Epoch 19/300
Epoch 20/300
Epoch 21/300
Epoch 22/300
Epoch 23/300
Epoch 24/300
Epoch 25/300
Epoch 26/300
Epoch 27/300
Epoch 28/300
Epoch 29/300
Epoch 30/300
Epoch 31/300
Epoch 32/300
Epoch 33/300
Epoch 34/300
Epoch 35/300
Epoch 36/300
Epoch 37/300
Epoch 38/300
Epoch 39/300
Epoch 40/300
Epoch 41/300
Epoch 42/300
Epoch 43/300
Epoch 44/300
Epoch 45/300
Epoch 46/300
Epoch 47/300
Epoch 48/300
Epoch 49/300
Epoch 50/300
Epoch 51/300
Epoch 52/300
Epoch 53/300
Epoch 54/300
Epoch 55/300
Epoch 56/300
Epoch 57/300
Epoch 58/300
Epoch 59/300
Epoch 60/300
Epoch 61/300
Epoch 62/300
Epoch 63/300
Epoch 64/300
Epoch 65/300
Epoch 66/300
Epoch 67/300
Epoch 68/300
Epoch 69/300
Epoch 70/300
Epoch 71/300
Epoch 72/300
Epoch 73/300
Epoch 74/300
Epoch 75/300
Epoch 76/300
Epoch 77/300
Epoch 78

<keras.callbacks.History at 0x7f606d110940>

In [0]:
print(X_train_indices.shape)
print(Y_oh_train.shape)

(105, 10)
(105, 42)


In [0]:
test_sentence = ['what are the Documents required for admission through Minority',]
X_test = sentences_to_indices(np.array(test_sentence), word_to_index, maxLen)
pred = model.predict(X_test)
pred_index = np.argmax(pred)
print(pred_index)
print(all_tags[pred_index])

32
Documents_TFWS


In [0]:
model.save('/content/drive/My Drive/Machine Learning/Chatbot_Glove_model/trained_lstm_128_128_dropout_4_3.h5')