In [7]:
import numpy as np
import pandas as pd
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem.lancaster import LancasterStemmer
from nltk.stem import WordNetLemmatizer
import nltk
import re
from sklearn.preprocessing import OneHotEncoder
import matplotlib.pyplot as plt
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from keras.models import Sequential, load_model
from keras.layers import Dense, LSTM, Bidirectional, Embedding, Dropout
from keras.callbacks import ModelCheckpoint

nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /home/manas/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [8]:
dataset = pd.read_csv('dataset.csv', encoding = "latin1", names=["Sentences", "Intent"])
print(dataset)

                                              Sentences          Intent
0                                      Need help pleese  commonQ.assist
1                                             Need help  commonQ.assist
2                                      I need some info  commonQ.assist
3                                     Will you help me?  commonQ.assist
4                                 What else can you do?  commonQ.assist
...                                                 ...             ...
1108  borrowed amount can be used for which of the p...  faq.borrow_use
1109  borrowed amount given can used by me for what ...  faq.borrow_use
1110  borrowed amount given can used by me for which...  faq.borrow_use
1111  borrowed money can be used for which of the pu...  faq.borrow_use
1112  borrow money given can used by me for what rea...  faq.borrow_use

[1113 rows x 2 columns]


In [37]:
intents = dataset['Intent'].tolist()
unique_intents = set(list(dataset['Intent']))
print(unique_intents, len(unique_intents))

{'faq.aadhaar_missing', 'faq.biz_new', 'commonQ.assist', 'faq.apply_register', 'faq.application_process', 'commonQ.not_giving', 'commonQ.just_details', 'commonQ.query', 'faq.borrow_use', 'faq.biz_category_missing', 'faq.borrow_limit', 'commonQ.name', 'faq.banking_option_missing', 'contact.contact', 'faq.address_proof', 'commonQ.bot', 'faq.approval_time', 'faq.bad_service', 'commonQ.how', 'faq.biz_simpler', 'commonQ.wait'} 21


In [38]:
sentences = list(dataset['Sentences'])
print(sentences[:5])

['Need help pleese', 'Need help', 'I need some info', 'Will you help me?', 'What else can you do?']


In [39]:
# DATA CLEANING
# Before we can feed our data into an algorithm to train our model, we would have to clean the raw data.

# Step 1: remove any punctuation marks or special characters (if any) from each sentence
# Step 2: tokenize each sentence i.e. convert each sentence into a list of words
# Tokenization is the process by which big quantity of text is divided into smaller parts called tokens
# Words are called tokens and the process of splitting text into tokens is called tokenization.
# Step 3: Apply Lemmatization on each word (token) in the sentence
# Lemmatization is the process of deriving the actual root lemma (word) from a given word
# for example: apologies => apology, horses => horse

lemmatizer = WordNetLemmatizer()
cleaned_sentences = []
for s in sentences:
    clean = re.sub(r'[^ a-z A-Z 0-9]', " ", s) # replace any special characters with an empty space
    w = word_tokenize(clean)
    cleaned_sentences.append([lemmatizer.lemmatize(i.lower()) for i in w])
print(cleaned_sentences[:5])

[['need', 'help', 'pleese'], ['need', 'help'], ['i', 'need', 'some', 'info'], ['will', 'you', 'help', 'me'], ['what', 'else', 'can', 'you', 'do']]


In [40]:
# INPUT ENCODING: converting our input messages into numeric values understandable by our ML algorithm
# It is popular to represent a document as a sequence of integer values, where each word in the document 
# is represented as a unique integer.
token = Tokenizer(filters = '!"#$%&()*+,-./:;<=>?@[\]^_`{|}~')
token.fit_on_texts(cleaned_sentences)
max_sentence_length = len(max(cleaned_sentences, key=len))
print('Vocab size:', len(token.word_index)) # word_index: A dict of words and their uniquely assigned integers.
print('Length of the longest sentence:', max_sentence_length)

Vocab size: 461
Length of the longest sentence: 28


In [41]:
encoded_doc = token.texts_to_sequences(cleaned_sentences)
print(encoded_doc[:5])

[[25, 74, 316], [25, 74], [1, 25, 194, 176], [54, 10, 74, 16], [9, 261, 4, 10, 30]]


In [55]:
padded_doc = pad_sequences(encoded_doc, maxlen = max_sentence_length, padding = "post")
print(padded_doc)
print("Shape of features:",padded_doc.shape)

[[ 25  74 316 ...   0   0   0]
 [ 25  74   0 ...   0   0   0]
 [  1  25 194 ...   0   0   0]
 ...
 [ 59  28 129 ...   0   0   0]
 [ 59  44   4 ...   0   0   0]
 [ 81  44 129 ...   0   0   0]]
Shape of features: (1113, 28)


In [59]:
# OUTPUT ENCODING
# changed the filters here by removing . and _ so as to preserve the labels
output_token = Tokenizer(filters = '!"#$%&()*+,-/:;<=>?@[\]^`{|}~')
output_token.fit_on_texts(unique_intents)
print(output_token.word_index)

{'faq.aadhaar_missing': 1, 'faq.biz_new': 2, 'commonq.assist': 3, 'faq.apply_register': 4, 'faq.application_process': 5, 'commonq.not_giving': 6, 'commonq.just_details': 7, 'commonq.query': 8, 'faq.borrow_use': 9, 'faq.biz_category_missing': 10, 'faq.borrow_limit': 11, 'commonq.name': 12, 'faq.banking_option_missing': 13, 'contact.contact': 14, 'faq.address_proof': 15, 'commonq.bot': 16, 'faq.approval_time': 17, 'faq.bad_service': 18, 'commonq.how': 19, 'faq.biz_simpler': 20, 'commonq.wait': 21}


In [60]:
encoded_output = output_token.texts_to_sequences(intents)
print(encoded_output[:5]) # list

[[3], [3], [3], [3], [3]]


In [61]:
encoded_output = np.array(encoded_output).reshape(len(encoded_output), 1)
print(encoded_output[:5]) # numpy array
print('Shape of label:', encoded_output.shape)

[[3]
 [3]
 [3]
 [3]
 [3]]
Shape of label: (1113, 1)


In [63]:
# one-hot encoding the output
o = OneHotEncoder(sparse = False)
output_one_hot = o.fit_transform(encoded_output)
print(output_one_hot)

[[0. 0. 1. ... 0. 0. 0.]
 [0. 0. 1. ... 0. 0. 0.]
 [0. 0. 1. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.
