In [7]:
import numpy as np
import pandas as pd
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem.lancaster import LancasterStemmer
from nltk.stem import WordNetLemmatizer
import nltk
import re
from sklearn.preprocessing import OneHotEncoder
import matplotlib.pyplot as plt
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from keras.models import Sequential, load_model
from keras.layers import Dense, LSTM, Bidirectional, Embedding, Dropout
from keras.callbacks import ModelCheckpoint

nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /home/manas/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [8]:
dataset = pd.read_csv('dataset.csv', encoding = "latin1", names=["Sentences", "Intent"])
print(dataset)

                                              Sentences          Intent
0                                      Need help pleese  commonQ.assist
1                                             Need help  commonQ.assist
2                                      I need some info  commonQ.assist
3                                     Will you help me?  commonQ.assist
4                                 What else can you do?  commonQ.assist
...                                                 ...             ...
1108  borrowed amount can be used for which of the p...  faq.borrow_use
1109  borrowed amount given can used by me for what ...  faq.borrow_use
1110  borrowed amount given can used by me for which...  faq.borrow_use
1111  borrowed money can be used for which of the pu...  faq.borrow_use
1112  borrow money given can used by me for what rea...  faq.borrow_use

[1113 rows x 2 columns]


In [9]:
unique_intents = set(list(dataset['Intent']))
print(unique_intents, len(unique_intents))

{'faq.aadhaar_missing', 'faq.biz_new', 'commonQ.assist', 'faq.apply_register', 'faq.application_process', 'commonQ.not_giving', 'commonQ.just_details', 'commonQ.query', 'faq.borrow_use', 'faq.biz_category_missing', 'faq.borrow_limit', 'commonQ.name', 'faq.banking_option_missing', 'contact.contact', 'faq.address_proof', 'commonQ.bot', 'faq.approval_time', 'faq.bad_service', 'commonQ.how', 'faq.biz_simpler', 'commonQ.wait'} 21


In [10]:
sentences = list(dataset['Sentences'])
print(sentences[:5])

['Need help pleese', 'Need help', 'I need some info', 'Will you help me?', 'What else can you do?']


In [13]:
# DATA CLEANING
# Before we can feed our data into an algorithm to train our model, we would have to clean the raw data.

# Step 1: remove any punctuation marks or special characters (if any) from each sentence
# Step 2: tokenize each sentence i.e. convert each sentence into a list of words
# Tokenization is the process by which big quantity of text is divided into smaller parts called tokens
# Step 3: Apply Lemmatization on each word (token) in the sentence
# Lemmatization is the process of deriving the actual root lemma (word) from a given word
# for example: apologies => apology, horses => horse

lemmatizer = WordNetLemmatizer()
cleaned_sentences = []
for s in sentences:
    clean = re.sub(r'[^ a-z A-Z 0-9]', " ", s) # replace any special characters with an empty space
    w = word_tokenize(clean)
    cleaned_sentences.append([lemmatizer.lemmatize(i.lower()) for i in w])
print(cleaned_sentences[:5])

[['need', 'help', 'pleese'], ['need', 'help'], ['i', 'need', 'some', 'info'], ['will', 'you', 'help', 'me'], ['what', 'else', 'can', 'you', 'do']]


In [29]:
token = Tokenizer(filters = '!"#$%&()*+,-./:;<=>?@[\]^_`{|}~')
token.fit_on_texts(cleaned_sentences)
max_sentence_length = len(max(cleaned_sentences, key=len))
print('Vocab size:', len(token.word_index))
print('Length of the longest sentence:', max_sentence_length)

Vocab size: 461
Length of the longest sentence: 28
