<a href="https://colab.research.google.com/github/maykon/intent-classifier/blob/master/Intent_classification_final.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
import numpy as np
import pandas as pd
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import RSLPStemmer
import nltk
import re
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder
import matplotlib.pyplot as plt
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from keras.models import Sequential, load_model
from keras.layers import Activation, Dense, GRU, LSTM, Bidirectional, Embedding, Dropout, BatchNormalization
from keras.callbacks import ModelCheckpoint


In [0]:
def load_dataset(filename):
  df = pd.read_csv(filename, encoding = 'utf-8', names = ["Sentence", "Intent"])
  print(df.head())
  intent = df["Intent"]
  unique_intent = list(set(intent))
  sentences = list(df["Sentence"])
  
  return (intent, unique_intent, sentences)
  


In [227]:
intent, unique_intent, sentences = load_dataset("Dataset_pt.csv")

                              Sentence       Intent
0                        Frete grátis?  FreteGratis
1                   Quanto tá o frete?   DadosFrete
2  qual o valor do frete para maringá?   DadosFrete
3       quanto ta o frete pra maringá?   DadosFrete
4        quanto ta o frte para maringá   DadosFrete


In [228]:
print(sentences[:5])

['Frete grátis?', 'Quanto tá o frete?', 'qual o valor do frete para maringá?', 'quanto ta o frete pra maringá?', 'quanto ta o frte para maringá']


In [229]:
nltk.download("stopwords")
nltk.download("punkt")
nltk.download('rslp')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package rslp to /root/nltk_data...
[nltk_data]   Package rslp is already up-to-date!


True

In [0]:
#define stemmer
stemmer = RSLPStemmer()

In [0]:
def cleaning(sentences):
  words = []
  for s in sentences:
    clean = re.sub(r'[^ a-z A-Z 0-9]', " ", s)
    w = word_tokenize(clean, language='portuguese')
    #stemming
    words.append([stemmer.stem(i.lower()) for i in w])
    
  return words  

In [232]:
cleaned_words = cleaning(sentences)
print(len(cleaned_words))
print(cleaned_words[:2])  
  


32
[['fret', 'gr', 'ti'], ['quant', 't', 'o', 'fret']]


In [0]:
def create_tokenizer(words, filters = '!"#$%&()*+,-./:;<=>?@[\]^_`{|}~'):
  token = Tokenizer(filters = filters)
  token.fit_on_texts(words)
  return token

In [0]:
def max_length(words):
  return(len(max(words, key = len)))
  

In [235]:
word_tokenizer = create_tokenizer(cleaned_words)
vocab_size = len(word_tokenizer.word_index) + 1
max_length = max_length(cleaned_words)

print("Vocab Size = %d and Maximum length = %d" % (vocab_size, max_length))

Vocab Size = 69 and Maximum length = 14


In [0]:
def encoding_doc(token, words):
  return(token.texts_to_sequences(words))

In [0]:
encoded_doc = encoding_doc(word_tokenizer, cleaned_words)

In [0]:
def padding_doc(encoded_doc, max_length):
  return(pad_sequences(encoded_doc, maxlen = max_length, padding = "post"))

In [0]:
padded_doc = padding_doc(encoded_doc, max_length)

In [240]:
padded_doc[:5]

array([[ 2,  6,  7,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0],
       [ 4, 20,  1,  2,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0],
       [21,  1, 31, 32,  2,  5,  8,  0,  0,  0,  0,  0,  0,  0],
       [ 4, 10,  1,  2, 11,  8,  0,  0,  0,  0,  0,  0,  0,  0],
       [ 4, 10,  1, 33,  5,  8,  0,  0,  0,  0,  0,  0,  0,  0]],
      dtype=int32)

In [241]:
print("Shape of padded docs = ",padded_doc.shape)

Shape of padded docs =  (32, 14)


In [0]:
#tokenizer with filter changed
output_tokenizer = create_tokenizer(unique_intent, filters = '!"#$%&()*+,-/:;<=>?@[\]^`{|}~')


In [243]:
output_tokenizer.word_index

{'dadosfrete': 3, 'dadosnotafiscal': 1, 'fretegratis': 2}

In [244]:
print(intent)
encoded_output = encoding_doc(output_tokenizer, intent)

0         FreteGratis
1          DadosFrete
2          DadosFrete
3          DadosFrete
4          DadosFrete
5          DadosFrete
6          DadosFrete
7          DadosFrete
8          DadosFrete
9          DadosFrete
10        FreteGratis
11         DadosFrete
12        FreteGratis
13        FreteGratis
14        FreteGratis
15         DadosFrete
16        FreteGratis
17        FreteGratis
18        FreteGratis
19         DadosFrete
20    DadosNotaFiscal
21    DadosNotaFiscal
22    DadosNotaFiscal
23    DadosNotaFiscal
24    DadosNotaFiscal
25    DadosNotaFiscal
26    DadosNotaFiscal
27    DadosNotaFiscal
28    DadosNotaFiscal
29    DadosNotaFiscal
30    DadosNotaFiscal
31    DadosNotaFiscal
Name: Intent, dtype: object


In [0]:
encoded_output = np.array(encoded_output).reshape(len(encoded_output), 1)

In [261]:
encoded_output.shape

(32, 1)

In [0]:
def one_hot(encode):
  o = OneHotEncoder(sparse = False)
  return(o.fit_transform(encode))

In [0]:
output_one_hot = one_hot(encoded_output)

In [279]:
output_one_hot.shape

(32, 3)

In [0]:
from sklearn.model_selection import train_test_split

In [0]:
train_X, val_X, train_Y, val_Y = train_test_split(padded_doc, output_one_hot, shuffle = True, test_size = 0.2)


In [282]:
print("Shape of train_X = %s and train_Y = %s" % (train_X.shape, train_Y.shape))
print("Shape of val_X = %s and val_Y = %s" % (val_X.shape, val_Y.shape))

Shape of train_X = (25, 14) and train_Y = (25, 3)
Shape of val_X = (7, 14) and val_Y = (7, 3)


In [0]:
num_labels = len(unique_intent)

def create_model(vocab_size, max_length):
  model = Sequential()
  model.add(Embedding(vocab_size, 256,
            input_length = max_length,  trainable = False))
  model.add(Bidirectional(LSTM(256)))
  model.add(Dense(256, activation = "relu"))
  model.add(Dropout(0.5))
  model.add(Dense(256, activation = "relu"))
  model.add(Dropout(0.5))
  model.add(BatchNormalization())
  model.add(Dense(num_labels, activation = "softmax"))

  #model.add(Embedding(vocab_size, 128, input_length = max_length, trainable = False))
  #model.add(Bidirectional(LSTM(128)))
  #model.add(Dense(512, activation = "relu"))
  #model.add(Dropout(0.5))
  #model.add(Dense(num_labels, activation = "softmax"))
  #model.add(Dense(512))
  #model.add(Activation('relu'))
  #model.add(Dropout(0.5))
  #model.add(Dense(512))
  #model.add(Activation('relu'))
  #model.add(Dropout(0.5))
  #model.add(Dense(num_labels))
  #model.add(Activation('softmax'))
  
  return model

In [285]:
model = create_model(vocab_size, max_length)

model.compile(loss = "categorical_crossentropy", optimizer = "adam", metrics = ["accuracy"])
model.summary()

Model: "sequential_16"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_14 (Embedding)     (None, 14, 256)           17664     
_________________________________________________________________
bidirectional_11 (Bidirectio (None, 512)               1050624   
_________________________________________________________________
dense_40 (Dense)             (None, 256)               131328    
_________________________________________________________________
dropout_26 (Dropout)         (None, 256)               0         
_________________________________________________________________
dense_41 (Dense)             (None, 256)               65792     
_________________________________________________________________
dropout_27 (Dropout)         (None, 256)               0         
_________________________________________________________________
batch_normalization_10 (Batc (None, 256)             

In [286]:
filename = 'model.h5'
checkpoint = ModelCheckpoint(filename, monitor='val_loss', verbose=1, save_best_only=True, mode='min')

hist = model.fit(train_X, train_Y, epochs = 200, batch_size = 10, validation_data = (val_X, val_Y), callbacks = [checkpoint])

Train on 25 samples, validate on 7 samples
Epoch 1/200

Epoch 00001: val_loss improved from inf to 1.04033, saving model to model.h5
Epoch 2/200

Epoch 00002: val_loss improved from 1.04033 to 0.83096, saving model to model.h5
Epoch 3/200

Epoch 00003: val_loss did not improve from 0.83096
Epoch 4/200

Epoch 00004: val_loss did not improve from 0.83096
Epoch 5/200

Epoch 00005: val_loss did not improve from 0.83096
Epoch 6/200

Epoch 00006: val_loss improved from 0.83096 to 0.79986, saving model to model.h5
Epoch 7/200

Epoch 00007: val_loss did not improve from 0.79986
Epoch 8/200

Epoch 00008: val_loss did not improve from 0.79986
Epoch 9/200

Epoch 00009: val_loss did not improve from 0.79986
Epoch 10/200

Epoch 00010: val_loss improved from 0.79986 to 0.69386, saving model to model.h5
Epoch 11/200

Epoch 00011: val_loss improved from 0.69386 to 0.65078, saving model to model.h5
Epoch 12/200

Epoch 00012: val_loss did not improve from 0.65078
Epoch 13/200

Epoch 00013: val_loss impr

In [0]:
 model = load_model("model.h5")

In [0]:
def predictions(text):
  clean = re.sub(r'[^ a-z A-Z 0-9]', " ", text)
  test_word = word_tokenize(clean, language='portuguese')
  test_word = [stemmer.stem(w.lower()) for w in test_word] 
  test_ls = word_tokenizer.texts_to_sequences(test_word) 
  #Check for unknown words
  if [] in test_ls:
    test_ls = list(filter(None, test_ls))
    
  test_ls = np.array(test_ls).reshape(1, len(test_ls))
 
  x = padding_doc(test_ls, max_length)
  
  pred = model.predict_proba(x)
  
  
  return pred


  

In [0]:
def get_final_output(pred, classes):
  predictions = pred[0]  
 
  classes = np.array(classes)
  ids = np.argsort(-predictions)
  classes = classes[ids]
  predictions = -np.sort(-predictions)
  
  print(predictions)
  for i in range(pred.shape[1]):
    print("%s has confidence = %s" % (classes[i], (predictions[i])))



In [301]:
text = "o aviso fala frete gratis."
pred = predictions(text)
get_final_output(pred, unique_intent)
print()

text = "tem frete?"
pred = predictions(text)
get_final_output(pred, unique_intent)
print()

text = "vem com nota? é frete gratis"
pred = predictions(text)
get_final_output(pred, unique_intent)
print()


text = "o valor do frete consta na nota?"
pred = predictions(text)
get_final_output(pred, unique_intent)

[0.606543   0.3638454  0.02961162]
FreteGratis has confidence = 0.606543
DadosFrete has confidence = 0.3638454
DadosNotaFiscal has confidence = 0.029611615

[0.9190556  0.04837725 0.03256721]
DadosFrete has confidence = 0.9190556
DadosNotaFiscal has confidence = 0.048377253
FreteGratis has confidence = 0.032567214

[0.9720848  0.02279115 0.005124  ]
DadosNotaFiscal has confidence = 0.9720848
DadosFrete has confidence = 0.02279115
FreteGratis has confidence = 0.0051240046

[0.6667947  0.2524448  0.08076047]
DadosFrete has confidence = 0.6667947
FreteGratis has confidence = 0.2524448
DadosNotaFiscal has confidence = 0.08076047
