In [2]:
import numpy as np
import tensorflow as tf
import matplotlib.pyplot as plt
import pandas as pd

In [4]:
intent_mapper = {
'AFFIRM': 1,
'AFFIRM_INTENT': 2,
'INFORM': 3,
'INFORM_INTENT': 4,
'NEGATE': 5,
'NEGATE_INTENT': 6,
'REQUEST': 7,
'REQUEST_ALTS': 8,
'SELECT':9,
'THANK_YOU':10
}

In [3]:
TRAIN_DATASET_PATH = './data/classification/train_dataset.csv'
TEST_DATASET_PATH = './data/classification/test_dataset.csv'
VALID_DATASET_PATH = './data/classification/dev_dataset.csv'

In [6]:
train_restaurant_dataset = pd.read_csv(TRAIN_DATASET_PATH)
# pega 2 np arrays, um com as frases e outro com os respectivos intents
train_dataset = train_restaurant_dataset.drop_duplicates(subset=['Sentence'])
train_phrases = np.array(train_restaurant_dataset.Sentence)[1:]
train_intents = np.array(train_restaurant_dataset.Intent.map(lambda x : intent_mapper[x]))[1:]


In [7]:

test_dataset = pd.read_csv(TEST_DATASET_PATH)
# pega 2 np arrays, um com as frases e outro com os respectivos intents
test_dataset = test_dataset.drop_duplicates(subset=['Sentence'])
test_phrases = np.array(test_dataset.Sentence)[1:]
test_intents = np.array(test_dataset.Intent.map(lambda x : intent_mapper[x]))[1:]


In [8]:
VOCAB_SIZE = 2000
encoder = tf.keras.layers.experimental.preprocessing.TextVectorization(
    max_tokens=VOCAB_SIZE)
encoder.adapt(train_phrases)

In [9]:
vocab = np.array(encoder.get_vocabulary())
print(len(encoder.get_vocabulary()))
vocab[:20]

783


array(['', '[UNK]', 'i', 'to', 'the', 'is', 'a', 'movie', 'watch', 'in',
       'what', 'for', 'it', 'you', 'movies', 'want', 'of', 'can', 'no',
       'like'], dtype='<U15')

In [10]:

model = tf.keras.Sequential([
  encoder,
  tf.keras.layers.Embedding(
    input_dim=len(encoder.get_vocabulary()),
    output_dim=64,
    # Use masking to handle the variable sequence lengths
    mask_zero=True),
  tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64)),
  tf.keras.layers.Dense(11, activation='softmax')
])

In [11]:

# predict on a sample text without padding.

sample_text = ('Please check the showtimes for next Friday.')
predictions = model.predict(np.array([sample_text]))
print(predictions[0])

[0.09134612 0.08937164 0.09175269 0.0912028  0.08964152 0.09107782
 0.09174416 0.08992222 0.09105679 0.09239127 0.09049302]


In [12]:
# predict on a sample text with padding

padding = "the " * 2000
predictions = model.predict(np.array([sample_text, padding]))
print(predictions[0])

[0.09134612 0.08937164 0.09175269 0.0912028  0.08964152 0.09107782
 0.09174416 0.08992222 0.09105679 0.09239127 0.09049302]


In [14]:
model.compile(loss=tf.keras.losses.SparseCategoricalCrossentropy(),
              optimizer=tf.keras.optimizers.Adam(1e-4),
              metrics=['accuracy'])

In [15]:
history = model.fit(train_phrases, train_intents, epochs=20)


Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [16]:

from sklearn.metrics import classification_report

test_phrases = np.array(test_dataset.Sentence)[1:]
test_intents = np.array(test_dataset.Intent.map(lambda x : intent_mapper[x]))[1:]

predictions = model.predict(np.array(test_phrases))
y_pred = [] 
for pred in predictions:
    y_pred.append(pred.argmax())

print(classification_report(test_intents, y_pred, zero_division = 1))

              precision    recall  f1-score   support

           1       1.00      0.00      0.00        44
           2       1.00      0.00      0.00        13
           3       0.93      0.92      0.93      1179
           4       0.65      0.80      0.72       313
           5       0.69      0.80      0.74       157
           6       0.88      0.32      0.47       108
           7       0.63      0.66      0.64       233
           8       1.00      0.10      0.18        92
           9       0.68      0.96      0.80       415
          10       1.00      0.00      0.00        59

    accuracy                           0.79      2613
   macro avg       0.85      0.46      0.45      2613
weighted avg       0.82      0.79      0.76      2613

