In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf

print(tf.__version__)

2.7.0-dev20210709


In [2]:
train_ds = pd.read_csv('./dataset/classification/train_dataset.csv')
test_ds  = pd.read_csv('./dataset/classification/test_dataset.csv')
valid_ds = pd.read_csv('./dataset/classification/dev_dataset.csv')

In [3]:
train_ds.head()

Unnamed: 0,Sentence,Intent
0,Can you show me some movie listings?,INFORM_INTENT
1,I am in San Jose. Also I want to see a 3D movie.,INFORM
2,"Yes, please find me some other movies. I want ...",INFORM
3,What times do you have available for Curse of ...,INFORM_INTENT
4,March 5th.,INFORM


In [4]:
intents_list = []
intents_list.extend(train_ds['Intent'].unique())
intents_list.extend(test_ds['Intent'].unique())
intents_list.extend(valid_ds['Intent'].unique())
intents_list = set(intents_list)
intents_list

{'AFFIRM',
 'AFFIRM_INTENT',
 'INFORM',
 'INFORM_INTENT',
 'NEGATE',
 'NEGATE_INTENT',
 'REQUEST',
 'REQUEST_ALTS',
 'SELECT',
 'THANK_YOU'}

In [5]:
intent_mapper = {
'AFFIRM': 1,
'AFFIRM_INTENT': 2,
'INFORM': 3,
'INFORM_INTENT': 4,
'NEGATE': 5,
'NEGATE_INTENT': 6,
'REQUEST': 7,
'REQUEST_ALTS': 8,
'SELECT':9,
'THANK_YOU':10
}

In [6]:
train_ds['Intent'] = np.array(train_ds.Intent.map(lambda x: intent_mapper[x]))
test_ds['Intent'] = np.array(test_ds.Intent.map(lambda x: intent_mapper[x]))
valid_ds['Intent'] = np.array(valid_ds.Intent.map(lambda x: intent_mapper[x]))
train_ds.head()

Unnamed: 0,Sentence,Intent
0,Can you show me some movie listings?,4
1,I am in San Jose. Also I want to see a 3D movie.,3
2,"Yes, please find me some other movies. I want ...",3
3,What times do you have available for Curse of ...,4
4,March 5th.,3


In [7]:
train_ds['Intent'].unique()

array([4, 3, 9, 7, 6, 5, 8])

In [8]:
BUFFER_SIZE = 10000
BATCH_SIZE = 64

In [9]:
train_text = tf.data.Dataset.from_tensor_slices(train_ds['Sentence'])
train_labels = tf.data.Dataset.from_tensor_slices(train_ds['Intent'])

test_text = tf.data.Dataset.from_tensor_slices(test_ds['Sentence'])
test_labels = tf.data.Dataset.from_tensor_slices(test_ds['Intent'])

valid_text = tf.data.Dataset.from_tensor_slices(valid_ds['Sentence'])
valid_labels = tf.data.Dataset.from_tensor_slices(valid_ds['Intent'])


train_dataset = tf.data.Dataset.zip((train_text, train_labels))
test_dataset = tf.data.Dataset.zip((test_text, test_labels))
valid_dataset = tf.data.Dataset.zip((valid_text, valid_labels))

In [10]:
train_dataset = train_dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE).prefetch(tf.data.AUTOTUNE)
test_dataset = test_dataset.batch(BATCH_SIZE).prefetch(tf.data.AUTOTUNE)
valid_dataset = valid_dataset.batch(BATCH_SIZE).prefetch(tf.data.AUTOTUNE)

In [11]:
for example, label in train_dataset.take(1):
    print('texts: ', example.numpy()[:3])
    print()
    print('labels: ', label.numpy()[:3])

texts:  [b'Check for other movies.'
 b'I want a 3D showing, on March 2nd, at the Vogue Theatre.'
 b'I am bored and would like to see a movie.']

labels:  [8 3 4]


# Text encoder

In [12]:
VOCAB_SIZE = 1000
encoder = tf.keras.layers.experimental.preprocessing.TextVectorization(
    max_tokens=VOCAB_SIZE)
encoder.adapt(train_dataset.map(lambda text, label: text))
encoder.adapt(test_dataset.map(lambda text, label: text))
encoder.adapt(valid_dataset.map(lambda text, label: text))

In [13]:
vocab = np.array(encoder.get_vocabulary())
vocab[:20]

array(['', '[UNK]', 'i', 'to', 'the', 'a', 'movie', 'is', 'for', 'in',
       'you', 'watch', 'it', 'what', 'want', 'of', 'like', 'movies',
       'can', 'me'], dtype='<U15')

In [14]:
encoded_example = encoder(example)[:3].numpy()
encoded_example

array([[248,   8,  57,  17,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0],
       [  2,  14,   5,  72,  54,  24,  44,  82,  22,   4, 724, 122,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0],
       [  2,  63, 338,  28,  27,  16,   3,  25,   5,   6,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0]])

In [15]:
for n in range(3):
    print("Original: ", example[n].numpy())
    print("Round-trip: ", " ".join(vocab[encoded_example[n]]))
    print()

Original:  b'Check for other movies.'
Round-trip:  check for other movies                        

Original:  b'I want a 3D showing, on March 2nd, at the Vogue Theatre.'
Round-trip:  i want a 3d showing on march 2nd at the vogue theatre                

Original:  b'I am bored and would like to see a movie.'
Round-trip:  i am bored and would like to see a movie                  



# create model

In [16]:

model = tf.keras.Sequential([
  encoder,
  tf.keras.layers.Embedding(
    input_dim=len(encoder.get_vocabulary()),
    output_dim=64,
    # Use masking to handle the variable sequence lengths
    mask_zero=True),
  tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64)),
  tf.keras.layers.Dense(11, activation='softmax')
])

In [17]:

# predict on a sample text without padding.

sample_text = ('Please check the showtimes for next Friday.')
predictions = model.predict(np.array([sample_text]))
print(predictions[0])

[0.09067997 0.09013047 0.09207603 0.09061903 0.09115911 0.09049734
 0.09089662 0.0903633  0.09132195 0.09059145 0.09166473]


In [18]:
# predict on a sample text with padding

padding = "the " * 2000
predictions = model.predict(np.array([sample_text, padding]))
print(predictions[0])

[0.09067997 0.09013047 0.09207603 0.09061903 0.09115911 0.09049734
 0.09089662 0.0903633  0.09132195 0.09059145 0.09166473]


In [19]:
model.compile(loss=tf.keras.losses.SparseCategoricalCrossentropy(),
              optimizer=tf.keras.optimizers.Adam(1e-4),
              metrics=['accuracy'])

# Train the model

In [22]:
history = model.fit(train_dataset, epochs=20,
                    validation_data=valid_dataset,
                    validation_steps=30)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [23]:
test_loss, test_acc = model.evaluate(test_dataset)

print('Test Loss:', test_loss)
print('Test Accuracy:', test_acc)

Test Loss: 0.8521207571029663
Test Accuracy: 0.7830910682678223


In [24]:
# predict on a sample text without padding.

sample_text = ('Please check the showtimes for next Friday.')
predictions = model.predict(np.array([sample_text]))
print(predictions)

[[7.4509455e-07 2.0017237e-06 7.8007542e-07 9.8423541e-01 1.4781028e-02
  3.8477867e-07 1.5240630e-06 5.3030028e-05 2.1355312e-04 7.1055372e-04
  1.0588172e-06]]
