# **Intent Detection Using LSTM**

This example of intent detection is available in the Textbook: https://github.com/practical-nlp/practical-nlp-code/blob/master/Ch6/01_CNN_RNN_ATIS_intents.ipynb

The code has been updated due to keras restructuring and an equivalent in Pytorch has been added.

## Imports

In [17]:
#general imports
import os
import sys
import random
random.seed(0) #for reproducability of results

#basic imports
import numpy as np
import pandas as pd

#NN imports
from keras.preprocessing.text import Tokenizer
from keras.utils import pad_sequences
from keras.utils import to_categorical
from keras.layers import Dense, Input, GlobalMaxPooling1D
from keras.layers import Conv1D, MaxPooling1D, Embedding, LSTM
from keras.models import Model, Sequential
from keras.initializers import Constant

#encoder
from sklearn.preprocessing import LabelEncoder

## Loading Data

In [8]:
import pandas as pd
import numpy as np

def get_data(filename):
    df = pd.read_csv(filename,delim_whitespace=True,names=['word','label'])
    beg_indices = list(df[df['word'] == 'BOS'].index)+[df.shape[0]]
    sents,labels,intents = [],[],[]
    for i in range(len(beg_indices[:-1])):
        sents.append(df[beg_indices[i]+1:beg_indices[i+1]-1]['word'].values)
        labels.append(df[beg_indices[i]+1:beg_indices[i+1]-1]['label'].values)
        intents.append(df.loc[beg_indices[i+1]-1]['label'])    
    return np.array(sents, dtype=object), np.array(labels, dtype=object), np.array(intents, dtype=object)

def get_data2(filename):
    with open(filename) as f:
        contents = f.read()
    sents,labels,intents = [],[],[]
    for line in contents.strip().split('\n'):
        words,labs = [i.split(' ') for i in line.split('\t')]
        sents.append(words[1:-1])
        labels.append(labs[1:-1])
        intents.append(labs[-1])
    return np.array(sents, dtype=object), np.array(labels, dtype=object), np.array(intents, dtype=object)

In [9]:
sents,labels,intents = get_data2('atis.train.w-intent.iob')

train_sentences = [" ".join(i) for i in sents]

train_texts = train_sentences
train_labels= intents.tolist()

vals = []

for i in range(len(train_labels)):
    if "#" in train_labels[i]:
        vals.append(i)
        
for i in vals[::-1]:
    train_labels.pop(i)
    train_texts.pop(i)

print ("Number of training sentences :",len(train_texts))
print ("Number of unique intents :",len(set(train_labels)))

for i in zip(train_texts[:5], train_labels[:5]):
    print(i)

Number of training sentences : 4952
Number of unique intents : 17
('i want to fly from boston at 838 am and arrive in denver at 1110 in the morning', 'atis_flight')
('what flights are available from pittsburgh to baltimore on thursday morning', 'atis_flight')
('what is the arrival time in san francisco for the 755 am flight leaving washington', 'atis_flight_time')
('cheapest airfare from tacoma to orlando', 'atis_airfare')
('round trip fares from pittsburgh to philadelphia under 1000 dollars', 'atis_airfare')


In [11]:
sents,labels,intents = get_data('atis.test.w-intent.iob')

test_sentences = [" ".join(i) for i in sents]

test_texts = test_sentences
test_labels = intents.tolist()

new_labels = set(test_labels) - set(train_labels)

vals = []

for i in range(len(test_labels)):
    if "#" in test_labels[i]:
        vals.append(i)
    elif test_labels[i] in new_labels:
        print(test_labels[i])
        vals.append(i)
        
for i in vals[::-1]:
    test_labels.pop(i)
    test_texts.pop(i)

print ("Number of testing sentences :",len(test_texts))
print ("Number of unique intents :",len(set(test_labels)))

for i in zip(test_texts[:5], test_labels[:5]):
    print(i)

atis_day_name
atis_day_name
Number of testing sentences : 876
Number of unique intents : 15
('i would like to find a flight from charlotte to las vegas that makes a stop in st. louis', 'atis_flight')
('on april first i need a ticket from tacoma to san jose departing before 7 am', 'atis_airfare')
('on april first i need a flight going from phoenix to san diego', 'atis_flight')
('i would like a flight traveling one way from phoenix to san diego on april first', 'atis_flight')
('i would like a flight from orlando to salt lake city for april first on delta airlines', 'atis_flight')


In [12]:
MAX_SEQUENCE_LENGTH = 300
MAX_NUM_WORDS = 20000 
EMBEDDING_DIM = 100 
VALIDATION_SPLIT = 0.3

In [13]:
tokenizer = Tokenizer(num_words=MAX_NUM_WORDS)
tokenizer.fit_on_texts(train_texts)
train_sequences = tokenizer.texts_to_sequences(train_texts) #Converting text to a vector of word indexes
test_sequences = tokenizer.texts_to_sequences(test_texts)
word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

Found 897 unique tokens.


In [14]:
le = LabelEncoder()
le.fit(train_labels)
train_labels = le.transform(train_labels)
test_labels = le.transform(test_labels)

## Padding sequences

Padding sequences involves adding zeros (or any other designated padding value) to the beginning or end of a sequence so that all sequences have the same length. This is necessary for many machine learning models, such as neural networks, which require inputs to have a fixed shape.

In the code snippet you provided, the pad_sequences function is used to pad the training and test sequences to a maximum sequence length of MAX_SEQUENCE_LENGTH. This is done using zeros as the padding value. The resulting padded sequences are then split into training and validation sets using train_test_split.

After padding, the data is converted into a one-hot encoded format using the to_categorical function. The resulting arrays are then shuffled and split into training and validation sets using train_test_split based on a validation split percentage specified by VALIDATION_SPLIT.

In [18]:
#Converting this to sequences to be fed into neural network. Max seq. len is 1000 as set earlier
 #initial padding of 0s, until vector is of size MAX_SEQUENCE_LENGTH
trainvalid_data = pad_sequences(train_sequences, maxlen=MAX_SEQUENCE_LENGTH)
test_data = pad_sequences(test_sequences, maxlen=MAX_SEQUENCE_LENGTH)
trainvalid_labels = to_categorical(train_labels)

test_labels = to_categorical(np.asarray(test_labels), num_classes= trainvalid_labels.shape[1])

# split the training data into a training set and a validation set
indices = np.arange(trainvalid_data.shape[0])
np.random.shuffle(indices)
trainvalid_data = trainvalid_data[indices]
trainvalid_labels = trainvalid_labels[indices]
num_validation_samples = int(VALIDATION_SPLIT * trainvalid_data.shape[0])
x_train = trainvalid_data[:-num_validation_samples]
y_train = trainvalid_labels[:-num_validation_samples]
x_val = trainvalid_data[-num_validation_samples:]
y_val = trainvalid_labels[-num_validation_samples:]
#This is the data we will use for CNN and RNN training
print('Splitting the train data into train and valid is done')

Splitting the train data into train and valid is done


## Training an LSTM using Keras

In [19]:
print("Defining and training an LSTM model, training embedding layer on the fly")

rnnmodel = Sequential()
rnnmodel.add(Embedding(MAX_NUM_WORDS, 128))
rnnmodel.add(LSTM(128, dropout=0.2, recurrent_dropout=0.2))
rnnmodel.add(Dense(len(trainvalid_labels[0]), activation='sigmoid'))
rnnmodel.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

rnnmodel.summary()

print('Training the RNN')
rnnmodel.fit(x_train, y_train,
          batch_size=32,
          epochs=1,
          validation_data=(x_val, y_val))
score, acc = rnnmodel.evaluate(test_data, test_labels,
                            batch_size=32)
print('Test accuracy with RNN:', acc)

Defining and training an LSTM model, training embedding layer on the fly
Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, None, 128)         2560000   
                                                                 
 lstm (LSTM)                 (None, 128)               131584    
                                                                 
 dense (Dense)               (None, 17)                2193      
                                                                 
Total params: 2,693,777
Trainable params: 2,693,777
Non-trainable params: 0
_________________________________________________________________
Training the RNN
Test accuracy with RNN: 0.7214611768722534


## Similar Pytorch Equivalent

Or if you want to do something similar in Pytorch see the code below. First, we need to prepare the data in a way that Pytorch expects. We can still use the padded sequences, but we

In [32]:
import torch
from torch.utils.data import TensorDataset, DataLoader

trainvalid_data = pad_sequences(train_sequences, maxlen=MAX_SEQUENCE_LENGTH)
test_data = pad_sequences(test_sequences, maxlen=MAX_SEQUENCE_LENGTH)
trainvalid_labels = to_categorical(train_labels)

# Convert to PyTorch tensors
trainvalid_data_tensor = torch.tensor(trainvalid_data)
trainvalid_labels_tensor = torch.tensor(trainvalid_labels)

test_data_tensor = torch.tensor(test_data)
test_labels_tensor = torch.tensor(test_labels)

# Create a TensorDataset for train data
train_dataset = TensorDataset(trainvalid_data_tensor, trainvalid_labels_tensor)

# Create a DataLoader for train data
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)

# Create a TensorDataset for test data
test_dataset = TensorDataset(test_data_tensor, test_labels_tensor)

# Create a DataLoader for test data
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=True)

In [33]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F


class LSTMModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, dropout):
        super().__init__()

        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.rnn = nn.LSTM(embedding_dim, hidden_dim, dropout=dropout, batch_first=True)
        self.fc = nn.Linear(hidden_dim, output_dim)
        self.dropout = nn.Dropout(dropout)

    def forward(self, text):
        embedded = self.embedding(text)
        output, (hidden, cell) = self.rnn(embedded)
        hidden = self.dropout(hidden[-1])
        predictions = self.fc(hidden)
        return predictions

model = LSTMModel(vocab_size=MAX_NUM_WORDS, embedding_dim=128, hidden_dim=128, output_dim=len(trainvalid_labels[0]), dropout=0.2)

optimizer = optim.Adam(model.parameters())
criterion = nn.BCEWithLogitsLoss()

N_EPOCHS = 1

for epoch in range(N_EPOCHS):
    for inputs, labels in train_loader:
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

with torch.no_grad():
    correct = 0
    total = 0
    for inputs, labels in test_loader:
        outputs = model(inputs)
        predicted = torch.round(outputs)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()
    accuracy = correct / total
    print('Test accuracy with PyTorch LSTM:', accuracy)


Test accuracy with PyTorch LSTM: 0.728310502283105
