# Intent Recognition with Sequential Models and Word2Vec
The goal of this notebook will be to classify intents of sentences. <br>For the purpose of demonstration, we will be using the ATIS (Airline travel information system) dataset. 
This can be accomplished with the following steps:
- Reading the dataset (from iob files) and Understanding the labels
- Encoding the intent labels
- Loading the word2vec model and embedding the words.
- Creating our sequential model (Bi-RNN) with PyTorch
- Testing the model

## Reading the dataset and Understanding labels

In [228]:
import random
import pandas as pd
import numpy as np
import torch.nn as nn
import torch
from utils import fetch_data, read_method

sents,labels,intents = fetch_data('data2/atis.train.w-intent.iob')

def display(n):
    sense = []
    print ("INTENT : ",intents[n])
    for i in range(len(sents[n])):
    #     sense.append({"word_index":word_indices[0][i],"word":words2idx[word_indices[0][i]],"entity_index":name_entities[0][i],"entity":tables2idx[name_entities[0][i]],"label_index":labels[0][i],"label":labels2idx[labels[0][i]]})
        sense.append({"word":sents[n][i],"label":labels[n][i]})
    return pd.DataFrame(sense)

print ("Number of sentences :",len(sents))
print ("Number of unique intents :",len(set(intents)))

Number of sentences : 4978
Number of unique intents : 22


In [229]:
# sents - List of sentences where each sentence is a list of words
# intents - List of labelled intents
display(random.randint(0,len(sents)))

INTENT :  atis_ground_fare


Unnamed: 0,label,word
0,O,what
1,O,price
2,O,is
3,O,a
4,B-transport_type,limousine
5,O,service
6,O,in
7,B-city_name,boston


## ~~Loading~~ Training the word2vec model and embedding the words.

In [230]:
# Training word2vec model
from gensim.models import word2vec

file_names = read_method.keys()
data_sets = []
for f in file_names:
    data_sets.append(fetch_data(f))

all_sents = []    
all_intents = []
for temp_sents,_,temp_intents in data_sets:
    all_sents += list([list(x)+['EOS'] for x in temp_sents])
    all_intents += list(temp_intents)
    
w2v_model = word2vec.Word2Vec(all_sents,min_count=1)

In [None]:
# from gensim.models import KeyedVectors
# MODEL_PATH = '/home/b/Downloads/GoogleNews-vectors-negative300.bin.gz'
# w2v_model = KeyedVectors.load_word2vec_format(MODEL_PATH, binary=True,limit=2500000)

In [231]:
def embed_sentence(sent):
    return [w2v_model.wv[word] for word in list(sent)+['EOS']]

enc_sents = []
exceptions = []
for s in sents:
    try:
        enc_sents.append(embed_sentence(s))
    except KeyError:
        exceptions.append(s)

## Encoding the intent labels

In [234]:
from sklearn import preprocessing
intent_encoder = preprocessing.LabelEncoder()
intent_encoder.fit(all_intents)

enc_intents = intent_encoder.transform(intents)

target = torch.LongTensor(enc_intents).unsqueeze_(-1)

pd.DataFrame({"Intents":intents[:5],"Encoded Intents":enc_intents[:5]})

Unnamed: 0,Intents,Encoded Intents
0,atis_flight,14
1,atis_flight,14
2,atis_flight_time,19
3,atis_airfare,3
4,atis_airfare,3


## Creating our sequential model (Bi-RNN) with PyTorch

In [246]:
class RNN(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(RNN, self).__init__()

        self.hidden_size = hidden_size

        self.in2hid_fwd = nn.Linear(input_size + hidden_size, hidden_size)
        self.in2hid_bck = nn.Linear(input_size + hidden_size, hidden_size)
        
        self.hid2out = nn.Linear(hidden_size*2, output_size)
        self.softmax = nn.LogSoftmax(dim=1)

    def forward(self, sentence):
                    
        hidden_fwd = self.initHidden()        
        
        for word in sentence:        
            temp_comb = (torch.from_numpy(word).view(1,-1), hidden_fwd)
            combined_fwd = torch.cat(temp_comb, 1)
            hidden_fwd = self.in2hid_fwd(combined_fwd)
        
        hidden_bck = self.initHidden()        
        
        for word in sentence[::-1]:
            temp_comb = (torch.from_numpy(word).view(1,-1), hidden_fwd)
            combined_bck = torch.cat(temp_comb, 1)
            hidden_bck = self.in2hid_bck(combined_bck)
            
        combined_full = torch.cat((hidden_fwd, hidden_bck), 1)
        
        output = self.hid2out(combined_full)
        output = self.softmax(output)

        return output

    def initHidden(self):
        return torch.zeros(1, self.hidden_size)

In [278]:
rnn = RNN(input_size=w2v_model.vector_size,
          hidden_size=50, 
          output_size=len(intent_encoder.classes_))

In [282]:
learning_rate = 0.005 
criterion = nn.NLLLoss()

def train(sentence, intent):    
    rnn.zero_grad()

    output = rnn(sentence)
    
    loss = criterion(output, intent.long())
    loss.backward()

    for p in rnn.parameters():
        p.data.add_(-learning_rate, p.grad.data)

    return output, loss.item()

train(enc_sents[0],target[0])

(tensor([[-3.2375, -3.4819, -3.1829, -3.3999, -3.5318, -3.3369, -3.2144,
          -3.1258, -3.2889, -3.1266, -3.2126, -3.1669, -3.3000, -3.4782,
          -3.2032, -3.3715, -3.1530, -3.3392, -3.0339, -3.2316, -3.1438,
          -3.2253, -3.4502, -3.1610, -3.2873, -3.2226]]), 3.2031524181365967)

In [283]:
import time
import math

n_iters = 2
print_every = 1000
all_losses = []

start = time.time()

for iter in range(1, n_iters + 1):
    for x in range(len(enc_sents)):
        output, loss = train(enc_sents[x],target[x])
#         print (output,loss)
        if math.isnan(x):
            print ("NAN loss")
            break

        total_loss += loss

        if x % print_every == 0:
            print('%.2fs since start | (Epoch : %d, %d%%) Loss : %.4f' % (time.time()-start, iter, iter / n_iters * 100, loss))


0.00s since start | (Epoch : 1, 50%) Loss : 3.1132
1.39s since start | (Epoch : 1, 50%) Loss : 0.0712
2.54s since start | (Epoch : 1, 50%) Loss : 0.2806
3.63s since start | (Epoch : 1, 50%) Loss : 1.2845
4.71s since start | (Epoch : 1, 50%) Loss : 0.0203
5.78s since start | (Epoch : 2, 100%) Loss : 0.0418
6.87s since start | (Epoch : 2, 100%) Loss : 0.0649
7.97s since start | (Epoch : 2, 100%) Loss : 8.6046
9.12s since start | (Epoch : 2, 100%) Loss : 0.0580
10.22s since start | (Epoch : 2, 100%) Loss : 0.0440


In [292]:
def test_one(sent,val,allow=3):
    pred = rnn(sent).topk(allow)[1].tolist()[0]
    return val in pred

def test():
    sents_test,_,intents_test = fetch_data('data2/atis.test.w-intent.iob')
    enc_intents_test = intent_encoder.transform(intents_test)
    target_test = torch.LongTensor(enc_intents_test).unsqueeze_(-1)
    
    num_correct = 0.0
    for sent,targ in zip(sents_test,target_test):
        sent = embed_sentence(sent)    
        if test_one(sent,targ,allow=1):
            num_correct+=1
            
    print ("Accuracy :",num_correct/len(sents_test)*100)

test()

Accuracy : 78.61142217245241
