In [None]:
import pandas as pd 
import numpy as np

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Bidirectional, CuDNNLSTM, Dense, Embedding, Dropout, Input, CuDNNGRU, GRU, LSTM
from keras.layers import Conv1D, GlobalMaxPooling1D, MaxPooling1D, GlobalAveragePooling1D, concatenate,AveragePooling1D
from keras.models import Model
from keras.callbacks import ModelCheckpoint,ReduceLROnPlateau

from gensim.models import Word2Vec

from sklearn.metrics import confusion_matrix, f1_score 
from sklearn.model_selection import train_test_split

import pickle
import re
import string

In [None]:
!wget http://nlp.stanford.edu/data/glove.840B.300d.zip

In [None]:
!unzip glove.840B.300d.zip

In [None]:
embeddings_index = dict()
with open('glove.840B.300d.txt') as glove:
  for line in glove:
    values = line.split()
    word = values[0]
    try:
      coefs = np.asarray(values[1:], dtype='float32')
      embeddings_index[word] = coefs
    except:
      print(word)
   
print('Found %s word vectors.' % len(embeddings_index))

In [None]:
data = pd.read_csv('data cab.csv')

In [None]:
data2=pd.read_csv('data hotels.csv')

In [None]:
data3=pd.read_csv('data weather.csv')

In [None]:
data4=pd.read_csv('data flight.csv')

In [None]:
data_sam1 = data.sample(frac=1).reset_index(drop=True)
data_sam1=data_sam1.drop(['Unnamed: 2','Unnamed: 3'],axis=1)

In [None]:
data_sam2 = data2.sample(frac=1).reset_index(drop=True)

In [None]:
data_sam3=data3.sample(frac=1).reset_index(drop=True)

In [None]:
data_sam4=data4.sample(frac=1).reset_index(drop=True)

In [None]:
datamix=data_sam2.append(data_sam1)
datamix=datamix.append(data_sam3)
datamix=datamix.append(data_sam4)

In [None]:
df=datamix.sample(frac=1).reset_index(drop=True)

In [None]:
dumPREV = pd.get_dummies(df["type"])

In [None]:
embed_size = 300 # how big is each word vector

maxlen = 30 # max number of words in a question to use

In [None]:
queries = df["query"].values

In [None]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(queries)
queries = tokenizer.texts_to_sequences(queries)
queries = pad_sequences(queries, maxlen)

In [None]:
df1 = pd.DataFrame(queries)

In [None]:
df2=pd.DataFrame(df['type'])

In [None]:
word_index = tokenizer.word_index
nb_words = len(word_index)
print(nb_words)
print(word_index)

In [None]:
dataset = df1.join(dumPREV)

In [None]:
x = dataset.iloc[:, :-4].values
y = dataset.iloc[:, -4:].values
print("Shape X ", x.shape)
print("Shape Y ", y.shape)

In [None]:
train_x, test_x, train_y, test_y = train_test_split(x, y, test_size = 0.04)

In [None]:
from sklearn.utils import class_weight
class_weights = class_weight.compute_class_weight('balanced',np.unique(df['type']),df['type'])


In [None]:
print("Shape of X, Y train", train_x.shape, train_y.shape)
print("Shape of X, Y test", test_x.shape, test_y.shape)

In [None]:
nb_words = len(word_index)
embedding_matrix = np.zeros((nb_words+1, 300))

In [None]:
for word, i in word_index.items(): 
    if word in embeddings_index.keys():  
        embedding_vector = embeddings_index[word] 
        #print(embedding_vector.shape)
        embedding_matrix[i] = embedding_vector

In [None]:
inp1 = Input(shape = (30,)) 

x = Embedding(nb_words+1, embed_size, weights = [embedding_matrix])(inp1)


x = Bidirectional(LSTM(10, return_sequences=False,dropout=0.1, recurrent_dropout=0.1))(x)

x = Dense(32, activation="relu")(x)
x = Dropout(0.1)(x)
x = Dense(4, activation="softmax")(x)
model = Model(inputs=inp1, outputs=x)

model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

print(model.summary())

In [None]:
checkpoint = ModelCheckpoint("weights.hdf5", monitor='val_acc', verbose=0, save_best_only=True, mode='max')

callbacks = [checkpoint]

In [None]:
epochs= 2

y_test = [np.argmax(y) for y in test_y]

for e in range(epochs):
    model.fit(train_x, train_y, class_weight=class_weights,batch_size=32, epochs=1, nb_epoch=5, callbacks = callbacks, validation_split=0.1) #fit model

    pred_glove_val_y = model.predict(test_x, batch_size=30, verbose=1) #make predictions

    y_pred = [np.argmax(y) for y in pred_glove_val_y]
    
    score = f1_score(y_test, y_pred, average = 'micro')
    print(confusion_matrix(y_test, y_pred))
            
    print("Val F1 Score: {:.4f}".format(score))

In [None]:


def transformQ(query):
  query = tokenizer.texts_to_sequences([query])
  query = pad_sequences(query, maxlen)
  return np.array(query)

In [None]:
dd={'query':[],'type': []}

In [None]:

intent = ['cab','flight','hotel','weather']

choice=input('Want to enter a query Y/N')
while choice.lower() == 'y':
  
  print("cab=1; flight=3; hotel=3; weather=4\n")
  query = input("Enter Query ")

  q = transformQ(query)
  
  pred_glove_val_y = model.predict(q) 

  y_pred = np.argmax(pred_glove_val_y[0])
  print(intent[y_pred],  pred_glove_val_y[0][y_pred], "\n\n")
  
  x=str(input('Did I predict correct y/n?'))
  
  if x.lower()=='n' or x.lower()=='no':
    m=str(input('What type of query was it?'))
    dd["query"].append(query)
    dd['type'].append(m)
    
  else:
    pass 
  
  
  choice=input('Want to enter a query')
