# ATIS Dataset
ATIS Stands for Airline Travel Information System.The ATIS dataset is a standard benchmark dataset for the tast of intent detection.

In [1]:
# Place the Google Drive Sharing link
file_share_link = "https://drive.google.com/file/d/1HHLFjA42D5zvRz2-tZ3GR4uzR-R5QrpJ/view?usp=drive_link"

# extract the ID of the file
file_id = file_share_link[file_share_link.find("d/")+2 : file_share_link.find("/v")]
print(file_id)
!gdown "$file_id" # Download the data from Google Drive

1HHLFjA42D5zvRz2-tZ3GR4uzR-R5QrpJ
Downloading...
From: https://drive.google.com/uc?id=1HHLFjA42D5zvRz2-tZ3GR4uzR-R5QrpJ
To: /Users/nachikethpro/Desktop/author-repo/Generative-AI-Natural-Language-Processing-Bootcamp/6.Deep-Dive-Dialog-Systems/atis.zip
100%|████████████████████████████████████████| 135k/135k [00:00<00:00, 1.64MB/s]


In [2]:
!unzip atis.zip
!rm atis.zip

Archive:  atis.zip
  inflating: atis_intents_train.csv  
  inflating: __MACOSX/._atis_intents_train.csv  
  inflating: atis_intents_test.csv   
  inflating: __MACOSX/._atis_intents_test.csv  
  inflating: atis_intents.csv        
  inflating: __MACOSX/._atis_intents.csv  


In [None]:
import os
import numpy as np
import random
import pandas as pd
import tensorflow as tf

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.layers import Dense, Input, GlobalMaxPooling1D, Conv1D, MaxPooling1D, Embedding, LSTM
from tensorflow.keras.models import Model, Sequential
from tensorflow.keras.initializers import Constant

from sklearn.preprocessing import LabelEncoder

In [None]:
train_df = pd.read_csv('atis_intents_train.csv', header=None)
train_df.columns = ['intents','text']
train_df.head()

Unnamed: 0,intents,text
0,atis_flight,i want to fly from boston at 838 am and arriv...
1,atis_flight,what flights are available from pittsburgh to...
2,atis_flight_time,what is the arrival time in san francisco for...
3,atis_airfare,cheapest airfare from tacoma to orlando
4,atis_airfare,round trip fares from pittsburgh to philadelp...


In [None]:
test_df = pd.read_csv('atis_intents_test.csv', header=None)
test_df.columns = ['intents','text']
test_df.head()

Unnamed: 0,intents,text
0,atis_flight,i would like to find a flight from charlotte ...
1,atis_airfare,on april first i need a ticket from tacoma to...
2,atis_flight,on april first i need a flight going from pho...
3,atis_flight,i would like a flight traveling one way from ...
4,atis_flight,i would like a flight from orlando to salt la...


# Data Preprocessing on Dataset

In [None]:
MAX_SEQENCE_LENGTH = 300
MAX_NUM_WORDS = 20000
EMBEDDING_DIM = 100
VALIDATION_SPLIT = 0.3

In [None]:
tokenizer = Tokenizer(num_words=MAX_NUM_WORDS)
tokenizer.fit_on_texts(train_df['text'])

train_sequences = tokenizer.texts_to_sequences(train_df['text'])
test_sequences = tokenizer.texts_to_sequences(test_df['text'])

word_index = tokenizer.word_index
print(f'length of unique words (tokens) is {len(word_index)}')

length of unique words (tokens) is 871


In [None]:
train_sequences[:2]

[[18, 67, 1, 38, 2, 9, 68, 394, 84, 16, 78, 15, 12, 68, 511, 15, 4, 35],
 [7, 3, 26, 57, 2, 19, 1, 21, 5, 75, 35]]

In [None]:
train_df.head()

Unnamed: 0,intents,text
0,atis_flight,i want to fly from boston at 838 am and arriv...
1,atis_flight,what flights are available from pittsburgh to...
2,atis_flight_time,what is the arrival time in san francisco for...
3,atis_airfare,cheapest airfare from tacoma to orlando
4,atis_airfare,round trip fares from pittsburgh to philadelp...


In [None]:
set(train_df['intents'])

{'atis_abbreviation',
 'atis_aircraft',
 'atis_airfare',
 'atis_airline',
 'atis_flight',
 'atis_flight_time',
 'atis_ground_service',
 'atis_quantity'}

In [None]:
le = LabelEncoder()
le.fit(train_df['intents'])
train_df['intents'] = le.transform(train_df['intents'])
test_df['intents'] = le.transform(test_df['intents'])

In [None]:
train_df.head()

Unnamed: 0,intents,text
0,4,i want to fly from boston at 838 am and arriv...
1,4,what flights are available from pittsburgh to...
2,5,what is the arrival time in san francisco for...
3,2,cheapest airfare from tacoma to orlando
4,2,round trip fares from pittsburgh to philadelp...


In [None]:
# Every row has same number of features
# 

In [None]:
type(train_sequences)

list

In [None]:
max([len(l) for l in train_sequences])

46

In [None]:
max([len(l) for l in test_sequences])

30

In [None]:
train_valid_data = pad_sequences(train_sequences, maxlen=MAX_SEQENCE_LENGTH)
test_data = pad_sequences(test_sequences, maxlen=MAX_SEQENCE_LENGTH)

train_valid_labels = to_categorical(train_df['intents'])
test_labels = to_categorical(test_df['intents'])

In [None]:
from sklearn.model_selection import train_test_split
X_train,X_val,y_train,y_val = train_test_split(train_valid_data,train_valid_labels, test_size=VALIDATION_SPLIT, random_state=42)

# Embeddings

In [None]:
#Download Pretrained Glove Embeddings
# Place the Google Drive Sharing link
file_share_link = "https://drive.google.com/file/d/1qqTDo8h4WDcNBEFnOH-LEPhra0zjyoXj/view?usp=share_link"

# extract the ID of the file
file_id = file_share_link[file_share_link.find("d/")+2 : file_share_link.find("/v")]
print(file_id)
import gdown
gdown.download(
    f"https://drive.google.com/uc?export=download&confirm=pbef&id={file_id}"
)

1qqTDo8h4WDcNBEFnOH-LEPhra0zjyoXj


Downloading...
From: https://drive.google.com/uc?export=download&confirm=pbef&id=1qqTDo8h4WDcNBEFnOH-LEPhra0zjyoXj
To: /content/glove.6B.100d.txt.zip
100%|██████████| 138M/138M [00:04<00:00, 29.3MB/s]


'glove.6B.100d.txt.zip'

In [None]:
!unzip "/content/glove.6B.100d.txt.zip"
!rm "/content/glove.6B.100d.txt.zip"

Archive:  /content/glove.6B.100d.txt.zip
  inflating: glove.6B.100d.txt       


In [None]:
GLOVE_DIR = '/content'

embeddings_index = {}
with open(os.path.join(GLOVE_DIR, 'glove.6B.100d.txt'), encoding="utf-8") as f:
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = coefs

num_words = min(MAX_NUM_WORDS, len(word_index)) + 1
embedding_matrix = np.zeros((num_words, EMBEDDING_DIM))
for word, i in word_index.items():
    if i > MAX_NUM_WORDS:
        continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector

embedding_layer = Embedding(num_words,
                            EMBEDDING_DIM,
                            embeddings_initializer=Constant(embedding_matrix),
                            input_length=MAX_SEQENCE_LENGTH,
                            trainable=False)

# Model

In [None]:
model = Sequential()
model.add(embedding_layer)

model.add(Conv1D(filters=128, kernel_size=5,activation='relu'))
model.add(MaxPooling1D(5))

model.add(Conv1D(filters=128, kernel_size=5,activation='relu'))
model.add(MaxPooling1D(5))

model.add(Conv1D(filters=128, kernel_size=5,activation='relu'))
model.add(GlobalMaxPooling1D())

model.add(Dense(128, activation='relu'))

model.add(Dense(8, activation='softmax'))

In [None]:
model.compile(loss='categorical_crossentropy',
              optimizer='rmsprop',
              metrics=['acc'])
model.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 300, 100)          87200     
                                                                 
 conv1d_3 (Conv1D)           (None, 296, 128)          64128     
                                                                 
 max_pooling1d_2 (MaxPooling  (None, 59, 128)          0         
 1D)                                                             
                                                                 
 conv1d_4 (Conv1D)           (None, 55, 128)           82048     
                                                                 
 max_pooling1d_3 (MaxPooling  (None, 11, 128)          0         
 1D)                                                             
                                                                 
 conv1d_5 (Conv1D)           (None, 7, 128)           

In [None]:
model.fit(X_train,y_train,
          batch_size=128,
          epochs=5,
          validation_data = (X_val,y_val))

model.evaluate(test_data,test_labels)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


[0.0774763971567154, 0.9762499928474426]