In [1]:
import pandas as pd
dataset = pd.read_csv("ATIS.csv",header=None,error_bad_lines=False)

In [2]:
dataset.head()

Unnamed: 0,0,1
0,atis_flight,i want to fly from boston at 838 am and arriv...
1,atis_flight,what flights are available from pittsburgh to...
2,atis_flight_time,what is the arrival time in san francisco for...
3,atis_airfare,cheapest airfare from tacoma to orlando
4,atis_airfare,round trip fares from pittsburgh to philadelp...


In [3]:
for text in dataset[1].head():
  print(text)

 i want to fly from boston at 838 am and arrive in denver at 1110 in the morning
 what flights are available from pittsburgh to baltimore on thursday morning
 what is the arrival time in san francisco for the 755 am flight leaving washington
 cheapest airfare from tacoma to orlando
 round trip fares from pittsburgh to philadelphia under 1000 dollars


In [4]:
grouped = dataset.groupby(0).size()
grouped

0
atis_abbreviation                            147
atis_aircraft                                 81
atis_aircraft#atis_flight#atis_flight_no       1
atis_airfare                                 423
atis_airfare#atis_flight_time                  1
atis_airline                                 157
atis_airline#atis_flight_no                    2
atis_airport                                  20
atis_capacity                                 16
atis_cheapest                                  1
atis_city                                     19
atis_distance                                 20
atis_flight                                 3666
atis_flight#atis_airfare                      21
atis_flight_no                                12
atis_flight_time                              54
atis_ground_fare                              18
atis_ground_service                          255
atis_ground_service#atis_ground_fare           1
atis_meal                                      6
atis_quantity     

In [5]:
!awk -F ',' '{print $2}' ATIS.csv > ATIS.txt

### Extracting named entities with Matcher

In [6]:
pip install -U spacy



In [7]:
import spacy
from collections import Counter

In [8]:
!python -m spacy download en_core_web_md

Collecting en-core-web-md==3.1.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_md-3.1.0/en_core_web_md-3.1.0-py3-none-any.whl (45.4 MB)
[K     |████████████████████████████████| 45.4 MB 16 kB/s 
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_md')


In [9]:
nlp = spacy.load("en_core_web_md")

In [10]:
corpus = open("ATIS.txt", "r").read().split("\n")

In [11]:
all_ent_labels = []

In [12]:
for sentence in corpus:
  doc = nlp(sentence.strip())
  ents = doc.ents
  all_ent_labels += [ent.label_ for ent in ents]

In [13]:
c = Counter(all_ent_labels)

In [14]:
print(c)

Counter({'GPE': 9219, 'DATE': 1454, 'TIME': 925, 'ORG': 425, 'CARDINAL': 275, 'ORDINAL': 201, 'FAC': 63, 'NORP': 60, 'MONEY': 52, 'PERSON': 17, 'PRODUCT': 14, 'LOC': 6, 'EVENT': 3})


In [15]:
for k,v in c.items():
  print (k, v)

GPE 9219
TIME 925
DATE 1454
MONEY 52
ORDINAL 201
ORG 425
CARDINAL 275
FAC 63
PERSON 17
NORP 60
PRODUCT 14
EVENT 3
LOC 6


The most frequent entity labels are GPE (location names), DATE, TIME, and ORGANIZATION.

Extract the location entities by spaCy Matcher by searching for a pattern
of the preposition location_name form.

In [16]:
from spacy.matcher import Matcher

In [17]:
matcher = Matcher(nlp.vocab)

# A pattern matching two tokens, a preposition (POS tag ADP
# means an adposition = preposition + postposition) and a location entity (label GPE
# means a location entity)

pattern = [{"POS": "ADP"}, {"ENT_TYPE": "GPE"}]
matcher.add("prepositionLocation", [pattern])
doc = nlp("show me flights from denver to boston on tuesday")
matches = matcher(doc)

for mid, start, end in matches:
  print(doc[start:end])

from denver
to boston


In [18]:
doc = nlp("i'm looking for a flight that goes from ontario to westchester and stops in chicago")
matches = matcher(doc)
for mid, start, end in matches:
  print(doc[start:end])

from ontario
in chicago


In [19]:
doc = nlp("what flights arrive in chicago on sunday on continental")
matches = matcher(doc)
for mid, start, end in matches:
  print(doc[start:end])

in chicago


In [20]:
doc = nlp("yes i'd like a flight from long beach to st.louis by way of dallas")
matches = matcher(doc)
for mid, start, end in matches:
  print(doc[start:end])

to st.louis
of dallas


In [21]:
doc = nlp("what are the evening flights flying out of dallas")
matches = matcher(doc)
for mid, start, end in matches:
  print(doc[start:end])

of dallas


In [22]:
matcher = Matcher(nlp.vocab)

# extract the airline information. The ORG entity label means an
# organization and it corresponds to airline company names

pattern = [{"ENT_TYPE": "ORG", "OP": "+"}]
matcher.add("AirlineName", [pattern])
doc = nlp("what is the earliest united airlines flight flying from denver")
matches = matcher(doc)
for mid,start,end in matches:
  print(doc[start:end])

united
united airlines
airlines


In [23]:
pattern1 = [{"TEXT": {"REGEX": "\w{1,2}\d{1,2}"}}]
pattern2 = [{"SHAPE": { "IN": ["x", "xx"]}}, {"SHAPE": {"IN": ["d", "dd"]}}]
pattern3 = [{"TEXT": {"IN": ["class", "code", "abbrev","abbreviation"]}}, {"SHAPE": { "IN": ["x", "xx"]}}]
pattern4 = [{"POS": "NOUN", "SHAPE": { "IN": ["x","xx"]}}]

In [24]:
matcher = Matcher(nlp.vocab)
matcher.add("abbrevEntities", [pattern1, pattern2, pattern3, pattern4])

In [25]:
sentences = [
'what does restriction ap 57 mean',
'what does the abbreviation co mean',
'what does fare code qo mean',
'what is the abbreviation d10',
'what does code y mean',
'what does the fare code f and fn mean',
'what is booking class c'
]

In [26]:
for sent in sentences:
  doc = nlp(sent)
  matches = matcher(doc)
  for mid, start, end in matches:
    print(doc[start:end])

ap 57
57
abbreviation co
co
code qo
qo
d10
code y
code f
fn
class c
c


### Using dependency trees for extracting entities

ROOT is a special dependency label and is always assigned
to the main verb of the sentence. spaCy shows syntactic relations with arcs.
One of the tokens is the syntactic parent (called the HEAD) and the other is
dependent (called the CHILD).

In [27]:
import spacy
nlp = spacy.load("en_core_web_md")

def reach_parent(source_token, dest_token):
    source_token = source_token.head
    while source_token != dest_token:
        if source_token.head == source_token:
            return None
        source_token = source_token.head
    return source_token

doc = nlp("I'm going to a conference in Munich.")

In [28]:
doc[-2]

Munich

In [29]:
doc[3]

to

In [30]:
doc[-1]

.

In [31]:
reach_parent(doc[-2], doc[3])

to

In [32]:
reach_parent(doc[-1], doc[3])

### Using dependency relations for intent recognition

Extracting transitive verbs and their direct objects

In [33]:
doc = nlp("find a flight from washington to sf")
for token in doc:
  if token.dep_ == "dobj":
    print(token.head.text + token.text.capitalize())

findFlight


In [34]:
for token in doc:
  print(token.text, token.dep_)

find ROOT
a det
flight dobj
from prep
washington pobj
to prep
sf pobj


In [35]:
from spacy import displacy
displacy.render(doc, style='dep', jupyter=True, options={'distance': 90})

Extracting multiple intents with conjunction relation 


In [36]:
doc = nlp("show all flights and fares from denver to san francisco")
for token in doc:
   if token.dep_ == "dobj":
     dobj = token.text
     conj = [t.text for t in token.conjuncts]
     verb = token.head
print(verb, dobj, conj)

show flights ['fares']


Recognizing the intent using wordlists

In [37]:
doc = nlp("i want to make a reservation for a flight")
displacy.render(doc, style='dep', jupyter=True, options={'distance': 90})

### Semantic similarity methods for semantic parsing


In [38]:
verbSynsets = [("show", "list"), ("book", "make a reservation", "buy", "reserve")]
objSynsets = [("meal", "food"),("aircraft", "airplane", "plane")]

In [39]:
doc = nlp("show me all aircrafts that cp uses")
displacy.render(doc, style='dep', jupyter=True, options={'distance': 90})

In [40]:
doc2 = nlp("list all meals on my flight")
displacy.render(doc2, style='dep', jupyter=True, options={'distance': 90})

In [41]:
for token in doc:
   if token.dep_ == "dobj":
     obj = token.lemma_
     verb = token.head.lemma_
     break

In [42]:
for token in doc2:
   if token.dep_ == "dobj":
     obj2 = token.lemma_
     verb2 = token.head.lemma_
     break

In [43]:
verb, obj

('show', 'aircraft')

In [44]:
verb2, obj2

('list', 'meal')

### Using word vectors to recognize semantic similarity

In [45]:
doc = nlp("show me all aircrafts that cp uses")
doc2 = nlp("list all meals on my flight")

In [46]:
for token in doc:
   if token.dep_ == "dobj":
     obj = token
     verb = token.head
     break

In [47]:
for token in doc2:
   if token.dep_ == "dobj":
     obj2 = token
     verb2 = token.head
     break


In [48]:
verb, obj


(show, aircrafts)

In [49]:
verb2, obj2

(list, meals)

In [50]:
obj.similarity(obj2)
# A very low score, we can deduce these 2 utterances are not related at this point.

0.15025872

In [51]:
verb.similarity(verb2)

0.33161193