Data analysis

Import basic libraries

In [2]:
import numpy as np
import pandas as pd
import nltk, pprint
import matplotlib.pyplot as plt
import random

gzip for reading the gz files, and pickle to save/dump trained model

In [3]:
import gzip, os, pickle
import _pickle as cPickle

services from sklearn

In [7]:
import sklearn
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV

supress warnings

In [8]:
import warnings
warnings.filterwarnings('ignore')

Dataset is divided into 5 folds. Reading the first fold of them.

In [9]:
filename = '/content/atis.fold0.pkl'
f = open(filename,'rb')
try:
  train_set,valid_set,test_set,dicts = pickle.load(f,encoding='latin1')
except:
  train_set,valid_set,test_set,dicts = pickle.load(f)
finally:
  f.close()

Train, validation, and test sets are all tuples of length 3. Each tuple has 3 lists of same length

In [None]:
for i in train_set:
  print(type(i), len(i))
for i in valid_set:
  print(type(i), len(i))
for i in test_set:
  print(type(i), len(i))

Storing the three elements of the tuple in 3 objects

In [12]:
train_text,_,train_label = train_set
valid_text,_,valid_label = valid_set
test_text,_,test_label = test_set

In [13]:
words = dicts['words2idx']
labels = dicts['labels2idx']

In [None]:
random.sample(words.items(),10)

Mapping the numeric values v in a query with the k,v in the dict

In [15]:
[k for i in train_text[0] for k,v in words.items() if v==i]

['what',
 'flights',
 'leave',
 'atlanta',
 'at',
 'about',
 'DIGIT',
 'in',
 'the',
 'afternoon',
 'and',
 'arrive',
 'in',
 'san',
 'francisco']

Looking at first 10 queries

In [None]:
sents = []
for i in range(10):
  sents.append(' '.join([k for i in train_text[i] for k,v in words.items() if v==i]))
sents

In [None]:
random.sample(labels.items(),10)

Reversing the dicts for easy lookup

In [17]:
idx_to_words = {words[k]:k for k in words}
idx_to_labels = {labels[k]:k for k in labels}

A function to take an index and return a list of corresponding queries with labels

In [18]:
def print_queries(index):
  w = [idx_to_words[id] for id in train_text[index]]
  l = [idx_to_labels[id] for id in train_label[index]]
  return list(zip(w,l))

In [None]:
i=random.randrange(len(train_text))
print_queries(i)

Data preprocessig

POS tagging

In [20]:
def pos_tag(sent_list):
  pos_tags = []
  for sent in sent_list:
    tagged_words = nltk.pos_tag([idx_to_words[i] for i in sent])
    pos_tags.append(tagged_words)
  return pos_tags

In [21]:
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


True

In [22]:
train_pos = pos_tag(train_text)
valid_pos = pos_tag(valid_text)
test_pos = pos_tag(test_text)

In [23]:
i = random.randrange(len(train_pos))
train_pos[i]

[('show', 'VB'),
 ('me', 'PRP'),
 ('all', 'DT'),
 ('flights', 'NNS'),
 ('from', 'IN'),
 ('boston', 'NN'),
 ('to', 'TO'),
 ('detroit', 'VB')]

Preparing the training data

In [24]:
def create_word_pos_label(pos_tagged_data, labels):
    iob_labels = []
    for sent in list(zip(pos_tagged_data, labels)):
        pos = sent[0]
        labels = sent[1]
        zipped_list = list(zip(pos, labels))
        tuple_3 = [(word_pos_tuple[0], word_pos_tuple[1], idx_to_labels[label])
                   for word_pos_tuple, label in zipped_list]
        iob_labels.append(tuple_3)
    return iob_labels

In [25]:
train_data = create_word_pos_label(train_pos,train_label)
valid_data = create_word_pos_label(valid_pos,valid_label)
test_data = create_word_pos_label(test_pos,test_label)

A gazetteer to lookup for US cities, states, and counties

In [None]:
us_cities = pd.read_csv("/content/US City, State, and County.txt", sep="|")
us_cities.head()

In [27]:
cities = set(us_cities['City'].str.lower())
states = set(us_cities['State full'].str.lower())
counties = set(us_cities['County'].str.lower())

In [28]:
def gazetteer_lookup(word):
  return (word in cities, word in states, word in counties)

In [29]:
gazetteer_lookup('south dakota')
gazetteer_lookup('michigan')

(True, True, False)

Feature extraction for words in a sentence

In [30]:
def extract_features(sentence, i):
    word, pos, _ = sentence[i]

    # first word
    if i==0:
        prevword = '<START>'
        prevpos = '<START>'
    else:
        prevword = sentence[i-1][0]
        prevpos = sentence[i-1][1]

    # last word
    if i == len(sentence)-1:
        nextword = '<END>'
        nextpos = '<END>'
    else:
        nextword = sentence[i+1][0]
        nextpos = sentence[i+1][1]

    gazetteer = gazetteer_lookup(word)

    # prefixes and suffixes
    pref_1, pref_2, pref_3, pref_4 = word[:1], word[:2], word[:3], word[:4]
    suff_1, suff_2, suff_3, suff_4 = word[-1:], word[-2:], word[-3:], word[-4:]
    return {"pos": pos, "prevpos": prevpos,
           'prevword':prevword,
           'nextpos': nextpos,
           'nextword': nextword,
           'word':word,
           'word_is_city': gazetteer[0],
           'word_is_state': gazetteer[1],
           'word_is_county': gazetteer[2],
           'word_is_digit': word in 'DIGITDIGITDIGIT',
           'pref_1': pref_1,
           'pref_2': pref_2,
           'pref_3': pref_3,
           'pref_4': pref_4,
           'suff_1': suff_1,
           'suff_2': suff_2,
           'suff_3': suff_3,
           'suff_4': suff_4}

Building Conditional Random Field(CRF) classifier

In [31]:
!pip install sklearn_crfsuite
import sklearn_crfsuite
from sklearn_crfsuite import metrics
from sklearn_crfsuite import scorers

Collecting sklearn_crfsuite
  Downloading sklearn_crfsuite-0.3.6-py2.py3-none-any.whl (12 kB)
Collecting python-crfsuite>=0.8.3 (from sklearn_crfsuite)
  Downloading python_crfsuite-0.9.9-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (993 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m993.5/993.5 kB[0m [31m12.1 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: python-crfsuite, sklearn_crfsuite
Successfully installed python-crfsuite-0.9.9 sklearn_crfsuite-0.3.6


Writing functions to convert the training, validation, and test datasets to the formaat required by sklearn CRF classifier

In [32]:
def sent2features(sent):
    return [extract_features(sent, i) for i in range(len(sent))]

def sent2labels(sent):
    return [label for token, postag, label in sent]

Converting the training data into a standard format - X_train and Y_train

In [33]:
X_train = [sent2features(s) for s in train_data]
y_train = [sent2labels(s) for s in train_data]

X_valid = [sent2features(s) for s in valid_data]
y_valid = [sent2labels(s) for s in valid_data]

X_test = [sent2features(s) for s in test_data]
y_test = [sent2labels(s) for s in test_data]

Fitting CRF with arbitrary hyperparameters

In [34]:
crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs',
    c1=0.01,
    c2=0.1,
    max_iterations=100,
    all_possible_transitions=True
)
try:
    crf.fit(X_train, y_train)
except AttributeError:
    pass

Remove 'O' from the labels

In [None]:
labels = list(crf.classes_)
labels.remove('O')
labels[:5]

Making predictions on validation data and evaluating model performance

In [56]:
y_pred = crf.predict(X_valid)
metrics.flat_f1_score(y_valid,y_pred,average='weighted',labels=labels)

0.9372302908496231