# POS Tagging

In [1]:
import json
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import os
import time
import tqdm
%matplotlib inline
import matplotlib.pyplot as plt
plt.style.use('ggplot')

In [2]:
df = pd.read_json('data.json')

In [3]:
df

Unnamed: 0,sentence,tags
0,how long do you have to pay back debt after cl...,"[WH, ADV, AUX, PART, VERB, TO, VERB, PART, NOU..."
1,what shakespearean play featured shylock,"[WH, ADJ, NOUN, VERB, NOUN]"
2,which magazine is fine entertainment for men,"[WH, NOUN, AUX, ADJ, NOUN, PREP, NOUN]"
3,what features of the african elephant are larg...,"[WH, VERB, PREP, DT, ADJ, NOUN, AUX, ADJ, PREP..."
4,what ocean surrounds the maldive islands,"[WH, NOUN, VERB, DT, ADJ, NOUN]"
...,...,...
954,what english word has the most letters,"[WH, ADJ, NOUN, VERB, DT, ADJ, NOUN]"
955,what film ends with the line this is mrs. norm...,"[WH, NOUN, VERB, PREP, DT, NOUN, DT, AUX, ADJ,..."
956,who asked the musical question have you ever b...,"[WH, VERB, DT, ADJ, NOUN, VERB, PRON, VERB, VE..."
957,who wrote poems are made by fools like me but ...,"[WH, VERB, NOUN, AUX, VERB, PREP, NOUN, PREP, ..."


## Making Training and Test Set

In [4]:
train = df[0:int(.90*len(df))]
test = df[int(.90*len(df)):]

In [5]:
train['sentence'][0], train['tags'][0]

('how long do you have to pay back debt after claiming chapter 11 bankruptcy',
 ['WH',
  'ADV',
  'AUX',
  'PART',
  'VERB',
  'TO',
  'VERB',
  'PART',
  'NOUN',
  'PREP',
  'VERB',
  'NOUN',
  'NUMB',
  'NOUN'])

## Word Tag Pair / Sentence

In [6]:
train_ds = []
for i in range(len(train)):
    sentence = []
    for j in range(len(train['sentence'][i].split(' '))):
        d = tuple([train['sentence'][i].split(' ')[j], train['tags'][i][j]])
        sentence.append(d)
    train_ds.append(sentence)

test_ds = []
for i in range(len(test)):
    sentence = []
    for j in range(len(test['sentence'][i + len(train)].split(' '))):
        d = tuple([test['sentence'][i + len(train)].split(' ')[j], test['tags'][i + len(train)][j]])
        sentence.append(d)
    test_ds.append(sentence)

In [7]:
train_ds[0], test_ds[0]

([('how', 'WH'),
  ('long', 'ADV'),
  ('do', 'AUX'),
  ('you', 'PART'),
  ('have', 'VERB'),
  ('to', 'TO'),
  ('pay', 'VERB'),
  ('back', 'PART'),
  ('debt', 'NOUN'),
  ('after', 'PREP'),
  ('claiming', 'VERB'),
  ('chapter', 'NOUN'),
  ('11', 'NUMB'),
  ('bankruptcy', 'NOUN')],
 [('how', 'WH'),
  ('many', 'ADJ'),
  ('varieties', 'NOUN'),
  ('of', 'PREP'),
  ('apple', 'NOUN'),
  ('are', 'AUX'),
  ('there', 'ADV')])

## Define Features

In [8]:
def word2features(sent, i, m):
    word = sent[i][0]
    if m == 2: #with prefix and suffix
        features = {
        'bias': 1.0,
        'word.lower()': word.lower(),
        'word[-3:]': word[-3:],
        'word[-2:]': word[-2:],
        'word[+2:]': word[2:],
        'word[+3:]': word[3:],
        'word.isupper()': word.isupper(),
        'word.istitle()': word.istitle(),
        'word.isdigit()': word.isdigit(),
    }
    else:
        features = { #model1 without prefix and suffixx
        'bias': 1.0,
        'word.lower()': word.lower(),
        'word.isupper()': word.isupper(),
        'word.istitle()': word.istitle(),
        'word.isdigit()': word.isdigit(),
    }
    if i > 0:
        word1 = sent[i-1][0]
        features.update({
            '-1:word.lower()': word1.lower(),
            '-1:word.istitle()': word1.istitle(),
            '-1:word.isupper()': word1.isupper(),
        })
    else:
        features['BOS'] = True

    if i < len(sent)-1:
        word1 = sent[i+1][0]
        features.update({
            '+1:word.lower()': word1.lower(),
            '+1:word.istitle()': word1.istitle(),
            '+1:word.isupper()': word1.isupper(),
        })
    else:
        features['EOS'] = True

    return features


def sent2features(sent, m):
    return [word2features(sent, i, m) for i in range(len(sent))]

def sent2labels(sent):
    return [label for token, label in sent]

def sent2tokens(sent):
    return [token for token, label in sent]

## CRF

In [9]:
import sklearn_crfsuite
from sklearn_crfsuite import scorers
from sklearn_crfsuite import metrics

In [10]:
model1 = sklearn_crfsuite.CRF(

    algorithm = 'l2sgd',
    c2 = 0.1,
    max_iterations = 100,
    all_possible_transitions = True
)

model2 = sklearn_crfsuite.CRF(

    algorithm = 'l2sgd',
    c2 = 0.1,
    max_iterations = 100,
    all_possible_transitions = True
)

In [11]:
%%time
X_train_m1 = [sent2features(s, m = 1) for s in train_ds]
X_train_m2 = [sent2features(s, m = 2) for s in train_ds]
y_train = [sent2labels(s) for s in train_ds]

X_test_m1 = [sent2features(s, 1) for s in test_ds]
X_test_m2 = [sent2features(s, 2) for s in test_ds]
y_test = [sent2labels(s) for s in test_ds]

CPU times: user 292 ms, sys: 28.4 ms, total: 320 ms
Wall time: 318 ms


In [12]:
model1.fit(X_train_m1, y_train)
model2.fit(X_train_m2, y_train)



CRF(algorithm='l2sgd', all_possible_transitions=True, c2=0.1,
    keep_tempfiles=None, max_iterations=100)

In [13]:
labels1 = list(model1.classes_)
labels2 = list(model2.classes_)

In [14]:
labels1, labels2

(['WH',
  'ADV',
  'AUX',
  'PART',
  'VERB',
  'TO',
  'NOUN',
  'PREP',
  'NUMB',
  'ADJ',
  'DT',
  'CONJ',
  'MOD',
  'PRON'],
 ['WH',
  'ADV',
  'AUX',
  'PART',
  'VERB',
  'TO',
  'NOUN',
  'PREP',
  'NUMB',
  'ADJ',
  'DT',
  'CONJ',
  'MOD',
  'PRON'])

In [15]:
y_pred1 = model1.predict(X_test_m1)
f1_1 = metrics.flat_f1_score(y_test, y_pred1,
                      average='weighted', labels=labels1)

y_pred2 = model2.predict(X_test_m2)
f1_2 = metrics.flat_f1_score(y_test, y_pred2,
                      average='weighted', labels=labels2)
print(f1_1, f1_2)

0.8658220138927282 0.9013922444431405


## Performance Matrix Model 1

In [16]:
sorted_labels1 = sorted(
    labels1,
    key=lambda name: (name[1:], name[0])
)
print(metrics.flat_classification_report(
    y_test, y_pred1, labels=sorted_labels1, digits=3
))

              precision    recall  f1-score   support

        PART      0.667     0.222     0.333         9
         ADJ      0.662     0.608     0.634        74
         ADV      0.333     0.333     0.333         3
        VERB      0.775     0.697     0.734        89
          WH      0.990     0.970     0.980       100
          TO      1.000     1.000     1.000        13
         MOD      0.800     0.571     0.667         7
        CONJ      1.000     0.818     0.900        11
        NOUN      0.792     0.900     0.843       250
        PREP      0.985     0.956     0.970        68
        PRON      0.917     0.846     0.880        13
          DT      1.000     0.967     0.983        90
        NUMB      1.000     0.889     0.941         9
         AUX      1.000     1.000     1.000        78

    accuracy                          0.869       814
   macro avg      0.851     0.770     0.800       814
weighted avg      0.869     0.869     0.866       814



## Performance Matrix Model 2

In [17]:
sorted_labels2 = sorted(
    labels2,
    key=lambda name: (name[1:], name[0])
)
print(metrics.flat_classification_report(
    y_test, y_pred2, labels=sorted_labels2, digits=3
))

              precision    recall  f1-score   support

        PART      0.333     0.111     0.167         9
         ADJ      0.784     0.784     0.784        74
         ADV      0.143     0.333     0.200         3
        VERB      0.842     0.719     0.776        89
          WH      0.990     0.980     0.985       100
          TO      1.000     1.000     1.000        13
         MOD      0.833     0.714     0.769         7
        CONJ      1.000     1.000     1.000        11
        NOUN      0.869     0.932     0.900       250
        PREP      0.985     0.971     0.978        68
        PRON      0.688     0.846     0.759        13
          DT      1.000     0.978     0.989        90
        NUMB      1.000     0.889     0.941         9
         AUX      1.000     1.000     1.000        78

    accuracy                          0.903       814
   macro avg      0.819     0.804     0.803       814
weighted avg      0.904     0.903     0.901       814

