# <center>Book: Steven Bird, Ewan Klein, Edward Loper, 2009. **Natural Language Processing (NLP) with Python**, O'Reilly.</center> 

### Part of Speech Tagging

In [1]:
import nltk
from nltk.corpus import brown
suffix_fdist = nltk.FreqDist()
for word in brown.words():
    word = word.lower()
    suffix_fdist[word[-1:]] += 1
    suffix_fdist[word[-2:]] += 1
    suffix_fdist[word[-3:]] += 1

In [2]:
common_suffixes = [suffix for (suffix, count) in suffix_fdist.most_common(100)]
print(common_suffixes)

['e', ',', '.', 's', 'd', 't', 'he', 'n', 'a', 'of', 'the', 'y', 'r', 'to', 'in', 'f', 'o', 'ed', 'nd', 'is', 'on', 'l', 'g', 'and', 'ng', 'er', 'as', 'ing', 'h', 'at', 'es', 'or', 're', 'it', '``', 'an', "''", 'm', ';', 'i', 'ly', 'ion', 'en', 'al', '?', 'nt', 'be', 'hat', 'st', 'his', 'th', 'll', 'le', 'ce', 'by', 'ts', 'me', 've', "'", 'se', 'ut', 'was', 'for', 'ent', 'ch', 'k', 'w', 'ld', '`', 'rs', 'ted', 'ere', 'her', 'ne', 'ns', 'ith', 'ad', 'ry', ')', '(', 'te', '--', 'ay', 'ty', 'ot', 'p', 'nce', "'s", 'ter', 'om', 'ss', ':', 'we', 'are', 'c', 'ers', 'uld', 'had', 'so', 'ey']


In [3]:
def pos_features(word):
    features = {}
    for suffix in common_suffixes:
        features['endwsith({})'.format(suffix)] = word.lower().endswith(suffix)
    return features

In [4]:
tagged_words = brown.tagged_words(categories = 'news')
featuresets = [(pos_features(n), g) for (n, g) in tagged_words]
size = int(len(featuresets) * 0.1)
train_set, test_set = featuresets[size:], featuresets[:size]
classifier = nltk.DecisionTreeClassifier.train(train_set)
nltk.classify.accuracy(classifier, test_set)

0.6270512182993535

In [5]:
classifier.classify(pos_features('cats'))

'NNS'

In [6]:
print(classifier.pseudocode(depth = 6))

if endwsith(the) == False: 
  if endwsith(,) == False: 
    if endwsith(s) == False: 
      if endwsith(.) == False: 
        if endwsith(of) == False: 
          if endwsith(and) == False: return '.'
          if endwsith(and) == True: return 'CC'
        if endwsith(of) == True: return 'IN'
      if endwsith(.) == True: return '.'
    if endwsith(s) == True: 
      if endwsith(is) == False: 
        if endwsith(was) == False: 
          if endwsith(as) == False: return 'PP$'
          if endwsith(as) == True: return 'CS'
        if endwsith(was) == True: return 'BEDZ'
      if endwsith(is) == True: 
        if endwsith(his) == False: return 'BEZ'
        if endwsith(his) == True: return 'PP$'
  if endwsith(,) == True: return ','
if endwsith(the) == True: return 'AT'



### Exploiting Context

In [7]:
def pos_features(sentence, i):
    features = {"suffix(1)": sentence[i][-1:],
                "suffix(2)": sentence[i][-2:],
                "suffix(3)": sentence[i][-3:]}
    if i == 0:
        features["prev-word"] = "<START>"
    else:
        features["prev-word"] = sentence[i - 1]
    return features

In [8]:
pos_features(brown.sents()[0], 8)

{'prev-word': 'an', 'suffix(1)': 'n', 'suffix(2)': 'on', 'suffix(3)': 'ion'}

In [9]:
featuresets = []
tagged_sents = brown.tagged_sents(categories = 'news')
for tagged_sent in tagged_sents:
    untagged_sent = nltk.tag.untag(tagged_sent)
    for i, (word, tag) in enumerate(tagged_sent):
        featuresets.append( (pos_features(untagged_sent, i), tag) )

size = int(len(featuresets) * 0.1)
train_set, test_set = featuresets[size:], featuresets[:size]
classifier = nltk.NaiveBayesClassifier.train(train_set)
nltk.classify.accuracy(classifier, test_set)

0.7891596220785678