# Part of Speech Tagging

### Imports:

In [1]:
import nltk
from nltk import word_tokenize
from nltk.corpus import brown

### Corpus: <br>
Tagged Sentences: pre-tagged brown corpus in sentence structure <br>
non-Tagged Sentences: non-tagged brown corpus in sentence structure <br>

In [2]:
tagged_sentences=brown.tagged_sents()
print("tagged_sentences:")
print("Total number of sentences:"+ str(len(tagged_sentences)))
print("First two sentences:"+str(tagged_sentences[:2])+"...")
print()
non_tagged_sentences=brown.sents()
print("non_tagged_sentences:")
print("Total number of sentences:"+ str(len(non_tagged_sentences)))
print("First two sentences:"+str(non_tagged_sentences[:2])+"...")

tagged_sentences:
Total number of sentences:57340
First two sentences:[[('The', 'AT'), ('Fulton', 'NP-TL'), ('County', 'NN-TL'), ('Grand', 'JJ-TL'), ('Jury', 'NN-TL'), ('said', 'VBD'), ('Friday', 'NR'), ('an', 'AT'), ('investigation', 'NN'), ('of', 'IN'), ("Atlanta's", 'NP$'), ('recent', 'JJ'), ('primary', 'NN'), ('election', 'NN'), ('produced', 'VBD'), ('``', '``'), ('no', 'AT'), ('evidence', 'NN'), ("''", "''"), ('that', 'CS'), ('any', 'DTI'), ('irregularities', 'NNS'), ('took', 'VBD'), ('place', 'NN'), ('.', '.')], [('The', 'AT'), ('jury', 'NN'), ('further', 'RBR'), ('said', 'VBD'), ('in', 'IN'), ('term-end', 'NN'), ('presentments', 'NNS'), ('that', 'CS'), ('the', 'AT'), ('City', 'NN-TL'), ('Executive', 'JJ-TL'), ('Committee', 'NN-TL'), (',', ','), ('which', 'WDT'), ('had', 'HVD'), ('over-all', 'JJ'), ('charge', 'NN'), ('of', 'IN'), ('the', 'AT'), ('election', 'NN'), (',', ','), ('``', '``'), ('deserves', 'VBZ'), ('the', 'AT'), ('praise', 'NN'), ('and', 'CC'), ('thanks', 'NNS'), ('o

## Basic Tagging Technique: <br>
### Representing Tagged Tokens
Tag by hand, The tagged token could be create using str2tuple() library in nltk package or create tuple object

In [3]:
tag_token = nltk.tag.str2tuple('test/NN')
print("Tagged Token:"+str(tag_token))

Tagged Token:('test', 'NN')


In [4]:
sentences='''
Computing/NN is/VBZ the/DT new/JJ mathematics/NNS and/CC new/JJ stethoscope/NN of/IN the/DT 21st/JJ century/NN
'''
tag_tokens=[nltk.tag.str2tuple(t) for t in sentences.split()]
print("List of Tagged Token:"+str(tag_tokens))

List of Tagged Token:[('Computing', 'NN'), ('is', 'VBZ'), ('the', 'DT'), ('new', 'JJ'), ('mathematics', 'NNS'), ('and', 'CC'), ('new', 'JJ'), ('stethoscope', 'NN'), ('of', 'IN'), ('the', 'DT'), ('21st', 'JJ'), ('century', 'NN')]


### Defualt Tagger: <br>
Using nltk package create the default tagger object, default tagger would tag all the tokens by given tag

In [5]:
default_tagger = nltk.DefaultTagger('NN')
example_text = "Default Tagger tagged everything as you given"
#tokenize the text
text_token = word_tokenize(example_text)
#tag the tokens
result= default_tagger.tag(text_token)
print(result)
#evaluate the tagger, by using the pre-tagged brown corpus
evaluation=default_tagger.evaluate(tagged_sentences)
print(evaluation)

[('Default', 'NN'), ('Tagger', 'NN'), ('tagged', 'NN'), ('everything', 'NN'), ('as', 'NN'), ('you', 'NN'), ('given', 'NN')]
0.13130472824476916


## Automatic Tagging
### NLTK POS-tagger
POS-tagger is a build in tagger in nltk package, using prebuilt tagger to tag. The tagger has trained and tested on the Wall Street Journal Corpus

In [6]:
tokens = word_tokenize("Computing is the new mathematics and new stethoscope of the 21st century")
tagTokens=nltk.pos_tag(tokens)
print(tagTokens)

[('Computing', 'NN'), ('is', 'VBZ'), ('the', 'DT'), ('new', 'JJ'), ('mathematics', 'NNS'), ('and', 'CC'), ('new', 'JJ'), ('stethoscope', 'NN'), ('of', 'IN'), ('the', 'DT'), ('21st', 'JJ'), ('century', 'NN')]


#### Perceptron Algorithm
pos_tagger use perceptron tagger to rain the data and store the tagger for user to use, the perceptron tagger use Averaged Perceptron Algorithm to form the tagger <br>

$$w(C,T)=\sum_{i=1}^{n} \alpha_{i} \phi_{i}(C,T)$$ <br>
$w(C,T)$:  transition weight for tag T incontext C <br>
$n$: number of feature <br>
$\alpha_{i}$: the weight coefficient of the $i^{th}$ feature
$\phi_{i}(C,T)$:  the evaluation of the $i^{th}$ feature for context C and tag T

### Regular Expression <br>
use regular expression to create the patterns to tag for <br>

Regular expression: <br>

| Operation|Behavior|
|:----------|:---------------------------------------|
|.| Wildcard, matches any character |
|^abc| Matches some pattern abc at the start of a string|
|abc$|Matches some pattern abc at the end of a string|
|[abc]|Matches one of a set of characters|
|[A-Z0-9]|Matches one of a range of characters|
|ed &#x7c; ing &#x7c;s|Matches one of the specified strings (disjunction)|
|*|Zero or more of previous item, e.g. a*, [a-z]*|
|+|One or more of previous item, e.g. a+, [a-z]+|
|?|Zero or one of the previous item (i.e. optional), e.g. a?, [a-z]?|
|{n}|Exactly n repeats where n is a non-negative integer|
|{n,}|At least n repeats|
|{,n}|No more than n repeats|
|{m,n}|At least m and no more than n repeats|
|a(b &#x7c; c)+|Parentheses that indicate the scope of the operators|

'r' tell the python that backslash is not a special character in the string
    

In [7]:
#define patterns
patterns = [
    (r'.*ing$', 'VBG'),
    (r'.*ed$', 'VBD'),                
    (r'.*es$', 'VBZ'),                
    (r'.*ould$', 'MD'),               
    (r'.*\'s$', 'NN$'),               
    (r'.*s$', 'NNS'),                 
    (r'^-?[0-9]+(.[0-9]+)?$', 'CD'),  
    (r'.*ness$', 'NN'),
    (r'.*ment$', 'NN'),
    (r'.*ful$', 'JJ'),
    (r'.*ious$', 'JJ'),
    (r'.*ble$', 'JJ'),
    (r'.*ic$', 'JJ'),
    (r'.*ive$', 'JJ'),
    (r'.*ic$', 'JJ'),
    (r'.*est$', 'JJ'),
    (r'.*', 'NN')                    
]
#create regular expession tagger using the paterns
regexp_tagger = nltk.RegexpTagger(patterns)
print(regexp_tagger.evaluate(tagged_sentences))

0.20163762754135406


### Lookup Tagger <br>
tag words with most common use POS

In [8]:
#create freqDist of words and CondFreqDist of words with tags
fd = nltk.FreqDist(brown.words())
cfd = nltk.ConditionalFreqDist(brown.tagged_words())
#find the first 100 most common words
most_freq_words = fd.most_common(100)
#create a dictionary that has frist 100 most common words as the key and the most used tag for that word as value
likely_tags = dict((word, cfd[word].max()) for (word, _) in most_freq_words)
lookup_tagger = nltk.UnigramTagger(model=likely_tags)
print(lookup_tagger.evaluate(tagged_sentences))

0.4804614568477909


### n-gram Tagger

In [9]:
#bigram tagger
brownTagSent = brown.tagged_sents()
#train set 80%, test set 20%
size = int(len(brownTagSent)*0.8)
train = brownTagSent[:size]
test = brownTagSent[size:]
bigram = nltk.BigramTagger(train)
print(bigram.evaluate(test))

0.3390490564374869


n-gram perform poorly with unseen data

### Combine n-gram Tagger
combine the n-gram Tagger using backoff-tagger

In [10]:
default_tagger = nltk.DefaultTagger('NN')
uni_tagger = nltk.UnigramTagger(train, backoff=default_tagger)
bi_tagger = nltk.BigramTagger(train, backoff=uni_tagger)
print(bi_tagger.evaluate(test))

0.9068555627774779


the reslut improve significantly after combine the n-gram taggers

### Feature Extraction and DecisionTreeClassifier Part of Speech Tagger
Extracing features to create decision tree classifier to train the tagger

In [11]:
suffix_fdist = nltk.FreqDist()
#extract suffix
for word in brown.words():
    word = word.lower()
    suffix_fdist[word[-1:]] += 1
    suffix_fdist[word[-2:]] += 1
    suffix_fdist[word[-3:]] += 1
#list of common suffiexes
common_suffixes = [suffix for (suffix, count) in suffix_fdist.most_common(200)]

#part of speech features extract function
def pos_features(word):
    features = {}
    for suffix in common_suffixes:
        features['endswith({})'.format(suffix)] = word.lower().endswith(suffix)
    return features

tagged_words = brown.tagged_words(categories='news')
#create list of features of the words that match with tags
featuresets = [(pos_features(word),tag) for (word,tag) in tagged_words]
# train set 80%, test set 20%
size = int(len(featuresets)*0.2)
train_set=featuresets[size:]
test_set=featuresets[:size]
print('training')
classifier = nltk.DecisionTreeClassifier.train(train_set)
print(nltk.classify.accuracy(classifier,test_set))

training
0.6747886623570363


NLTK

@inproceedings{Loper:2002:NNL:1118108.1118117,
 author = {Loper, Edward and Bird, Steven},
 title = {NLTK: The Natural Language Toolkit},
 booktitle = {Proceedings of the ACL-02 Workshop on Effective Tools and Methodologies for Teaching Natural Language Processing    and Computational Linguistics - Volume 1},
 series = {ETMTNLP '02},
 year = {2002},
 location = {Philadelphia, Pennsylvania},
 pages = {63--70},
 numpages = {8},
 url = {https://doi.org/10.3115/1118108.1118117},
 doi = {10.3115/1118108.1118117},
 acmid = {1118117},
 publisher = {Association for Computational Linguistics},
 address = {Stroudsburg, PA, USA},
}

Perceptron Algorithm

@inproceedings{hajivc2009semi, <br>
  title={Semi-supervised training for the averaged perceptron POS tagger},
  author={Haji{\v{c}}, Jan and Raab, Jan and Spousta, Miroslav and others},
  booktitle={Proceedings of the 12th Conference of the European Chapter of the Association for Computational Linguistics},
  pages={763--771},
  year={2009},
  organization={Association for Computational Linguistics}
  abstract= {This paper describes POS tagging experiments with semi-supervised training as an extension to the (supervised) averaged perceptron algorithm, ﬁrst introduced for this task by (Collins, 2002). Experiments withaniterativetrainingonstandard-sized supervised (manually annotated) dataset (106 tokens) combined with a relatively modest (in the order of 108 tokens) unsupervised (plain) data in a bagging-like fashion showed signiﬁcant improvement of the POS classiﬁcation task on typologicallydifferentlanguages,yieldingbetter than state-of-the-art results for English and Czech (4.12 % and 4.86 % relative error reduction, respectively; absolute accuracies being 97.44 % and 95.89 %).
}
}

n-gram
@article{Brown:1992:CNG:176313.176316,
 author = {Brown, Peter F. and deSouza, Peter V. and Mercer, Robert L. and Pietra, Vincent J. Della and Lai, Jenifer C.},
 title = {Class-based N-gram Models of Natural Language},
 journal = {Comput. Linguist.},
 issue_date = {December 1992},
 volume = {18},
 number = {4},
 month = dec,
 year = {1992},
 issn = {0891-2017},
 pages = {467--479},
 numpages = {13},
 url = {http://dl.acm.org/citation.cfm?id=176313.176316},
 acmid = {176316},
 publisher = {MIT Press},
 address = {Cambridge, MA, USA},
 abstract={We address the problem of predicting a word from previous words in a sample of text. In particular, we discuss n-gram models based on classes of words. We also discuss several statistical algorithms for assigning words to classes based on the frequency of their co-occurrence with other words. We find that we are able to extract classes that have the flavor of either syntactically based groupings or semantically based groupings, depending on the nature of the underlying statistics.}
}

@inproceedings{schmid2013probabilistic,
  title={Probabilistic part-ofispeech tagging using decision trees},
  author={Schmid, Helmut},
  booktitle={New methods in language processing},
  pages={154},
  year={2013}
}