# Part of Speech Tagging

### Imports:

In [2]:
import nltk
from nltk import word_tokenize
from nltk.corpus import brown

## Basic Tagging Technique: <br>
### Representing Tagged Tokens
The tagged token could be create using str2tuple() library in nltk package or create tuple object

In [16]:
tag_token = nltk.tag.str2tuple('test/NN')
print("Tagged Token:"+str(tag_token))

Tagged Token:('test', 'NN')


In [17]:
sentences='''
Computing/NN is/VBZ the/DT new/JJ mathematics/NNS and/CC new/JJ stethoscope/NN of/IN the/DT 21st/JJ century/NN
'''
tag_tokens=[nltk.tag.str2tuple(t) for t in sentences.split()]
print("List of Tagged Token:"+str(tag_tokens))

List of Tagged Token:[('Computing', 'NN'), ('is', 'VBZ'), ('the', 'DT'), ('new', 'JJ'), ('mathematics', 'NNS'), ('and', 'CC'), ('new', 'JJ'), ('stethoscope', 'NN'), ('of', 'IN'), ('the', 'DT'), ('21st', 'JJ'), ('century', 'NN')]


#### Defualt Tagger: <br>
Using nltk package create the default tagger object

In [18]:
default_tagger = nltk.DefaultTagger('NN')
example_text = "Default Tagger tagged everything as you given"
text_token = word_tokenize(example_text)
result= default_tagger.tag(text_token)
print(result)

[('Default', 'NN'), ('Tagger', 'NN'), ('tagged', 'NN'), ('everything', 'NN'), ('as', 'NN'), ('you', 'NN'), ('given', 'NN')]


## Automatic Tagging
### NLTK POS-tagger
POS-tagger is a build in tagger in nltk package, using prebuilt tagger to tag

In [19]:
tokens = word_tokenize("Computing is the new mathematics and new stethoscope of the 21st century")
tagTokens=nltk.pos_tag(tokens)
print(tagTokens)

[('Computing', 'NN'), ('is', 'VBZ'), ('the', 'DT'), ('new', 'JJ'), ('mathematics', 'NNS'), ('and', 'CC'), ('new', 'JJ'), ('stethoscope', 'NN'), ('of', 'IN'), ('the', 'DT'), ('21st', 'JJ'), ('century', 'NN')]


Tagged Sentences and non-Tagged Sentences, using brown corpus

In [23]:
tagged_sentences=brown.tagged_sents()
non_tagged_sentences=brown.sents()

### Tagger Evaluation
NLTK provide prebuilt tagger evaluate function

In [25]:
evaluation=default_tagger.evaluate(brown_tag_sent)
print(evaluation)

0.13130472824476916


### Regular Expression <br>
use regular expression to create the patterns to tag for

In [30]:
patterns = [
    (r'.*ing$', 'VBG'),               # gerunds
    (r'.*ed$', 'VBD'),                # simple past
    (r'.*es$', 'VBZ'),                # 3rd singular present
    (r'.*ould$', 'MD'),               # modals
    (r'.*\'s$', 'NN$'),               # possessive nouns
    (r'.*s$', 'NNS'),                 # plural nouns
    (r'^-?[0-9]+(.[0-9]+)?$', 'CD'),  # cardinal numbers
    (r'.*', 'NN')                     # nouns (default)
]
regexp_tagger = nltk.RegexpTagger(patterns)
brown_reg_tagged=regexp_tagger.tag(non_tagged_sentences[3])
print(regexp_tagger.evaluate(brown_tag_sent))

0.19537768086586887


### Lookup Tagger <br>
tag words with most common use POS

In [31]:
fd = nltk.FreqDist(brown.words())
cfd = nltk.ConditionalFreqDist(brown.tagged_words())
most_freq_words = fd.most_common(100)
likely_tags = dict((word, cfd[word].max()) for (word, _) in most_freq_words)
baseline_tagger = nltk.UnigramTagger(model=likely_tags)
print(baseline_tagger.evaluate(brown_tag_sent))

0.4804614568477909


## Machine Learning Part of Speech Tagging <br>
### n-gram Tagger

In [43]:
brownTagSent = brown.tagged_sents(categories ='news')
brownSent = brown.sents(categories ='news')
size = int(len(brownTagSent)*0.9)
train = brownTagSent[:size]
test = brownTagSent[size:]
bigram = nltk.BigramTagger(train)
bigram.tag(brownSent[2007])
print(bigram.evaluate(test))

0.10206319146815508


perform poorly with unseen data

### Combine n-gram Tagger
combine the n-gram Tagger using backoff-tagger

In [44]:
tagger0 = nltk.DefaultTagger('NN')
tagger1 = nltk.UnigramTagger(train, backoff=tagger0)
tagger2 = nltk.BigramTagger(train, backoff=tagger1)
print(tagger2.evaluate(test))

0.8452108043456593


the reslut improve dramatically after combine the n-gram taggers

### Feature Extraction and DecisionTreeClassifier Part of Speech Tagger
Extracing features to create decision tree classifier to train the tagger

In [47]:
suffix_fdist = nltk.FreqDist()
#extract suffix
for word in brown.words():
    word = word.lower()
    suffix_fdist[word[-1:]] += 1
    suffix_fdist[word[-2:]] += 1
    suffix_fdist[word[-3:]] += 1

common_suffixes = [suffix for (suffix, count) in suffix_fdist.most_common(100)]
#part of speech features extract function
def pos_features(word):
    features = {}
    for suffix in common_suffixes:
        features['endswith({})'.format(suffix)] = word.lower().endswith(suffix)
    return features

tagged_words = brown.tagged_words(categories='news')
featuresets = [(pos_features(n),g) for (n,g) in tagged_words]
size = int(len(featuresets)*0.1)
train_set,test_set=featuresets[size:],featuresets[:size]
print('trainning')
classifier = nltk.DecisionTreeClassifier.train(train_set)
print(nltk.classify.accuracy(classifier,test_set))
classifier.classify(pos_features('cat'))

trainning
0.6270512182993535


'IN'

reference: <br>
    NLTK online Textbook chapter 5 Categorizing and Tagging Words(https://www.nltk.org/book/ch05.html) <br>
    NLTK online Textbook chapter 6 Learning to Classify Text(https://www.nltk.org/book/ch06.html) <br>
More Information: <br>
    All Tags in NLTK for english(https://stackoverflow.com/questions/1833252/java-stanford-nlp-part-of-speech-labels?utm_medium=organic&utm_source=google_rich_qa&utm_campaign=google_rich_qa)
    