### From corpora: download "Brown", from models download "tagsets", "averaged_perceptron_tagger"

In [None]:
# import nltk

# nltk.download() # corpora, brown, ptb, models tagsets

### Using built in pos_tag function

In [1]:
from nltk import pos_tag,word_tokenize, help
# 1)a  --> Must tokenize string first

print(pos_tag(word_tokenize('The boy ate the delicious cake')))

# 1)b
help.brown_tagset('DT') # Gets the description of a tag and examples

[('The', 'DT'), ('boy', 'NN'), ('ate', 'VB'), ('the', 'DT'), ('delicious', 'JJ'), ('cake', 'NN')]
DT: determiner/pronoun, singular
    this each another that 'nother


### Tag first 10 files in simple-wki and get the Nouns

In [2]:
import os
from nltk import FreqDist
from collections import defaultdict

tagged = []

for root, dirs, files in os.walk('C:/Users/hadeel.mostafa/Desktop/my nlp/NLP w18/Labs/datasets/simple-wiki/single-docs'):
    for file in files[:10]:
        with open(os.path.join(root,file)) as f:
            tags = pos_tag(word_tokenize(f.read()))
            tagged+=tags # Build a list of pairs of all words and their tags in first 10 files

print(set([word for (word, tag) in tagged if tag[:2] == 'NN'])) # Print all words whose tags start with NN

{'religion', 'activity', 'arrow', 'brothers', 'Ages', 'boat', 'hooks', 'American', 'conservationÂ', 'Carolina', 'forms', 'word', 'Union', 'leaders', 'Thomas', 'street', 'representatives', 'States', 'world', 'end', 'statistics', 'Lincoln', 'km', 'characters', 'motors', 'Tale', 'Ricky', 'reason', 'Giovanni', 'Sumter', 'Sunfish', 'Citybus', 'months', 'nets', 'princes', 'forts', 'mammals', 'start', 'victims', 'contests', 'Confederacy', 'samples', 'disease', 'birthstone', 'Spain', 'Americans', 'Protestant', 'Arkansas', 'travels', 'Drift', 'actions', 'War', 'tuna', 'laws', 'Van', 'kinds', 'flea', 'ways', 'wires', 'includeÂ', 'spin', 'net', 'Treaty', '%', 'cost', 'half', 'frogsÂ', 'worldwide', 'Flanimals', 'Aquarium', 'others', 'Catholics', 'company', 'engine', 'US', 'river', 'Muskellunge', 'spots', 'country', 'month', 'andÂ', 'bait', 'engines', 'whales', 'winter', 'U.S.A.', 'gun', 'Lucie', 'Trout', 'seals', 'study', 'Middle', 'Sea', 'Steen', 'control', 'transport', 'Civil', 'Lower', 'Darnay'

### Using a default dict, no need to initialize key

In [3]:
test_dict = {}
# test_dict['plays']+=1 # This gives an error when uncommented, key has not been intialized

if 'plays' not in test_dict: # Using normal dict, must initalize key
    test_dict['plays'] = 0
    
#######################################

from collections import defaultdict

test_dict = defaultdict(int)
test_dict['plays']+=1

print(test_dict['plays'])
print(test_dict['eats'])

1
0


### Dictionary of FreqDist objects, to count the parts of speech assigned to every word

In [4]:
from nltk import FreqDist
tags = defaultdict(lambda: FreqDist())

tags['play']['NN']+=3
tags['play']['VB']+=10
tags['play']['DT']+=1

print(tags['play'].most_common(2)) # Gets the 2 most common elements
print(tags['play'].most_common(2)[0]) # Gets the first most common element, pair of format (tag,count)
print(tags['play'].most_common(2)[0][0]) # Gets the tag of the first most common element

[('VB', 10), ('NN', 3)]
('VB', 10)
VB


&nbsp;
## Exercise 5-3   &nbsp;&nbsp; (Unigram Tagger Implementation)
### Loading the brown corpus

In [5]:
from nltk.corpus import brown

# 3)a
all_tagged = brown.tagged_sents()
print(all_tagged[0:2]) # The first two sentences in the tagged corpus: List of a list of (word, tag) pairs
print('Number of sents in brown corpus', len(all_tagged))
print('Number of tokens in brown corpus', sum([len(sent) for sent in all_tagged]))

# 3)b
train = all_tagged[:50000] # First 50000 sentences for training
test = all_tagged[50000:] # Rest for testing


[[('The', 'AT'), ('Fulton', 'NP-TL'), ('County', 'NN-TL'), ('Grand', 'JJ-TL'), ('Jury', 'NN-TL'), ('said', 'VBD'), ('Friday', 'NR'), ('an', 'AT'), ('investigation', 'NN'), ('of', 'IN'), ("Atlanta's", 'NP$'), ('recent', 'JJ'), ('primary', 'NN'), ('election', 'NN'), ('produced', 'VBD'), ('``', '``'), ('no', 'AT'), ('evidence', 'NN'), ("''", "''"), ('that', 'CS'), ('any', 'DTI'), ('irregularities', 'NNS'), ('took', 'VBD'), ('place', 'NN'), ('.', '.')], [('The', 'AT'), ('jury', 'NN'), ('further', 'RBR'), ('said', 'VBD'), ('in', 'IN'), ('term-end', 'NN'), ('presentments', 'NNS'), ('that', 'CS'), ('the', 'AT'), ('City', 'NN-TL'), ('Executive', 'JJ-TL'), ('Committee', 'NN-TL'), (',', ','), ('which', 'WDT'), ('had', 'HVD'), ('over-all', 'JJ'), ('charge', 'NN'), ('of', 'IN'), ('the', 'AT'), ('election', 'NN'), (',', ','), ('``', '``'), ('deserves', 'VBZ'), ('the', 'AT'), ('praise', 'NN'), ('and', 'CC'), ('thanks', 'NNS'), ('of', 'IN'), ('the', 'AT'), ('City', 'NN-TL'), ('of', 'IN-TL'), ('Atlant

### Building the unigram dictionary

In [6]:
# Example format for dictionary named "words"
#{'play': {'VB':200, 'NN':100}, eat:{'VB':100}}

# 3)c
words = defaultdict(lambda: FreqDist())

for sent in train: # Loop over all training sents
    for (word, pos) in sent: # Loop over all words and their true parts of speech in each setence
        words[word][pos]+=1 # This word has been tagged by this pos once

# Dictionary of words (keys) and their most repeated part of speech (values)
# 3) d
uni = {}

for word in words:
    uni[word] = words[word].most_common(1)[0][0]
    
print(list(uni.items())[:10])

[('uproar', 'NN'), ('borrowed', 'VBN'), ('potentiometer', 'NN'), ('Stetson', 'NP'), ('savory', 'JJ'), ('convalescence', 'NN'), ('barn', 'NN'), ("'till", 'IN'), ('demoralized', 'VBN'), ('Neglected', 'VBN-TL')]


### Compute the accuracy over the test set

In [7]:
correct = 0
total = 0
for sent in test:
    for (word, pos) in sent:
        # If word in test sentence is present in unigram dictionary
        # AND if the predicted part of speech (uni[word]) is equal to the true part of speech
        # Increment the number of correct predictions by 1
        if word in uni and uni[word] == pos:
            correct +=1
        total+=1 # Count the total number of words in the test set
        
print('Unigram model accuracy:', correct/total)

Unigram model accuracy: 0.8831634672471799


&nbsp;
## Exercise 5-4   &nbsp;&nbsp; (NLTK Taggers)
### Build a DeafultTagger that predicts all parts of speech as nouns

In [8]:
from nltk import DefaultTagger

default_tagger = DefaultTagger('NN')
print(default_tagger.evaluate(test))

print(default_tagger.tag(word_tokenize('The boy ate the apple')))

0.1091925588759153
[('The', 'NN'), ('boy', 'NN'), ('ate', 'NN'), ('the', 'NN'), ('apple', 'NN')]


### Unigram Tagger

In [9]:
from nltk import UnigramTagger

unigram_tagger = UnigramTagger(train)

print(unigram_tagger.evaluate(test))

print(unigram_tagger.tag(word_tokenize('He watched the play'))) # Tagged play as verb
print(unigram_tagger.tag(word_tokenize('The kids play in the garden')))
print(unigram_tagger.tag(word_tokenize('I saw a green spider')))
print(unigram_tagger.tag(word_tokenize('Salah scored the goal'))) # Can't find salah in train sents, tagged as none

0.8831634672471799
[('He', 'PPS'), ('watched', 'VBD'), ('the', 'AT'), ('play', 'VB')]
[('The', 'AT'), ('kids', 'NNS'), ('play', 'VB'), ('in', 'IN'), ('the', 'AT'), ('garden', 'NN')]
[('I', 'PPSS'), ('saw', 'VBD'), ('a', 'AT'), ('green', 'JJ'), ('spider', 'NN')]
[('Salah', None), ('scored', 'VBD'), ('the', 'AT'), ('goal', 'NN')]


### Bigram Tagger

In [10]:
from nltk import BigramTagger

bigram_tagger = BigramTagger(train)
print(bigram_tagger.evaluate(test))

print(bigram_tagger.tag(word_tokenize('He watched the play'))) # play correctly tagged as noun
print(bigram_tagger.tag(word_tokenize('The kids play in the garden')))
print(bigram_tagger.tag(word_tokenize('I saw a green spider'))) # can't find the bigram green spider

0.3462464542515997
[('He', 'PPS'), ('watched', 'VBD'), ('the', 'AT'), ('play', 'NN')]
[('The', 'AT'), ('kids', 'NNS'), ('play', 'VB'), ('in', 'IN'), ('the', 'AT'), ('garden', 'NN')]
[('I', 'PPSS'), ('saw', 'VBD'), ('a', 'AT'), ('green', 'JJ'), ('spider', None)]


### Backoff tagger, from Unigram to DefaultTagegr

In [11]:
backoff_unigram_tagger = UnigramTagger(train, backoff=default_tagger)
print(backoff_unigram_tagger.evaluate(test))
# couldn't find salah in unigram, so backed off to noun
print(backoff_unigram_tagger.tag(word_tokenize('Salah scored the goal'))) 

0.8897437166039976
[('Salah', 'NN'), ('scored', 'VBD'), ('the', 'AT'), ('goal', 'NN')]


### From Bigram to Unigram (which backs off to DefaultTagger)

In [12]:
backoff_bigram_tagger = BigramTagger(train, backoff = backoff_unigram_tagger)
print(backoff_bigram_tagger.evaluate(test))
print(backoff_bigram_tagger.tag(word_tokenize('I saw a green spider')))

0.911174879609473
[('I', 'PPSS'), ('saw', 'VBD'), ('a', 'AT'), ('green', 'JJ'), ('spider', 'NN')]
