# Assignment 3: Parts-of-Speech Tagging 

Name: Karan Patel  
Categorizing and Tagging Words: http://www.nltk.org/book/ch05.html

In [1]:
import nltk
from tqdm import tqdm
from nltk.corpus import brown

nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('brown')
nltk.download('universal_tagset')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\r631915\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\r631915\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package brown to
[nltk_data]     C:\Users\r631915\AppData\Roaming\nltk_data...
[nltk_data]   Package brown is already up-to-date!
[nltk_data] Downloading package universal_tagset to
[nltk_data]     C:\Users\r631915\AppData\Roaming\nltk_data...
[nltk_data]   Package universal_tagset is already up-to-date!


True

__Q1.__ Search the web for 2 “spoof newspaper headlines”, to find such gems as: _British Left Waffles on Falkland Islands_, and _Juvenile Court to Try Shooting Defendant_. Manually tag these headlines to see if knowledge of the part-of-speech tags removes the ambiguity.

__Answer__: 

Following are the two spoof newspaper headlines that I found from the web:
1. Cows lose their jobs as milk prices drop
2. Trump's Lawyers: Telling Armed Crazies to "Go to Capitol" and "Fight Like Hell" Was Just Metaphorical

In [2]:
tokens = nltk.word_tokenize("Cows lose their jobs as milk prices drop")
print("POS tags for headline #1:\n{}\n".format(nltk.pos_tag(tokens)))

tokens = nltk.word_tokenize('Trump\'s Lawyers: Telling Armed Crazies to "Go to Capitol" and "Fight Like Hell" Was Just Metaphorical')
print("POS tags for headline #2:\n{}".format(nltk.pos_tag(tokens)))

POS tags for headline #1:
[('Cows', 'NNS'), ('lose', 'VBP'), ('their', 'PRP$'), ('jobs', 'NNS'), ('as', 'IN'), ('milk', 'NN'), ('prices', 'NNS'), ('drop', 'NN')]

POS tags for headline #2:
[('Trump', 'NNP'), ("'s", 'POS'), ('Lawyers', 'NNS'), (':', ':'), ('Telling', 'NNP'), ('Armed', 'NNP'), ('Crazies', 'NNPS'), ('to', 'TO'), ('``', '``'), ('Go', 'VB'), ('to', 'TO'), ('Capitol', 'NNP'), ("''", "''"), ('and', 'CC'), ('``', '``'), ('Fight', 'NNP'), ('Like', 'IN'), ('Hell', 'NNP'), ("''", "''"), ('Was', 'NNP'), ('Just', 'NNP'), ('Metaphorical', 'NNP')]


__Q2.__ Tokenize and tag the following sentence: They wind back the clock, while we chase after the wind. What is the output?

In [3]:
tokens = nltk.word_tokenize('They wind back the clock, while we chase after the wind.')
print("POS tags for sentence:\n{}".format(nltk.pos_tag(tokens)))

POS tags for sentence:
[('They', 'PRP'), ('wind', 'VBP'), ('back', 'RB'), ('the', 'DT'), ('clock', 'NN'), (',', ','), ('while', 'IN'), ('we', 'PRP'), ('chase', 'VBP'), ('after', 'IN'), ('the', 'DT'), ('wind', 'NN'), ('.', '.')]


__Q3.__ Pick 2 words that can be either a noun or a verb (e.g., contest). Predict which POS tag is likely to be the most frequent in the Brown corpus, and compare with your predictions.


In [4]:
tag_fd = nltk.FreqDist(tag for (word, tag) in brown.tagged_words(tagset='universal') if word == 'increase')
print('Most common POS tags for word "increase" = {}'.format(tag_fd.most_common()))

tag_fd = nltk.FreqDist(tag for (word, tag) in brown.tagged_words(tagset='universal') if word == 'attack')
print('Most common POS tags for word "attack" = {}'.format(tag_fd.most_common()))

Most common POS tags for word "increase" = [('NOUN', 112), ('VERB', 82)]
Most common POS tags for word "attack" = [('NOUN', 78), ('VERB', 24), ('X', 1)]


__Q4.__ Use sorted() and set() to get a sorted list of tags used in the Brown corpus, removing duplicates.


In [5]:
sorted(set(tag for (word, tag) in brown.tagged_words(tagset='universal')))

['.',
 'ADJ',
 'ADP',
 'ADV',
 'CONJ',
 'DET',
 'NOUN',
 'NUM',
 'PRON',
 'PRT',
 'VERB',
 'X']

__Q5.__ Write programs to process the Brown Corpus and find answers to the following questions:
1. Which nouns are more common in their plural form, rather than their singular form? (Only consider regular plurals, formed with the -s suffix.)
2. List tags in order of decreasing frequency. What do the 20 most frequent tags represent?

In [30]:
# Q5.1

# Note: foreign word noun tags are excluded from sets below since plural foreign words may not have a '-s' suffix
singular_noun_tags = {'NN', 'NN$', 'NN+BEZ', 'NN+HVD', 'NN+HVZ', 'NN+IN', 'NN+MD', 'NN+NN', 'NP', 'NP$', 'NP+BEZ', 'NP+HVZ', 'NP+MD', 'NR', 'NP$', 'NR+MD'}
plural_noun_tags = {'NNS', 'NNS$', 'NNS+MD', 'NPS', 'NPS$', 'NRS'}

singular_noun_freq_dist = nltk.FreqDist(word for (word, tag) in brown.tagged_words() if tag in singular_noun_tags)
all_singular_nouns = set(singular_noun_freq_dist.keys())
print(f'Found a total of {len(all_singular_nouns)} singular nouns in the corpus.')

all_regular_plural_nouns = {singular_noun + 's' for singular_noun in all_singular_nouns}
plural_noun_freq_dist = nltk.FreqDist(word for (word, tag) in brown.tagged_words() if tag in plural_noun_tags and word in all_regular_plural_nouns)

total_count = 0
for singluar_noun in all_singular_nouns:
    plural_noun = singluar_noun + 's'
    
    singular_noun_count = singular_noun_freq_dist[singluar_noun]
    plural_noun_count = plural_noun_freq_dist[plural_noun]
    
    if plural_noun_count > singular_noun_count:
        if total_count <= 10:
            print(f'(Singluar) noun "{singluar_noun}" is more common in its plural form "{plural_noun}". Singular form count = {singular_noun_count}, plural form count = {plural_noun_count}')
        total_count += 1

print(f'\nFound a total of {total_count} nouns that are more common in their plural form, rather than their singular form. Only showing a few above.')

Found a total of 23187 singular nouns in the corpus.
(Singluar) noun "lodging" is more common in its plural form "lodgings". Singular form count = 1, plural form count = 2
(Singluar) noun "interface" is more common in its plural form "interfaces". Singular form count = 3, plural form count = 5
(Singluar) noun "monosyllable" is more common in its plural form "monosyllables". Singular form count = 1, plural form count = 2
(Singluar) noun "critic" is more common in its plural form "critics". Singular form count = 24, plural form count = 26
(Singluar) noun "exploit" is more common in its plural form "exploits". Singular form count = 1, plural form count = 4
(Singluar) noun "periodical" is more common in its plural form "periodicals". Singular form count = 4, plural form count = 5
(Singluar) noun "export" is more common in its plural form "exports". Singular form count = 7, plural form count = 10
(Singluar) noun "lag" is more common in its plural form "lags". Singular form count = 1, plural

In [38]:
# Q5.2

tags_frequency_dist = nltk.FreqDist(tag for (word, tag) in brown.tagged_words())
print(tags_frequency_dist.most_common(20))

[('NN', 152470), ('IN', 120557), ('AT', 97959), ('JJ', 64028), ('.', 60638), (',', 58156), ('NNS', 55110), ('CC', 37718), ('RB', 36464), ('NP', 34476), ('VB', 33693), ('VBN', 29186), ('VBD', 26167), ('CS', 22143), ('PPS', 18253), ('VBG', 17893), ('PP$', 16872), ('TO', 14918), ('PPSS', 13802), ('CD', 13510)]


__Q6.__ Generate some statistics for tagged data to answer the following questions: 
1. What proportion of word types are always assigned the same part-of-speech tag?
2. How many words are ambiguous, in the sense that they appear with at least two tags?
3. What percentage of word tokens in the Brown Corpus involve these ambiguous words?

In [78]:

num_word_tokens = len(brown.words())
print(f'Number of work tokens = {num_word_tokens}')

all_word_types = {word for word in brown.words()}
print(f'Found a total of {len(all_word_types)} word types / unique words.')

all_pos_tags = {tag for (word, tag) in brown.tagged_words()}
print(f'Found a total of {len(all_pos_tags)} POS tags.\n')

word_types_with_unique_pos_tag_assignment_count = 0
num_ambiguous_words = 0
ambiguous_word_tokens_count = 0

word_type_to_pos_tags_dict = {word_type:set() for word_type in all_word_types}
for word_token, tag in brown.tagged_words():
    word_type_to_pos_tags_dict[word_token].add(tag)
    
for word_type, pos_tags_set in word_type_to_pos_tags_dict.items():
    if len(pos_tags_set) == 1:
        word_types_with_unique_pos_tag_assignment_count += 1
    elif len(pos_tags_set) > 1:
        num_ambiguous_words += 1

for word_token in brown.words():
    if len(word_type_to_pos_tags_dict[word_token]) > 1:
        ambiguous_word_tokens_count += 1
        
print(f'% of word types that are always assigned the same part-of-speech tag = {(word_types_with_unique_pos_tag_assignment_count / len(all_word_types)) * 100} %')
print(f'Number of words that are ambiguous (i.e. they appear with at least two tags) = {num_ambiguous_words}')
print(f'% of word tokens that involve ambiguous words = {(ambiguous_word_tokens_count / num_word_tokens) * 100} %')

Number of work tokens = 1161192
Found a total of 56057 word types / unique words.
Found a total of 472 POS tags.

% of word types that are always assigned the same part-of-speech tag = 84.42834971546819 %
Number of words that are ambiguous (i.e. they appear with at least two tags) = 8729
% of word tokens that involve ambiguous words = 78.64892283102192 %


__Q9.__

In [135]:
word_type_to_pos_tags_dict = {word_type:set() for word_type in all_word_types}
for word_token, tag in brown.tagged_words():
    word_type_to_pos_tags_dict[word_token].add(tag)

distinct_pos_tag_count_to_distinct_words_count = {i + 1:0 for i in range(10)}
word_to_distinct_pos_tags_count = (None, 0)  # word, distinct pos tags count
for word, pos_tags_set in word_type_to_pos_tags_dict.items():
    num_unique_tags = len(pos_tags_set)
    
    if num_unique_tags > 0 and num_unique_tags <= 10:
        distinct_pos_tag_count_to_distinct_words_count[num_unique_tags] = distinct_pos_tag_count_to_distinct_words_count[num_unique_tags] + 1
    
    if num_unique_tags > word_to_distinct_pos_tags_count[1]:
        word_to_distinct_pos_tags_count = word, num_unique_tags
    
print(distinct_pos_tag_count_to_distinct_words_count)
print(word_to_distinct_pos_tags_count)

for tag in word_type_to_pos_tags_dict[word_to_distinct_pos_tags_count[0]]:
    for tagged_sentence in brown.tagged_sents():
        if (word_to_distinct_pos_tags_count[0], tag) in tagged_sentence:
            sentence = ' '.join([word for word, tag in tagged_sentence])
            print(f'\nFor {tag} tag, sentence = {sentence}')
            break

{1: 47328, 2: 7186, 3: 1146, 4: 265, 5: 87, 6: 27, 7: 12, 8: 1, 9: 1, 10: 2}
('that', 12)

For WPO tag, sentence = He was able to smell a bargain -- and a masterpiece -- a continent away , and the Museum of Modern Art's Alfred Barr said of him : `` I have never mentioned a new artist that Thompson didn't know about '' .

For NIL tag, sentence = Thus , as a development program is being launched , commitments and obligations must be entered into in a given year which may exceed by twofold or threefold the expenditures to be made in that year .

For DT-NC tag, sentence = He has his own system of shorthand , devised by abbreviations : `` humility '' will be `` humly '' , `` with '' will be `` w '' , and `` that '' will be `` tt '' .

For WPO-NC tag, sentence = Thus to has light stress both in that was the conclusion that I came to and in that was the conclusion I came to .

For WPS tag, sentence = Regarding Atlanta's new multi-million-dollar airport , the jury recommended `` that when the 