In [1]:
import nltk
nltk.download("treebank")
nltk.download("cmudict")
nltk.download("brown")
nltk.download("movie_reviews")

[nltk_data] Downloading package treebank to /Users/niki/nltk_data...
[nltk_data]   Package treebank is already up-to-date!
[nltk_data] Downloading package cmudict to /Users/niki/nltk_data...
[nltk_data]   Package cmudict is already up-to-date!
[nltk_data] Downloading package brown to /Users/niki/nltk_data...
[nltk_data]   Package brown is already up-to-date!
[nltk_data] Downloading package movie_reviews to
[nltk_data]     /Users/niki/nltk_data...
[nltk_data]   Package movie_reviews is already up-to-date!


True

In [2]:
from nltk.corpus import treebank,cmudict,brown,movie_reviews

## POS ambiguity

iterate through the Penn treebank corpus and count the number of times the word "fight" appears as a noun (the POS starts with NN) or a verb (starts with VB)

In [3]:
verb_count = 0
noun_count = 0

for word, POS in brown.tagged_words():
    if word.lower()=="fight":
        if POS.startswith("NN"):
            noun_count +=1
        elif POS.startswith("VB"):
            verb_count+=1
    
print(verb_count)
print(noun_count)



43
55


a dictionary of dictionaries which contains information about how often different POS appear for all the words in the corpus

In [4]:
sub_dict={}
POS_tag_lexicon = {}
for word, POS in brown.tagged_words():
    word=word.lower()
    POS=POS[:2]
    if word not in POS_tag_lexicon:
        POS_tag_lexicon[word]= {}
    sub_dict = POS_tag_lexicon[word]
    sub_dict[POS]= sub_dict.get(POS, 0) + 1
    
    
print(POS_tag_lexicon["fight"])

{'NN': 55, 'VB': 43}


### Homophones

The CMU pronouncing dictionary, which is accessible using NLTK, contains information about the pronounciation of words. It is a dictionary whose keys are word types and whose values are lists of pronounciations 

 iterate through the lexicon and remember which pronounciations you have seen (in a set); when you run into a pronounciation you have already seen, add it to your set of duplicates.

In [5]:
p_dict = cmudict.dict() # p_dict is a Python dictionary
tot_set=set()
duplicate_set = set()
for word, pronounciations in p_dict.items():
    for i in pronounciations:
        pronounciation= " ".join(i)
        if pronounciation in tot_set:
            duplicate_set.add(pronounciation)
        tot_set.add(pronounciation)
        
print(len(duplicate_set))

12827


percentage of word tokens in the English language which are homophones.

In [8]:
homophones = set()
for word, pronounciations in p_dict.items():
    for i in pronounciations:
        pronounciation= " ".join(i)
        if pronounciation in duplicate_set:
            homophones.add(word)

            
total_tokens = 0
homophone_tokens = 0
for word, values in brown.tagged_words():
    word=word.lower()
    if word in homophones:
        homophone_tokens+=1
    total_tokens+=1
    tot_set.add(word)
    
percent_homophone = homophone_tokens/total_tokens
print(percent_homophone)

0.36493448111940147


### Comparing genres 

finding words which appear considerably more often in one that the other genre

In [9]:
def get_count_dict(genre):
    '''count how often words appear in genres of the brown corpus, return dictionary of counts'''
    count_dict = {}
    # your code here
    temp_count_dict = {}
    for word in brown.words(categories=genre):
        word = word.lower()
        temp_count_dict[word] = temp_count_dict.get(word,0) + 1
    
    for word,count in temp_count_dict.items():
        if count >= 3:
            count_dict[word] = count
    # your code here
    return count_dict