In [7]:
import nltk
from nltk.tokenize import RegexpTokenizer
from collections import defaultdict
from nltk.stem import WordNetLemmatizer
from nltk import pos_tag


In [8]:
nltk.download('brown')
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')




[nltk_data] Downloading package brown to
[nltk_data]     /Users/jitender.singh/nltk_data...
[nltk_data]   Package brown is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/jitender.singh/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/jitender.singh/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/jitender.singh/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [14]:
import nltk
from nltk.tokenize import RegexpTokenizer
from collections import defaultdict
from nltk.stem import WordNetLemmatizer
from nltk.corpus import brown


"""
Program to find count of all words in a corpus and
print words having a minimum threshold of occurance
"""


class BrownCorpus:

    def __init__(self):
        self.word_counts = defaultdict(lambda: 0)     # to hold word counts
        self.word_tagset = defaultdict(set)           # holds distinct tags for a word
        self.tag_counts = defaultdict(lambda: 0)      # to hold word counts
        self.prevtag_counts = defaultdict(lambda: 0)  # holds counts for tags appearing before a noun tag.

        self.word_tokenizer = RegexpTokenizer(r'\w+')

    def read_corpus(self):
        # consider the previous sentence ended with '.'
        # presence of '.' tag can be used to mark words/tags that appear at the beginning of a sentence.
        prev_tag = '.'

        for tagged_word in brown.tagged_words():
            # treat all-upper words (US, WHO..) differently as they should be considered proper noun, except..
            # the single letter ones ('A' = 'a', 'I' = 'i')
            word = tagged_word[0] if tagged_word[0].isupper() and len(tagged_word[0]) > 1 else tagged_word[0].lower()

            # CLAWS tag corpus may have combination tags, separated by '-'
            tags = str.split(tagged_word[1], '-')

            # add tags
            self.word_tagset[word].update(tags)

            # set word count
            self.word_counts[word] += 1

            # set tag count
            for tag in tags:
                self.tag_counts[tag] += 1

            # find previous word of a noun.
            for tag in tags:
                if self.is_noun(tag):
                    self.prevtag_counts[prev_tag] += 1
                prev_tag = tag

    def is_noun(self, tag):     # noqa
        # if a tag starts with letter 'N', it's a noun, except NIL and NC,
        # see: CLAWS1 tags as used by brown corpus: http://ucrel.lancs.ac.uk/claws1tags.html
        if tag.upper().startswith('N') and tag not in ['NIL', 'NC']:
            return True
        return False

    def get_p_s_count_diff(self):
        plural_singular_diff = dict()
        wnl = WordNetLemmatizer()

        # for all words and their counts...
        for word, count in self.word_counts.items():

            # if the word looks like a plural form and has a noun tag...
            if word.endswith('s') and any(self.is_noun(tag) for tag in self.word_tagset[word]):
                # get its singular form and count of singular form
                singular_form = wnl.lemmatize(word, pos='n')    # wordnet lemmatizer takes small 'n' for noun pos tag.

                # take only the simple plurals, also ensure we have their singular form in the wordlist.
                if '{}s'.format(singular_form) == word and self.word_counts.get(singular_form):
                    count_diff = self.word_counts[word] - self.word_counts[singular_form]

                    # only pick the words where plural_form is more frequent than singular_form
                    if count_diff > 0:
                        plural_singular_diff[word] = {
                            'singular_form': singular_form,
                            'count_plural': self.word_counts[word],
                            'count_singular': self.word_counts[singular_form],
                            'count_diff': count_diff,
                        }
        return sorted(plural_singular_diff.items(), key=lambda x: x[1]['count_diff'], reverse=True)

    def get_words_with_most_tags(self):
        """
        Returns words having the highest tag counts.
        """
        return sorted(self.word_tagset.items(), key=lambda x: len(x[1]), reverse=True)

    def get_tags_by_count(self):
        """
        Returns list of tag with count, most frequently occuring tag first.
        """
        return sorted(self.tag_counts.items(), key=lambda x: x[1], reverse=True)

    def get_prevtag_counts(self):
        """
        Return tags with highest counts that are followed by a noun.
        """
        return sorted(self.prevtag_counts.items(), key=lambda x: x[1], reverse=True)


nltk.download('wordnet')
nltk.download('brown')
nltk.download('averaged_perceptron_tagger')

bc = BrownCorpus()
bc.read_corpus()



[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/jitender.singh/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package brown to
[nltk_data]     /Users/jitender.singh/nltk_data...
[nltk_data]   Package brown is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/jitender.singh/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [19]:
# Q1.a
# get the plural/singular counts / diff, sorted by 'count_diff', show first 20 items.
bc.get_p_s_count_diff()[:20]


[('years',
  {'singular_form': 'year',
   'count_plural': 950,
   'count_singular': 658,
   'count_diff': 292}),
 ('eyes',
  {'singular_form': 'eye',
   'count_plural': 401,
   'count_singular': 122,
   'count_diff': 279}),
 ('members',
  {'singular_form': 'member',
   'count_plural': 325,
   'count_singular': 137,
   'count_diff': 188}),
 ('minutes',
  {'singular_form': 'minute',
   'count_plural': 196,
   'count_singular': 55,
   'count_diff': 141}),
 ('miles',
  {'singular_form': 'mile',
   'count_plural': 173,
   'count_singular': 48,
   'count_diff': 125}),
 ('means',
  {'singular_form': 'mean',
   'count_plural': 310,
   'count_singular': 199,
   'count_diff': 111}),
 ('corps',
  {'singular_form': 'corp',
   'count_plural': 110,
   'count_singular': 1,
   'count_diff': 109}),
 ('sales',
  {'singular_form': 'sale',
   'count_plural': 133,
   'count_singular': 44,
   'count_diff': 89}),
 ('conditions',
  {'singular_form': 'condition',
   'count_plural': 180,
   'count_singular': 91

In [20]:
# Q1. b
# get the words having most tags, sorted by the respective tag count, show first 20 items.
bc.get_words_with_most_tags()[:20]


[('that', {'CS', 'DT', 'HL', 'NC', 'NIL', 'QL', 'TL', 'WPO', 'WPS'}),
 ('a', {'AT', 'FW', 'HL', 'IN', 'NC', 'NIL', 'NN', 'NP', 'TL'}),
 ('down', {'HL', 'IN', 'JJ', 'NN', 'NP', 'RB', 'RP', 'TL', 'VB'}),
 ('in', {'FW', 'HL', 'IN', 'NC', 'NIL', 'NN', 'RP', 'TL'}),
 ('to', {'HL', 'IN', 'NC', 'NIL', 'NPS', 'QL', 'TL', 'TO'}),
 ('well', {'HL', 'JJ', 'NN', 'QL', 'RB', 'TL', 'UH', 'VB'}),
 (':', {',', '.', ':', 'HL', 'IN', 'NIL', 'NP', 'TL'}),
 ('more', {'AP', 'HL', 'NC', 'NIL', 'NP', 'QL', 'RBR', 'TL'}),
 ('beat', {'HL', 'JJ', 'NN', 'NNS', 'TL', 'VB', 'VBD', 'VBN'}),
 ('place', {'FW', 'HL', 'NC', 'NN', 'NP', 'TL', 'VB'}),
 ('for', {'CS', 'HL', 'IN', 'NC', 'NN', 'RB', 'TL'}),
 ('best', {'HL', 'JJT', 'NP', 'QL', 'RBT', 'TL', 'VB'}),
 ('as', {'CS', 'HL', 'IN', 'NIL', 'QL', 'RB', 'TL'}),
 ('home', {'HL', 'NC', 'NN', 'NP', 'NR', 'TL', 'VB'}),
 ('present', {'AP', 'HL', 'JJ', 'NN', 'RB', 'TL', 'VB'}),
 ('out', {'HL', 'IN', 'NC', 'PP$', 'RB', 'RP', 'TL'}),
 ('near', {'HL', 'IN', 'JJ', 'QL', 'RB', 'TL

In [21]:
print("---\n\n")

# Q1. c
# get tags along with their count, sorted in decreasing order of count, show first 20 items.
bc.get_tags_by_count()[:20]


---




[('NN', 168028),
 ('IN', 122713),
 ('AT', 99146),
 ('JJ', 68728),
 ('.', 61254),
 (',', 58336),
 ('NNS', 58109),
 ('NP', 39045),
 ('CC', 38192),
 ('RB', 36614),
 ('VB', 33985),
 ('TL', 30169),
 ('VBN', 29944),
 ('VBD', 26195),
 ('CS', 22178),
 ('PPS', 18288),
 ('VBG', 18196),
 ('PP$', 16936),
 ('TO', 14996),
 ('CD', 14883)]

In [22]:
# Q1. d
# get list of most frequent tags that are followed by a noun, show first 20 items.
bc.get_prevtag_counts()[:20]


[('AT', 63830),
 ('JJ', 41836),
 ('IN', 32460),
 ('NN', 18658),
 ('TL', 14033),
 ('PP$', 12417),
 ('NP', 10294),
 ('CC', 8847),
 (',', 7977),
 ('.', 7313),
 ('CD', 5473),
 ('AP', 5280),
 ('VBG', 4700),
 ('DT', 4628),
 ('VBN', 4045),
 ('CS', 3100),
 ('VB', 3086),
 ('HL', 2295),
 ('VBD', 1839),
 ('``', 1814)]