In [1]:
import nltk
from nltk.tokenize import RegexpTokenizer
from collections import defaultdict
from nltk.stem import WordNetLemmatizer
from nltk import pos_tag


In [2]:
nltk.download('brown')
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')




[nltk_data] Error loading brown: [Errno 54] Connection reset by peer
[nltk_data] Error loading punkt: [Errno 54] Connection reset by peer
[nltk_data] Error loading wordnet: [Errno 54] Connection reset by peer
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/jitender.singh/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [3]:
import nltk
from nltk.tokenize import RegexpTokenizer
from collections import defaultdict
from nltk.stem import WordNetLemmatizer
from nltk.corpus import brown


"""
Program to find count of all words in a corpus and
print words having a minimum threshold of occurance
"""


class BrownCorpus:

    def __init__(self):
        self.word_counts = defaultdict(lambda: 0)     # to hold word counts
        self.word_tagset = defaultdict(set)           # holds distinct tags for a word
        self.tag_counts = defaultdict(lambda: 0)      # to hold word counts
        self.prevtag_counts = defaultdict(lambda: 0)  # holds counts for tags appearing before a noun tag.

        self.word_tokenizer = RegexpTokenizer(r'\w+')

    def read_corpus(self, include_combination_tags=True):
        # consider the previous sentence ended with '.'
        # presence of '.' tag can be used to mark words/tags that appear at the beginning of a sentence.
        prev_tag = '.'

        for tagged_word in brown.tagged_words():
            # treat all-upper words (US, WHO..) differently as they should be considered proper noun, except..
            # the single letter ones ('A' = 'a', 'I' = 'i')
            word = tagged_word[0] if tagged_word[0].isupper() and len(tagged_word[0]) > 1 else tagged_word[0].lower()

            # CLAWS tag corpus may have combination tags, separated by '-'
            if include_combination_tags:
                tags = str.split(tagged_word[1], '-')
            else:
                # use the first tag only.
                tags = [str.split(tagged_word[1], '-')[0]]
            # add tags
            self.word_tagset[word].update(tags)

            # set word count
            self.word_counts[word] += 1

            # set tag count
            for tag in tags:
                self.tag_counts[tag] += 1

            # find previous word's tag(s), if current word is a noun, and increase their counts
            if any(self.is_noun(tag) for tag in tags):
                for prev_tag in prev_tags:
                    self.prevtag_counts[prev_tag] += 1
            # reset prev tags to current word's tag(s)
            prev_tags = tags

            
    def is_noun(self, tag):     # noqa
        # if a tag starts with letter 'N', it's a noun, except NIL and NC,
        # see: CLAWS1 tags as used by brown corpus: http://ucrel.lancs.ac.uk/claws1tags.html
        if tag.upper().startswith('N') and tag not in ['NIL', 'NC']:
            return True
        return False

    def get_p_s_count_diff(self):
        plural_singular_diff = dict()
        wnl = WordNetLemmatizer()

        # for all words and their counts...
        for word, count in self.word_counts.items():

            # if the word looks like a plural form and has a noun tag...
            if word.endswith('s') and not word.endswith('ss') and \
                    any(self.is_noun(tag) for tag in self.word_tagset[word]):
                # get its singular form and count of singular form
                singular_form = wnl.lemmatize(word, pos='n')    # wordnet lemmatizer takes small 'n' for noun pos tag.

                # take only the simple plurals, also ensure we have their singular form in the wordlist.
                if '{}s'.format(singular_form) == word and self.word_counts.get(singular_form):
                    count_diff = self.word_counts[word] - self.word_counts[singular_form]

                    # only pick the words where plural_form is more frequent than singular_form
                    if count_diff > 0:
                        plural_singular_diff[word] = {
                            'singular_form': singular_form,
                            'count_plural': self.word_counts[word],
                            'count_singular': self.word_counts[singular_form],
                            'count_diff': count_diff,
                        }
        return sorted(plural_singular_diff.items(), key=lambda x: x[1]['count_diff'], reverse=True)

    def get_words_with_most_tags(self):
        """
        Returns words having the highest tag counts.
        """
        return sorted(self.word_tagset.items(), key=lambda x: len(x[1]), reverse=True)

    def get_tags_by_count(self):
        """
        Returns list of tag with count, most frequently occuring tag first.
        """
        return sorted(self.tag_counts.items(), key=lambda x: x[1], reverse=True)

    def get_prevtag_counts(self):
        """
        Return tags with highest counts that are followed by a noun.
        """
        return sorted(self.prevtag_counts.items(), key=lambda x: x[1], reverse=True)


bc = BrownCorpus()
bc.read_corpus()



In [4]:
# Q1.a
# Words that are simple plurals and occur more frequenty than their singular form
bc.get_p_s_count_diff()[:20]


[('years',
  {'singular_form': 'year',
   'count_plural': 950,
   'count_singular': 658,
   'count_diff': 292}),
 ('eyes',
  {'singular_form': 'eye',
   'count_plural': 401,
   'count_singular': 122,
   'count_diff': 279}),
 ('members',
  {'singular_form': 'member',
   'count_plural': 325,
   'count_singular': 137,
   'count_diff': 188}),
 ('minutes',
  {'singular_form': 'minute',
   'count_plural': 196,
   'count_singular': 55,
   'count_diff': 141}),
 ('miles',
  {'singular_form': 'mile',
   'count_plural': 173,
   'count_singular': 48,
   'count_diff': 125}),
 ('means',
  {'singular_form': 'mean',
   'count_plural': 310,
   'count_singular': 199,
   'count_diff': 111}),
 ('corps',
  {'singular_form': 'corp',
   'count_plural': 110,
   'count_singular': 1,
   'count_diff': 109}),
 ('sales',
  {'singular_form': 'sale',
   'count_plural': 133,
   'count_singular': 44,
   'count_diff': 89}),
 ('conditions',
  {'singular_form': 'condition',
   'count_plural': 180,
   'count_singular': 91

***Analysis***

The list shows the plural form of words which are used far more frequently than their singular form,
example: years, eyes, members, means etc.



In [5]:
# Q1. b
# Words having the most tags.
bc.get_words_with_most_tags()[:20]


[('that', {'CS', 'DT', 'HL', 'NC', 'NIL', 'QL', 'TL', 'WPO', 'WPS'}),
 ('a', {'AT', 'FW', 'HL', 'IN', 'NC', 'NIL', 'NN', 'NP', 'TL'}),
 ('down', {'HL', 'IN', 'JJ', 'NN', 'NP', 'RB', 'RP', 'TL', 'VB'}),
 ('in', {'FW', 'HL', 'IN', 'NC', 'NIL', 'NN', 'RP', 'TL'}),
 ('to', {'HL', 'IN', 'NC', 'NIL', 'NPS', 'QL', 'TL', 'TO'}),
 ('well', {'HL', 'JJ', 'NN', 'QL', 'RB', 'TL', 'UH', 'VB'}),
 (':', {',', '.', ':', 'HL', 'IN', 'NIL', 'NP', 'TL'}),
 ('more', {'AP', 'HL', 'NC', 'NIL', 'NP', 'QL', 'RBR', 'TL'}),
 ('beat', {'HL', 'JJ', 'NN', 'NNS', 'TL', 'VB', 'VBD', 'VBN'}),
 ('place', {'FW', 'HL', 'NC', 'NN', 'NP', 'TL', 'VB'}),
 ('for', {'CS', 'HL', 'IN', 'NC', 'NN', 'RB', 'TL'}),
 ('best', {'HL', 'JJT', 'NP', 'QL', 'RBT', 'TL', 'VB'}),
 ('as', {'CS', 'HL', 'IN', 'NIL', 'QL', 'RB', 'TL'}),
 ('home', {'HL', 'NC', 'NN', 'NP', 'NR', 'TL', 'VB'}),
 ('present', {'AP', 'HL', 'JJ', 'NN', 'RB', 'TL', 'VB'}),
 ('out', {'HL', 'IN', 'NC', 'PP$', 'RB', 'RP', 'TL'}),
 ('near', {'HL', 'IN', 'JJ', 'QL', 'RB', 'TL


***Analysis***

Words like 'that', 'a', 'down' show up with the highest number of tags, followed by 'in', 'to', 'well'.
Though many of the assigned tags in the brown corpus are medium to high granularity tags,
usually not found in many other tagsets.
ex. TL (word in title), NC (cited text) or HL (word in healines) are some of the combination tags,
often seen as NN-TL, NN-NC etc. (see: https://link.springer.com/content/pdf/bbm%3A978-94-015-9273-4%2F1.pdf )

The below section, excludes the combination tags to get the words with most count of tags.

In [6]:
# Q1. b
# Words having the most tags (with combination tgs excluded)
bc_2 = BrownCorpus()
bc_2.read_corpus(include_combination_tags=False)
bc_2.get_words_with_most_tags()[:20]


[('down', {'IN', 'JJ', 'NN', 'NP', 'RB', 'RP', 'VB'}),
 ('still', {'JJ', 'NN', 'NP', 'QL', 'QLP', 'RB', 'VB'}),
 ('that', {'CS', 'DT', 'NIL', 'QL', 'WPO', 'WPS'}),
 ('well', {'JJ', 'NN', 'QL', 'RB', 'UH', 'VB'}),
 (':', {',', '.', ':', 'IN', 'NIL', 'NP'}),
 ('beat', {'JJ', 'NN', 'NNS', 'VB', 'VBD', 'VBN'}),
 ('in', {'FW', 'IN', 'NIL', 'NN', 'RP'}),
 ('to', {'IN', 'NIL', 'NPS', 'QL', 'TO'}),
 ('a', {'AT', 'FW', 'NIL', 'NN', 'NP'}),
 ('best', {'JJT', 'NP', 'QL', 'RBT', 'VB'}),
 ('as', {'CS', 'IN', 'NIL', 'QL', 'RB'}),
 ('fit', {'JJ', 'NN', 'VB', 'VBD', 'VBN'}),
 ('more', {'AP', 'NIL', 'NP', 'QL', 'RBR'}),
 ('present', {'AP', 'JJ', 'NN', 'RB', 'VB'}),
 ('near', {'IN', 'JJ', 'QL', 'RB', 'VB'}),
 ('then', {'CS', 'JJ', 'NIL', 'RB', 'RN'}),
 ('past', {'AP', 'IN', 'JJ', 'NN', 'RB'}),
 ('long', {'JJ', 'NP', 'QL', 'RB', 'VB'}),
 ('post', {'FW', 'IN', 'NN', 'NP', 'VB'}),
 ('left', {'JJ', 'NN', 'NR', 'VBD', 'VBN'})]

Excluding the combination tags, we find words like 'down', 'still', 'that', 'well', 'beat' and punctuation ':' 
having the most tags assigned to them.
As can be seen, most of these words belong to adjective/preposition/adverbs/qualifiers categories, primarily used to join different sentences, parts of sentences.

In [7]:

# Q1. c
# Most frequent tags.
bc.get_tags_by_count()[:20]


[('NN', 168028),
 ('IN', 122713),
 ('AT', 99146),
 ('JJ', 68728),
 ('.', 61254),
 (',', 58336),
 ('NNS', 58109),
 ('NP', 39045),
 ('CC', 38192),
 ('RB', 36614),
 ('VB', 33985),
 ('TL', 30169),
 ('VBN', 29944),
 ('VBD', 26195),
 ('CS', 22178),
 ('PPS', 18288),
 ('VBG', 18196),
 ('PP$', 16936),
 ('TO', 14996),
 ('CD', 14883)]

***Analysis***

Nouns are the most common tags assigned, followed by prepositions, articles and adjectives and sentence end/pause markers.


In [8]:
# Q1. d
# tags that fequently appear before a noun word.
bc.get_prevtag_counts()[:20]


[('AT', 64819),
 ('JJ', 45922),
 ('IN', 33846),
 ('NN', 23626),
 ('TL', 14234),
 ('NP', 12589),
 ('PP$', 12480),
 ('CC', 9209),
 (',', 8125),
 ('.', 7447),
 ('CD', 5672),
 ('AP', 5326),
 ('VBG', 4881),
 ('DT', 4643),
 ('VBN', 4643),
 ('VB', 3157),
 ('CS', 3109),
 ('HL', 2295),
 ('NP$', 1849),
 ('``', 1847)]

***Analysis***

Article tags appear most frequently before a noun word. The other frequent tags are adjectives and prepositions that appear before a noun word.