In [None]:
# COLX 521 Lecture 4: Corpus Statistics

* Advanced counting
* Sorting
* Simple statistics
* Comparing corpora in NLTK

## Advanced counting

Let's return to counting. Yet another solution to the initialization problem for counting (and other situations) is the [defaultdict](https://docs.python.org/3/library/collections.html#collections.defaultdict). When you initialize a defaultdict for integers, the default value is zero, so no need to do anything but add. Defaultdicts also work for lists, sets, and dictionaries (defaulting to empty).

In [2]:
from collections import defaultdict
from nltk.corpus import brown

# my code here
counts = defaultdict(int)

for word in brown.words():
    counts[word] += 1

print(counts["the"])
# my code here


62713


In [10]:
words_by_length = defaultdict(set)

# my code here
for word in brown.words():
    words_by_length[len(word)].add(word)

print(words_by_length[18])
# my code here

{'pressure-measuring', 'rookie-of-the-year', 'anti-authoritarian', 'head-and-shoulders', 'materials-handling', 'Characteristically', 'non-discrimination', 'fifteen-sixteenths', 'policeman-murderer', 'interconnectedness', 'value-orientations', 'university-trained', 'thirteenth-century', 'fiber-photocathode', "Journal-Bulletin's", 'radiation-produced', 'on-again-off-again', 'Brumidi-Costaggini', 'microcytochemistry', 'handyman-carpenter', 'commander-in-chief', 'Communist-inspired', 'concentration-camp', 'student-physicists', 'abrasion-resistant', 'Kohnstamm-negative', 'twenty-five-dollar', 'disenfranchisement', 'counter-escalation', 'disproportionately', 'Nineteenth-century', 'upper-middle-class', 'finite-dimensional', 'Commander-in-Chief', 'Diethylstilbestrol', 'macro-instructions', 'lower-middle-class', 'ring-around-a-rosy', 'triphosphopyridine', 'Mainliner-Highland', 'entropy-increasing', 'Clinico-pathologic', 'Icelandic-speaking', 'ultracentrifugally', 'non-propagandistic', 'Kristall

Often you need to normalize the counts in a dictionary, to control for the effect of corpus size and/or create a probability distribution. You can keep a running total or, more conveniently, just [sum](https://docs.python.org/3/library/functions.html#sum) the values of your count dict.

In [11]:
total_tokens = sum(counts.values())

for word in counts:
    counts[word] /= total_tokens
    
print(counts["the"])
print(sum(counts.values()))

0.05400743374050114
0.999999999999049


For easy interpretiblity, one often multiples these normalized word probabilities by some large number like 1000, at which point the resulting number can be understood as X occurences per 1000 tokens

In [12]:
for word in counts:
    counts[word] *= 1000
    
print(counts["the"])

54.00743374050114


Another common use case involving counts is removing words with high or low counts, which are often uninteresting or statistically unreliable. This is tricky, because in Python you can't delete from something you're iterating over! Unless you're very worried about lack of memory, usually easier to just create a new dictionary.

In [56]:
from collections import Counter

counts = Counter(brown.words())

#my code here
new_counts = {}
for word, count in counts.items():
    if 5 < count < 10000:
        new_counts[word] = count
#my code here
        
counts = new_counts
print(len(counts))


56057
13150


If you're just interested in the highest (or lowest) count item, it is easy enough just to iterate over the dictionary once and remember the top scoring item. Beyond that, you'll want to do some sorting.

In [14]:
highest_count_word = None
highest_count = 0
for word, count in counts.items():
    if count > highest_count:
        highest_count_word = word
        highest_count = count
print(highest_count_word)
print(highest_count)

was
9777


But if you're using a Counter object, the [most_common](https://docs.python.org/3/library/collections.html#collections.Counter.most_common) method is often handy. Counters have a few other neat options, for instance they can be added and subtracted.

In [57]:
brown_counts = Counter(brown.words())
#my code here
brown_counts.most_common(10)
#my code here

[('the', 62713),
 (',', 58334),
 ('.', 49346),
 ('of', 36080),
 ('and', 27915),
 ('to', 25732),
 ('a', 21881),
 ('in', 19536),
 ('that', 10237),
 ('is', 10011)]

In [16]:
from nltk.corpus import treebank
treebank_counts = Counter(treebank.words())
#my code here
both_counts = brown_counts + treebank_counts
print(both_counts.most_common(10))
#my code here

[('the', 66758), (',', 63219), ('.', 53174), ('of', 38399), ('and', 29426), ('to', 27896), ('a', 23759), ('in', 21108), ('that', 11044), ('is', 10682)]


## Sorting

Simple sorting of a list of objects in Python is fairly straightward. Use the [sort](https://docs.python.org/3/library/stdtypes.html#list.sort) method to sort in place, or [sorted](https://docs.python.org/3/library/functions.html#sorted) to create a new sorted list. Result is order from smallest to largest, use reverse keyword to reverse the order.

In [3]:
#provided code
nums = [3, 6, -4, 23, 0.5, 202, -24592, 3482]

In [4]:
sorted(nums)

[-24592, -4, 0.5, 3, 6, 23, 202, 3482]

In [5]:
nums

[3, 6, -4, 23, 0.5, 202, -24592, 3482]

In [6]:
nums.sort(reverse=True)

In [7]:
nums

[3482, 202, 23, 6, 3, 0.5, -4, -24592]

Note that strings are generally sorted alphabetically, but once you get outside of a-z things can get unpredictable.

In [17]:
strings = ["aardvark", "Aardvark", "Zebra", "zebra", "12", "110", "2"]

# my code here
sorted(strings)
# my code here

['110', '12', '2', 'Aardvark', 'Zebra', 'aardvark', 'zebra']

Often, though, you have a statistic associated with a group of objects (words, documents, corpora, etc.) and want to sort the objects based on the statistic. One easy strategy is to create a list of tuples where the statistic is the first element of the tuple, since sort operates on the first element of each tuple first.

In [21]:
counts = {"the":87925, "quick":327, "brown":539, "fox":69}

# my code here
sort_me = []
for word, count in counts.items():
    sort_me.append((count, word))
sort_me.sort()
print(sort_me)
# my code here


[(69, 'fox'), (327, 'quick'), (539, 'brown'), (87925, 'the')]


To do this compactly, we should use a handy piece of Python syntax, the list comprehension, which allows you to build a new list based on an exisiting iterable in a single expression.

In [22]:
sorted([(count,word) for word, count in counts.items()])

[(69, 'fox'), (327, 'quick'), (539, 'brown'), (87925, 'the')]

An similarly compact way is to use the "key" keyword for *sort/sorting* function which allows you to specify a function which will define the value to sort a given iterable. The typical way to specify the function for this is to use a [lambda expression](https://docs.python.org/3/tutorial/controlflow.html#lambda-expressions). One advantage of this is that you just get the sorted list without having to deal with extracting what you need from tuples

In [23]:
sorted(counts.keys(), key=lambda x: counts[x])

['fox', 'quick', 'brown', 'the']

Note that both list comprehensions and lambda expressions are relatively advanced Python syntax. If you are new to Python and find them confusing, it is totally okay not to use them in this course!

Once you have a sorted list, you can use slicing to get what you want.

In [11]:
counts = Counter(brown.words())

sorted_words = sorted(counts.keys(), key=lambda x: counts[x])
print(sorted_words[-50:])

[')', 'been', 'their', 'him', 'would', 'all', 'you', 'they', 'one', 'her', 'He', 'but', 'were', '--', 'which', 'an', 'have', 'this', 'or', 'from', 'are', 'not', '?', 'at', 'had', 'by', 'I', ';', 'be', 'on', 'his', 'he', 'as', 'it', 'with', 'The', "''", '``', 'for', 'was', 'is', 'that', 'in', 'a', 'to', 'and', 'of', '.', ',', 'the']


Exercise: Use sorting to get the 50 longest and shortest word types in the Penn Treebank corpus

In [24]:
sorted_by_length = sorted(set(treebank.words()), key=lambda x: len(x))
print(sorted_by_length[:50])
print(sorted_by_length[-50:])

['?', ':', ',', 'I', '5', '$', '3', 'B', 'G', 'X', '7', '6', 'A', '@', '4', '0', 'a', '`', '-', '1', '&', "'", '%', '.', 'R', 'F', '#', '!', '9', '*', '8', '2', 'b', ';', 'Mo', 'Al', '11', '95', '75', 'GM', '94', 'wo', 'AG', '25', 'H.', '70', '68', 'We', '71', 'at']
['Lafite-Rothschild', 'substance-abusing', 'larger-than-normal', 'industry-supported', 'investor-relations', 'telecommunications', 'Corton-Charlemagne', 'school-improvement', 'constitutional-law', 'dollar-denominated', 'computer-generated', 'stock-manipulation', 'recession-inspired', 'shareholder-rights', 'search-and-seizure', 'acquisition-minded', 'newspaper-printing', 'diethylstilbestrol', 'yttrium-containing', 'property\\/casualty', 'financial-services', 'Metallgesellschaft', 'housing-assistance', 'machine-gun-toting', 'Property\\/casualty', 'Philadelphia-based', 'automotive-lighting', 'identity-management', 'multibillion-dollar', 'limited-partnership', '238,000-circulation', 'disaster-assistance', 'less-than-brilliant',

There are two other built in functions, [min](https://docs.python.org/3/library/functions.html#min) and [max](https://docs.python.org/3/library/functions.html#min) which get the minimum and maximum values. Like sort/sorted, they have a *key* keyword argument

In [47]:
print(max(set(treebank.words()), key=lambda x: len(x)))

marketing-communications


There are more ways to sort when you are using numpy arrays, but that is beyond our scope here!

## Simple statistics

The easiest sorts of corpus statistics to calculate are averages: e.g. average word length, average sentence length, average words per text.

In [58]:
#provided code

def get_simple_stats(corpus):
    num_chars = sum([len(word) for word in corpus.words()])
    num_words = len(corpus.words())
    num_sents = len(corpus.sents())
    num_texts = len(corpus.fileids())
    print("average word length")
    print(num_chars/num_words)
    print("average sentence length")
    print(num_words/num_sents)
    print("average text length")
    print(num_words/num_texts)    

In [59]:
get_simple_stats(brown)

average word length
4.276538246904905
average sentence length
20.250994070456922
average text length
2322.384


One popular statistic for individual texts that reflects lexical diversity is the type-token ratio. Note that when you are using it for comparison, you generally need to fix the number of tokens. As we've seen already seen, sets are an easy way to get the number of types, though you'll want to lower case first

In [30]:
types = set(brown.words())
print(len(types)/len(brown.words()))

0.048275392872152066


In [29]:
types = set(brown.words()[:10000])
len(types)/10000

0.269

The relative quantity of the main closed-class POS can reflect the nature of a particular corpus. For POS tagged texts, this is easy to calculate.

In [58]:
noun_count = 0
for word, pos in brown.tagged_words():
    if pos[0] == "N":
        noun_count += 1
print(noun_count/len(brown.words()))

0.23521002555994186


One popular POS summary statistic is lexical density, which can also be calculated using a POS-tagged corpus. It is the ratio of open-class words (nouns, verbs, adjectives, adverbs) to all words.

In [3]:
open_class_prefix = {"N", "V", "J", "R"}
open_class_total = 0
for word, pos in brown.tagged_words():
    # my code here
    if pos[0] in open_class_prefix:
        open_class_total += 1
    # my code here
print(open_class_total/len(brown.words()))

0.43480664696277616


Of course, any word or POS sequence or otherwise easy identified linguistic property may be considered a potential statistic. For example, let's count how often split English infinitives (i.e. TO + RB +  V, "to boldly go") appear per 1000 words in the Brown (and print them out)

In [53]:
split_infinitives = 0
for sent in brown.tagged_sents():
    #my code here
    for i in range(len(sent) - 2):
        if sent[i][1] == "TO" and sent[i+1][1] == "RB" and sent[i+2][1][0] == "V":
            print(sent[i][0], sent[i+1][0], sent[i+2][0])
            split_infinitives += 1
    #my code here
        
print(1000*split_infinitives/len(brown.words()))
    

to formally request
to completely bypass
to merely go
to properly express
to properly display
to even name
to magically influence
to actually move
to roughly calculate
to first drill
to first confront
to ever leave
to substantially lessen
to so notify
to fully serve
to promptly salvage
to virtually destroy
to approximately quadruple
to deliberately behave
to often seclude
to properly relate
to partially destroy
to accurately measure
to automatically hold
to deliberately foul
to gradually reach
to just throw
0.023251968666680445


## Comparing corpora

Let's pick two corpora from NLTK (your choice, but not the Brown) and do some comparisons. First, let's look (again) at vocabulary overlap.

In [4]:
from nltk.corpus import gutenberg, switchboard

guten_types = set(gutenberg.words())
switch_types = set(switchboard.words())
both_types = guten_types&switch_types


In [61]:
print(len(guten_types))

51156


In [62]:
print(len(switch_types))

4729


In [63]:
print(len(both_types))

3631


Vocabulary overlap will overestimate the actual difference between two small corpora (Why?). A better measure: what percentage of the tokens of each corpus consist of types that appear in both?


In [35]:
#provided code
def percent_in_set(word_set, corpus):
    return len([word for word in corpus.words() if word in word_set])/len(corpus.words())

In [36]:
percent_in_set(both_types,gutenberg)

0.736242534653284

In [37]:
percent_in_set(both_types,switchboard)

0.9001473572325829


Now, let's look at some of the "simple" statistics for each corpus and see how they compare.

In [38]:
def get_simple_stats_spoken(corpus):
    num_chars = sum([len(word) for word in corpus.words()])
    num_words = len(corpus.words())
    num_sents = len(corpus.turns())
    num_texts = len(corpus.fileids())
    print("average word length")
    print(num_chars/num_words)
    print("average turn length")
    print(num_words/num_sents)
    print("average text length")
    print(num_words/num_texts)
    

In [39]:
get_simple_stats(gutenberg)

average word length
3.618868231123358
average sentence length
26.601317071190845
average text length
145645.16666666666


In [40]:
get_simple_stats_spoken(switchboard)

average word length
3.240325152188617
average turn length
15.612294927399585
average text length
82792.0


Exercise: Let's write a function that calculates the type-token ratio by using the first n words from each of the corpora. Then compare our two corpora with n = 1000

In [42]:
def type_token_first_n(corpus, n):
    types = set(corpus.words()[:n])
    return len(types)/n

In [43]:
type_token_first_n(gutenberg,1000)

0.41

In [44]:
type_token_first_n(switchboard,1000)

0.278

Now let's build a dictionary of counts for both corpora and (simultaneously) calculate the percentage of tokens which are *hapax legonema* (word types that only appear once) for each.

In [45]:
#provided code
def get_counts(corpus):
    return Counter([word.lower() for word in corpus.words()])

def percent_hapax(corpus):
    counts = get_counts(corpus)
    hapax = [word for word in counts if counts[word] == 1]
    print(len(hapax)/len(corpus.words()))
    

In [54]:
percent_hapax(gutenberg)

0.005886452348229887


In [55]:
percent_hapax(switchboard)

0.02512320030920862


Next, let's create a separate dictionary which consists of the ratios of counts of words appearing in both corpora first normalized by the size of the corpus.

In [64]:
#provided code
def normalize(counts):
    total = sum(counts.values())
    for word in counts:
        counts[word] /= total


In [None]:
guten_counts = get_counts(gutenberg)
normalize(guten_counts)
switch_counts = get_counts(switchboard)
normalize(switch_counts)

ratio_dict = {}
for word in guten_counts:
    if word in switch_counts:
        ratio_dict[word] = guten_counts[word]/switch_counts[word]

Let's look at the ratios for stopwords and punctuation and see if there are any clear differences

In [47]:
ratio_dict["the"]

2.2379976700132294

In [48]:
ratio_dict["a"]

0.6690428768208987

In [49]:
ratio_dict[","]

0.5694629379261056

In [69]:
ratio_dict["?"]

0.8555883871404298

In [68]:

ratio_dict["however"]

21.25371517458908

Finally, let's sort the words by their ratio and look at those words with the highest and lowest ratio

In [65]:
sorted_counts = sorted([(count,word) for word,count in ratio_dict.items()])

In [66]:
print(sorted_counts[:10])

[(0.0006445011727746333, 'ca'), (0.0015790278732978516, 'guy'), (0.0015790278732978516, 'um'), (0.0017544754147753906, 'dad'), (0.0019737848416223145, 'anymore'), (0.0021053704977304685, 'program'), (0.003508950829550781, 'pro'), (0.003947569683244628, 'taxes'), (0.003947569683244628, 'u'), (0.003947569683244629, 'restaurants')]


In [67]:
print(sorted_counts[-10:])

[(40.26521076909521, 'face'), (44.37068323966962, 'among'), (45.09703606138664, 'eyes'), (45.23914856998344, '!'), (51.41314755457804, 'therefore'), (81.85680495176061, 'israel'), (144.86001709634488, 'upon'), (284.54082276827285, 'unto'), (368.92407231731, 'shall'), (627.600418520964, "'")]


Exercise: Compare two English corpora in NLTK that have POS tag annotations for their lexical density.

In [5]:
def lexical_density(corpus):
    open_class_total = 0
    total = 0
    for word,pos in corpus.tagged_words():
        if pos[0] in open_class_prefix:
            open_class_total += 1
        total += 1
    print(open_class_total/total)
    
lexical_density(switchboard)
lexical_density(brown)

0.37422697845202435
0.43480664696277616


Advanced exercise: pick a non-English corpus with POS tagging and see how its lexical density compares with the Brown (you'll have to figure out the tag scheme of this other language). Is lexical density directly comparable across languages?

In [114]:
from nltk.corpus import sinica_treebank

lexical_tags = {"N","V"}

lexical = 0
total = 0
for word,tag in sinica_treebank.tagged_words():
    if tag[0] in lexical_tags:
        lexical += 1
    total += 1
print(lexical/total)

0.6579610813406529
