# Analyzing Corpora

Now that we have looked at analyzing and comparing documents, we can move to a higher unit of text. Sometime we want to look at a large collection of text in aggregate, such as the complete works of William Shakespeare, or all New York Times articles ever. The term we use for a collection of documents is corpus. And a corpus can be as large or as small as you want, but are usually collected together for some reason and have some meaning behind why they are grouped together. 

Lets look at a few examples we have direct access to through NLTK.

In [1]:
#
# Preamble
#

%matplotlib inline

# Import our core libraries
import nltk
from nltk.corpus import gutenberg
from pprint import pprint
from collections import Counter
import numpy as np
from scipy.spatial.distance import cosine
from sklearn.metrics import pairwise_distances
from nltk.cluster.kmeans import KMeansClusterer
from numpy import array

In [7]:
filenames = ["1789-Washington.txt",
"1865-Lincoln.txt",
"1941-Roosevelt.txt",
"1793-Washington.txt",
"1869-Grant.txt",
"1945-Roosevelt.txt",
"1797-Adams.txt",
"1873-Grant.txt",
"1949-Truman.txt",
"1801-Jefferson.txt",
"1877-Hayes.txt",
"1953-Eisenhower.txt",
"1805-Jefferson.txt",
"1881-Garfield.txt",
"1957-Eisenhower.txt",
"1809-Madison.txt",
"1885-Cleveland.txt",
"1961-Kennedy.txt",
"1813-Madison.txt",
"1889-Harrison.txt",
"1965-Johnson.txt",
"1817-Monroe.txt",
"1893-Cleveland.txt",
"1969-Nixon.txt",
"1821-Monroe.txt",
"1897-McKinley.txt",
"1973-Nixon.txt",
"1825-Adams.txt",
"1901-McKinley.txt",
"1977-Carter.txt",
"1829-Jackson.txt",
"1905-Roosevelt.txt",
"1981-Reagan.txt",
"1833-Jackson.txt",
"1909-Taft.txt",
"1985-Reagan.txt",
"1837-VanBuren.txt",
"1913-Wilson.txt",
"1989-Bush.txt",
"1841-Harrison.txt",
"1917-Wilson.txt",
"1993-Clinton.txt",
"1845-Polk.txt",
"1921-Harding.txt",
"1997-Clinton.txt",
"1849-Taylor.txt",
"1925-Coolidge.txt",
"2001-Bush.txt",
"1853-Pierce.txt",
"1929-Hoover.txt",
"2005-Bush.txt",
"1857-Buchanan.txt",
"1933-Roosevelt.txt",
"2009-Obama.txt",
"1861-Lincoln.txt",
"1937-Roosevelt.txt"
]

In [11]:
#filename = "data/inaugural/1789-Washington.txt"

text_dict = {}


for filename in filenames:
    with open("data/inaugural/" + filename) as handle:
        name = filename.replace(".txt", "")
        text = handle.read()
        text_dict[name] = text
        
        
        

print(text_dict["1789-Washington"][0:50])

Fellow-Citizens of the Senate and of the House of 


In [4]:
# Lets turn the text of alice in wonderland into a bag of words with
# associated frequency distribution.

alice = gutenberg.raw('carroll-alice.txt')
alice = nltk.word_tokenize(alice)
alice = [word.lower() for word in alice]
frequencies = nltk.FreqDist(alice)
print(frequencies.most_common(50))

# FreqDist docs.
# http://www.nltk.org/api/nltk.html?highlight=freqdist#nltk.probability.FreqDist

[(u',', 2418), (u'the', 1616), (u"'", 1127), (u'.', 974), (u'and', 810), (u'to', 720), (u'a', 620), (u'she', 544), (u'it', 539), (u'of', 499), (u'said', 462), (u'!', 450), (u'alice', 396), (u'was', 366), (u'i', 364), (u'in', 359), (u'you', 356), (u'that', 284), (u'--', 264), (u'as', 256), (u'her', 248), (u':', 233), (u'at', 209), (u"n't", 204), (u'?', 202), (u"'s", 194), (u';', 194), (u'on', 191), (u'had', 184), (u'with', 179), (u'all', 178), (u"'i", 169), (u'be', 148), (u'for', 146), (u'so', 144), (u'very', 139), (u'they', 135), (u'not', 135), (u'this', 131), (u'but', 131), (u'little', 128), (u'do', 125), (u'he', 117), (u'is', 113), (u'out', 113), (u'what', 103), (u'down', 102), (u'one', 99), (u'up', 97), (u'his', 95)]


In [5]:
#Let’s create a vector space model out of some documents
#and calculate some pairwise similarities.

# Extract and normalize tokens for a given document.
def get_tokens(fileid, corpus):
    raw = corpus.raw(fileid)
    tokens = nltk.word_tokenize(raw)
    norm = [token.lower() for token in tokens]
    return norm

# Takes all the tokens that appear in token_lists and puts them in a 
# set to determine unique tokens. Then creates a dictionary mapping a token
# to its index in the set, this enables us to have a unique target position for
# each word in our vocabulary.
def build_vocabulary(token_lists):
    result = set()
    for tl in token_lists:
        result = result.union(set(tl))
    result = {v:i for i,v in enumerate(result)}
    return result

# Builds a vector for a given token list and vocabulary.
# The result is a vector with length equal to the number of words in the
# vocabulary, and term weights for each token in the token list set in 
# the appropriate position
def build_vector(tokens, vocabulary):
    result = [0] * len(vocabulary)
    freq = Counter(tokens)
    for token in tokens:
        pos = vocabulary[token]
        result[pos] = freq[token]
    return result

In [6]:
alice = get_tokens('carroll-alice.txt', gutenberg)
moby = get_tokens('melville-moby_dick.txt', gutenberg)
austen1 = get_tokens('austen-emma.txt', gutenberg)
austen2 = get_tokens('austen-persuasion.txt', gutenberg)
austen3 = get_tokens('austen-sense.txt', gutenberg)

vocabulary = build_vocabulary([alice, moby, austen1, austen2, austen3]);
print(len(vocabulary))


24588


In [7]:
alice_v = build_vector(alice, vocabulary)
moby_v = build_vector(moby, vocabulary)
austen1_v = build_vector(austen1, vocabulary)
austen2_v = build_vector(austen2, vocabulary)
austen3_v = build_vector(austen3, vocabulary)

In [8]:
print(alice_v[0:50])
print("\n")

#this code is slightly wrong - it was changed during the course to accurately match words to frequencies

print(list(vocabulary)[0:50])
print("\n")
print(zip(alice_v[0:50], list(vocabulary)[0:50]))

[0, 0, 0, 0, 6, 0, 0, 0, 0, 3, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 17, 0, 0, 0, 0, 0, 0, 0]


[u'gag', u'woods', u'clotted', u'pantheistic', u'hanging', u'woody', u'disobeying', u'canes', u'scold', u'stipulate', u'medicament', u'bringing', u'wooded', u'harville', u'wooden', u'wednesday', u'broiled', u'soladoes', u'crotch', u'sooty', u'insular', u'woollen-draper', u'miniatures', u'sooth', u'sustaining', u'consenting', u"frigate's", u'inanimate', u"mind'em", u'errors', u'semicircular', u'nature.', u'cooking', u'designing', u'shocks', u'crouch', u'work-bags', u'primogenitures', u'china', u'properest', u'natured', u'climbed', u'circumferences', u'natures', u'golden', u'_would_', u'projection', u'lengthen', u'hermaphroditical', u'stern']


[(0, u'gag'), (0, u'woods'), (0, u'clotted'), (0, u'pantheistic'), (6, u'hanging'), (0, u'woody'), (0, u'disobeying'), (0, u'canes'), (0, u'scold'), (3, u'stipulate'), (2, u'medicament'), (0, u'b

In [9]:
# Lets compare Alice in Wonderland to the Others
from scipy.spatial.distance import cosine


1.0
0.902451480786
0.881000554468
0.893427441523
0.88611793731


In [12]:
# Lets see this in action
# http://www.nltk.org/api/nltk.cluster.html
# http://www.nltk.org/_modules/nltk/cluster/kmeans.html

from nltk.cluster.kmeans import KMeansClusterer
from numpy import array

num_clusters = 3
kclusterer = KMeansClusterer(num_clusters, distance=nltk.cluster.util.cosine_distance, repeats=5)

vectors = [array(f) for f in all_vectors] 
clusters = kclusterer.cluster(vectors, True) 
print('Clustering results:', clusters)

('Clustering results:', [0, 2, 1, 1, 1])


In [13]:
# Lets do this for all 18 documents

doc_ids = gutenberg.fileids()
token_lists = [get_tokens(f, gutenberg) for f in doc_ids]
voc = build_vocabulary(token_lists)
doc_vectors = [array(build_vector(tl, voc)) for tl in token_lists]

In [17]:
num_clusters = 12 #note there are 12 authors
kclusterer = KMeansClusterer(num_clusters, distance=nltk.cluster.util.cosine_distance, repeats=25)
clusters = kclusterer.cluster(doc_vectors, True) 

doc_clusters = zip(gutenberg.fileids(), clusters)
for dc in doc_clusters:
    pprint(dc)
    
# Note that running this multiple times produces different results

(u'austen-emma.txt', 9)
(u'austen-persuasion.txt', 6)
(u'austen-sense.txt', 9)
(u'bible-kjv.txt', 11)
(u'blake-poems.txt', 7)
(u'bryant-stories.txt', 3)
(u'burgess-busterbrown.txt', 0)
(u'carroll-alice.txt', 7)
(u'chesterton-ball.txt', 4)
(u'chesterton-brown.txt', 5)
(u'chesterton-thursday.txt', 4)
(u'edgeworth-parents.txt', 10)
(u'melville-moby_dick.txt', 7)
(u'milton-paradise.txt', 8)
(u'shakespeare-caesar.txt', 1)
(u'shakespeare-hamlet.txt', 2)
(u'shakespeare-macbeth.txt', 1)
(u'whitman-leaves.txt', 8)


In [18]:
print(clusters)
kclusterer.means()

[9, 6, 9, 11, 7, 3, 0, 7, 4, 5, 4, 10, 7, 8, 1, 2, 1, 8]


[array([ 0.,  0.,  0., ...,  0.,  0.,  0.]),
 array([ 0.5,  0. ,  0. , ...,  1. ,  0. ,  0. ]),
 array([ 0.,  4.,  0., ...,  1.,  0.,  0.]),
 array([ 0.,  0.,  0., ...,  0.,  0.,  0.]),
 array([ 0. ,  0. ,  0. , ...,  0. ,  0.5,  0. ]),
 array([ 0.,  0.,  0., ...,  0.,  0.,  0.]),
 array([ 0.,  0.,  0., ...,  0.,  0.,  0.]),
 array([ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.66666667,  0.        ]),
 array([ 0.5,  0. ,  0.5, ...,  0. ,  1.5,  0. ]),
 array([ 0.,  0.,  0., ...,  0.,  0.,  0.]),
 array([ 0.,  0.,  0., ...,  0.,  0.,  0.]),
 array([ 0.,  0.,  0., ...,  0.,  0.,  3.])]

In [23]:
# Exercise 1.

from nltk.corpus import inaugural

from nltk.cluster.kmeans import KMeansClusterer
from numpy import array

# Extract and normalize tokens for a given document.
def get_tokens(fileid, corpus):
    raw = corpus.raw(fileid)
    tokens = nltk.word_tokenize(raw)
    norm = [token.lower() for token in tokens]
    return norm

# Takes all the tokens that appear in token_lists and puts them in a 
# set to determine unique tokens. Then creates a dictionary mapping a token
# to its index in the set, this enables us to have a unique target position for
# each word in our vocabulary.
def build_vocabulary(token_lists):
    result = set()
    for tl in token_lists:
        result = result.union(set(tl))
    result = {v:i for i,v in enumerate(result)}
    return result

# Builds a vector for a given token list and vocabulary.
# The result is a vector with length equal to the number of words in the
# vocabulary, and term weights for each token in the token list set in 
# the appropriate position
def build_vector(tokens, vocabulary):
    result = [0] * len(vocabulary)
    freq = Counter(tokens)
    for token in tokens:
        pos = vocabulary[token]
        result[pos] = freq[token]
    return result

# The ids for the speeches. Print these out if you want to get a sense of
# what is in this corpus.
speech_names = inaugural.fileids()
print speech_names

# Step 1. Tokenize the speeches
token_lists = [get_tokens(f, inaugural) for f in speech_names]

# Step 2. Build a vocabulary for all the speeches this is the
voc = build_vocabulary(token_lists)


# Step 3. Build feature vectors for the individual speeches.  
doc_vectors = [array(build_vector(tl, voc)) for tl in token_lists]

# Step 4. Do the clustering. Feel free to pick between K-Means and GAAC. 
# How many clusters might you want to generate?

num_clusters = 4
kclusterer = KMeansClusterer(num_clusters, distance=nltk.cluster.util.cosine_distance, repeats=5)
clusters = kclusterer.cluster(doc_vectors, True) 

doc_clusters = zip(inaugural.fileids(), clusters)
for dc in doc_clusters:
    pprint(dc)

[u'1789-Washington.txt', u'1793-Washington.txt', u'1797-Adams.txt', u'1801-Jefferson.txt', u'1805-Jefferson.txt', u'1809-Madison.txt', u'1813-Madison.txt', u'1817-Monroe.txt', u'1821-Monroe.txt', u'1825-Adams.txt', u'1829-Jackson.txt', u'1833-Jackson.txt', u'1837-VanBuren.txt', u'1841-Harrison.txt', u'1845-Polk.txt', u'1849-Taylor.txt', u'1853-Pierce.txt', u'1857-Buchanan.txt', u'1861-Lincoln.txt', u'1865-Lincoln.txt', u'1869-Grant.txt', u'1873-Grant.txt', u'1877-Hayes.txt', u'1881-Garfield.txt', u'1885-Cleveland.txt', u'1889-Harrison.txt', u'1893-Cleveland.txt', u'1897-McKinley.txt', u'1901-McKinley.txt', u'1905-Roosevelt.txt', u'1909-Taft.txt', u'1913-Wilson.txt', u'1917-Wilson.txt', u'1921-Harding.txt', u'1925-Coolidge.txt', u'1929-Hoover.txt', u'1933-Roosevelt.txt', u'1937-Roosevelt.txt', u'1941-Roosevelt.txt', u'1945-Roosevelt.txt', u'1949-Truman.txt', u'1953-Eisenhower.txt', u'1957-Eisenhower.txt', u'1961-Kennedy.txt', u'1965-Johnson.txt', u'1969-Nixon.txt', u'1973-Nixon.txt', u'