## Word count

In [1]:
import re
import string
import urllib2

regex = re.compile('[%s]' % re.escape(string.punctuation))

def word_count(text_file_url):
    text = urllib2.urlopen(text_file_url)
    wordcount={}
    for word in text.read().split():
        w = regex.sub('', word.lower())
        if w not in wordcount:
            wordcount[w] = 1
        else:
            wordcount[w] += 1
    print("Done with word count for %s" % (text_file_url))
    return wordcount

## Data

In [2]:
bible = word_count("http://www.gutenberg.org/cache/epub/10/pg10.txt")
hamlet = word_count("http://seattlecentral.edu/faculty/flepeint/java143/hw7/hamlet.text")
gettysburg = word_count("https://www.mathworks.com/moler/ncm/gettysburg.txt")

Done with word count for http://www.gutenberg.org/cache/epub/10/pg10.txt
Done with word count for http://seattlecentral.edu/faculty/flepeint/java143/hw7/hamlet.text
Done with word count for https://www.mathworks.com/moler/ncm/gettysburg.txt


## Get the top 10 words from each document

In [3]:
import operator

def sort_dict_by_value(dict):
    return sorted(dict.items(), key=operator.itemgetter(1), reverse=True)
    
bible_sorted = sort_dict_by_value(bible)
hamlet_sorted = sort_dict_by_value(hamlet)
gettysburg_sorted = sort_dict_by_value(gettysburg)

print("\nTop 10 words from Bible")
print(bible_sorted[:10])

print("\nTop 10 words from Hamlet")
print(hamlet_sorted[:10])

print("\nTop 10 words from Gettysburg Address")
print(gettysburg_sorted[:10])


Top 10 words from Bible
[('the', 64202), ('and', 51764), ('of', 34789), ('to', 13660), ('that', 12927), ('in', 12725), ('he', 10421), ('shall', 9840), ('unto', 8997), ('for', 8996)]

Top 10 words from Hamlet
[('the', 1090), ('and', 964), ('to', 742), ('of', 675), ('i', 577), ('a', 558), ('you', 554), ('my', 520), ('in', 434), ('it', 419)]

Top 10 words from Gettysburg Address
[('that', 13), ('the', 11), ('we', 10), ('to', 8), ('here', 8), ('', 7), ('a', 7), ('and', 6), ('nation', 5), ('not', 5)]


## Term frequency

In [4]:
def tf(d):
    max_ct = sort_dict_by_value(d)[0][1]
    term_freq = dict()
    for word in d:
        term_freq[word] = 0.5 + 0.5 * d[word] / max_ct
    return term_freq

print("\nTop term frequencies for the Bible\n")
print(sort_dict_by_value(tf(bible))[:20])

print("\nTop term frequencies for Hamlet\n")
print(sort_dict_by_value(tf(hamlet))[:20])

print("\nTop term frequencies for Gettysburg Address\n")
print(sort_dict_by_value(tf(gettysburg))[:20])



Top term frequencies for the Bible

[('the', 1.0), ('and', 0.9031338587582942), ('of', 0.7709339272919846), ('to', 0.6063829787234043), ('that', 0.6006744338182611), ('in', 0.5991012741036105), ('he', 0.5811579078533379), ('shall', 0.5766331266938725), ('unto', 0.5700679106569889), ('for', 0.5700601227376094), ('i', 0.5689542381857263), ('his', 0.5659870409021526), ('a', 0.564125728170462), ('lord', 0.5609794087411607), ('they', 0.5574670571010248), ('be', 0.5547646490763528), ('is', 0.5546244665275225), ('him', 0.5518597551478147), ('not', 0.5515326625338774), ('them', 0.5500763216099187)]

Top term frequencies for Hamlet

[('the', 1.0), ('and', 0.9422018348623853), ('to', 0.8403669724770642), ('of', 0.8096330275229358), ('i', 0.7646788990825688), ('a', 0.7559633027522936), ('you', 0.7541284403669725), ('my', 0.7385321100917431), ('in', 0.6990825688073394), ('it', 0.6922018348623853), ('that', 0.678440366972477), ('ham', 0.6642201834862386), ('is', 0.6587155963302752), ('not', 0.6444

## Inverse document frequency

In [5]:
import math

def merge_dict_keys(dicts):
    merged = set()
    for d in dicts:
        merged = merged | set(d.keys())
    return merged

def idf(dicts):
    inv_doc_freq = dict()
    merged_words = merge_dict_keys(dicts)
    num_docs = len(dicts)
    for word in merged_words:
        count = 0.0
        for d in dicts:
            if word in d:
                count += 1
        inv_doc_freq[word] = math.log(num_docs / (count + 1))  # max(count, 1)
    return inv_doc_freq

inv_doc_freq = sort_dict_by_value(idf([bible, hamlet, gettysburg]))

print("\nInverse document frequency (log scale)\n")
print(inv_doc_freq[:10])


Inverse document frequency (log scale)

[('giddel', 0.4054651081081644), ('writings', 0.4054651081081644), ('oertop', 0.4054651081081644), ('nunnery', 0.4054651081081644), ('tormenting', 0.4054651081081644), ('mozah', 0.4054651081081644), ('aijalon', 0.4054651081081644), ('spiders', 0.4054651081081644), ('5111', 0.4054651081081644), ('hanging', 0.4054651081081644)]


In [6]:
def tf_idf(document_dict, all_dicts):
    term_freq = tf(document_dict)
    inv_doc_freq = idf(all_dicts)
    weighted = dict()
    for word in document_dict:
        weighted[word] = term_freq[word] * inv_doc_freq[word]
    return weighted

In [7]:
all_counts = [bible, hamlet, gettysburg]
bible_results = sort_dict_by_value(tf_idf(bible, all_counts))
hamlet_results = sort_dict_by_value(tf_idf(hamlet, all_counts))
gettysburg_results = sort_dict_by_value(tf_idf(gettysburg, all_counts))

print("\nTop 10 results from the Bible\n")
print(bible_results[:10])

print("\nTop 10 results from Hamlet\n")
print(hamlet_results[:10])

print("\nTop 10 results from Gettysburg Address\n")
print(gettysburg_results[:10])


Top 10 results from the Bible

[('saith', 0.20671760877537207), ('david', 0.20592501865251644), ('jesus', 0.2058366022244688), ('moses', 0.2054229396503888), ('judah', 0.20529663046746366), ('jerusalem', 0.2052934727378905), ('offering', 0.2050187502650282), ('called', 0.2047345546034465), ('egypt', 0.2046619268232645), ('brethren', 0.20451035580375432)]

Top 10 results from Hamlet

[('hamlet', 0.2226338231217765), ('pol', 0.21872796657578042), ('tis', 0.21631005538064), ('th', 0.21612406221178304), ('laer', 0.2142641305232135), ('oph', 0.21352015784778564), ('horatio', 0.21147423299035914), ('laertes', 0.2112882398215022), ('ros', 0.21110224665264524), ('exit', 0.21017228080836045)]

Top 10 results from Gettysburg Address

[('devotion', 0.2339221777547102), ('proposition', 0.2183273659043962), ('unfinished', 0.2183273659043962), ('detract', 0.2183273659043962), ('honored', 0.2183273659043962), ('testing', 0.2183273659043962), ('civil', 0.2183273659043962), ('final', 0.218327365904396