In [None]:
import os, sys, json, re, argparse, urllib.request, html5lib
from bs4 import BeautifulSoup, Tag, UnicodeDammit

In [None]:
script_url = 'http://www.imsdb.com/scripts/Aliens.html'

request = urllib.request.Request(script_url)

webpage_bytes = urllib.request.urlopen(request)
soup = BeautifulSoup(webpage_bytes, 'lxml')

In [1]:
corpus = "Each time we gather to inaugurate a president, we bear witness to the enduring strength of our Constitution. We affirm the promise of our democracy. We recall that what binds this nation together is not the colors of our skin or the tenets of our faith or the origins of our names. What makes us exceptional – what makes us American – is our allegiance to an idea, articulated in a declaration made more than two centuries ago:    “We hold these truths to be self-evident, that all men are created equal, that they are endowed by their Creator with certain unalienable rights, that among these are Life, Liberty, and the pursuit of Happiness. For we, the people, understand that our country cannot succeed when a shrinking few do very well and a growing many barely make it. We believe that America’s prosperity must rest upon the broad shoulders of a rising middle class. We know that America thrives when every person can find independence and pride in their work; when the wages of honest labor liberate families from the brink of hardship. We, the people, still believe that every citizen deserves a basic measure of security and dignity. We must make the hard choices to reduce the cost of health care and the size of our deficit. But we reject the belief that America must choose between caring for the generation that built this country and investing in the generation that will build its future. We, the people, still believe that our obligations as Americans are not just to ourselves, but to all posterity. We will respond to the threat of climate change, knowing that the failure to do so would betray our children and future generations. We will defend our people and uphold our values through strength of arms and rule of law. We will show the courage to try and resolve our differences with other nations peacefully – not because we are naïve about the dangers we face, but because engagement can more durably lift suspicion and fear. America will remain the anchor of strong alliances in every corner of the globe; and we will renew those institutions that extend our capacity to manage crisis abroad, for no one has a greater stake in a peaceful world than its most powerful nation. Our journey is not complete until our wives, our mothers, and daughters can earn a living equal to their efforts. Our journey is not complete until our gay brothers and sisters are treated like anyone else under the law – for if we are truly created equal, then surely the love we commit to one another must be equal as well. Our journey is not complete until no citizen is forced to wait for hours to exercise the right to vote.  Our journey is not complete until we find a better way to welcome the striving, hopeful immigrants who still see America as a land of opportunity; until bright young students and engineers are enlisted in our workforce rather than expelled from our country."
corpus = corpus.lower()
book = corpus

In [None]:
spaces_regex = re.compile("^(\s*).*")
location_regex = re.compile("^\s*(INT\.|EXT\.)")

In [49]:
def tokenize():
    if book is not None:
        words = book.lower().split()
        return words
    else:
        return None
        

def map_book(tokens):
    hash_map = {}

    if tokens is not None:
        for element in tokens:
            # Remove Punctuation
            word = element.replace(",","")
            word = word.replace(".","")

            # Word Exist?
            if word in hash_map:
                hash_map[word] = hash_map[word] + 1
            else:
                hash_map[word] = 1

        return hash_map
    else:
        return None


# Tokenize the Book
words = tokenize()
word_list = ['our','people','journey']

# Create a Hash Map (Dictionary)
map = map_book(words)

# Show Word Information
for word in word_list:
    print('Word: [' + word + '] Frequency: ' + str(map[word]))

Word: [our] Frequency: 23
Word: [people] Frequency: 4
Word: [journey] Frequency: 4


In [50]:
from collections import defaultdict

def build_conditional_probabilities(corpus):
	"""
	The function takes as its input a corpus string (words separated by 
	spaces) and returns a 2D dictionnary of probabilities P(next|current) of
	seeing a word "next" conditionnaly to seeing a word "current". 
	"""

	# First we parse the string to build a double dimension dictionnary that
	# returns the conditional probabilities.

	# We parse the string to build a first dictionnary indicating for each
	# word, what are the words that follow it in the string. Repeated next
	# words are kept so we use a list and not a set. 

	tokenized_string = corpus.split()
	previous_word = ""
	dictionnary = defaultdict(list)

	for current_word in tokenized_string:
		if previous_word != "":
			dictionnary[previous_word].append(current_word)
		previous_word = current_word
		
	# We know parse dictionnary to compute the probability each observed
	# next word for each word in the dictionnary. 

	for key in dictionnary.keys():
		next_words = dictionnary[key]
		unique_words = set(next_words) # removes duplicated
		nb_words = len(next_words)
		probabilities_given_key = {}
		for unique_word in unique_words:
			probabilities_given_key[unique_word] = \
				float(next_words.count(unique_word)) / nb_words
		dictionnary[key] = probabilities_given_key

	return dictionnary


def bigram_next_word_predictor(conditional_probabilities, current, next_candidate):
	"""
	The function takes as its input a 2D dictionnary of probabilities 
	P(next|current) of seeing a word "next" conditionnaly to seeing a word 
	"current", the current word being read, and a next candidate word, and
	returns P(next_candidate|current).
	"""

	# We look for the probability corresponding to the 
	# current -> next_candidate pair

	if current in conditional_probabilities:
		if next_candidate in conditional_probabilities[current]:
			return conditional_probabilities[current][next_candidate]

	# If current -> next_candidate pair has not been observed in the corpus,
	# the corresponding dictionnary keys will not be defined. We return 
	# a probability 0.0

	return 0.0

# An example corpus to try out the function
corpus = "Each time we gather to inaugurate a president, we bear witness to the enduring strength of our Constitution. We affirm the promise of our democracy. We recall that what binds this nation together is not the colors of our skin or the tenets of our faith or the origins of our names. What makes us exceptional – what makes us American – is our allegiance to an idea, articulated in a declaration made more than two centuries ago:    “We hold these truths to be self-evident, that all men are created equal, that they are endowed by their Creator with certain unalienable rights, that among these are Life, Liberty, and the pursuit of Happiness. For we, the people, understand that our country cannot succeed when a shrinking few do very well and a growing many barely make it. We believe that America’s prosperity must rest upon the broad shoulders of a rising middle class. We know that America thrives when every person can find independence and pride in their work; when the wages of honest labor liberate families from the brink of hardship. We, the people, still believe that every citizen deserves a basic measure of security and dignity. We must make the hard choices to reduce the cost of health care and the size of our deficit. But we reject the belief that America must choose between caring for the generation that built this country and investing in the generation that will build its future. We, the people, still believe that our obligations as Americans are not just to ourselves, but to all posterity. We will respond to the threat of climate change, knowing that the failure to do so would betray our children and future generations. We will defend our people and uphold our values through strength of arms and rule of law. We will show the courage to try and resolve our differences with other nations peacefully – not because we are naïve about the dangers we face, but because engagement can more durably lift suspicion and fear. America will remain the anchor of strong alliances in every corner of the globe; and we will renew those institutions that extend our capacity to manage crisis abroad, for no one has a greater stake in a peaceful world than its most powerful nation. Our journey is not complete until our wives, our mothers, and daughters can earn a living equal to their efforts. Our journey is not complete until our gay brothers and sisters are treated like anyone else under the law – for if we are truly created equal, then surely the love we commit to one another must be equal as well. Our journey is not complete until no citizen is forced to wait for hours to exercise the right to vote.  Our journey is not complete until we find a better way to welcome the striving, hopeful immigrants who still see America as a land of opportunity; until bright young students and engineers are enlisted in our workforce rather than expelled from our country."
corpus = corpus.lower()
# We call the conditional probability dictionnary builder function
conditional_probabilities = build_conditional_probabilities(corpus)

# Some sample queries to the bigram predictor
assert bigram_next_word_predictor(conditional_probabilities, "our", "people") 
assert bigram_next_word_predictor(conditional_probabilities, "our", "journey")
#assert bigram_next_word_predictor(conditional_probabilities, "", "red") == 0.0

print(conditional_probabilities)

defaultdict(<class 'list'>, {'each': {'time': 1.0}, 'time': {'we': 1.0}, 'we': {'reject': 0.058823529411764705, 'face,': 0.058823529411764705, 'believe': 0.058823529411764705, 'bear': 0.058823529411764705, 'commit': 0.058823529411764705, 'will': 0.23529411764705882, 'recall': 0.058823529411764705, 'are': 0.11764705882352941, 'know': 0.058823529411764705, 'find': 0.058823529411764705, 'gather': 0.058823529411764705, 'must': 0.058823529411764705, 'affirm': 0.058823529411764705}, 'gather': {'to': 1.0}, 'to': {'the': 0.11764705882352941, 'manage': 0.058823529411764705, 'reduce': 0.058823529411764705, 'one': 0.058823529411764705, 'welcome': 0.058823529411764705, 'wait': 0.058823529411764705, 'exercise': 0.058823529411764705, 'their': 0.058823529411764705, 'an': 0.058823529411764705, 'try': 0.058823529411764705, 'be': 0.058823529411764705, 'inaugurate': 0.058823529411764705, 'do': 0.058823529411764705, 'vote.': 0.058823529411764705, 'ourselves,': 0.058823529411764705, 'all': 0.05882352941176

In [52]:
assert bigram_next_word_predictor(conditional_probabilities, "our", "people") 

In [4]:
import nltk
from nltk import FreqDist
from nltk.corpus import gutenberg
nltk.download('gutenberg')


nltk.corpus.gutenberg.fileids()


file0 = nltk.corpus.gutenberg.fileids()[12]
emmatext = nltk.corpus.gutenberg.raw(file0)
emmatokens = nltk.word_tokenize(emmatext) 
emmawords = [w.lower( ) for w in emmatokens]

print(len(emmawords))
print(emmawords[ :110])



[nltk_data] Downloading package gutenberg to
[nltk_data]     /Users/kenmckee/nltk_data...
[nltk_data]   Package gutenberg is already up-to-date!
254989
['[', 'moby', 'dick', 'by', 'herman', 'melville', '1851', ']', 'etymology', '.', '(', 'supplied', 'by', 'a', 'late', 'consumptive', 'usher', 'to', 'a', 'grammar', 'school', ')', 'the', 'pale', 'usher', '--', 'threadbare', 'in', 'coat', ',', 'heart', ',', 'body', ',', 'and', 'brain', ';', 'i', 'see', 'him', 'now', '.', 'he', 'was', 'ever', 'dusting', 'his', 'old', 'lexicons', 'and', 'grammars', ',', 'with', 'a', 'queer', 'handkerchief', ',', 'mockingly', 'embellished', 'with', 'all', 'the', 'gay', 'flags', 'of', 'all', 'the', 'known', 'nations', 'of', 'the', 'world', '.', 'he', 'loved', 'to', 'dust', 'his', 'old', 'grammars', ';', 'it', 'somehow', 'mildly', 'reminded', 'him', 'of', 'his', 'mortality', '.', '``', 'while', 'you', 'take', 'in', 'hand', 'to', 'school', 'others', ',', 'and', 'to', 'teach', 'them', 'by', 'what', 'name', 'a', '

In [5]:
ndist = FreqDist(emmawords)
nitems = ndist.most_common(30)
for item in nitems:
    print (item[0], '\t', item[1])


, 	 19204
the 	 14416
. 	 7308
of 	 6586
and 	 6414
a 	 4694
to 	 4597
; 	 4173
in 	 4162
that 	 3080
his 	 2530
it 	 2507
i 	 2097
he 	 1890
but 	 1813
! 	 1767
is 	 1748
as 	 1741
with 	 1721
-- 	 1713
was 	 1651
's 	 1634
for 	 1616
'' 	 1615
all 	 1508
`` 	 1456
this 	 1391
at 	 1318
not 	 1218
by 	 1201


In [6]:
emmawords2 = gutenberg.words('melville-moby_dick.txt')
emmawords2lowercase = [w.lower() for w in emmawords2]

len(emmawords)
len(emmawords2lowercase)

print(emmawords[:160])
print(emmawords2lowercase[:160])



['[', 'moby', 'dick', 'by', 'herman', 'melville', '1851', ']', 'etymology', '.', '(', 'supplied', 'by', 'a', 'late', 'consumptive', 'usher', 'to', 'a', 'grammar', 'school', ')', 'the', 'pale', 'usher', '--', 'threadbare', 'in', 'coat', ',', 'heart', ',', 'body', ',', 'and', 'brain', ';', 'i', 'see', 'him', 'now', '.', 'he', 'was', 'ever', 'dusting', 'his', 'old', 'lexicons', 'and', 'grammars', ',', 'with', 'a', 'queer', 'handkerchief', ',', 'mockingly', 'embellished', 'with', 'all', 'the', 'gay', 'flags', 'of', 'all', 'the', 'known', 'nations', 'of', 'the', 'world', '.', 'he', 'loved', 'to', 'dust', 'his', 'old', 'grammars', ';', 'it', 'somehow', 'mildly', 'reminded', 'him', 'of', 'his', 'mortality', '.', '``', 'while', 'you', 'take', 'in', 'hand', 'to', 'school', 'others', ',', 'and', 'to', 'teach', 'them', 'by', 'what', 'name', 'a', 'whale-fish', 'is', 'to', 'be', 'called', 'in', 'our', 'tongue', 'leaving', 'out', ',', 'through', 'ignorance', ',', 'the', 'letter', 'h', ',', 'which', 

In [7]:
>>> emptydict = dict()
>>> phonedict = {'Bailey':'32-16','Char':'15-18', 'Dave': '20-15'}

>>> phonedict['Bailey']

'32-16'

In [8]:
>>> phonedict['Avi'] = '41-54'
>>> phonedict

{'Bailey': '32-16', 'Char': '15-18', 'Dave': '20-15', 'Avi': '41-54'}

In [9]:
>>> phonedict.keys()

dict_keys(['Bailey', 'Char', 'Dave', 'Avi'])

In [10]:
>>> phonedict.values()

dict_values(['32-16', '15-18', '20-15', '41-54'])

In [11]:
>>> phonedict.items()

dict_items([('Bailey', '32-16'), ('Char', '15-18'), ('Dave', '20-15'), ('Avi', '41-54')])

In [12]:
>>> 'Char' in phonedict


True

In [13]:
>>> 'Dave' not in phonedict

False

In [14]:
>>> for pair in phonedict.items():
           print(pair)



('Bailey', '32-16')
('Char', '15-18')
('Dave', '20-15')
('Avi', '41-54')


In [15]:
# the function doublesum takes 2 numbers as parameters, either int or float
#  and returns a result which is the sum of those numbers multiplied by 2
def doublesum (x, y):
    result = 2 * (x + y)
    return result


In [16]:
>>> doublesum(3, 5)


16

In [17]:
>>> num = doublesum(3.4, 2)
>>> num


10.8

In [18]:
# this function takes a string and a list of words as parameters.
#   It will return all the words in the list that contain the string as a substring
def searchstring (substring, wordlist):
    # initialize the result
    result = [ ]
    #  loop over all the words
    for word in wordlist:
        # test each word if it contains the substring
        if substring in word:
            # add it to the result
            result.append(word)
    return result


In [19]:
>>> searchstring('zz', emmawords)

['fuzzing',
 'drizzly',
 'puzzled',
 'dazzling',
 'puzzle',
 'puzzled',
 'puzzled',
 'mizzen',
 'huzza',
 'huzza',
 'huzza',
 'huzza',
 'huzza',
 'belshazzar',
 'belshazzar',
 'belshazzar',
 'belshazzar',
 'huzza',
 'huzza',
 'dazzlingly',
 'mizzen',
 'piazza',
 'plazza',
 'mizzen',
 'whizzings',
 'mizzen-mast-heads',
 'gizzard',
 'grizzled',
 'puzzling',
 'muezzin',
 'muzzle',
 'grizzled',
 'puzzling',
 'piazza',
 'belshazzar',
 'grizzled',
 'puzzle',
 'huzza',
 'puzzle',
 'grizzly',
 'dazzling',
 'dazzlingly',
 'dazzling']

In [20]:
# multiple variable assignment and use
>>> name, phone, location = ('Zack', '22-15', 'Room 159')
>>> name


'Zack'

In [21]:
>>> phone


'22-15'

In [22]:
>>> location


'Room 159'

In [23]:
import re
# this regular expression pattern matches any word that contains all non-alphabetical
#   lower-case characters [^a-z]+
# the beginning ^ and ending $ require the match to begin and end on a word boundary 
pattern = re.compile('^[^a-z]+$')


In [24]:
nonAlphaMatch = pattern.match('**')
#  if it matched, print a message
if nonAlphaMatch: 'matched non-alphabetical'



In [25]:
# function that takes a word and returns true if it consists only
#   of non-alphabetic characters

def alpha_filter(w):
  # pattern to match a word of non-alphabetical characters
    pattern = re.compile('^[^a-z]+$')
    if (pattern.match(w)):
        return True
    else:
        return False


In [26]:
alphaemmawords = [w for w in emmawords if not alpha_filter(w)]
print(len(alphaemmawords))
print(alphaemmawords[:100])


215607
['moby', 'dick', 'by', 'herman', 'melville', 'etymology', 'supplied', 'by', 'a', 'late', 'consumptive', 'usher', 'to', 'a', 'grammar', 'school', 'the', 'pale', 'usher', 'threadbare', 'in', 'coat', 'heart', 'body', 'and', 'brain', 'i', 'see', 'him', 'now', 'he', 'was', 'ever', 'dusting', 'his', 'old', 'lexicons', 'and', 'grammars', 'with', 'a', 'queer', 'handkerchief', 'mockingly', 'embellished', 'with', 'all', 'the', 'gay', 'flags', 'of', 'all', 'the', 'known', 'nations', 'of', 'the', 'world', 'he', 'loved', 'to', 'dust', 'his', 'old', 'grammars', 'it', 'somehow', 'mildly', 'reminded', 'him', 'of', 'his', 'mortality', 'while', 'you', 'take', 'in', 'hand', 'to', 'school', 'others', 'and', 'to', 'teach', 'them', 'by', 'what', 'name', 'a', 'whale-fish', 'is', 'to', 'be', 'called', 'in', 'our', 'tongue', 'leaving', 'out', 'through']


In [27]:
nltkstopwords = nltk.corpus.stopwords.words('english')
print(len(nltkstopwords))
print(nltkstopwords)


179
['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than

In [28]:
print(emmawords[:100])
print(emmawords[15300:15310])


['[', 'moby', 'dick', 'by', 'herman', 'melville', '1851', ']', 'etymology', '.', '(', 'supplied', 'by', 'a', 'late', 'consumptive', 'usher', 'to', 'a', 'grammar', 'school', ')', 'the', 'pale', 'usher', '--', 'threadbare', 'in', 'coat', ',', 'heart', ',', 'body', ',', 'and', 'brain', ';', 'i', 'see', 'him', 'now', '.', 'he', 'was', 'ever', 'dusting', 'his', 'old', 'lexicons', 'and', 'grammars', ',', 'with', 'a', 'queer', 'handkerchief', ',', 'mockingly', 'embellished', 'with', 'all', 'the', 'gay', 'flags', 'of', 'all', 'the', 'known', 'nations', 'of', 'the', 'world', '.', 'he', 'loved', 'to', 'dust', 'his', 'old', 'grammars', ';', 'it', 'somehow', 'mildly', 'reminded', 'him', 'of', 'his', 'mortality', '.', '``', 'while', 'you', 'take', 'in', 'hand', 'to', 'school', 'others', ',']
['who-e', 'debel', 'you', '?', "''", '--', 'he', 'at', 'last', 'said']


In [29]:
morestopwords = ['could','would','might','must','need','sha','wo','y',"'s","'d","'ll","'t","'m","'re","'ve"]


In [30]:
stopwords = nltkstopwords + morestopwords
print(len(stopwords))
print(stopwords)


194
['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than

In [31]:
stoppedemmawords = [w for w in alphaemmawords if not w in stopwords]
print(len(stoppedemmawords))


107330


In [32]:
emmadist = FreqDist(stoppedemmawords)
emmaitems = emmadist.most_common(30)
for item in emmaitems:
  print(item)



('whale', 1086)
('one', 912)
('like', 580)
('upon', 565)
('ahab', 508)
('man', 490)
('ship', 463)
('old', 443)
('ye', 438)
('sea', 384)
('though', 382)
('yet', 344)
('time', 326)
('captain', 324)
('long', 318)
('still', 312)
('said', 304)
('great', 303)
('two', 288)
('boat', 287)
('seemed', 283)
('head', 277)
('last', 275)
('see', 268)
('thou', 268)
('whales', 267)
('way', 264)
('stubb', 254)
("n't", 252)
('queequeg', 252)


In [33]:
emmabigrams = list(nltk.bigrams(emmawords))

print(emmabigrams[:20])


[('[', 'moby'), ('moby', 'dick'), ('dick', 'by'), ('by', 'herman'), ('herman', 'melville'), ('melville', '1851'), ('1851', ']'), (']', 'etymology'), ('etymology', '.'), ('.', '('), ('(', 'supplied'), ('supplied', 'by'), ('by', 'a'), ('a', 'late'), ('late', 'consumptive'), ('consumptive', 'usher'), ('usher', 'to'), ('to', 'a'), ('a', 'grammar'), ('grammar', 'school')]


In [34]:
from nltk.collocations import *
bigram_measures = nltk.collocations.BigramAssocMeasures()



In [35]:
finder = BigramCollocationFinder.from_words(emmawords)
scored = finder.score_ngrams(bigram_measures.raw_freq)


In [36]:
print(type(scored))
first = scored[0]
print(type(first), first)


<class 'list'>
<class 'tuple'> ((',', 'and'), 0.010357309530999377)


In [37]:
for bscore in scored[:30]:
    print (bscore)


((',', 'and'), 0.010357309530999377)
(('of', 'the'), 0.0073493366380510535)
(('in', 'the'), 0.004604120177733157)
((',', 'the'), 0.003600155300816898)
((';', 'and'), 0.0033609292949891957)
(('to', 'the'), 0.002851103380930158)
(('.', '``'), 0.002337355729070666)
(('.', 'but'), 0.002337355729070666)
((',', 'that'), 0.0023059818266670325)
((',', 'as'), 0.002058912345238422)
(('.', "''"), 0.001917729784422073)
(("''", '``'), 0.001854981979614807)
((',', 'i'), 0.0017961559126079948)
((',', 'he'), 0.0017687037480048158)
(('from', 'the'), 0.0017255646321998204)
((',', 'in'), 0.0015804603335830172)
(('.', 'the'), 0.0014981038397734803)
(('of', 'his'), 0.001458886461768939)
(('and', 'the'), 0.0014431995105671225)
(('the', 'whale'), 0.0014039821325625812)
(('on', 'the'), 0.0013686864923584939)
((',', 'but'), 0.001345156065555769)
((';', 'but'), 0.0013373125899548608)
(('of', 'a'), 0.0013059386875512277)
(('at', 'the'), 0.0012902517363494112)
(('to', 'be'), 0.0012902517363494112)
(('!', "''"), 0

In [38]:
finder.apply_word_filter(alpha_filter)
scored = finder.score_ngrams(bigram_measures.raw_freq)
for bscore in scored[:30]:
    print (bscore)


(('of', 'the'), 0.0073493366380510535)
(('in', 'the'), 0.004604120177733157)
(('to', 'the'), 0.002851103380930158)
(('from', 'the'), 0.0017255646321998204)
(('of', 'his'), 0.001458886461768939)
(('and', 'the'), 0.0014431995105671225)
(('the', 'whale'), 0.0014039821325625812)
(('on', 'the'), 0.0013686864923584939)
(('of', 'a'), 0.0013059386875512277)
(('at', 'the'), 0.0012902517363494112)
(('to', 'be'), 0.0012902517363494112)
(('by', 'the'), 0.0012471126205444156)
(('with', 'the'), 0.0012235821937416908)
(('for', 'the'), 0.0011961300291385118)
(('it', 'was'), 0.0011529909133335162)
(('it', 'is'), 0.001109851797528521)
(('in', 'his'), 0.001027495303718984)
(('in', 'a'), 0.0010118083525171675)
(('with', 'a'), 0.000972590974512626)
(('the', 'ship'), 0.0009647474989117178)
(('into', 'the'), 0.0009608257611112636)
(('upon', 'the'), 0.0008627823160999102)
(('as', 'the'), 0.0008510171026985478)
(('that', 'the'), 0.0008431736270976395)
(('the', 'sea'), 0.0008431736270976395)
(('all', 'the'), 0.

In [39]:
finder.apply_word_filter(lambda w: w in stopwords)
scored = finder.score_ngrams(bigram_measures.raw_freq)
for bscore in scored[:30]:
    print (bscore)


(('sperm', 'whale'), 0.0006784606394785658)
(('white', 'whale'), 0.00041570420684813853)
(('moby', 'dick'), 0.0003176607618367851)
(('old', 'man'), 0.0002941303350340603)
(('captain', 'ahab'), 0.00023922600582770238)
(('right', 'whale'), 0.00020393036562361513)
(('captain', 'peleg'), 0.0001254956096145324)
(('cried', 'ahab'), 0.0001254956096145324)
(('mr.', 'starbuck'), 0.00011373039621316998)
(('one', 'hand'), 0.00010980865841271584)
(('let', 'us'), 0.00010588692061226171)
(('ca', "n't"), 0.00010196518281180757)
(('every', 'one'), 9.412170721089929e-05)
(('cried', 'stubb'), 9.019996941044515e-05)
(('look', 'ye'), 8.627823160999102e-05)
(('never', 'mind'), 8.627823160999102e-05)
(('one', 'side'), 8.627823160999102e-05)
(("'ye", 'see'), 8.235649380953688e-05)
(('thou', 'art'), 8.235649380953688e-05)
(('ai', "n't"), 7.451301820862861e-05)
(('new', 'bedford'), 7.059128040817447e-05)
(('said', 'stubb'), 7.059128040817447e-05)
(('sperm', 'whales'), 7.059128040817447e-05)
(('years', 'ago'), 

In [40]:
finder2 = BigramCollocationFinder.from_words(emmawords)
finder2.apply_freq_filter(2)
scored = finder2.score_ngrams(bigram_measures.raw_freq)
for bscore in scored[:20]:
    print (bscore)

finder2.apply_ngram_filter(lambda w1, w2: len(w1) < 2)
scored = finder2.score_ngrams(bigram_measures.raw_freq)
for bscore in scored[:20]:
    print (bscore)


((',', 'and'), 0.010357309530999377)
(('of', 'the'), 0.0073493366380510535)
(('in', 'the'), 0.004604120177733157)
((',', 'the'), 0.003600155300816898)
((';', 'and'), 0.0033609292949891957)
(('to', 'the'), 0.002851103380930158)
(('.', '``'), 0.002337355729070666)
(('.', 'but'), 0.002337355729070666)
((',', 'that'), 0.0023059818266670325)
((',', 'as'), 0.002058912345238422)
(('.', "''"), 0.001917729784422073)
(("''", '``'), 0.001854981979614807)
((',', 'i'), 0.0017961559126079948)
((',', 'he'), 0.0017687037480048158)
(('from', 'the'), 0.0017255646321998204)
((',', 'in'), 0.0015804603335830172)
(('.', 'the'), 0.0014981038397734803)
(('of', 'his'), 0.001458886461768939)
(('and', 'the'), 0.0014431995105671225)
(('the', 'whale'), 0.0014039821325625812)
(('of', 'the'), 0.0073493366380510535)
(('in', 'the'), 0.004604120177733157)
(('to', 'the'), 0.002851103380930158)
(("''", '``'), 0.001854981979614807)
(('from', 'the'), 0.0017255646321998204)
(('of', 'his'), 0.001458886461768939)
(('and', 'th

In [41]:
finder3 = BigramCollocationFinder.from_words(emmawords)
scored = finder3.score_ngrams(bigram_measures.pmi)
for bscore in scored[:30]:
    print (bscore)

(('*in', 'sperm-whalemen'), 17.96007548627488)
(('11', 'nightgown'), 17.96007548627488)
(('12', 'biographical'), 17.96007548627488)
(('121', 'midnight.'), 17.96007548627488)
(('2,800', 'firkins'), 17.96007548627488)
(('25', 'postscript'), 17.96007548627488)
(('a.s.', 'walw-ian'), 17.96007548627488)
(('accidental', 'advantages'), 17.96007548627488)
(('adoring', 'cherubim'), 17.96007548627488)
(('agassiz', 'imagines'), 17.96007548627488)
(('agrarian', 'freebooting'), 17.96007548627488)
(('air-freighted', 'demijohn'), 17.96007548627488)
(('albert', 'durer'), 17.96007548627488)
(('all-ramifying', 'heartlessness'), 17.96007548627488)
(('amphitheatrical', 'heights'), 17.96007548627488)
(('anacharsis', 'clootz'), 17.96007548627488)
(('andrew', 'jackson'), 17.96007548627488)
(('anno', '1652'), 17.96007548627488)
(('annus', 'mirabilis'), 17.96007548627488)
(('arkansas', 'duellist'), 17.96007548627488)
(('aroostook', 'hemlock'), 17.96007548627488)
(('arrantest', 'topers'), 17.96007548627488)
(('

In [42]:
finder3.apply_freq_filter(5)
scored = finder3.score_ngrams(bigram_measures.pmi)
for bscore in scored[:30]:
    print (bscore)



(('samuel', 'enderby'), 14.278251446301137)
(('mrs.', 'hussey'), 13.872612645024544)
(('heidelburgh', 'tun'), 13.737683064938436)
(('don', 'sebastian'), 13.468222389945208)
(('st.', 'george'), 13.348640774192532)
(('father', 'mapple'), 13.259635768133789)
(('huzza', 'porpoise'), 13.034076067718662)
(('d', "'ye"), 12.915681366916429)
(('fiery', 'pit'), 12.889686158383487)
(('steering', 'oar'), 12.375112985553727)
(('cape', 'horn'), 11.932169489704997)
(('seven', 'hundred'), 11.901181797221312)
(('centuries', 'ago'), 11.79015048483257)
(('moby', 'dick'), 11.58482147992334)
(('new', 'york'), 11.55068455013718)
(('new', 'zealand'), 11.55068455013718)
(('new', 'bedford'), 11.550684550137179)
(('book', 'ii'), 11.12718547211014)
(('saturday', 'night'), 10.904793050773694)
(('she', 'blows'), 10.716149903388795)
(('drew', 'nigh'), 10.659341613656645)
(('chief', 'mate'), 10.643309096503298)
(('years', 'ago'), 10.452280846076183)
(('english', 'whalers'), 10.419366223603452)
(('forty', 'years'), 1