In [None]:
from nltk.corpus import brown

In [None]:
import random

<h3>Functions we'll need for text generation</h3>

In [None]:
# Add open and close sentence tags
def addS(text):
	for i in range(len(text)):
		text[i].append(u'</s>')
		text[i].insert(0, u'<s>')

	return text

In [None]:
# Create list of all bigrams
def createBGList(text):
	BGList = []
	for sent in text:
		for word in range(1, len(sent)):
			BGList.append([sent[word - 1], sent[word]])

	return BGList

In [None]:
# Create list of all trigrams
def createTGList(text):
	TGList = []
	for sent in text:
		for word in range(2, len(sent)):
			TGList.append([[sent[word - 2], sent[word - 1]], sent[word]])

	return TGList

In [None]:
# Make everything lowercase
def lowerAll(text):
	for sent in range(len(text)):
		for word in range(len(text[sent])):
			text[sent][word] = text[sent][word].lower()

	return text

In [None]:
# List only bigrams that start with seed word
# Randomize list
def nextWordBi(seed, BGs):
    choices = [words[1] for words in BGs if words[0] == seed]
    random.shuffle(choices)
    
    return choices

In [None]:
# List only trigrams that start with seed bigram
# Randomize list
def nextWordTri(seed, TGs):
    choices = [words[1] for words in TGs if words[0] == seed]
    random.shuffle(choices)
    
    return choices

<h3>Get our text set up</h3>

In [None]:
# For some reason this works better if I convert it to a list
# This doesn't seem to be the case for everyone
text = list(brown.sents())

text = lowerAll(text)

sText = addS(text)

BGList = createBGList(sText)
TGList = createTGList(sText)

<h3>Bigram text</h3>

In [None]:
# Create sentence from bigrams
seed = u'<s>'
while seed != u'</s>':
    word = nextWordBi(seed, BGList)
    print word[0],
    seed = word[0]

<h3>Trigram text</h3>

In [None]:
# Create a sentence from trigrams
# First word is chosen from bigrams
firstWord = nextWordBi(u'<s>', BGList)
print firstWord[0],
seed = [u'<s>', firstWord[0]]
while seed[1] != u'</s>':
    word = nextWordTri(seed, TGList)
    print word[0],
    seed = [seed[1], word[0]]

<h3>Additional functions needed for spacing</h3>

In [None]:
# Create dictionary to count frequency of each word
def wordCount(text):
    count = {}
    for sent in text:
		for word in sent:
			if word in count:
				count[word] += 1
			else:
				count[word] = 1

    return count

In [None]:
# Create dictionary to count frequency of each bigram
def bgCount(bgs):
    count = {}
    for bg in bgs:
		if bg in count:
			count[bg] += 1
		else:
			count[bg] = 1

    return count

In [None]:
# Create list of all possible words that could follow current word
def getPossibleWords(seed, BGs):
    possWords = [word[1] for word in BGs if word[0] == seed]
    
    return possWords

In [None]:
# Insert the first space
def spaceIt(seed, BGList, unspaced, unigramCount):
    
    # Create list of choices based on bigrams
    choices = getPossibleWords(seed, BGList)
    # Get the count of each choice
    choiceCount = bgCount(choices)
    topChoice = [0, 0]
    # Loop through the unspaced string
    # Look at every possible string (1, 12, 123, etc)
    # If that string is one of the choices and its frequency is
    # higher than other strings found, that is the new top choice
    for i in range(len(unspaced)):
        if unspaced[:i] in choices:
            score = choiceCount[unspaced[:i]]
            if score > topChoice[0]:
                topChoice[0] = score
                topChoice[1] = i
    
    # If no strings were found based on bigrams, back off to unigrams            
    if topChoice[0] == 0:
        for j in range(len(unspaced)):
            if unspaced[:j] in unigramCount:
                score = unigramCount[unspaced[:j]]
                if score > topChoice[0]:
                    topChoice[0] = score
                    topChoice[1] = j
    
    return unspaced[:topChoice[1]], unspaced[topChoice[1]:]

<h3>A bit of additional setup for spacing</h3>

In [None]:
unigramCount = wordCount(text)
# A lot of single letters have a very high frequency in the corpus
# This messes up the unigram backoff part
# so lower the count of each to 1
# Skip 'a' and 'i' because we actually want those counts
letters = 'bcdefghjklmnopqurtuvwxyz'
for letter in letters:
    unigramCount[letter] = 1

<h3>Get sentence from user</h3>

In [None]:
unspaced = raw_input("> ").lower()
unspaced += u' '

<h3>Time to run it!</h3>

In [None]:
# Starting 'seed' is open sentence tag
seed = u'<s>'
# Print the word the spacing function found
# Call the spacing function on the remainder of the string
while seed != '':
    seed, unspaced = spaceIt(seed, BGList, unspaced, unigramCount)
    print seed,