Task 1: Third-Order Letter Approximation Model

In [17]:
# Imports.

# Selecting random items from lists.
import random

# Efficient data structures.
import collections

Step 1: A method that will read in the text from one of the books

In [18]:
#method to read in a book
def readBook(book):
    with open('books/'+book, 'r', encoding='utf-8') as file:#open the file
        english = file.read()#store the read in file in a variable

    return english#return the text

In [19]:
#Clean the text by putting it to lower case and only keeping ascii chars
def cleanBook(book):
    #change everything to lower case
    english = book.upper();
    # The characters to keep.
    keep = 'ABCDEFGHIJKLMNOPQRSTUVWXYZ .'
    # Remove unwanted characters.
    cleaned = ''.join(c for c in english if c in keep)

    return cleaned


In [20]:
#method to remove the preamble and poatamble 
def getRidOfPreambleAndPostamble(book):
    #sentences that are at the start and end of the actual content
    startOfBook = "*** START OF THE PROJECT GUTENBERG EBOOK"
    endOfBook = "*** END OF THE PROJECT GUTENBERG EBOOK"
    #if the starter sentence is in the text
    if startOfBook in book:
        #split the text at the start line and keep the content after it
        book = book.split(startOfBook, 1)[1]
    #if the end sentence is in the text
    if endOfBook in book:
        #split the text at the end line and keep the content before it
        book = book.split(endOfBook, 1)[0]
    #return the book without the preamble and the postamble
    return book 



In [21]:
text = readBook('voyaging.txt')
strippedText = getRidOfPreambleAndPostamble(text)
cleanedText = cleanBook(strippedText)
print(cleanedText)

 DAVID GOES VOYAGING                                  DAVID                             GOES VOYAGING                                   BY                          DAVID BINNEY PUTNAM                WITH ILLUSTRATIONS FROM PHOTOGRAPHS AND              DECORATIONS BY ISABEL COOPER DON DICKERMAN                          AND DWIGHT FRANKLIN                          G. P. PUTNAMS SONS                          NEW YORK AND LONDON                        THE KNICKERBOCKER PRESS                                                  COPYRIGHT  BY G. P. PUTNAMS SONS                    FIRST PRINTING SEPTEMBER                     SECOND PRINTING OCTOBER                     THIRD PRINTING NOVEMBER                     FOURTH PRINTING NOVEMBER                     FIFTH PRINTING NOVEMBER                 MADE IN THE UNITED STATES OF AMERICA                                   TO                                GRANDMA                                  AND                              GRANDPA BUBA SOLEMN FOREWO

Building the trigram model

In [22]:
#method to build the trigram model
def makeTrigramModel(book):
    #create a default int dictionary
    trigramModel = collections.defaultdict(int)

    for i in range(len(book) - 2):
        #this is getting the 3 char sequence
        trigram = text[i:i+3]
        #increment the count
        trigramModel[trigram] += 1
    #retun the built trigram model
    return trigramModel

In [23]:
#getting a random trigram from the trigram model
def getARandomTrigram(trigramModel):
    #store the full list of trigrams
    listOfTrigrams = list(trigramModel.keys())
    #randomly pick one of the trigrams and return it
    return random.choice(listOfTrigrams)

In [24]:
#finding a matching trigram
def getMatchingTrigram(trigramModel, lastTwoChars):
    #store the full list of trigrams
    listOfTrigrams = list(trigramModel.keys())
    #finds all the trigrams that match with the last 2 characters
    return [trigram for trigram in listOfTrigrams if trigram.startswith(lastTwoChars)]

In [25]:
#add the last character of the next trigram to the text
def appendCharacter(createdText, nextTrigram):
    return createdText + nextTrigram[-1]

In [26]:
#makes the text using the trigram model that was created above 
def makeText(trigramModel, length=100):
    #call the methods to get a random trigram and initialize with the first trigram
    thisTrigram = getARandomTrigram(trigramModel)
    createdText = thisTrigram
    #loop that will create the text to be outputted
    for _ in range(length - 3):
        #get the last 2 chars for matching trigrams
        lastTwoChars = createdText[-2:]
        #call the method that matches the trigrams
        matchedTrigram = getMatchingTrigram(trigramModel, lastTwoChars)
        #if there is a matching trigram
        if matchedTrigram:
            #get the next trigram and append its last character
            nextTrigram = random.choice(matchedTrigram)
            createdText = appendCharacter(createdText, nextTrigram)
        else:
            #if there is no matches break
            break;
    #return the text that was just created
    return createdText

In [27]:
#Get the last 2 characters
def getLastTwoChars(text):
    return text[-2:]


In [28]:
#Sort through the trigram model to only get the trigrams that start with the last 2 characters and return the list
def sortMatchingTrigrams(model, twoChars):
    matchingTrigrams = []

    for trigram in model:
        if trigram.startwith(twoChars):
            matchingTrigrams.append(trigram)
    
    return matchingTrigrams

In [29]:
#get the trigram counts for each matching trigram
def getCounts(model, matchingTrigram):
    counts = [model[trigram] for trigram in matchingTrigram]
    return counts

In [30]:
#Get the matching trigrams and their counts
def getMatchingTrigrams(model, twoChars):
    matchingTrigrams = sortMatchingTrigrams(model, twoChars)

    if matchingTrigrams:
        counts = getCounts(model, matchingTrigrams)
        return matchingTrigrams, counts
    else:
        return None, None

In [31]:
#pick the 3rd char of a trigram based on the random counts choice
def pickNextCharacter(trigrams, counts):
    #return only the third character
    chosenTrigram = random.choices(trigrams, counts=counts, k=1)[0]
    return chosenTrigram[-1]

In [32]:
def generateText(model, startText="TH", length=10000):
    #start the generated text by storing the initial string given in the task
    generatedText = startText
    #keep going till the text is 10000
    while len(generatedText) < length:
        #take out the last 2 chars from the current text
        lastTwoChars = generatedText[-2:]
        #call the method that gets all the trigrams that start with the current 2 chars and their counts
        trigrams, counts = getMatchingTrigrams(model, lastTwoChars)
        #if there are matching trigrams then pick the next char based on the probablity of the counts
        if trigrams:
            nextTrigram = random.choices(trigrams, counts=counts, k=1)[0]
            generatedText += nextTrigram[-1]
        #if there are no mathcing trigrams stop generating the text
        else:
            break

    return generatedText