Task 1: Third-Order Letter Approximation Model

In [102]:
# Imports.

# Selecting random items from lists.
import random

# Efficient data structures.
import collections
import json

Step 1: A method that will read in the text from one of the books

In [103]:
#method to read in a book
def readBook(book):
    with open('books/'+book, 'r', encoding='utf-8') as file:#open the file
        english = file.read()#store the read in file in a variable

    return english#return the text

In [104]:
#Clean the text by putting it to lower case and only keeping ascii chars
def cleanBook(book):
    #change everything to lower case
    english = book.upper();
    # The characters to keep.
    keep = 'ABCDEFGHIJKLMNOPQRSTUVWXYZ .'
    # Remove unwanted characters.
    cleaned = ''.join(c for c in english if c in keep)

    return cleaned


In [105]:
#method to remove the preamble and poatamble 
def getRidOfPreambleAndPostamble(book):
    #sentences that are at the start and end of the actual content
    startOfBook = "*** START OF THE PROJECT GUTENBERG EBOOK"
    endOfBook = "*** END OF THE PROJECT GUTENBERG EBOOK"
    #if the starter sentence is in the text
    if startOfBook in book:
        #split the text at the start line and keep the content after it
        book = book.split(startOfBook, 1)[1]
    #if the end sentence is in the text
    if endOfBook in book:
        #split the text at the end line and keep the content before it
        book = book.split(endOfBook, 1)[0]
    #return the book without the preamble and the postamble
    return book 



In [106]:
text = readBook('voyaging.txt')
strippedText = getRidOfPreambleAndPostamble(text)
cleanedText = cleanBook(strippedText)
print(cleanedText)

 DAVID GOES VOYAGING                                  DAVID                             GOES VOYAGING                                   BY                          DAVID BINNEY PUTNAM                WITH ILLUSTRATIONS FROM PHOTOGRAPHS AND              DECORATIONS BY ISABEL COOPER DON DICKERMAN                          AND DWIGHT FRANKLIN                          G. P. PUTNAMS SONS                          NEW YORK AND LONDON                        THE KNICKERBOCKER PRESS                                                  COPYRIGHT  BY G. P. PUTNAMS SONS                    FIRST PRINTING SEPTEMBER                     SECOND PRINTING OCTOBER                     THIRD PRINTING NOVEMBER                     FOURTH PRINTING NOVEMBER                     FIFTH PRINTING NOVEMBER                 MADE IN THE UNITED STATES OF AMERICA                                   TO                                GRANDMA                                  AND                              GRANDPA BUBA SOLEMN FOREWO

Building the trigram model

In [107]:
#method to build the trigram model
def makeTrigramModel(book):
    #create a default int dictionary
    trigramModel = collections.defaultdict(int)

    for i in range(len(book) - 2):
        #this is getting the 3 char sequence
        trigram = text[i:i+3]
        #increment the count
        trigramModel[trigram] += 1
    #retun the built trigram model
    return trigramModel

In [108]:
trigramModel = makeTrigramModel(cleanedText)
print(trigramModel)

defaultdict(<class 'int'>, {'The': 156, 'he ': 891, 'e P': 11, ' Pr': 8, 'Pro': 5, 'roj': 3, 'oje': 3, 'jec': 7, 'ect': 35, 'ct ': 10, 't G': 3, ' Gu': 5, 'Gut': 3, 'ute': 13, 'ten': 25, 'enb': 4, 'nbe': 5, 'ber': 23, 'erg': 12, 'rg ': 3, 'g e': 6, ' eB': 2, 'eBo': 3, 'Boo': 4, 'ook': 70, 'ok ': 41, 'k o': 20, ' of': 333, 'of ': 302, 'f D': 1, ' Da': 19, 'Dav': 12, 'avi': 21, 'vid': 13, 'id ': 31, 'd g': 23, ' go': 75, 'goe': 11, 'oes': 17, 'es ': 219, 's v': 16, ' vo': 15, 'voy': 5, 'oya': 6, 'yag': 6, 'agi': 4, 'gin': 22, 'ing': 369, 'ng\n': 26, 'g\n ': 1, '\n  ': 73, '   ': 1726, '  \n': 1, ' \nT': 1, '\nTh': 60, 'Thi': 20, 'his': 99, 'is ': 234, 's e': 20, ' eb': 2, 'ebo': 3, 'boo': 34, 'k i': 11, ' is': 173, 's f': 48, ' fo': 158, 'for': 151, 'or ': 246, 'r t': 139, ' th': 1204, 'the': 1192, 'e u': 15, ' us': 29, 'use': 44, 'se ': 77, 'e o': 140, 'f a': 34, ' an': 581, 'any': 63, 'nyo': 2, 'yon': 9, 'one': 138, 'ne ': 134, 'e a': 231, 'nyw': 7, 'ywh': 2, 'whe': 49, 'her': 232, 'er

TASK 2

In [109]:
#get the trigram counts for each matching trigram
def getCounts(model, matchingTrigram):
    counts = [model[trigram] for trigram in matchingTrigram]
    return counts

In [110]:
#Get the matching trigrams and their counts
def getMatchingTrigrams(model, twoChars):
    matchingTrigrams = [trigram for trigram in model if trigram.startswith(twoChars)]

    if matchingTrigrams:
        counts = getCounts(model, matchingTrigrams)
        return matchingTrigrams, counts
    else:
        return None, None

In [111]:
#pick the 3rd char of a trigram based on the random counts choice
def pickNextCharacter(trigrams, counts):
    #return only the third character
    chosenTrigram = random.choices(trigrams, counts, k=1)[0]
    return chosenTrigram[-1]

In [112]:
def generateText(model, startText="TH", length=10000):
    #start the generated text by storing the initial string given in the task
    generatedText = startText
    #keep going till the text is 10000
    while len(generatedText) < length:
        #take out the last 2 chars from the current text
        lastTwoChars = generatedText[-2:]
        #call the method that gets all the trigrams that start with the current 2 chars and their counts
        trigrams, counts = getMatchingTrigrams(model, lastTwoChars)
        #if there are matching trigrams then pick the next char based on the probablity of the counts
        if trigrams:
            nextCharacter = pickNextCharacter(trigrams, counts)
            generatedText += nextCharacter
        #if there are no mathcing trigrams stop generating the text
        else:
            break

    return generatedText

In [113]:
genText = generateText(trigramModel, startText="TH", length=10000)
print(genText)

THE Parklithe Moto th wit lan is his or orawat
whic Of arowe for at undry had we ourearsto elowas vere surrin st
sped, bing alls Coccimbs up eve had hip forned. The glancledged Unche ISEA


Even seare and. Alont ded ous It of cimbly wit
fres hite put luee drat rockis of
wity beader enthe slying sch go the prich tom a ples, Done.

              65

              Whe
cound
hadley forty to I he wer, riging themest up them
imed clooked ang the dowelmounk.

Mach afrigh
Bet. Tagat unden a siles on the bours here himmition tom a youtnamem I led hey big the moset a asited der a sh I re of gottionged a camat on a righ re duck
Zoon the by witeand tilest areas
1813. I bly wify aboad swimme hilon an has
ning on, lone booked overiver fir to and spin alosslan.

Oceactiound wil bare wand wo in the althe the the
stairst we
boonsts thead he pook, Dwithe ottles. Howbor hand on ther a prearwith putle of Pany day went ful cany bight mougher and beace and Moll
maill they
oneachatectublue, acks, dre sma sco

Task 3 

In [114]:
#get all the words from words.txt and store them in a set to get easily
def getAllTheWords(filePath):
    with open(filePath, 'r') as file:
        words = set(file.read().upper().splitlines())
    return words

In [115]:
words = getAllTheWords('words.txt')
print(words)



In [116]:
#take the words out 
def takeOutWords(text):
    words = [word.strip('.') for word in text.split()]
    return words

In [117]:
#count the valid words and calculate the percentage
def calculatePercentage(text, realWords):
    words = takeOutWords(text)
    wordCount = sum(1 for word in words if word in realWords)
    if len(words) == 0:
        return 0
    
    return (wordCount/len(words)) * 100

In [118]:
percentage = calculatePercentage(genText, words)
print(percentage)

0.16638935108153077


Task 4

In [119]:
with open('trigrams.json', 'w') as jsonFile:
    json.dump(trigramModel, jsonFile)