In [160]:
import numpy as np
import nltk, pprint
from nltk import word_tokenize
from nltk.tokenize import sent_tokenize
from urllib.request import urlopen
from sklearn.preprocessing import OneHotEncoder
from sklearn import preprocessing
import re
import time
# nltk.download() # Might need this if tokenize doens't work

In [161]:
# Creating word and sentence tokens
bookURLs = ["http://www.gutenberg.org/files/28054/28054-0.txt"] # takes about 1 hour per book on my cpu to encode 1 hots

#             "http://www.gutenberg.org/files/2554/2554-0.txt", 
#             "http://www.gutenberg.org/files/2600/2600-0.txt",
#             "http://www.gutenberg.org/files/1399/1399-0.txt",
#             "http://www.gutenberg.org/files/98/98-0.txt",
#             "http://www.gutenberg.org/files/1400/1400-0.txt",
#             "http://www.gutenberg.org/cache/epub/730/pg730.txt",
#             "http://www.gutenberg.org/cache/epub/84/pg84.txt",
#             "http://www.gutenberg.org/cache/epub/5200/pg5200.txt",
#             "http://www.gutenberg.org/cache/epub/7849/pg7849.txt"

wordTokens = []
sentTokens = []
for book in bookURLs:
    print("Reading book: " + book)
    response = urlopen(book)
    rawbook = response.read().decode('utf8')
    wordTokens += word_tokenize(rawbook)
    sentTokens += sent_tokenize(rawbook)
    
# Cleaning sentences
print("Cleaning sentences...")
for i in range(len(sentTokens)):
    sentTokens[i] = re.sub(r'\r*\n', " ", sentTokens[i])
    sentTokens[i] = re.sub(r' +', " ", sentTokens[i])
print("Finished Cleaning")

# Creating one-hot words
wordlb = preprocessing.LabelBinarizer()
wordEncoding = wordlb.fit_transform(wordTokens)
print("One-hot word dimensions:", wordEncoding.shape)

Reading book: http://www.gutenberg.org/files/28054/28054-0.txt
Cleaning sentences...
Finished Cleaning


In [164]:
# Creates an array called allSentences: each element is an array whose elements are one-hot words
# First element is "START", Last element is "END", padded with "PAD" until sentence is maxSentenceLength
# Otherwise it is truncated at maxSentenceLength and has no padding
# Dimensions of allSentences: numSentences * maxSentenceLength * numWords <-- number of unique words

allSentences = []
i = 0
maxSentenceLength = 25
numSentences = len(sentTokens)
averageTime = []
for sent in sentTokens:
    startTime = time.time()
    wordsInSent = word_tokenize(sent)
    sentOfOneHotWords = ['START']
    for word in wordsInSent:
        oneHotWord = wordlb.transform([word])[0]
        sentOfOneHotWords.append(oneHotWord)
    while len(sentOfOneHotWords) < (maxSentenceLength + 1):
        sentOfOneHotWords.append(["PAD"])
    sentOfOneHotWords = sentOfOneHotWords[:(maxSentenceLength + 1)] # truncating (+1 to account for "START" Token)
    sentOfOneHotWords.append(['END'])
    allSentences.append(sentOfOneHotWords)
    averageTime.append(time.time() - startTime)
    if i % 50 == 0:
        print(i, "sentences completed of:", numSentences, "in:", sum(averageTime), "seconds")
        print("ETA:", (sum(averageTime)/len(averageTime))*(numSentences-sentTokens.index(sent)), "seconds")
    i = i + 1

0 sentences completed of: 19843 in: 0.41724586486816406 seconds
ETA: 8279.40969657898
50 sentences completed of: 19843 in: 5.066877126693726 seconds
ETA: 1966.4450778166452
100 sentences completed of: 19843 in: 8.56989574432373 seconds
ETA: 1675.202491883004


KeyboardInterrupt: 

In [241]:
# Returns sentence at specified index with one-hot encoded words
def getSentence(sentenceIndex):
    if sentenceIndex > len(allSentences):
        raise ValueError("Sentence index is greater number of sentences in corpus")
    return(allSentences[sentenceIndex])

# Returns one-hot encoded word at specified sentence and word indices
def getWord(sentenceIndex, wordIndex):
    if wordIndex > maxSentenceLength+1:
        raise ValueError("Word index is greater than max sentence length")
    if allSentences[sentenceIndex][wordIndex][0] == "START":
        return("START")
    elif allSentences[sentenceIndex][wordIndex][0] == "END":
        return("END")
    elif allSentences[sentenceIndex][wordIndex][0] == "PAD":
        return("PAD")
    numWords = wordEncoding.shape[1]
    return(np.array(getSentence(sentenceIndex)[wordIndex]).reshape(1, numWords))

# Decodes word at specified sentence and word indicies back into English
def decode(sentenceIndex, wordIndex):
    word = getWord(sentenceIndex, wordIndex)
    if type(word) == str: # 'START', 'END', 'PAD'
        return word
    return(wordlb.inverse_transform(word)[0])