In [1]:
import os, sys, re, json, time, csv, copy, random, unittest
import time
import itertools, collections

import numpy as np
from scipy import stats
import pandas as pd
import cPickle as pickle

import nltk

from shared_lib import utils, vocabulary, ngram_lm, ngram_utils

random.seed(10)


In [2]:
nltk.__version__

'3.2.2'

In [4]:
'''
1. Read TSV tokens from files generated by Book-NLP into a pandas dataframe.
2. Read tokens for each sentence.
3. Pad the sentence with two start tags for Trigram Model.
4. Canonicalize the words - util.py for vocabulary
5. End the sentence with a closing sentence tag.
6. Split the dataset into train and test
'''

'\n1. Read TSV tokens from files generated by Book-NLP into a pandas dataframe.\n2. Read tokens for each sentence.\n3. Pad the sentence with two start tags for Trigram Model.\n4. Canonicalize the words - util.py for vocabulary\n5. End the sentence with a closing sentence tag.\n6. Split the dataset into train and test\n'

In [3]:
indir = '../book-nlp-master/data/tokens.gutenberg'
books = []
for root, dirs, filenames in os.walk(indir):
    for f in filenames:
         books.append(f)
print "Number of books tokenized: ", len(books)
print "A few titles: ", 
print books[3]
print books[10]
print books[102]
print books[305]
print books[340]
print books[500]
print books[654]

Number of books tokenized:  2998
A few titles:  Grant_Allen___Science_in_Arcady.tokens
Zane_Grey___The_Last_of_the_Plainsmen.tokens
Edward_Phillips_Oppenheim___Anna_the_Adventuress.tokens
Anthony_Trollope___The_Chateau_of_Prince_Polignac.tokens
Benjamin_Disraeli___The_Voyage_of_Captain_Popanilla.tokens
Henry_David_Thoreau___Canoeing_in_the_wilderness.tokens
Edward_Stratemeyer___The_Rover_Boys_in_Southern_Waters.tokens


In [None]:
tokens = []
canonicalized_words = []
V = 20681
# tempBooks = [books[3], books[5]]
# print tempBooks

start = time.time()
for book_num, book in enumerate(sorted(books)): 
    if book.startswith('Abraham_Lincoln_'):
        print "Processing book: ", book_num, book
        df = pd.read_csv('../book-nlp-master/data/tokens.gutenberg/'+ book, sep='\t', quoting=csv.QUOTE_NONE)
        for i in xrange(df["sentenceID"].max()):
            tokens.append('<s>')
            tokens.append('<s>')
            for word in df.query('sentenceID == '+ str(i) + ' & deprel != "punct"')['originalWord']:
                if type(word) != str:
                    word = str(word)
                word = word.decode('ascii', 'ignore')
                tokens.append(word)
                canonicalized_words.append(utils.canonicalize_word(word))
            tokens.append('</s>')


tokens_length = len(tokens)

print "tokens_length - ", tokens_length

vocab = vocabulary.Vocabulary(canonicalized_words, size=V)
train_tokens  = tokens[ : int(tokens_length * 0.8)]
test_tokens = tokens[int(tokens_length * 0.8) : ]

end = time.time()

print "Train set vocabulary: %d words" % vocab.size
print "Train set tokens: %d " % len(train_tokens)
print "Test set tokens: %d " % len(test_tokens)
print "First 10 Train Tokens: ", train_tokens[0:10]
print "First 10 Test Tokens: ", test_tokens[0:10]  

print "Time: {0}".format(end-start)

Processing book:  0 Abraham_Lincoln___Lincoln's_First_Inaugural_Address.tokens
Processing book:  1 Abraham_Lincoln___Lincoln's_Gettysburg_Address,_given_November_19,_1863.tokens
Processing book:  2 Abraham_Lincoln___Lincoln's_Inaugurals,_Addresses_and_Letters_(Selections).tokens
Processing book:  3 Abraham_Lincoln___Lincoln's_Second_Inaugural_Address.tokens
Processing book:  4 Abraham_Lincoln___Lincoln_Letters.tokens
Processing book:  5 Abraham_Lincoln___Speeches_and_Letters_of_Abraham_Lincoln,_1832-1865.tokens
Processing book:  6 Abraham_Lincoln___State_of_the_Union_Addresses.tokens
Processing book:  7 Abraham_Lincoln___The_Emancipation_Proclamation.tokens
Processing book:  8 Abraham_Lincoln___The_Life_and_Public_Service_of_General_Zachary_Taylor:_An_Address.tokens
Processing book:  9 Abraham_Lincoln___The_Writings_of_Abraham_Lincoln,_Volume_1:_1832-1843.tokens
Processing book:  10 Abraham_Lincoln___The_Writings_of_Abraham_Lincoln,_Volume_2:_1843-1858.tokens
Processing book:  11 Abrah

In [21]:
stokens = set(tokens)
len(stokens)

20681

In [11]:
reload(ngram_lm)
Model = ngram_lm.KNTrigramLM
t0 = time.time()
print "Building trigram LM...",
lm = Model(train_tokens)

print "done in %.02f s" % (time.time() - t0)
ngram_utils.print_stats(lm)

Building trigram LM... done in 3.59 s
=== N-gram Language Model stats ===
19049 unique 1-grams
173304 unique 2-grams
365207 unique 3-grams
Optimal memory usage (counts only): 12 MB


In [42]:
max_length = 20
num_sentences = 5

for _ in range(num_sentences):
    seq = ["<s>", "<s>"]
    for i in range(max_length):
        seq.append(ngram_utils.predict_next(lm, seq))
        # Stop at end-of-sentence.
        if seq[-1] == "</s>": break
    print " ".join(seq)
    print "[{1:d} tokens; log P(seq): {0:.02f}]".format(*ngram_utils.score_seq(lm, seq))
    print ""

<s> <s> A. LINCOLN </s>
[2 tokens; log P(seq): -4.86]

<s> <s> Paper was such an event you say so and that the place on the same source as reported in said
[20 tokens; log P(seq): -85.91]

<s> <s> I add that I now do with them </s>
[8 tokens; log P(seq): -41.31]

<s> <s> Let them beware of the States and people will have extinguished slavery in these resolutions read from MS. but with
[20 tokens; log P(seq): -90.50]

<s> <s> There is understood that on receiving this government was against it </s>
[11 tokens; log P(seq): -62.94]



In [22]:
log_p_data, num_real_tokens = ngram_utils.score_seq(lm, train_tokens)
print "Train perplexity: %.02f" % (2**(-1*log_p_data/num_real_tokens))
print "Number of test tokens: ", len(test_tokens)

lm.set_live_params(k = 0.001, delta=0.3)
log_p_data, num_real_tokens = ngram_utils.score_seq(lm, test_tokens)
print "Test perplexity: %.02f" % (2**(-1*log_p_data/num_real_tokens))

Train perplexity: 12.12
Number of test tokens:  164051
Test perplexity: 67.54


In [11]:
tokens = []
canonical_words = []
indir = 'processed/'
books = []
for root, dirs, filenames in os.walk(indir):
    for filename in filenames:
        print 'Processing book: ', filename
        if filename == 'canonical_words.p':
            with open(indir + filename, 'rb') as f:
                canonical_words = pickle.load(f)
        else:
            with open(indir + filename, 'rb') as f:
                tokens.append(pickle.load(f))
                
vocab = vocabulary.Vocabulary(canonicalized_words, size=V)
train_tokens  = tokens[ : int(tokens_length * 0.8)]
test_tokens = tokens[int(tokens_length * 0.8) : ]            

print "Train set vocabulary: %d words" % vocab.size
print "Train set tokens: %d " % len(train_tokens)
print "Test set tokens: %d " % len(test_tokens)
print "First 10 Train Tokens: ", train_tokens[0:10]
print "First 10 Test Tokens: ", test_tokens[0:10]

# with open('processed/'+book+'.p', 'a') as f:
#     pickle.dump(tokens, f)
# with open('processed/canonical_words.p', 'a') as c:
#     pickle.dump(canonicalized_words, c)

Processing book:  George_Alfred_Henty___March_to_Magdala.tokens.p
Processing book:  Baronness_Orczy___"Unto_Caesar".tokens.p
Processing book:  Henry_Rider_Haggard___Moon_of_Israel.tokens.p
Processing book:  Sir_Walter_Scott___Quentin_Durward.tokens.p
Processing book:  Sir_Walter_Scott___The_Black_Dwarf.tokens.p
Processing book:  George_Bernard_Shaw___Cashel_Byron's_Profession.tokens.p
Processing book:  R_M_Ballantyne___The_Butterfly's_Ball.tokens.p
Processing book:  Oscar_Wilde___The_Canterville_Ghost.tokens.p
Processing book:  Herman_Melville___The_Confidence-Man.tokens.p
Processing book:  James_Otis___Defending_the_Island.tokens.p
Processing book:  Grant_Allen___Science_in_Arcady.tokens.p
Processing book:  William_Dean_Howells___A_Chance_Acquaintance.tokens.p
Processing book:  Bertrand_Russell___Our_Knowledge_of_the_External_World_as_a_Field_for_Scientific_Method_in_Philosophy.tokens.p
Processing book:  Ambrose_Bierce___The_Collected_Works_of_Ambrose_Bierce,_Volume_8.tokens.p
Process

MemoryError: 

In [1]:
a = 'abcde'
a.startswith('abc')

True