- First start with Jane Austen books tokenized. 
- Using original word at this point.
- N-gram window needs to through one column across n-rows.

In [22]:
import os, sys, re, json, time, csv, copy, random, unittest
import itertools, collections

import numpy as np
from scipy import stats

import nltk

from shared_lib import utils, vocabulary, ngram_lm, ngram_utils

random.seed(10)

In [19]:
austen_emma_all = []
with open('../book-nlp-master/data/tokens/austen.emma.tokens', 'rb') as f:
    reader = csv.reader(f, dialect="excel-tab")
    austen_emma_all = list(reader)
# Remove header
austen_emma_all = austen_emma_all[1:]

In [40]:
austen_emma_tokens = [line[7] for line in austen_emma_all]
print 'The first 10 tokens from Jane Austen\'s Emma:'
print ', '.join(austen_emma_tokens[0:10])

The first 10 tokens from Jane Austen's Emma:
Produced, by, An, Anonymous, Volunteer, EMMA, By, Jane, Austen, VOLUME


In [None]:
# Shuffle when needed
random.shuffle(austen_emma_tokens)

In [74]:
tokens_length = len(austen_emma_tokens)
print tokens_length * 0.8
V = 30000
train_tokens  = austen_emma_tokens[ : int(tokens_length * 0.8)]
test_tokens = austen_emma_tokens[int(tokens_length * 0.8) : ]
vocab = vocabulary.Vocabulary((utils.canonicalize_word(w) for w in train_tokens), size=V)
print "Train set vocabulary: %d words" % vocab.size
print "Train set tokens: %d " % len(train_tokens)
print "Test set tokens: %d " % len(test_tokens)
print "First 10 Train Tokens: ", train_tokens[0:10]
print "First 10 Test Tokens: ", test_tokens[0:10]

154540.0
Train set vocabulary: 6684 words
Train set tokens: 154540 
Test set tokens: 38635 
First 10 Train Tokens:  ['\xe2\x80\x9c', 'with', 'her', 'be', 'dissolved', 'the', 'had', ',', 'well', 'indeed']
First 10 Test Tokens:  ['not', 'whose', 'Mrs.', 'Emma', 'to', 'very', 'they', 'for', 'union', 'my']


In [23]:
'''
Feeding original text instead of the tokens generated to see if this works immly with the code from assignment-2
'''
from nltk.corpus import gutenberg
assert(nltk.download('gutenberg'))
V = 10000
corpus = nltk.corpus.gutenberg
train_sents, test_sents = utils.get_train_test_sents(corpus, split=0.8, shuffle=False)
vocab = vocabulary.Vocabulary((utils.canonicalize_word(w) for w in utils.flatten(train_sents)), size=V)
# vocab = vocabulary.Vocabulary((utils.canonicalize_word(w) for w in utils.flatten(corpus.sents())), size=V)
print "Train set vocabulary: %d words" % vocab.size

[nltk_data] Downloading package gutenberg to
[nltk_data]     /home/sharmila_velamur/nltk_data...
[nltk_data]   Package gutenberg is already up-to-date!
Loaded 98552 sentences (2.62178e+06 tokens)
Training set: 78841 sentences (2108453 tokens)
Test set: 19711 sentences (513332 tokens)
Train set vocabulary: 10000 words


In [24]:

def sents_to_tokens(sents):
    """Returns an flattened list of the words in the sentences, with padding for a trigram model."""
    padded_sentences = (["<s>", "<s>"] + s + ["</s>"] for s in sents)
    # This will canonicalize words, and replace anything not in vocab with <unk>
    return np.array([utils.canonicalize_word(w, wordset=vocab.wordset) 
                     for w in utils.flatten(padded_sentences)], dtype=object)

train_tokens = sents_to_tokens(train_sents)
test_tokens = sents_to_tokens(test_sents)
print "Sample data: \n" + repr(train_tokens[:20])

Sample data: 
array(['<s>', '<s>', u'[', u'emma', u'by', u'jane', '<unk>', u'DGDGDGDG',
       u']', '</s>', '<s>', '<s>', u'volume', u'i', '</s>', '<s>', '<s>',
       u'chapter', u'i', '</s>'], dtype=object)


In [25]:
Model = ngram_lm.KNTrigramLM
t0 = time.time()
print "Building trigram LM...",
lm = Model(train_tokens)
print "done in %.02f s" % (time.time() - t0)
ngram_utils.print_stats(lm)

Building trigram LM... done in 10.84 s
=== N-gram Language Model stats ===
10000 unique 1-grams
319307 unique 2-grams
968394 unique 3-grams
Optimal memory usage (counts only): 27 MB


In [13]:
max_length = 20
num_sentences = 5

for _ in range(num_sentences):
    seq = ["<s>", "<s>"]
    for i in range(max_length):
        seq.append(ngram_utils.predict_next(lm, seq))
        # Stop at end-of-sentence.
        if seq[-1] == "</s>": break
    print " ".join(seq)
    print "[{1:d} tokens; log P(seq): {0:.02f}]".format(*ngram_utils.score_seq(lm, seq))
    print ""

<s> <s> i can fainted proposed , and you know ,' said she , with an m , sometimes ; that it
[20 tokens; log P(seq): -126.56]

<s> <s> when you were so high !" </s>
[6 tokens; log P(seq): -32.81]

<s> <s> " perhaps there might be as though she did not intend to add to your pardon . </s>
[17 tokens; log P(seq): -74.59]

<s> <s> he had rather , the son of david . </s>
[9 tokens; log P(seq): -42.06]

<s> <s> " but why must you , my statutes and judgments which i command thee this day . </s>
[17 tokens; log P(seq): -52.59]



In [14]:
lm.set_live_params(k = 0.001, delta=0.75)

In [26]:
log_p_data, num_real_tokens = ngram_utils.score_seq(lm, train_tokens)
print "Train perplexity: %.02f" % (2**(-1*log_p_data/num_real_tokens))

Train perplexity: 19.55


In [27]:
log_p_data, num_real_tokens = ngram_utils.score_seq(lm, test_tokens)
print "Test perplexity: %.02f" % (2**(-1*log_p_data/num_real_tokens))

Test perplexity: 302.38
