In [1]:
import nltk
import regex as re
import random
from collections import defaultdict
import numpy as np

In [2]:
from nltk.corpus import gutenberg, brown

nltk.download('gutenberg')
nltk.download('brown')
nltk.download('punkt')

[nltk_data] Downloading package gutenberg to /home/petey/nltk_data...
[nltk_data]   Package gutenberg is already up-to-date!
[nltk_data] Downloading package brown to /home/petey/nltk_data...
[nltk_data]   Package brown is already up-to-date!
[nltk_data] Downloading package punkt to /home/petey/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [3]:
gutenberg_sentences = []

for fileid in gutenberg.fileids():
    gutenberg_sentences += list(gutenberg.sents(fileid))

In [4]:
brown_sentences = brown.sents()

In [5]:
def process_sents(sents):
    lines = [' '.join(x) for x in sents]

    # Remove punctuation
    lines = [re.sub(ur"\p{P}+", "", line) for line in lines]

    lines = [line.replace("'s ", " 's/") for line in lines]
    lines = [' '.join([w.split('/')[0] for w in line.split(' ')]) for line in lines]

    # Split hyphenated words
    lines = [line.replace('-', ' ') for line in lines]

    # Replace numbers
    lines = [re.sub("\s([\d]+[\d,\.]*)(th)?\s", " @@NUMBER@@ ", ' ' + line + ' ') for line in lines]
    lines = [re.sub("\s([\d]+[\d,\.]*)(th)?\s", " @@NUMBER@@ ", ' ' + line + ' ') for line in lines]

    # Replace currency
    lines = [re.sub("\s([\$]\.?[\d]+[\d,\.]*)\s", " @@CURRENCY@@ ", ' ' + line + ' ') for line in lines]

    # Add start and end symbols
    lines = ['@@START@@ ' + line + ' @@END@@' for line in lines]

    # Split
    lines = [line.split(' ') for line in lines]

    # Normalize case
    lines = [[w.lower().strip() for w in line if w.strip() != ''] for line in lines]
    
    return lines

In [6]:
gutenberg_sentences = process_sents(gutenberg_sentences)
brown_sentences = process_sents(brown_sentences)

In [7]:
random.seed(1337)

In [8]:
gutenberg_train_idx = set(random.sample(range(len(gutenberg_sentences)), int(0.9*len(gutenberg_sentences))))
gutenberg_valid_idx = set(range(len(gutenberg_sentences))) - gutenberg_train_idx

brown_train_idx = set(random.sample(range(len(brown_sentences)), int(0.9*len(brown_sentences))))
brown_valid_idx = set(range(len(brown_sentences))) - brown_train_idx

In [9]:
gutenberg_train_sentences = [gutenberg_sentences[i] for i in gutenberg_train_idx]
gutenberg_valid_sentences = [gutenberg_sentences[i] for i in gutenberg_valid_idx]

brown_train_sentences = [brown_sentences[i] for i in brown_train_idx]
brown_valid_sentences = [brown_sentences[i] for i in brown_valid_idx]

In [34]:
train_datasets = ['gutenberg', 'brown']
test_datasets  = ['brown']

In [35]:
train_sentences = []
valid_sentences = []

if 'gutenberg' in train_datasets:
    train_sentences += gutenberg_train_sentences
if 'gutenberg' in test_datasets:
    valid_sentences += gutenberg_valid_sentences
if 'brown' in train_datasets:
    train_sentences += brown_train_sentences
if 'brown' in test_datasets:
    valid_sentences += brown_valid_sentences

In [36]:
# Extract vocabulary
vocab = set()
frequency = dict()

for sentence in train_sentences:
    for word in sentence:
        if word not in vocab:
            vocab.add(word)
            frequency[word] = 1
        else:
            frequency[word] += 1

In [37]:
threshold = 2

In [38]:
for word, freq in frequency.iteritems():
    if freq < threshold:
        if word in vocab:
            vocab.remove(word)
            
print 'Vocabulary size:', len(vocab)

Vocabulary size: 39065


In [39]:
# Build mapping from vocabulary to IDs
word_to_id = dict()

for word in vocab:
    word_to_id[word] = len(word_to_id) + 1
    
id_to_word = dict([(id_, word) for word, id_ in word_to_id.iteritems()])

In [40]:
# Store counts
n_grams = dict()

for n in range(1,2+1):
    n_grams[n] = defaultdict(dict)
    
    for sentence in train_sentences:
        for i in range(len(sentence) - n + 1):
            key = tuple(map(lambda x: word_to_id[x] if x in word_to_id else 0, sentence[i:i+n]))
            
            if key in n_grams[n]:
                n_grams[n][key] += 1
            else:
                n_grams[n][key] = 1

In [41]:
prefix = '_'.join(train_datasets) + '-' + '_'.join(test_datasets)

np.save('%s_word_to_id.npy' % prefix, word_to_id)
np.save('%s_id_to_word.npy' % prefix, id_to_word)
np.save('%s_n_grams.npy' % prefix, n_grams)
np.save('%s_train.npy' % prefix, train_sentences)
np.save('%s_valid.npy' % prefix, valid_sentences)