In [1]:
import keras
import collections
import numpy as np
import os
import random
import re
import string
import time

Using Theano backend.
Using gpu device 0: GeForce GTX 950 (CNMeM is enabled with initial size: 66.0% of memory, cuDNN 5105)


Data processing

In [12]:
# Get relevant files.

def remove_hidden(l):
    return filter(lambda s: not s[0] == '.', l)

d = "//home//xgamer//OANC-GrAF//data//written_1//journal//slate"

def get_files(d):
    files = os.listdir(d)
    files = remove_hidden(files)
    return map(lambda f: d + "//" + f, files)

dirs = get_files(d)
files = map(get_files, dirs)
files = [f for l in files for f in l]
files = filter(lambda f: f[-3:] == 'txt', files)

print len(files)

texts = []
for f in files:
    f = open(f)
    texts.append('\n'.join(f.readlines()[7:]))
    f.close()

print sum([len(text) for text in texts])

4531
27629896


In [13]:
# Get text.

def clean_text(text):
    text = text.strip()
    text = text.replace('\n', ' ')
    text = text.replace('--', ' - ')
    text = text.replace('\t', ' ')
    text = re.sub(r'\ \ +', r' ', text)
    text = filter(lambda char: char in string.printable, text)
    return text

texts = map(clean_text, texts)
print sum([len(text) for text in texts])

25425786


In [14]:
# Determine alphabet.

alphabet = collections.defaultdict(bool)

for text in texts:
    for char in text:
        alphabet[char] = True

alphabet = [char for char in alphabet if alphabet[char] == True]
alphabet.sort()

# Add a start of sequence token \t
alphabet.append('\t')
# Add an end of sequence token \n
alphabet.append('\n')

texts = map(lambda text: '\t' + text + '\n', texts)
print alphabet

char_to_index = {}
index_to_char = {}
for i in xrange(len(alphabet)):
    index_to_char[i] = alphabet[i]
    char_to_index[alphabet[i]] = i
print index_to_char
print char_to_index

[' ', '!', '"', '#', '$', '%', '&', "'", '(', ')', '*', '+', ',', '-', '.', '/', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ':', ';', '<', '=', '>', '?', '@', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', '[', '\\', ']', '^', '_', '`', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '{', '}', '~', '\t', '\n']
{0: ' ', 1: '!', 2: '"', 3: '#', 4: '$', 5: '%', 6: '&', 7: "'", 8: '(', 9: ')', 10: '*', 11: '+', 12: ',', 13: '-', 14: '.', 15: '/', 16: '0', 17: '1', 18: '2', 19: '3', 20: '4', 21: '5', 22: '6', 23: '7', 24: '8', 25: '9', 26: ':', 27: ';', 28: '<', 29: '=', 30: '>', 31: '?', 32: '@', 33: 'A', 34: 'B', 35: 'C', 36: 'D', 37: 'E', 38: 'F', 39: 'G', 40: 'H', 41: 'I', 42: 'J', 43: 'K', 44: 'L', 45: 'M', 46: 'N', 47: 'O', 48: 'P', 49: 'Q', 50: 'R', 51: 'S', 52: 'T', 53: 'U', 54: 'V', 55: 'W', 56: 'X', 57: 'Y', 58:

In [27]:
text_seq = ''.join(texts)
sample_len = 200

def sample(text, length=sample_len + 1):
    i = random.randint(0, len(text) - length - 1)
    return text[i:i+length]

def toseqs(string, alphabet=alphabet):
    assert len(string) > 1
    n = len(string) - 1
    data = np.zeros(n, dtype='int')
    label = np.zeros(len(alphabet), dtype='uint8')
    for i in xrange(n):
        cur = string[i]
        data[i] = char_to_index[cur]
    nxt = string[i+1]
    label[char_to_index[nxt]] = 1
    return data, label

s = sample(text_seq, length=3)
print s
x,y = toseqs(s)
print x
print y

def tostring(seq, alphabet=alphabet):
    string = []
    for i in xrange(seq.shape[0]):
        string.append(index_to_char[np.argmax(seq[i])])
    return ''.join(string)

print len(x), len(tostring(x))

ste
[83 84]
[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
2 2


Neural model

In [3]:
from keras.models import Sequential, load_model
from keras.layers import Dense, Dropout, Activation, LSTM
from keras.layers.embeddings import Embedding
from keras.optimizers import RMSprop, SGD
import bisect

In [7]:
embedding_size = len(alphabet)
n_lstm_cells = 1000
dropout_perc = 0.1
seen_samples = collections.defaultdict(int)

model = Sequential()
print sample_len, len(alphabet)
model.add(Embedding(len(alphabet), embedding_size, input_length=None))
model.add(LSTM(n_lstm_cells))
model.add(Dropout(dropout_perc))
model.add(Dense(len(alphabet)))
model.add(Activation('softmax'))
#opt = SGD(lr=0.1, decay=1e-6, momentum=0.9, nesterov=True)
opt = RMSprop()
model.compile(loss='categorical_crossentropy', optimizer=opt, metrics=['accuracy'])

200 96


In [16]:
model = load_model("incremental.h5")

In [28]:
batch_size = 1024
test_size = 1024

def get_batch(size, n=sample_len, text=text_seq):
    xs = np.zeros((size, n), dtype='int')
    ys = np.zeros((size, len(alphabet)), dtype='uint8')
    for i in xrange(size):
        s = sample(text, length=n+1)
        xs[i],ys[i] = toseqs(s)
    return xs, ys

def train_until_converged(n, eval_every_n_samples=50000, batch_size=batch_size, model=model):
    vx,vy = get_batch(test_size, n=n)
    mu_acc = 0.0
    n_batches = 0
    i = 0
    next_eval = eval_every_n_samples
    start = time.time()
    accs_train = []
    accs_test = []
    while True:
        i += 1
        x,y = get_batch(batch_size, n=n)
        seen_samples[n] += batch_size
        _,acc = model.train_on_batch(x, y)
        mu_acc += acc
        n_batches += 1
        accs_train.append(acc)
        if seen_samples[n] >= next_eval:
            next_eval += eval_every_n_samples
            t = time.time()
            _,acc = model.evaluate(vx, vy, batch_size=batch_size)
            accs_test.append(acc)
            print "Saw %d samples of length %d. Ran %.2f seconds." % (seen_samples[n], n, t-start)
            print "Train accuracy: %.3f" % (mu_acc / (1. * n_batches))
            print "Test accuracy: %.3f" % acc
            print
            mu_acc = 0.0
            n_batches = 0
            if len(accs_test) > 2 and accs_test[-1] <= accs_test[-2] and accs_test[-2] < accs_test[-3]:
                print "No improvement over last two epochs. Stopping."
                return

In [None]:
# Let's train short strings first.
for i in xrange(2, 11):
    train_until_converged(i)
    print

In [24]:
text_len = 300
n_samples = 5
seed_string = "\tMicrosoft CEO Bill Gates says "

def tochar_prob(output):
    summed = []
    _sum = 0.0
    output = output[0]
    output = output**2
    output /= sum(output)
    for i in xrange(output.shape[0]):
        _sum += output[i]
        summed.append(_sum)
    choice = random.random()
    i = bisect.bisect(summed, choice)
    return index_to_char[i]
        
def generate(length, seed='\t', model=model):
    seq = np.zeros((1, length + len(seed)))
    for i in xrange(len(seed)):
        seq[0,i] = char_to_index[seed[i]]
    string = list(seed)
    for i in xrange(length):
        c = tochar_prob(model.predict(seq[:,:len(seed)+i]))
        string.append(c)
        if c == '\n':
            break
        seq[0,len(seed)+i] = char_to_index[c]
    return ''.join(string)

for i in xrange(n_samples):
    print generate(text_len, seed_string)

	Microsoft CEO Bill Gates says the LAT , and the final conference in the New York Times , the New York Times and the WP and the WP described the hands and young words in the son surprised as a profits in the way the rest of the artists and winners may be the price of part of the story of the actual consequences of conflict of som
	Microsoft CEO Bill Gates says the NYT reports that Germany and the White House strength the subject of the primary form on my time. In the price of the indictment of the money in the door, and the president soon as in the top of the president with an international consequences of the princess. The American policy and the fact th
	Microsoft CEO Bill Gates says the Internet for the early 1940s to the biggest story that are screening any of the traditional reason to use him to the people to stop for an actress. The WP and the WP reports that a whole words that he was in the first bond, and some of the most contraction of the individual history of the contra
	Mic

In [None]:
print seen_samples
print sum(seen_samples.values())

In [12]:
batch_size = 512
test_size = 1024

# Now some longer stuff.
for i in xrange(11, 21):
    train_until_converged(i, batch_size=512)
    print
    
for i in xrange(n_samples):
    print generate(text_len, seed_string)

Saw 50176 samples of length 11. Ran 20.39 seconds.
Train accuracy: 0.576
Test accuracy: 0.568

Saw 100352 samples of length 11. Ran 42.30 seconds.
Train accuracy: 0.577
Test accuracy: 0.577

Saw 150016 samples of length 11. Ran 62.62 seconds.
Train accuracy: 0.576
Test accuracy: 0.573

Saw 200192 samples of length 11. Ran 83.17 seconds.
Train accuracy: 0.576
Test accuracy: 0.570

No improvement over last two epochs. Stopping.

Saw 50176 samples of length 12. Ran 22.41 seconds.
Train accuracy: 0.575
Test accuracy: 0.589

Saw 100352 samples of length 12. Ran 46.05 seconds.
Train accuracy: 0.580
Test accuracy: 0.597

Saw 150016 samples of length 12. Ran 69.40 seconds.
Train accuracy: 0.578
Test accuracy: 0.581

Saw 200192 samples of length 12. Ran 93.32 seconds.
Train accuracy: 0.582
Test accuracy: 0.593

Saw 250368 samples of length 12. Ran 117.44 seconds.
Train accuracy: 0.582
Test accuracy: 0.587

Saw 300032 samples of length 12. Ran 140.89 seconds.
Train accuracy: 0.580
Test accuracy:

RuntimeError: CudaNdarray_ZEROS: allocation failed.
Apply node that caused the error: forall_inplace,gpu,grad_of_scan_fn}(Elemwise{Composite{Switch(EQ(i0, i1), i2, i0)}}[(0, 0)].0, GpuSubtensor{int64:int64:int64}.0, GpuElemwise{tanh,no_inplace}.0, GpuAlloc{memset_0=True}.0, GpuElemwise{Composite{(i0 - sqr(i1))},no_inplace}.0, GpuSubtensor{int64:int64:int64}.0, GpuSubtensor{int64:int64:int64}.0, GpuSubtensor{int64:int64:int64}.0, GpuAlloc{memset_0=True}.0, GpuAlloc{memset_0=True}.0, Elemwise{Composite{Switch(EQ(i0, i1), i2, i0)}}[(0, 0)].0, Elemwise{Composite{Switch(EQ(i0, i1), i2, i0)}}[(0, 0)].0, Elemwise{Composite{Switch(EQ(i0, i1), i2, i0)}}[(0, 0)].0, Elemwise{Composite{Switch(EQ(i0, i1), i2, i0)}}[(0, 0)].0, Elemwise{Composite{Switch(EQ(i0, i1), i2, i0)}}[(0, 0)].0, lstm_1_U_o, lstm_1_U_f, lstm_1_U_i, lstm_1_U_c, GpuDimShuffle{1,0}.0, GpuDimShuffle{1,0}.0, GpuDimShuffle{1,0}.0, GpuDimShuffle{1,0}.0)
Toposort index: 314
Inputs types: [TensorType(int64, scalar), CudaNdarrayType(float32, 3D), CudaNdarrayType(float32, 3D), CudaNdarrayType(float32, 3D), CudaNdarrayType(float32, 3D), CudaNdarrayType(float32, 3D), CudaNdarrayType(float32, 3D), CudaNdarrayType(float32, 3D), CudaNdarrayType(float32, 3D), CudaNdarrayType(float32, 3D), TensorType(int64, scalar), TensorType(int64, scalar), TensorType(int64, scalar), TensorType(int64, scalar), TensorType(int64, scalar), CudaNdarrayType(float32, matrix), CudaNdarrayType(float32, matrix), CudaNdarrayType(float32, matrix), CudaNdarrayType(float32, matrix), CudaNdarrayType(float32, matrix), CudaNdarrayType(float32, matrix), CudaNdarrayType(float32, matrix), CudaNdarrayType(float32, matrix)]
Inputs shapes: [(), (23, 512, 1000), (23, 512, 1000), (23, 512, 4000), (23, 512, 1000), (23, 512, 4000), (23, 512, 1000), (23, 512, 1000), (24, 512, 1000), (24, 512, 1000), (), (), (), (), (), (1000, 1000), (1000, 1000), (1000, 1000), (1000, 1000), (1000, 1000), (1000, 1000), (1000, 1000), (1000, 1000)]
Inputs strides: [(), (-512000, 1000, 1), (512000, 1000, 1), (2048000, 4000, 1), (512000, 1000, 1), (-4000, 92000, 1), (-512000, 1000, 1), (-512000, 1000, 1), (512000, 1000, 1), (512000, 1000, 1), (), (), (), (), (), (1000, 1), (1000, 1), (1000, 1), (1000, 1), (1, 1000), (1, 1000), (1, 1000), (1, 1000)]
Inputs values: [array(23), 'not shown', 'not shown', 'not shown', 'not shown', 'not shown', 'not shown', 'not shown', 'not shown', 'not shown', array(23), array(23), array(23), array(23), array(23), 'not shown', 'not shown', 'not shown', 'not shown', 'not shown', 'not shown', 'not shown', 'not shown']
Outputs clients: [[], [], [GpuSubtensor{::int64}(forall_inplace,gpu,grad_of_scan_fn}.2, Constant{-1})], [GpuReshape{2}(forall_inplace,gpu,grad_of_scan_fn}.3, MakeVector{dtype='int64'}.0)], [GpuReshape{2}(forall_inplace,gpu,grad_of_scan_fn}.4, MakeVector{dtype='int64'}.0)], [GpuReshape{2}(forall_inplace,gpu,grad_of_scan_fn}.5, MakeVector{dtype='int64'}.0)], [GpuReshape{2}(forall_inplace,gpu,grad_of_scan_fn}.6, MakeVector{dtype='int64'}.0)]]

HINT: Re-running with most Theano optimization disabled could give you a back-trace of when this node was created. This can be done with by setting the Theano flag 'optimizer=fast_compile'. If that does not work, Theano optimizations can be disabled with 'optimizer=None'.
HINT: Use the Theano flag 'exception_verbosity=high' for a debugprint and storage map footprint of this apply node.

In [14]:
model.save("incremental.h5")

In [None]:
batch_size = 256
test_size = 1024
seen_samples = collections.defaultdict(int)

# Now some longer stuff.
for i in xrange(21, 51):
    train_until_converged(i, batch_size=batch_size)
    print
    
for i in xrange(n_samples):
    print generate(text_len, seed_string)

Saw 50176 samples of length 21. Ran 41.80 seconds.
Train accuracy: 0.604
Test accuracy: 0.610

Saw 100096 samples of length 21. Ran 84.92 seconds.
Train accuracy: 0.604
Test accuracy: 0.615

Saw 150016 samples of length 21. Ran 127.22 seconds.
Train accuracy: 0.597
Test accuracy: 0.607

