In [7]:
import codecs
import math
import random
import numpy as np
import h5py

In [34]:
class exporthdf5:
    def __init__(self, encoding, input_txt, vocab_filename):
        self.encoding = encoding
        self.input_txt = input_txt
        #encoding = 'utf-8'
        #input_txt = 'data/tiny-shakespeare.txt'
        #vocab_filename = 'vocabulary.txt'
        val_frac = 0.1
        test_frac = 0.1


        # First go the file once to see how big it is and to build the vocab
        self.token_to_idx = {}
        self.total_size = 0
        with codecs.open(input_txt, 'r', encoding) as f:
            for line in f:
              self.total_size += len(line)
              for char in line:
                if char not in self.token_to_idx:
                  self.token_to_idx[char] = len(self.token_to_idx) + 1


        #Write vocab to file
        arr = [self.token_to_idx.keys()[self.token_to_idx.values().index(index)] for index in xrange(1, len(self.token_to_idx) + 1)]

        print 'Dumping vocabulary to file: %s' % vocab_filename
        with open(vocab_filename, 'wb') as vocab_file:
          for char in arr:
            vocab_file.write('%s\n' % char.encode('unicode_escape'))
        print 'Done.'

         # Now we can figure out the split sizes
        self.val_size = int(val_frac * self.total_size)
        self.test_size = int(test_frac * self.total_size)
        self.train_size = self.total_size - self.val_size - self.test_size

        print 'Total vocabulary size: %d' % len(self.token_to_idx)
        print 'Total tokens in file: %d' % self.total_size
        print '  Training size: %d' % self.train_size
        print '  Val size: %d' % self.val_size
        print '  Test size: %d' % self.test_size
        
    def create_dataset(self, name, start_offset, size):
        MAX_CHARS = 50
        BUFFER_SIZE = 100

        num_samples = math.ceil(size / float(MAX_CHARS))
        print 'Number of samples: %f' % num_samples
        remainder = num_samples % BUFFER_SIZE
        num_needed = BUFFER_SIZE - remainder
        print 'Samples needed: %f' % num_needed

        samples = []
        targets = []

        with codecs.open(self.input_txt, 'r', self.encoding) as f:
            # Read the file
            contents = f.read()
            contents_t = contents[start_offset+1:start_offset+size+1]
            contents = contents[start_offset:start_offset+size]
            # while the contents contain something
            while contents:
                # Add the first 50 characters to the grouping
                samples.append(contents[:50])
                # Set the contents to everything after the first 50
                contents = contents[50:]

            # while the contents contain something
            while contents_t:
                # Add the first 50 characters to the grouping
                targets.append(contents_t[:50])
                # Set the contents to everything after the first 50
                contents_t = contents_t[50:]

        # fill up randomly
        for i in range(int(num_needed)):
            choice = random.randint(0, num_samples - 1)
            samples.append(samples[choice])
            targets.append(targets[choice])

        samples_per_stream = len(samples) / BUFFER_SIZE
        stream_length = samples_per_stream * MAX_CHARS
        print 'Samples per stream: %d, Single stream length: %d' % (samples_per_stream , stream_length)

        train = np.zeros((stream_length, BUFFER_SIZE), dtype=np.float32)
        target = np.zeros((stream_length, BUFFER_SIZE), dtype=np.float32)
        cont = np.zeros((stream_length, BUFFER_SIZE), dtype=np.float32)

        for stream_num in xrange(BUFFER_SIZE):
            for sample_num in xrange(samples_per_stream):
                # for train
                sample = samples[stream_num*samples_per_stream+sample_num]
                sample_chars = [self.token_to_idx[index] for index in sample]

                # for target        
                tgt = targets[stream_num*samples_per_stream+sample_num]
                target_chars = [self.token_to_idx[index] for index in tgt]

                # fill 1s every time we have a char that should be learned
                cont_tmp = np.zeros(MAX_CHARS, dtype=np.uint8)
                cont_tmp[:len(sample_chars)] = [1] * len(sample_chars)

                if (len(sample_chars)<MAX_CHARS):
                    # fill up to MAX_CHARS
                    sample_chars[len(sample_chars):MAX_CHARS] = [0] * (MAX_CHARS-len(sample_chars))
                    # fill 0s every time we have a char that should NOT be learned
                    cont_tmp[len(sample_chars):MAX_CHARS]  = [0] * (MAX_CHARS-len(sample_chars))


                # for target
                if (len(target_chars)<MAX_CHARS):
                    # fill up to MAX_CHARS
                    target_chars[len(target_chars):MAX_CHARS] = [0] * (MAX_CHARS-len(target_chars))

                train[sample_num*MAX_CHARS:(sample_num+1)*MAX_CHARS,stream_num] = sample_chars   
                cont[sample_num*MAX_CHARS:(sample_num+1)*MAX_CHARS,stream_num] = cont_tmp
                target[sample_num*MAX_CHARS:(sample_num+1)*MAX_CHARS,stream_num] = target_chars

        with h5py.File(name, 'w') as f:
            f.create_dataset('input', data = train)
            f.create_dataset('cont', data = cont)
            f.create_dataset('target', data = target)

In [35]:
shakespeare = exporthdf5('utf-8', 'data/tiny-shakespeare.txt', 'vocab_shakespeare.txt')

Dumping vocabulary to file: vocab_shakespeare.txt
Done.
Total vocabulary size: 65
Total tokens in file: 1115394
  Training size: 892316
  Val size: 111539
  Test size: 111539


In [36]:
shakespeare.create_dataset('data/shakespeare_train.h5', 0, shakespeare.train_size)

Number of samples: 17847.000000
Samples needed: 53.000000
Samples per stream: 179, Single stream length: 8950


In [37]:
shakespeare.create_dataset('data/shakespeare_val.h5', shakespeare.train_size, shakespeare.val_size)

Number of samples: 2231.000000
Samples needed: 69.000000
Samples per stream: 23, Single stream length: 1150


In [38]:
reddit = exporthdf5('utf-8', 'data/reddit.txt', 'vocab_reddit.txt')

Dumping vocabulary to file: vocab_reddit.txt
Done.
Total vocabulary size: 656
Total tokens in file: 2688329
  Training size: 2150665
  Val size: 268832
  Test size: 268832


In [39]:
reddit.create_dataset('data/reddit_train.h5', 0, reddit.train_size)

Number of samples: 43014.000000
Samples needed: 86.000000
Samples per stream: 431, Single stream length: 21550


In [40]:
reddit.create_dataset('data/reddit_val.h5', reddit.train_size, reddit.val_size)

Number of samples: 5377.000000
Samples needed: 23.000000
Samples per stream: 54, Single stream length: 2700


In [None]:
# print example
print train.shape

tmp_arr = [token_to_idx.keys()[token_to_idx.values().index(index)] for index in train[:,0]]

s = ''
for char in tmp_arr:
    s += char
    
print s