In [2]:
import tensorflow as tf
import pandas as pd
from htrc_features import FeatureReader, utils
import itertools
import glob

  from ._conv import register_converters as _register_converters


In [3]:
paths = glob.glob('data/literature/ef/*.bz2')
len(paths)

103872

In [4]:
token_dict = (pd.read_csv('eng-vocab-1.txt.bz2', names=['token'])
                .reset_index().set_index('token')
                .to_dict())['index']

In [5]:
%%time
# Split into 20 chunks. For the sample, that's about 230 books per file
n_chunks = 500
books_processed = 0

for i in range(n_chunks):
    filename = 'data/literature/tfrecords/ef-files-%d.tfrecord' % i
    ef_files = FeatureReader(paths=paths[i::n_chunks])

    with tf.python_io.TFRecordWriter(filename) as writer:
        for vol in ef_files.volumes():
            # List of pages to include
            page_list = [int(p.seq) for p in vol.pages() if {'en': '1.00'} in p.languages]

            # Volume-wide token counts, filtered to page_list.
            df = vol.tokenlist(section='body', case=False, pos=False).loc[page_list,].reset_index()
            
            # Key tokens with integer IDs and drop N/As
            df['token_id'] = df['lowercase'].apply(lambda x: token_dict[x] if x in token_dict else None)
                
            df = df.dropna()
            df['token_id'] = df['token_id'].astype(int)

            for page_num, df in df.groupby('page'):
                # An array where the first row is ids and the second row is counts
                arr = df[['token_id', 'count']].astype(int).values.T

                token_ids = tf.train.Feature(int64_list=tf.train.Int64List(value=arr[0]))
                counts = tf.train.Feature(int64_list=tf.train.Int64List(value=arr[1]))
                volid = tf.train.Feature(bytes_list=tf.train.BytesList(value=[vol.id.encode('utf-8')]))
                page_seq = tf.train.Feature(int64_list=tf.train.Int64List(value=[page_num]))

                example = tf.train.Example(
                    features=tf.train.Features(feature=
                                             {
                                                 'page_seq': page_seq,
                                                 'volid': volid,
                                                 'token_ids': token_ids,
                                                 'counts': counts
                                             })
                )

                writer.write(example.SerializeToString())
            books_processed += 1
            
            if books_processed % 20 == 0:
                print(books_processed, "books processed")

20 books processed
40 books processed
60 books processed
80 books processed
100 books processed
120 books processed
140 books processed
160 books processed
180 books processed
200 books processed
220 books processed
240 books processed
260 books processed
280 books processed
300 books processed
320 books processed
340 books processed
360 books processed
380 books processed
400 books processed
420 books processed
440 books processed
460 books processed
480 books processed
500 books processed
520 books processed
540 books processed
560 books processed
580 books processed
600 books processed
620 books processed
640 books processed
660 books processed
680 books processed
700 books processed
720 books processed
740 books processed
760 books processed
780 books processed
800 books processed
820 books processed
840 books processed
860 books processed
880 books processed
900 books processed
920 books processed
940 books processed
960 books processed
980 books processed
1000 books processed
102

KeyboardInterrupt: 