# Compare JSON and Parquet EF representations

In [1]:
from htrc_features import Volume, utils
import os
import pandas as pd

In [2]:
ids = pd.read_csv('../test_dataset_htids.csv.gz', names=['htid'])['htid']
jsonpaths = ids.apply(lambda x: '/data/extracted-features/' + utils.id_to_rsync(x))
parqpaths = ids.apply(lambda x: '/data/extracted-features-parquet/' + utils.id_to_rsync(x)).str.replace('.json.bz2', '')
parqchunkpaths = parqpaths.str.replace('extracted-features-parquet', 'extracted-features-parquet-chunked')

## File Size

In [3]:
def stat_if(path):
    try:
        return os.stat(path).st_size
    except:
        return 0

In [4]:
# In GB
jsonsize = jsonpaths.apply(stat_if).div(1024**3).sum()
metasize = (parqpaths + '.meta.json').apply(stat_if).div(1024**3).sum()
parqsize = (parqpaths + '.tokens.parquet').apply(stat_if).div(1024**3).sum()
parqchunksize = (parqchunkpaths + '.tokens.parquet').apply(stat_if).div(1024**3).sum()
jsonsize.round(2), metasize.round(2), parqsize.round(2), (metasize+parqsize).round(2), (metasize+parqchunksize).round(2)

(23.98, 0.14, 32.1, 32.24, 22.9)

In [5]:
print("Parquet is larger by {}%".format(int((metasize+parqsize)/jsonsize*100)))

Parquet is larger by 134%


In [6]:
print("Chunked parquet (5000 words/chunk) is {}% of the JSON size".format(int((metasize+parqchunksize)/jsonsize*100)))

Chunked parquet (5000 words/chunk) is 95% of the JSON size


## Performance on token loading

As you would expect, the Parquet option is much quicker. Some notes, though:

- the parquet option is not only reading parquet files, but their associated metadata file in JSON. It's possible to save without the metadata, but it's small enough.
- Of course it's quicker! In addition to not needing JSON parsing and using faster decompression than BZIP2, the data has already been preprocessed and formatted into a table format.

The point is that if you ever expect to read your files *more than once*, [converting your local Extracted Features collection to parquet](https://github.com/massivetexts/compare-tools/blob/master/scripts/convert-to-parquet.py) using the `Volume.save_parquet` function will save you a great deal of computing time. It is also processing that can be front-loaded - converting to Parquet can be done in the background while you're developing your project code, not at the end.

In [11]:
%%time
for path in jsonpaths.head(1000):
    vol = Volume(path, parser='json')
    tl = vol.tokenlist(pos=False, case=False)

CPU times: user 4min 9s, sys: 5.64 s, total: 4min 15s
Wall time: 4min 15s


In [10]:
%%time
for path in parqpaths.head(1000):
    vol = Volume(path, parser='parquet')
    tl = vol.tokenlist(pos=False, case=False)

CPU times: user 53.6 s, sys: 587 ms, total: 54.2 s
Wall time: 53.5 s


### Chunked

In [9]:
%%time
for path in jsonpaths.head(1000):
    vol = Volume(path, parser='json')
    tl = vol.chunked_tokenlist(chunk_target=5000, pos=False, case=False)

CPU times: user 9min 17s, sys: 7.7 s, total: 9min 25s
Wall time: 9min 26s


In [7]:
%%time
for path in parqpaths.head(1000):
    vol = Volume(path, parser='parquet')
    tl = vol.chunked_tokenlist(chunk_target=5000, pos=False, case=False)

CPU times: user 5min 42s, sys: 3.06 s, total: 5min 45s
Wall time: 5min 45s


In [8]:
%%time
for path in parqchunkpaths.head(1000):
    vol = Volume(path, parser='parquet')
    tl = vol.chunked_tokenlist(pos=False, case=False, suppress_warning=True)

CPU times: user 33.8 s, sys: 487 ms, total: 34.3 s
Wall time: 33.5 s
