# Read Parquet files using Python

Dependencies:
* pyarrow
* fsspec
* s3fs

In [27]:
import pandas as pd
import boto3

In [28]:
s3 = boto3.resource('s3')



#### Settings

In [29]:
# Source
BUCKET = 'sorel-20m'
PREFIX = '09-DEC-2020/binaries'
BUCKET_PATH = f's3://{BUCKET}'
SRC_PATH = f'{BUCKET_PATH}/{PREFIX}'

filter_chars = '0000'
#filter_chars = '0'

# Target
TARG_BUCKET = 'sorel-20m-demo'
TARG_BUCKET_PATH = f's3://{TARG_BUCKET}'
TARG_PREFIX = 'output'
TARG_PATH = f'{TARG_BUCKET_PATH}/{TARG_PREFIX}'

MB = 1024 * 1024


#### Read files created

In [30]:
targ_bucket = s3.Bucket(TARG_BUCKET)
parquet_files = targ_bucket.objects.filter(Prefix=TARG_PREFIX)

num_found = 0
tot_size = 0
for o in parquet_files:
    print(o.key, o.size)
    num_found += 1
    tot_size += o.size

output/fff/part-00000-788e433a-08e2-45fb-89db-13a511632cc3-c000.snappy.parquet 103553959
output/fff/part-00001-788e433a-08e2-45fb-89db-13a511632cc3-c000.snappy.parquet 80111514
output/fff/part-00002-788e433a-08e2-45fb-89db-13a511632cc3-c000.snappy.parquet 123547332
output/fff/part-00003-788e433a-08e2-45fb-89db-13a511632cc3-c000.snappy.parquet 108759481
output/fff/part-00004-788e433a-08e2-45fb-89db-13a511632cc3-c000.snappy.parquet 102081776
output/fff/part-00005-788e433a-08e2-45fb-89db-13a511632cc3-c000.snappy.parquet 90662423
output/fff/part-00006-788e433a-08e2-45fb-89db-13a511632cc3-c000.snappy.parquet 75715927
output/fff/part-00007-788e433a-08e2-45fb-89db-13a511632cc3-c000.snappy.parquet 71012772
output/fff/part-00008-788e433a-08e2-45fb-89db-13a511632cc3-c000.snappy.parquet 70389673
output/fff/part-00009-788e433a-08e2-45fb-89db-13a511632cc3-c000.snappy.parquet 66506450
output/fff/part-00010-788e433a-08e2-45fb-89db-13a511632cc3-c000.snappy.parquet 62697546
output/fff/part-00011-788e43

In [31]:
print(f'Found {num_found} files.  Tot size = {tot_size / MB} MB.  Avg size = {tot_size / num_found / MB} MB')

Found 92 files.  Tot size = 2263.423614501953 MB.  Avg size = 24.602430592412535 MB


In [32]:
df_parquet.columns

Index(['path', 'modificationTime', 'length', 'content'], dtype='object')

In [33]:
tot_pe_files = 0
for o in parquet_files:
    df_parquet = pd.read_parquet(f'{TARG_BUCKET_PATH}/{o.key}')
    num_pe_files = len(df_parquet)
    tot_pe_files += num_pe_files
    print(f'{o.key} has {num_pe_files} files.  Size = {o.size}.')

output/fff/part-00000-788e433a-08e2-45fb-89db-13a511632cc3-c000.snappy.parquet has 1 files.  Size = 103553959.
output/fff/part-00001-788e433a-08e2-45fb-89db-13a511632cc3-c000.snappy.parquet has 1 files.  Size = 80111514.
output/fff/part-00002-788e433a-08e2-45fb-89db-13a511632cc3-c000.snappy.parquet has 2 files.  Size = 123547332.
output/fff/part-00003-788e433a-08e2-45fb-89db-13a511632cc3-c000.snappy.parquet has 4 files.  Size = 108759481.
output/fff/part-00004-788e433a-08e2-45fb-89db-13a511632cc3-c000.snappy.parquet has 8 files.  Size = 102081776.
output/fff/part-00005-788e433a-08e2-45fb-89db-13a511632cc3-c000.snappy.parquet has 11 files.  Size = 90662423.
output/fff/part-00006-788e433a-08e2-45fb-89db-13a511632cc3-c000.snappy.parquet has 14 files.  Size = 75715927.
output/fff/part-00007-788e433a-08e2-45fb-89db-13a511632cc3-c000.snappy.parquet has 15 files.  Size = 71012772.
output/fff/part-00008-788e433a-08e2-45fb-89db-13a511632cc3-c000.snappy.parquet has 16 files.  Size = 70389673.
ou

In [36]:
print(f'Num PE files contained in parquet files = {tot_pe_files}')
print(f'Num parquet files = {num_found}.  Avg # of PE files/parquet file = {tot_pe_files / num_found}')
print(f'Avg PE file size = {tot_size / tot_pe_files} bytes')

Num PE files contained in parquet files = 2377
Num parquet files = 92.  Avg # of PE files/parquet file = 25.83695652173913
Avg PE file size = 998473.5717290703 bytes


In [35]:
print(f'Detail for parquet file {o.key}')
num_pe_files = 0
tot_size_cur = 0
for row in df_parquet.itertuples():
    num_pe_files += 1
    tot_size_cur += row.length
    print(row.path, row.modificationTime, row.length)

Detail for parquet file output/fff/part-00091-788e433a-08e2-45fb-89db-13a511632cc3-c000.snappy.parquet
s3://sorel-20m/09-DEC-2020/binaries/fff224717c551708c7025b3c6150fb6b0716e6df686436fb94b2fe5a5ee157c4 2020-12-03 09:52:10 3408
s3://sorel-20m/09-DEC-2020/binaries/fffa8156c48752b5967cdb8955babcf5e8db8809589188412bf21b7b47a2f299 2020-12-03 09:54:02 3322
s3://sorel-20m/09-DEC-2020/binaries/ffff7fc2b23ce8832ec10ecdea8da3c7ffbeedece07864460f36d0546e5a9226 2020-12-03 09:55:24 3170
s3://sorel-20m/09-DEC-2020/binaries/fff96d12e2e3be66caacf2e1299d4c8a553b40ffc07fac763aba362dca6186cb 2020-12-03 09:53:48 3050
s3://sorel-20m/09-DEC-2020/binaries/fffe182691b22f7ad1fa415527eb3d0d030d12b62653bb5cc8b1585cd750ea1d 2020-12-03 09:55:06 3050
s3://sorel-20m/09-DEC-2020/binaries/fff353f4b43ef0a1c88efd383ce73857d6768a4697768d860f2d39b5f52b3524 2020-12-03 09:52:24 2988
s3://sorel-20m/09-DEC-2020/binaries/fff0b0c5f007abf2c1bc2c3e873f45eb0359fef364a369c0b8df246d41c4aa6c 2020-12-03 09:51:53 2850
s3://sorel-20m/

In [38]:
print(f'In this parquet file, found {num_pe_files} PE files.  Size of PE files = {tot_size_cur}.  Size of parquet file = {o.size}.')

In this parquet file, found 24 PE files.  Size of PE files = 49944.  Size of parquet file = 55038.


Manually verified that the size of the PE files matches source