# Transfer SOREL Binaries without Spark

Use ```pyarrow``` https://arrow.apache.org/docs/python/

Other python libraries:  https://stackoverflow.com/questions/32940416/methods-for-writing-parquet-files-using-python



In [3]:
import boto3
from typing import List
import datetime
import pyarrow as pa
import pyarrow.parquet as pq
import os

In [4]:
# Source
BUCKET = 'sorel-20m'
PREFIX = '09-DEC-2020/binaries'
BUCKET_PATH = f's3://{BUCKET}'
SRC_PATH = f'{BUCKET_PATH}/{PREFIX}'

MB = 1024 * 1024

In [5]:
os.environ['AWS_PROFILE'] = 'personal'  # Needed by pyarrow
session = boto3.Session(profile_name='personal')
s3 = session.resource('s3')

def get_objects(filter_chars: str, bucket: str, prefix: str) -> List[str]:
    bucket = s3.Bucket(bucket)
    prefix = f'{prefix}/{filter_chars}'
    objects = bucket.objects.filter(Prefix=prefix)
    return objects

def get_num_objects(filter_chars):
    objects = get_objects(filter_chars, BUCKET, PREFIX)
    object_list = [o for o in objects]
    return len(object_list)

# TODO Use logger https://docs.python.org/3/howto/logging.html
def logmsg(msg):
    print(msg)

def logmsg01(msg):
    logmsg(f'{datetime.datetime.now():%Y-%m-%d %H:%M:%S}:  {msg}')

In [6]:
filter_chars = '0000'

In [8]:
objects = get_objects(filter_chars, BUCKET, PREFIX)
num_objects = len([o for o in objects])

#max_size = 1024 * MB
max_size = 10 * MB
cur_size = 0
cur_num_rec = 0
tot_size = 0

delete_target = True

# TODO Add logmsg01
logmsg01('Begins')
start_time = datetime.datetime.now()
d = {'key': [],
        'last_modified': [],
        'size': [],
        'content': []}
for i, o in enumerate(objects):
    num_rec = i + 1
    cur_num_rec += 1
    # Read from S3
    o.load()
    response = o.get()
    content = response['Body'].read()
    #content = None
    d['key'].append(o.key)
    d['last_modified'].append(o.last_modified)
    d['size'].append(o.size)
    d['content'].append(content)
    if i and i % 10 == 0:
        logmsg01(f'i={i}, key={o.key}, cumulative size={cur_size}')
    cur_size += o.size 
    tot_size += o.size
    # if num_rec > 5:
    #     break

    # TODO Write file before size exceeds the limit.  However, if a single file is larger than the max, write it.
    if cur_size > max_size or num_rec == num_objects:
    # Write to parquet
        table = pa.Table.from_pydict(d)

        logmsg01(f'Writing {cur_num_rec} keys with content.  Size={cur_size}.')
        # https://stackoverflow.com/questions/58818227/how-to-write-pyarrow-parquet-data-to-s3-bucket
        pq.write_to_dataset(table, 
                            root_path=f's3://sorel-20m-demo/output_no_spark/', 
                            #filesystem=s3,
                            existing_data_behavior="delete_matching" if delete_target else "overwrite_or_ignore",
                            compression='snappy')
        delete_target = False
        cur_size = 0
        cur_num_rec = 0
        d = {'key': [],
                'last_modified': [],
                'size': [],
                'content': []}
end_time = datetime.datetime.now()        
logmsg01(f'Complete.  Processed {num_rec} files.  Total size={tot_size}.  Elapsed={end_time - start_time}')

2023-03-21 23:02:17:  Begins
2023-03-21 23:02:23:  i=10, key=09-DEC-2020/binaries/0000147e127de86fdec59ebafb0cca8d3002dc3513ab22f21c646fb7185a48f7, cumulative size=4456296
2023-03-21 23:02:28:  Writing 18 keys with content.  Size=11411026.
2023-03-21 23:02:43:  i=20, key=09-DEC-2020/binaries/00001c81777f5519a32efe91ac906f6b85adae2fc15b8884a35ef38f73dbb4d4, cumulative size=1917716
2023-03-21 23:02:52:  Writing 6 keys with content.  Size=14218896.
2023-03-21 23:03:21:  i=30, key=09-DEC-2020/binaries/000028e58d29d53833b17255293b73726c85f385e0052d4bddcb998311bdc5f7, cumulative size=5089559
2023-03-21 23:03:32:  Writing 16 keys with content.  Size=10530652.
2023-03-21 23:03:41:  i=40, key=09-DEC-2020/binaries/000031498f0ce7879b833b9fb180285a8649878cc556bb55afe1e18af52e4df5, cumulative size=0
2023-03-21 23:03:52:  i=50, key=09-DEC-2020/binaries/00003cb2964261e0125562a5728c7e3e1035c287659840a60e0fc6d8ae63fb5b, cumulative size=6583043
2023-03-21 23:03:57:  Writing 19 keys with content.  Size=1