In [1]:
import sys
import os
import re
from multiprocessing import Pool
from itertools import repeat
import warnings
from datetime import datetime
from time import sleep
import hashlib
import pandas as pd
import numpy as np

In [2]:
import bucket_manager as bm

In [3]:
# Initiate timing
start = datetime.now()
# Set the source directory, bucket name, and destination directory
subdir = 'dmu3' # change subdir to sys.argv[1] in script
source_dir = f"/rds/project/rds-rPTGgs6He74/ras81/lsst-ir-fusion/{subdir}"
log = f"{'-'.join(source_dir.split('/')[-3:])}-files.csv"
destination_dir = f"ip005-ras81-lsst-ir-fusion/{subdir}" 
folders = []
folder_files = []
ncores = 1 # change to adjust number of CPUs (= number of concurrent connections)
perform_checksum = True
upload_checksum = False
dryrun = False

In [4]:
# Add titles to log file
with open(log, 'w') as logfile: # elsewhere open(log, 'a')
    logfile.write('LOCAL_FOLDER,LOCAL_PATH,FILE_SIZE,BUCKET_NAME,DESTINATION_KEY,CHECKSUM,CHECKSUM_SIZE,CHECKSUM_KEY\n')

In [5]:
# Setup bucket
s3_host = 'echo.stfc.ac.uk'
keys = bm.get_keys(os.sep.join([os.environ['HOME'],'lsst_keys.json']))
access_key = keys['access_key']
secret_key = keys['secret_key']

In [6]:
client = bm.get_client(access_key, secret_key, s3_host)

In [7]:
client

<botocore.client.S3 at 0x2ab6d582a990>

In [8]:
bucket_name = 'csd3-backup-test'
if dryrun:
    mybucket = 'dummy_bucket'

In [9]:
if bucket_name not in bm.bucket_list(client):
    if not dryrun:
            bm.create_bucket(client,bucket_name)
            print(f'Added bucket: {bucket_name}')
else:
    if not dryrun:
        print(f'Bucket exists: {bucket_name}')
        #sys.exit('Bucket exists.')
    else:
        print(f'Bucket exists: {bucket_name}')
        print('dryrun = True, so continuing.')

Bucket exists: csd3-backup-test


In [10]:
current_objects = bm.object_list(client,bucket_name)
current_objects

['ip005-ras81-lsst-ir-fusion/dmu3/readme.md',
 'lsst-ir-fusion--f-files.csv',
 'ras81-lsst-ir-fusion-dmu3-files.csv']

In [11]:
def upload_to_bucket(s3_host,access_key,secret_key,bucket_name,folder,filename,object_name,perform_checksum,upload_checksum,dryrun):
    client = bm.get_client(access_key, secret_key, s3_host)
    file_data = open(filename, 'rb')
    if perform_checksum:
        checksum = hashlib.md5(file_data.read()).hexdigest().encode('utf-8')
        if upload_checksum and not dryrun:
            checksum_key = object_name + '.checksum'
            #create checksum object
            key = bucket.new_key(checksum_key)
            key.set_contents_from_string(checksum)
    """
    - Upload the file to the bucket
    """
    if not dryrun:
        client.upload_fileobj(file_data,bucket_name,object_name)
        # key = bucket.new_key(object_name)
        # key.set_contents_from_filename(filename)

    """
        report actions
        CSV formatted
        header: LOCAL_FOLDER,LOCAL_PATH,FILE_SIZE,BUCKET_NAME,DESTINATION_KEY,CHECKSUM,CHECKSUM_SIZE,CHECKSUM_KEY
    """
    return_string = f'{folder},{filename},{os.stat(filename).st_size},{bucket_name},{object_name}'
    if perform_checksum and upload_checksum:
        return_string += f',{checksum},{len(checksum)},{checksum_key}'
    elif perform_checksum:
        return_string += f',{checksum},n/a,n/a'
    else:
        return_string += ',n/a,n/a,n/a'
    return return_string

In [12]:
def print_stats(log,folder,file_count,total_size,folder_start,folder_end,upload_checksum):
    elapsed = folder_end - folder_start
    print(f'Finished folder {folder}, elapsed time = {elapsed}')
    elapsed_seconds = elapsed.seconds + elapsed.microseconds / 1e6
    avg_file_size = total_size / file_count / 1024**2
    if not upload_checksum:
        print(f'{file_count} files (avg {avg_file_size:.2f} MiB/file) uploaded in {elapsed_seconds:.2f} seconds, {elapsed_seconds/file_count:.2f} s/file',flush=True)
        print(f'{total_size / 1024**2:.2f} MiB uploaded in {elapsed_seconds:.2f} seconds, {total_size / 1024**2 / elapsed_seconds:.2f} MiB/s',flush=True)
    if upload_checksum:
        checksum_size = 32*file_count # checksum byte strings are 32 bytes
        total_size += checksum_size
        file_count *= 2
        print(f'{file_count} files (avg {avg_file_size:.2f} MiB/file) uploaded (including checksum files) in {elapsed_seconds:.2f} seconds, {elapsed_seconds/file_count:.2f} s/file',flush=True)
        print(f'{total_size / 1024**2:.2f} MiB uploaded (including checksum files) in {elapsed_seconds:.2f} seconds, {total_size / 1024**2 / elapsed_seconds:.2f} MiB/s',flush=True)

In [16]:
def process_files(s3_host,access_key,secret_key, bucket_name, current_objects, source_dir, destination_dir, ncores, perform_checksum, upload_checksum, dryrun, log):
    i = 0
    #processed_files = []
    with Pool(ncores) as pool: # use 4 CPUs by default - very little speed-up, might drop multiprocessing and parallelise at shell level
        #recursive loop over local folder
        for folder,subfolders,files in os.walk(source_dir):
            # check folder isn't empty
            if len(files) > 0:
                # all files within folder
                folder_files = [ os.sep.join([folder,filename]) for filename in files ]
                # keys to files on s3
                object_names = [ os.sep.join([destination_dir, os.path.relpath(filename, source_dir)]) for filename in folder_files ]
                print(folder_files)
                print(object_names)
                # remove current objects - avoids reuploading
                # could provide overwrite flag if this is desirable
                for oni,on in enumerate(object_names):
                    if on in current_objects:
                        object_names.remove(on)
                        del folder_files[oni]
                print(folder_files)
                print(object_names)
                folder_start = datetime.now()
                file_count = len(files)
                print('check for symlinks')
                for f in files:
                    if os.path.islink(f):
                        print(os.path.islink(f))
                        raise Exception("Not dealing with symlinks here yet.")
                # upload files in parallel and log output
                print(f'Uploading {file_count} files from {folder} using {ncores} processes.')
                with open(log, 'a') as logfile:
                    for result in pool.starmap(upload_to_bucket, zip(repeat(s3_host),repeat(access_key),repeat(secret_key), repeat(bucket_name), repeat(folder), folder_files, object_names, repeat(perform_checksum), repeat(upload_checksum), repeat(dryrun))):
                        logfile.write(f'{result}\n')
                folder_end = datetime.now()
                folder_files_size = np.sum(np.array([os.path.getsize(filename) for filename in folder_files]))
                print_stats(log, folder, file_count, folder_files_size, folder_start, folder_end, upload_checksum)

                # testing - stop after 1 folders
                i+=1
                if i == 1:
                    break
    # Upload log file
    if not dryrun:
        upload_to_bucket(s3_host,access_key,secret_key,bucket_name, '/', log, os.path.basename(log), False, False, False)

# Go!

In [17]:
# Process the files in parallel
print(f'Starting processing at {datetime.now()}, elapsed time = {datetime.now() - start}')
with warnings.catch_warnings():
    warnings.filterwarnings('ignore')
    process_files(s3_host,access_key,secret_key, bucket_name, current_objects, source_dir, destination_dir, ncores, perform_checksum, upload_checksum, dryrun, log)

# Complete
print(f'Finished at {datetime.now()}, elapsed time = {datetime.now() - start}')

Starting processing at 2024-02-23 17:00:32.060236, elapsed time = 0:16:05.949593
['/rds/project/rds-rPTGgs6He74/ras81/lsst-ir-fusion/dmu3/readme.md']
['ip005-ras81-lsst-ir-fusion/dmu3/readme.md']
[]
[]
check for symlinks
Uploading 1 files from /rds/project/rds-rPTGgs6He74/ras81/lsst-ir-fusion/dmu3 using 1 processes.
Finished folder /rds/project/rds-rPTGgs6He74/ras81/lsst-ir-fusion/dmu3, elapsed time = 0:00:00.002559
1 files (avg 0.00 MiB/file) uploaded in 0.00 seconds, 0.00 s/file
0.00 MiB uploaded in 0.00 seconds, 0.00 MiB/s
Finished at 2024-02-23 17:01:33.082468, elapsed time = 0:17:06.971830


## Note to self: it's not going into subfolders anymore!