# Swift vs Boto3 for Parallelisation with Dask

In [2]:
import boto3
import swiftclient
import os

In [3]:
s3_keys_path = '~/.keys/lsst_keys.json'
swift_keys_path = '~/.keys/lsst-swift-credentials.json'
creds = {}

#for boto3
with open(os.path.expanduser(s3_keys_path), 'r') as s3keys:
    for line in s3keys.readlines():
        if 'access_key' in line:
            creds['S3_ACCESS_KEY'] = line.split('"')[3]
        elif 'secret_key' in line:
            creds['S3_SECRET_KEY'] = line.split('"')[3]
creds['S3_HOST_URL'] = 'https://echo.stfc.ac.uk'

#for swift
with open(os.path.expanduser(swift_keys_path), 'r') as swiftkeys:
    for line in swiftkeys.readlines():
        if 'user' in line:
            creds['ST_USER'] = line.split('"')[3]
        elif 'secret_key' in line:
            creds['ST_KEY'] = line.split('"')[3]
creds['ST_AUTH'] = 'https://s3.echo.stfc.ac.uk/auth/1.0'

#Some Swift operations expect environment variables
#Here we just mirror the creds dict to the os.environ dict
for k, v in creds.items():
    os.environ[k] = v

## boto3 resource

In [4]:
session = boto3.Session(
    aws_access_key_id = creds['S3_ACCESS_KEY'],
    aws_secret_access_key = creds['S3_SECRET_KEY']
)

In [5]:
resource = session.resource(
    service_name = 's3',
    endpoint_url = creds['S3_HOST_URL']
)
resource

s3.ServiceResource()

In [6]:
# %%timeit
boto3_bucket_list = [ b.name for b in resource.buckets.all() ]

In [7]:
boto3_bucket_list

['DRP',
 'LSST-IR-FUSION',
 'LSST-IR-FUSION-Butlers',
 'LSST-IR-FUSION-TEST',
 'LSST-IR-FUSION-rdsip005',
 'LSST-IR-FUSION-testfromopenstack',
 'LSST-IR-FUSION_gen3_conversion',
 'dmu4',
 'lsst-dac',
 'lsst-drp-config',
 'lsst-test']

## Swift Connection

In [8]:
connection = swiftclient.Connection(
    user = creds['ST_USER'],
    key = creds['ST_KEY'],
    authurl = creds['ST_AUTH']
)

In [9]:
# %%timeit
containers = [ container['name'] for container in connection.get_account()[1] ]

In [10]:
containers

['DRP',
 'LSST-IR-FUSION',
 'LSST-IR-FUSION-Butlers',
 'LSST-IR-FUSION-TEST',
 'LSST-IR-FUSION-rdsip005',
 'LSST-IR-FUSION-testfromopenstack',
 'LSST-IR-FUSION_gen3_conversion',
 'dmu4',
 'lsst-dac',
 'lsst-drp-config',
 'lsst-test']

In [11]:
bucket = resource.Bucket('LSST-IR-FUSION-testfromopenstack')

In [12]:
# %%timeit
boto3_objects = [ obj.key for obj in bucket.objects.all() ]

In [13]:
boto3_objects

['dummy-lsst-backup.csv',
 'dummy/1/1_1.f',
 'dummy/1/1_2.f',
 'dummy/14/14_1.f',
 'dummy/14/14_2.f',
 'dummy/17/17_2.f',
 'dummy/18/18_1.f',
 'dummy/18/18_2.f',
 'dummy/2/2_1.f',
 'dummy/2/2_2.f',
 'dummy/21/21_1.f',
 'dummy/21/21_2.f',
 'dummy/30/30_1.f',
 'dummy/30/30_2.f',
 'dummy/31/31_1.f',
 'dummy/31/31_2.f',
 'dummy/36/36_1.f',
 'dummy/36/36_2.f',
 'dummy/44/44_1.f',
 'dummy/44/44_2.f',
 'dummy/46/46_1.f',
 'dummy/5/5_1.f',
 'dummy/5/5_2.f',
 'dummy/50/50_1.f',
 'dummy/50/50_2.f',
 'dummy/52/52_1.f',
 'dummy/52/52_2.f',
 'dummy/53/53_1.f',
 'dummy/53/53_2.f',
 'dummy/60/60_2.f',
 'dummy/67/67_1.f',
 'dummy/67/67_2.f',
 'dummy/7/7_1.f',
 'dummy/7/7_2.f',
 'dummy/72/72_1.f',
 'dummy/72/72_2.f',
 'dummy/8/8_1.f',
 'dummy/8/8_2.f',
 'dummy/81/81_1.f',
 'dummy/81/81_2.f',
 'dummy/87/87_1.f',
 'dummy/87/87_2.f',
 'dummy/88/88_1.f',
 'dummy/88/88_2.f',
 'dummy/97/97_1.f',
 'dummy/97/97_2.f',
 'dummy/98/98_1.f',
 'dummy/98/98_2.f',
 'dummy/collated_0.zip',
 'dummy/collated_0.zip.metada

In [14]:
# %%timeit
swift_objects = [ obj['name'] for obj in connection.get_container('LSST-IR-FUSION-testfromopenstack')[1] ]

In [15]:
swift_objects

['dummy-lsst-backup.csv',
 'dummy/1/1_1.f',
 'dummy/1/1_2.f',
 'dummy/14/14_1.f',
 'dummy/14/14_2.f',
 'dummy/17/17_2.f',
 'dummy/18/18_1.f',
 'dummy/18/18_2.f',
 'dummy/2/2_1.f',
 'dummy/2/2_2.f',
 'dummy/21/21_1.f',
 'dummy/21/21_2.f',
 'dummy/30/30_1.f',
 'dummy/30/30_2.f',
 'dummy/31/31_1.f',
 'dummy/31/31_2.f',
 'dummy/36/36_1.f',
 'dummy/36/36_2.f',
 'dummy/44/44_1.f',
 'dummy/44/44_2.f',
 'dummy/46/46_1.f',
 'dummy/5/5_1.f',
 'dummy/5/5_2.f',
 'dummy/50/50_1.f',
 'dummy/50/50_2.f',
 'dummy/52/52_1.f',
 'dummy/52/52_2.f',
 'dummy/53/53_1.f',
 'dummy/53/53_2.f',
 'dummy/60/60_2.f',
 'dummy/67/67_1.f',
 'dummy/67/67_2.f',
 'dummy/7/7_1.f',
 'dummy/7/7_2.f',
 'dummy/72/72_1.f',
 'dummy/72/72_2.f',
 'dummy/8/8_1.f',
 'dummy/8/8_2.f',
 'dummy/81/81_1.f',
 'dummy/81/81_2.f',
 'dummy/87/87_1.f',
 'dummy/87/87_2.f',
 'dummy/88/88_1.f',
 'dummy/88/88_2.f',
 'dummy/97/97_1.f',
 'dummy/97/97_2.f',
 'dummy/98/98_1.f',
 'dummy/98/98_2.f',
 'dummy/collated_0.zip',
 'dummy/collated_0.zip.metada

With a simple test of getting lists buckets and objects within a bucket, the boto3 API seems faster at this stage.

Now we'll try something abit more in-depth.

In [16]:
def get_metadata_boto3(key, bucket):
    if key.endswith('.zip'):
        try:
            metadata = str(bucket.Object(''.join([key,'.metadata'])).get()['Body'].read().decode('UTF-8'))
        except:
            return ''
        return metadata
    else:
        return ''

In [17]:
def get_metadata_swift(key, connection, container_name):
    metadata = None
    if key.endswith('.zip'):
        try:
            metadata = str(connection.get_object(container_name,''.join([key,'.metadata']))[1].decode('UTF-8'))
        except:
            return ''
        return metadata
    else:
        return ''

In [18]:
# %%timeit
metadata_boto3 = get_metadata_boto3('dummy/collated_7.zip', bucket)

In [19]:
metadata_boto3

'18/18_1.f|44/44_1.f|44/44_2.f'

In [20]:
# %%timeit
metadata_swift = get_metadata_swift('dummy/collated_7.zip', connection, 'LSST-IR-FUSION-testfromopenstack')

In [21]:
metadata_swift

'18/18_1.f|44/44_1.f|44/44_2.f'

Now switch to Pandas, so we can use DataFrame.apply() to run these functions on every row of a DataFrame

In [22]:
import pandas as pd

In [23]:
expanded_objects = []
for _ in range(10):
    expanded_objects.extend(boto3_objects)

In [24]:
len(expanded_objects)

800

In [25]:
objects_boto3 = pd.DataFrame.from_dict({'key':expanded_objects},dtype=str)

In [26]:
objects_boto3

Unnamed: 0,key
0,dummy-lsst-backup.csv
1,dummy/1/1_1.f
2,dummy/1/1_2.f
3,dummy/14/14_1.f
4,dummy/14/14_2.f
...,...
795,dummy/collated_65.zip.metadata
796,dummy/collated_66.zip
797,dummy/collated_66.zip.metadata
798,dummy/collated_7.zip


In [28]:
%%timeit
objects_boto3['metadata'] = objects_boto3['key'].apply(get_metadata_boto3, bucket=bucket)

19.6 s ± 716 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [29]:
objects_boto3[objects_boto3['key'].str.endswith('.zip')]

Unnamed: 0,key,metadata
48,dummy/collated_0.zip,7/7_1.f|7/7_2.f|98/98_2.f
50,dummy/collated_1.zip,98/98_1.f|53/53_2.f|53/53_1.f
52,dummy/collated_2.zip,50/50_2.f|50/50_1.f|67/67_2.f
54,dummy/collated_3.zip,67/67_1.f|2/2_2.f|2/2_1.f
56,dummy/collated_4.zip,8/8_1.f|8/8_2.f|87/87_2.f
...,...,...
790,dummy/collated_63.zip,81/81_2.f|52/52_1.f|52/52_2.f
792,dummy/collated_64.zip,31/31_1.f|31/31_2.f|30/30_2.f
794,dummy/collated_65.zip,30/30_1.f|36/36_2.f|36/36_1.f
796,dummy/collated_66.zip,14/14_2.f|14/14_1.f


In [30]:
objects_swift = pd.DataFrame.from_dict({'key':expanded_objects},dtype=str)

In [31]:
objects_swift

Unnamed: 0,key
0,dummy-lsst-backup.csv
1,dummy/1/1_1.f
2,dummy/1/1_2.f
3,dummy/14/14_1.f
4,dummy/14/14_2.f
...,...
795,dummy/collated_65.zip.metadata
796,dummy/collated_66.zip
797,dummy/collated_66.zip.metadata
798,dummy/collated_7.zip


In [32]:
%%timeit
objects_swift['metadata'] = objects_swift['key'].apply(get_metadata_swift, connection=connection, container_name='LSST-IR-FUSION-testfromopenstack')

18.8 s ± 740 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [33]:
objects_swift[objects_swift['key'].str.endswith('.zip')]

Unnamed: 0,key,metadata
48,dummy/collated_0.zip,7/7_1.f|7/7_2.f|98/98_2.f
50,dummy/collated_1.zip,98/98_1.f|53/53_2.f|53/53_1.f
52,dummy/collated_2.zip,50/50_2.f|50/50_1.f|67/67_2.f
54,dummy/collated_3.zip,67/67_1.f|2/2_2.f|2/2_1.f
56,dummy/collated_4.zip,8/8_1.f|8/8_2.f|87/87_2.f
...,...,...
790,dummy/collated_63.zip,81/81_2.f|52/52_1.f|52/52_2.f
792,dummy/collated_64.zip,31/31_1.f|31/31_2.f|30/30_2.f
794,dummy/collated_65.zip,30/30_1.f|36/36_2.f|36/36_1.f
796,dummy/collated_66.zip,14/14_2.f|14/14_1.f


Swift and boto3 still take around the same amount of time to give us the metadata for all of the zip objects using Pandas and our metadata functions to fetch it.
Both have established connection (or resource) objects that are already authenticated with the server.
Both are bound by the internet connection speed.

Let's try to parallelise them using Dask

## Dask parallelisation

Note: this will use a parallel version of the pandas functionality, _not_ parallel versions of the boto3 or Swift functions.

In [34]:
from dask import dataframe as ddf
from distributed import Client

In [35]:
client = Client()

In [36]:
client

0,1
Connection method: Cluster object,Cluster type: distributed.LocalCluster
Dashboard: http://127.0.0.1:8787/status,

0,1
Dashboard: http://127.0.0.1:8787/status,Workers: 4
Total threads: 12,Total memory: 7.43 GiB
Status: running,Using processes: True

0,1
Comm: tcp://127.0.0.1:46787,Workers: 4
Dashboard: http://127.0.0.1:8787/status,Total threads: 12
Started: Just now,Total memory: 7.43 GiB

0,1
Comm: tcp://127.0.0.1:33695,Total threads: 3
Dashboard: http://127.0.0.1:32937/status,Memory: 1.86 GiB
Nanny: tcp://127.0.0.1:43553,
Local directory: /tmp/dask-scratch-space/worker-ei_tmq24,Local directory: /tmp/dask-scratch-space/worker-ei_tmq24

0,1
Comm: tcp://127.0.0.1:35461,Total threads: 3
Dashboard: http://127.0.0.1:40249/status,Memory: 1.86 GiB
Nanny: tcp://127.0.0.1:44997,
Local directory: /tmp/dask-scratch-space/worker-le9ulp3d,Local directory: /tmp/dask-scratch-space/worker-le9ulp3d

0,1
Comm: tcp://127.0.0.1:46223,Total threads: 3
Dashboard: http://127.0.0.1:39599/status,Memory: 1.86 GiB
Nanny: tcp://127.0.0.1:39931,
Local directory: /tmp/dask-scratch-space/worker-2cfeus67,Local directory: /tmp/dask-scratch-space/worker-2cfeus67

0,1
Comm: tcp://127.0.0.1:42045,Total threads: 3
Dashboard: http://127.0.0.1:41373/status,Memory: 1.86 GiB
Nanny: tcp://127.0.0.1:37837,
Local directory: /tmp/dask-scratch-space/worker-pjbn8lf6,Local directory: /tmp/dask-scratch-space/worker-pjbn8lf6


In [37]:
objects_swift = pd.DataFrame.from_dict({'key':expanded_objects},dtype=str)

In [38]:
objects_swift_dask = ddf.from_pandas(objects_swift, npartitions=len(expanded_objects)//12)

In [39]:
objects_swift_dask

Unnamed: 0_level_0,key
npartitions=66,Unnamed: 1_level_1
0,string
13,...
...,...
788,...
799,...


In [40]:
objects_swift_dask['metadata'] = objects_swift_dask['key'].apply(get_metadata_swift, connection=connection, container_name='LSST-IR-FUSION-testfromopenstack', meta=pd.Series(dtype=str))

In [41]:
objects_swift_dask

Unnamed: 0_level_0,key,metadata
npartitions=66,Unnamed: 1_level_1,Unnamed: 2_level_1
0,string,object
13,...,...
...,...,...
788,...,...
799,...,...


In [42]:
%%timeit
objects_swift = objects_swift_dask.compute()

2.43 s ± 297 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [43]:
objects_swift

Unnamed: 0,key
0,dummy-lsst-backup.csv
1,dummy/1/1_1.f
2,dummy/1/1_2.f
3,dummy/14/14_1.f
4,dummy/14/14_2.f
...,...
795,dummy/collated_65.zip.metadata
796,dummy/collated_66.zip
797,dummy/collated_66.zip.metadata
798,dummy/collated_7.zip


In [44]:
objects_boto3 = pd.DataFrame.from_dict({'key':expanded_objects},dtype=str)

In [45]:
objects_boto3_dask = ddf.from_pandas(objects_boto3, npartitions=len(expanded_objects)//12)

In [46]:
objects_boto3_dask

Unnamed: 0_level_0,key
npartitions=66,Unnamed: 1_level_1
0,string
13,...
...,...
788,...
799,...


In [47]:
objects_boto3_dask['metadata'] = objects_boto3_dask['key'].apply(get_metadata_boto3, bucket=bucket, meta=pd.Series(dtype=str))

TokenizationError: Object s3.Bucket(name='LSST-IR-FUSION-testfromopenstack') cannot be deterministically hashed. See https://docs.dask.org/en/latest/custom-collections.html#implementing-deterministic-hashing for more information.

## Conclusion

- OpenStack Swift API is not 10 times faster than S3 API, it's about as fast.
- OpenStack python-swiftclient is not 10 times faster than boto3, it's about as fast.
- Using swiftclient with Pandas is not 10 times faster than using boto3 with Pandas, it's about as fast.
- Using swiftclient with Dask _is_ __at least__ 10 times faster than using either swiftclient or boto3 with Pandas.
- Using boto3 with Dask is impossible.

Therefore, the combination of swiftclient and Dask is __at least__ 10 times faster than the de facto industry standards of boto3 and Pandas, because Dask is faster than Pandas, and swiftclient objects __are__ _picklable_ and boto3 objects __are not__ _picklable_.


## Appendix

Minimal example of pickling.