# Check Swift ETag vs hashlib md5

In [1]:
import os
from bucket_manager import bucket_manager as bm
import pandas as pd

In [2]:
swift_keys_path = '~/.keys/lsst-swift-credentials.json'
creds = {}

#for swift
with open(os.path.expanduser(swift_keys_path), 'r') as swiftkeys:
    for line in swiftkeys.readlines():
        if 'user' in line:
            creds['ST_USER'] = line.split('"')[3]
        elif 'secret_key' in line:
            creds['ST_KEY'] = line.split('"')[3]
creds['ST_AUTH'] = 'https://s3.echo.stfc.ac.uk/auth/1.0'

#Some Swift operations expect environment variables
#Here we just mirror the creds dict to the os.environ dict
for k, v in creds.items():
    os.environ[k] = v

assert bm.check_keys(api='swift')

## Swift Connection

In [3]:
connection = bm.get_conn_swift()

In [4]:
# %%timeit
containers = [ container['name'] for container in connection.get_account()[1] ]

In [5]:
containers

['DRP',
 'LSST-IR-FUSION',
 'LSST-IR-FUSION-Butlers',
 'LSST-IR-FUSION-TEST',
 'LSST-IR-FUSION-rdsip005',
 'LSST-IR-FUSION-testfromopenstack',
 'LSST-IR-FUSION_gen3_conversion',
 'dmu4',
 'lsst-dac',
 'lsst-drp-config',
 'lsst-test']

In [6]:
# %%timeit
swift_objects = [ obj['name'] for obj in connection.get_container('LSST-IR-FUSION-testfromopenstack')[1] ]

In [7]:
swift_objects

['dummy-lsst-backup.csv',
 'dummy/1/1_1.f',
 'dummy/1/1_2.f',
 'dummy/14/14_1.f',
 'dummy/14/14_2.f',
 'dummy/17/17_2.f',
 'dummy/18/18_1.f',
 'dummy/18/18_2.f',
 'dummy/2/2_1.f',
 'dummy/2/2_2.f',
 'dummy/21/21_1.f',
 'dummy/21/21_2.f',
 'dummy/30/30_1.f',
 'dummy/30/30_2.f',
 'dummy/31/31_1.f',
 'dummy/31/31_2.f',
 'dummy/36/36_1.f',
 'dummy/36/36_2.f',
 'dummy/44/44_1.f',
 'dummy/44/44_2.f',
 'dummy/46/46_1.f',
 'dummy/5/5_1.f',
 'dummy/5/5_2.f',
 'dummy/50/50_1.f',
 'dummy/50/50_2.f',
 'dummy/52/52_1.f',
 'dummy/52/52_2.f',
 'dummy/53/53_1.f',
 'dummy/53/53_2.f',
 'dummy/60/60_2.f',
 'dummy/67/67_1.f',
 'dummy/67/67_2.f',
 'dummy/7/7_1.f',
 'dummy/7/7_2.f',
 'dummy/72/72_1.f',
 'dummy/72/72_2.f',
 'dummy/8/8_1.f',
 'dummy/8/8_2.f',
 'dummy/81/81_1.f',
 'dummy/81/81_2.f',
 'dummy/87/87_1.f',
 'dummy/87/87_2.f',
 'dummy/88/88_1.f',
 'dummy/88/88_2.f',
 'dummy/97/97_1.f',
 'dummy/97/97_2.f',
 'dummy/98/98_1.f',
 'dummy/98/98_2.f',
 'dummy/collated_0.zip',
 'dummy/collated_0.zip.metada

With a simple test of getting lists buckets and objects within a bucket, the boto3 API seems faster at this stage.

Now we'll try something abit more in-depth.

In [8]:
def get_headers(key, connection, container_name):
    try:
        header = connection.head_object(container_name,key)
    except:
        header = None
    return header

In [9]:
df = pd.DataFrame(swift_objects, columns=['key'])
df['header'] = df['key'].apply(lambda x: get_headers(x, connection, 'LSST-IR-FUSION-testfromopenstack'))
df

Unnamed: 0,key,header
0,dummy-lsst-backup.csv,"{'content-length': '2469', 'accept-ranges': 'b..."
1,dummy/1/1_1.f,"{'content-length': '0', 'accept-ranges': 'byte..."
2,dummy/1/1_2.f,"{'content-length': '0', 'accept-ranges': 'byte..."
3,dummy/14/14_1.f,"{'content-length': '0', 'accept-ranges': 'byte..."
4,dummy/14/14_2.f,"{'content-length': '0', 'accept-ranges': 'byte..."
...,...,...
75,dummy/collated_65.zip.metadata,"{'content-length': '29', 'accept-ranges': 'byt..."
76,dummy/collated_66.zip,"{'content-length': '71303378', 'accept-ranges'..."
77,dummy/collated_66.zip.metadata,"{'content-length': '19', 'accept-ranges': 'byt..."
78,dummy/collated_7.zip,"{'content-length': '106955056', 'accept-ranges..."


In [10]:
d = df['header'][0]
d

{'content-length': '2469',
 'accept-ranges': 'bytes',
 'last-modified': 'Tue, 04 Feb 2025 11:54:21 GMT',
 'x-timestamp': '1738670061.55192',
 'etag': '6e6565cc676378490db89c87857ed823',
 'x-trans-id': 'tx00000e1b82ddb7eb84d9d-0067bde4ee-2fd9a218e-default',
 'x-openstack-request-id': 'tx00000e1b82ddb7eb84d9d-0067bde4ee-2fd9a218e-default',
 'content-type': 'multipart/mixed',
 'date': 'Tue, 25 Feb 2025 15:42:38 GMT'}

In [11]:
def get_etag(header):
    try:
        etag = header['etag']
    except:
        etag = None
    return etag

In [12]:
df['etag'] = df['header'].apply(lambda x: get_etag(x))

In [13]:
df

Unnamed: 0,key,header,etag
0,dummy-lsst-backup.csv,"{'content-length': '2469', 'accept-ranges': 'b...",6e6565cc676378490db89c87857ed823
1,dummy/1/1_1.f,"{'content-length': '0', 'accept-ranges': 'byte...",d41d8cd98f00b204e9800998ecf8427e
2,dummy/1/1_2.f,"{'content-length': '0', 'accept-ranges': 'byte...",d41d8cd98f00b204e9800998ecf8427e
3,dummy/14/14_1.f,"{'content-length': '0', 'accept-ranges': 'byte...",d41d8cd98f00b204e9800998ecf8427e
4,dummy/14/14_2.f,"{'content-length': '0', 'accept-ranges': 'byte...",d41d8cd98f00b204e9800998ecf8427e
...,...,...,...
75,dummy/collated_65.zip.metadata,"{'content-length': '29', 'accept-ranges': 'byt...",d09468b58628a6279416b6317f9b6c47
76,dummy/collated_66.zip,"{'content-length': '71303378', 'accept-ranges'...",d7e9fbbb851da3197c318abba27313e5
77,dummy/collated_66.zip.metadata,"{'content-length': '19', 'accept-ranges': 'byt...",dc056d37c09c9e010fadcb0737fd668c
78,dummy/collated_7.zip,"{'content-length': '106955056', 'accept-ranges...",0716b629b5b98730571f798191a8a809


In [14]:
connection

<swiftclient.client.Connection at 0x7fb5928f2dd0>

In [24]:
def get_etag(key, bucket_name, connection):
    return connection.head_object(bucket_name,key)['etag']

In [25]:
bucket_name = 'LSST-IR-FUSION-Butlers'
keys = ['butler_full_20221201/data/u/ir-shir1/DRP/vikingSingleFrame/20221208T120837Z/isr_log/20140302/896356/isr_log_VIRCAM_VIRCAM-Ks_896356_ccd6_u_ir-shir1_DRP_vikingSingleFrame_20221208T120837Z.json','butler_full_20221201/data/u/ir-shir1/DRP/vikingSingleFrame/20221208T120837Z/calexp/20110627/K/VIRCAM-Ks/314139/calexp_VIRCAM_K_VIRCAM-Ks_314139_ccd3_u_ir-shir1_DRP_vikingSingleFrame_20221208T120837Z.fits']

In [26]:
etags = []
for k in keys:
    etags.append(get_etag(k, bucket_name, connection))

In [27]:
etags

['122206487683ef9a4a44ddd362a15ba8', '42f7a4b3bd16f5f58599a569307e11c2']

In [28]:
wrong_header = connection.head_object(bucket_name,keys[1])
wrong_header

{'content-length': '30191040',
 'accept-ranges': 'bytes',
 'last-modified': 'Tue, 25 Feb 2025 12:52:04 GMT',
 'x-timestamp': '1740487924.55205',
 'etag': '42f7a4b3bd16f5f58599a569307e11c2',
 'x-trans-id': 'tx00000c555350ea4cb1c97-0067bde7d0-2fd9af127-default',
 'x-openstack-request-id': 'tx00000c555350ea4cb1c97-0067bde7d0-2fd9af127-default',
 'content-type': 'image/fits',
 'date': 'Tue, 25 Feb 2025 15:54:56 GMT',
 'connection': 'Keep-Alive'}

In [30]:
data = connection.get_object(bucket_name, keys[1])[1]

In [31]:
import hashlib

In [32]:
md5sum = hashlib.md5(data).hexdigest()

In [36]:
parts = []
num_parts = len(data) // (512*1024**2)
# data.seek(0)
num_parts

0

In [37]:
len(data)

30191040