# S3 bucket search using a prefix does not find keys greater than that prefix

To efficiently search a bucket using prefixes (.\sorel\01. bench_s3_ls\ls_sorel_bin_multiproc.py) we need to ensure the prefixes we select get all keys.

The approach in the program above does not do that.

In [None]:
# https://stackoverflow.com/questions/75759423/high-volume-read-of-many-keys-in-an-s3-bucket/75779792#75779792
import boto3
import ctypes
import multiprocessing

In [None]:
def validate_hex_digits(s3, bucket, prefix):
    # Make sure the first character under the prefix is some hex digit
    resp = s3.list_objects_v2(Bucket=bucket, Prefix=prefix, MaxKeys=1)
    assert('0' <= resp['Contents'][0]['Key'][len(prefix)] <= 'f')
    # Make sure there's nothing after 'f'
    resp = s3.list_objects_v2(Bucket=bucket, Prefix=prefix + "g", MaxKeys=1)
    assert('Contents' not in resp)

In [30]:
bucket = 'sorel-20m'
prefix = '09-DEC-2020/binaries/'

s3 = boto3.client('s3')

# Verify all of the objects are at least two digit hex digits under the prefix
validate_hex_digits(s3, bucket, prefix)
for i in range(16):
    validate_hex_digits(s3, bucket, prefix + f"{i:x}")

In [28]:
key = '09-DEC-2020/binaries/0000029bfead495a003e43a7ab8406c6209ffb7d5e59dd212607aa358bfd66ea'
#key_short = key[:-25]
key_short = f'{prefix}00000'
resp = s3.list_objects_v2(Bucket=bucket, Prefix=key_short)
for k in resp['Contents']:
    print(k)    

{'Key': '09-DEC-2020/binaries/0000029bfead495a003e43a7ab8406c6209ffb7d5e59dd212607aa358bfd66ea', 'LastModified': datetime.datetime(2020, 12, 1, 20, 39, 23, tzinfo=tzutc()), 'ETag': '"e532ad9cf73b762766117b0e64336702"', 'Size': 179128, 'StorageClass': 'INTELLIGENT_TIERING'}
{'Key': '09-DEC-2020/binaries/000003b99c3d4b9860ad0b0ca43450603e5322f2cca3c9b3d543a2d6440305a0', 'LastModified': datetime.datetime(2020, 12, 1, 20, 39, 23, tzinfo=tzutc()), 'ETag': '"29ad9300447e6b57bc73da72f2330b2d"', 'Size': 1786, 'StorageClass': 'STANDARD'}
{'Key': '09-DEC-2020/binaries/00000533148c26bcc09ab44b1acafe32dde93773d4a7e3dbd06c8232db5e437f', 'LastModified': datetime.datetime(2020, 12, 1, 20, 39, 23, tzinfo=tzutc()), 'ETag': '"73474271cd24ddcc0f1e62e53001532b"', 'Size': 2687186, 'StorageClass': 'INTELLIGENT_TIERING'}
{'Key': '09-DEC-2020/binaries/000005920ff4eb85cfc74fd51ef1d5d7518dc16f6cb5c53f94f619473321d594', 'LastModified': datetime.datetime(2020, 12, 1, 20, 39, 23, tzinfo=tzutc()), 'ETag': '"23959ed

The keys above do not contain '9' in the 6th position following the prefix.

Searching for that does not give other keys greater than '9' (e.g. 'b')

In [29]:
key_short = f'{prefix}000009'
resp = s3.list_objects_v2(Bucket=bucket, Prefix=key_short)
for k in resp['Contents']:
    print(k)    

KeyError: 'Contents'