In [4]:
import boto3
import numpy as np
from pyspark.sql import SparkSession
from functools import partial

## Spark alone

In [5]:
sc = SparkSession.builder.appName('Untitled').getOrCreate().sparkContext
print(sc)

<SparkContext master=local[*] appName=Untitled>


In [3]:
rdd = sc.parallelize([i for i in range(20)])

In [4]:
def gen_data(n):
    return 20*(n.to_bytes(1,byteorder='little'))

In [5]:
blocks = rdd.map(gen_data)

In [6]:
blocks.collect()

[b'\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00',
 b'\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01',
 b'\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02',
 b'\x03\x03\x03\x03\x03\x03\x03\x03\x03\x03\x03\x03\x03\x03\x03\x03\x03\x03\x03\x03',
 b'\x04\x04\x04\x04\x04\x04\x04\x04\x04\x04\x04\x04\x04\x04\x04\x04\x04\x04\x04\x04',
 b'\x05\x05\x05\x05\x05\x05\x05\x05\x05\x05\x05\x05\x05\x05\x05\x05\x05\x05\x05\x05',
 b'\x06\x06\x06\x06\x06\x06\x06\x06\x06\x06\x06\x06\x06\x06\x06\x06\x06\x06\x06\x06',
 b'\x07\x07\x07\x07\x07\x07\x07\x07\x07\x07\x07\x07\x07\x07\x07\x07\x07\x07\x07\x07',
 b'\x08\x08\x08\x08\x08\x08\x08\x08\x08\x08\x08\x08\x08\x08\x08\x08\x08\x08\x08\x08',
 b'\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t',
 b'\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n',
 b'\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b',
 b'\x0c\x0c\x0c\x0c\x0c\x0c\x0c\x0c\x0c\x0c\x0c\

## Boto3 alone

In [None]:
s3 = boto3.client('s3')
mpu = s3.create_multipart_upload(Bucket='vmwishes-test',Key='junk.dtb')
mpu['UploadId']

In [None]:
head_part = s3.upload_part(Bucket='vmwishes-test',Key='junk.dtb',
                           PartNumber=1,
                           UploadId=mpu['UploadId'],
                           Body=5400000*b'H')
parts = [{ 'PartNumber':1, 'ETag':head_part['ETag']}]

In [None]:
for n in range(20):
    np = s3.upload_part(Bucket='vmwishes-test',Key='junk.dtb',
                           PartNumber=2+n,
                           UploadId=mpu['UploadId'],
                           Body=5400000*(n.to_bytes(1,byteorder='little')))
    
    parts.append( {'PartNumber':2+n, 'ETag':np['ETag']} )
    print(f"Uploaded {n}: {np['ETag']}")
    

In [None]:
parts

In [None]:
result = s3.complete_multipart_upload(Bucket='vmwishes-test',
                                      Key='junk.dtb',
                                      MultipartUpload={'Parts':parts},
                                      UploadId=mpu['UploadId'])

In [16]:
s3.abort_multipart_upload(Bucket='vmwishes-test',
                                      Key='junk.dtb',
                                      UploadId=mpu['UploadId'])

{'ResponseMetadata': {'RequestId': 'HPMD39ZVA22DJWN7',
  'HostId': 'XaBCsd7/d2IYHM2G7UI/kV1QFzxp/FkDSjMw3Kch9duHJEyMPdd2xQIGqZZE1WSqU12S3W6q0XQ=',
  'HTTPStatusCode': 204,
  'HTTPHeaders': {'x-amz-id-2': 'XaBCsd7/d2IYHM2G7UI/kV1QFzxp/FkDSjMw3Kch9duHJEyMPdd2xQIGqZZE1WSqU12S3W6q0XQ=',
   'x-amz-request-id': 'HPMD39ZVA22DJWN7',
   'date': 'Sat, 05 Jun 2021 04:37:57 GMT',
   'server': 'AmazonS3'},
  'RetryAttempts': 0}}

In [17]:
s3.list_multipart_uploads(Bucket='vmwishes-test')

{'ResponseMetadata': {'RequestId': 'N8VHNP644ZB8VKYQ',
  'HostId': '6/mHis32LxqbM0Vg5dPSe3lKvPT+72/gg2OeaegsFkHkB3lV08HnjY4C4D/oKs1jtdfHcY7o7OI=',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amz-id-2': '6/mHis32LxqbM0Vg5dPSe3lKvPT+72/gg2OeaegsFkHkB3lV08HnjY4C4D/oKs1jtdfHcY7o7OI=',
   'x-amz-request-id': 'N8VHNP644ZB8VKYQ',
   'date': 'Sat, 05 Jun 2021 04:38:00 GMT',
   'content-type': 'application/xml',
   'transfer-encoding': 'chunked',
   'server': 'AmazonS3'},
  'RetryAttempts': 0},
 'Bucket': 'vmwishes-test',
 'KeyMarker': '',
 'UploadIdMarker': '',
 'NextKeyMarker': '',
 'NextUploadIdMarker': '',
 'MaxUploads': 1000,
 'IsTruncated': False}

## Boto3 + Spark

In [6]:
sc = SparkSession.builder.appName('Untitled').getOrCreate().sparkContext
rdd = sc.parallelize([i for i in range(20)])

In [18]:
s3 = boto3.client('s3')
mpu = s3.create_multipart_upload(Bucket='vmwishes-test',Key='junk2.dtb')
mpu['UploadId']

'rEH.lq5JR498uSq2IMfqDCRd8wAjkSbhk0r6a_b6.EJOU6lYQJY2ZN3u4mtBc4TQMWDWmkRrRM9WwmPGjeSqUw--'

In [19]:
def upload_func(uploadId, n):
    s3 = boto3.client('s3')
    try:
        part = s3.upload_part(Bucket='vmwishes-test',Key='junk2.dtb',
                              PartNumber=n+1,
                              UploadId=uploadId,
                              Body=5400000*(n.to_bytes(1,byteorder='little')))
    except Exception as e:
        return f'caught:{e}  n={n}  uploadId={uploadId}'
    
    return part['ETag']

In [20]:
parts = rdd.map(partial(upload_func,mpu['UploadId'])).collect()
parts

['"e4e80bf1a8de7278fabc4bf53886bc05"',
 '"54c39b5badf28f3c775a02f7c2d78e1f"',
 '"c851c4219420d2b8f2f710955fb56de5"',
 '"02d22b67104fcadfc965203fb80cdc1d"',
 '"a8da5c36041b5d25103319667d2a0c54"',
 '"e8c5d113a83d7e42ffe7d0e1df3e61dd"',
 '"9d5af2404a559f5fb956cccfb6966285"',
 '"2e63eb8b37f3d71e4240e5769d73203c"',
 '"a5539911783ad56099a58c275a8dae2d"',
 '"b19a63c6f68046dbfd0c4b3931b82eb1"',
 '"1ffc7fadb7897123f587fed226e4730c"',
 '"5db7ab3ed357e5a1d31fbd8a4ca326b3"',
 '"590a725a2aeca6c64744459814f5c6fb"',
 '"22226b8f9bf68e2ca24fa582407cc4b5"',
 '"a4c8eaeae6808ad523088ce78e27f15b"',
 '"535b2ea8a3c3416d153dad6445a95ef3"',
 '"ee8e5743208bdd5ff4ea3b354a0e69db"',
 '"2badfe70a30758cc9efa3236f933dd3f"',
 '"26d9d59cefc45055c7edb685cc36f445"',
 '"8ba328b001e041e5af659ec07b2bb787"']

In [24]:
part_info = [ { 'PartNumber':i+1, 'ETag':etag } for i,etag in enumerate(parts) ]
part_info

[{'PartNumber': 1, 'ETag': '"e4e80bf1a8de7278fabc4bf53886bc05"'},
 {'PartNumber': 2, 'ETag': '"54c39b5badf28f3c775a02f7c2d78e1f"'},
 {'PartNumber': 3, 'ETag': '"c851c4219420d2b8f2f710955fb56de5"'},
 {'PartNumber': 4, 'ETag': '"02d22b67104fcadfc965203fb80cdc1d"'},
 {'PartNumber': 5, 'ETag': '"a8da5c36041b5d25103319667d2a0c54"'},
 {'PartNumber': 6, 'ETag': '"e8c5d113a83d7e42ffe7d0e1df3e61dd"'},
 {'PartNumber': 7, 'ETag': '"9d5af2404a559f5fb956cccfb6966285"'},
 {'PartNumber': 8, 'ETag': '"2e63eb8b37f3d71e4240e5769d73203c"'},
 {'PartNumber': 9, 'ETag': '"a5539911783ad56099a58c275a8dae2d"'},
 {'PartNumber': 10, 'ETag': '"b19a63c6f68046dbfd0c4b3931b82eb1"'},
 {'PartNumber': 11, 'ETag': '"1ffc7fadb7897123f587fed226e4730c"'},
 {'PartNumber': 12, 'ETag': '"5db7ab3ed357e5a1d31fbd8a4ca326b3"'},
 {'PartNumber': 13, 'ETag': '"590a725a2aeca6c64744459814f5c6fb"'},
 {'PartNumber': 14, 'ETag': '"22226b8f9bf68e2ca24fa582407cc4b5"'},
 {'PartNumber': 15, 'ETag': '"a4c8eaeae6808ad523088ce78e27f15b"'},
 {'P

In [25]:
result = s3.complete_multipart_upload(Bucket='vmwishes-test',
                                      Key='junk2.dtb',
                                      MultipartUpload={'Parts':part_info},
                                      UploadId=mpu['UploadId'])

In [26]:
s3.list_multipart_uploads(Bucket='vmwishes-test')

{'ResponseMetadata': {'RequestId': 'HVZRBXJ6W510RGR9',
  'HostId': '6PMi5P7Zb/Q3wQwE7EnHYLGaScf0Bs68uZLRQjw5SeglWL3T0t1oQ+BoeygXyadPUfxr7In3xGU=',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amz-id-2': '6PMi5P7Zb/Q3wQwE7EnHYLGaScf0Bs68uZLRQjw5SeglWL3T0t1oQ+BoeygXyadPUfxr7In3xGU=',
   'x-amz-request-id': 'HVZRBXJ6W510RGR9',
   'date': 'Sun, 06 Jun 2021 18:21:37 GMT',
   'content-type': 'application/xml',
   'transfer-encoding': 'chunked',
   'server': 'AmazonS3'},
  'RetryAttempts': 0},
 'Bucket': 'vmwishes-test',
 'KeyMarker': '',
 'UploadIdMarker': '',
 'NextKeyMarker': '',
 'NextUploadIdMarker': '',
 'MaxUploads': 1000,
 'IsTruncated': False}

In [27]:
s3.abort_multipart_upload(Bucket='vmwishes-test',
                                      Key='junk2.dtb',
                                      UploadId=mpu['UploadId'])

{'ResponseMetadata': {'RequestId': 'QRC119E7REBKNHYX',
  'HostId': '90c4J3J0Lu6jbwnvKHDmaaG3QFIIfBshlt4oYY1Gfu+RIXPQbJmbirX4huMpOprHShzJClf/KZE=',
  'HTTPStatusCode': 204,
  'HTTPHeaders': {'x-amz-id-2': '90c4J3J0Lu6jbwnvKHDmaaG3QFIIfBshlt4oYY1Gfu+RIXPQbJmbirX4huMpOprHShzJClf/KZE=',
   'x-amz-request-id': 'QRC119E7REBKNHYX',
   'date': 'Sun, 06 Jun 2021 18:24:16 GMT',
   'server': 'AmazonS3'},
  'RetryAttempts': 0}}

In [16]:
s3.abort_multipart_upload(Bucket='vmwishes-test',
                                      Key='junk2.dtb',
                                      UploadId='hz_IQ7.wS1KAGkn1AqvV2LL98AZZc4Orjhj3AIq0xCep6vPVtzrP6NLGF5iNQvLT.h1ZTus54Gnpkrsj7JYk.g--')

{'ResponseMetadata': {'RequestId': '7BVVHRCHASZDHX8N',
  'HostId': '0HNoB8vjiJreCd02WQ5tK00fZk4B3rwP6yTx9gfW3WjS7+os//BtJ1u9uSjRl9ZsaeDiEateIIs=',
  'HTTPStatusCode': 204,
  'HTTPHeaders': {'x-amz-id-2': '0HNoB8vjiJreCd02WQ5tK00fZk4B3rwP6yTx9gfW3WjS7+os//BtJ1u9uSjRl9ZsaeDiEateIIs=',
   'x-amz-request-id': '7BVVHRCHASZDHX8N',
   'date': 'Sun, 06 Jun 2021 18:13:29 GMT',
   'server': 'AmazonS3'},
  'RetryAttempts': 0}}