In [None]:
import boto3
import numpy as np
from pyspark.sql import SparkSession
from functools import partial

## Spark alone

In [None]:
sc = SparkSession.builder.appName('Untitled').getOrCreate().sparkContext
print(sc)

In [None]:
rdd = sc.parallelize([i for i in range(20)])

In [None]:
def gen_data(n):
    return 20*(n.to_bytes(1,byteorder='little'))

In [None]:
blocks = rdd.map(gen_data)

In [None]:
blocks.collect()

## Boto3 alone

In [None]:
s3 = boto3.client('s3')
mpu = s3.create_multipart_upload(Bucket='vmwishes-test',Key='junk3.dtb')
mpu['UploadId']

In [None]:
head_part = s3.upload_part(Bucket='vmwishes-test',Key='junk3.dtb',
                           PartNumber=1,
                           UploadId=mpu['UploadId'],
                           Body=5400000*b'H')
parts = [{ 'PartNumber':1, 'ETag':head_part['ETag']}]

In [None]:
for n in range(20):
    np = s3.upload_part(Bucket='vmwishes-test',Key='junk3.dtb',
                           PartNumber=2+n,
                           UploadId=mpu['UploadId'],
                           Body=5400000*(n.to_bytes(1,byteorder='little')))
    
    parts.append( {'PartNumber':2+n, 'ETag':np['ETag']} )
    print(f"Uploaded {n}: {np['ETag']}")
    

In [None]:
parts

In [None]:
result = s3.complete_multipart_upload(Bucket='vmwishes-test',
                                      Key='junk3.dtb',
                                      MultipartUpload={'Parts':parts},
                                      UploadId=mpu['UploadId'])

In [None]:
s3.abort_multipart_upload(Bucket='vmwishes-test',
                                      Key='junk.dtb',
                                      UploadId=mpu['UploadId'])

In [None]:
s3.list_multipart_uploads(Bucket='vmwishes-test')

## Boto3 + Spark

In [None]:
sc = SparkSession.builder.appName('Untitled').getOrCreate().sparkContext
rdd = sc.parallelize([i for i in range(20)])

In [None]:
s3 = boto3.client('s3')
mpu = s3.create_multipart_upload(Bucket='vmwishes-test',Key='junk4.dtb')
mpu['UploadId']

In [None]:
def upload_func(uploadId, n):
    s3 = boto3.client('s3')
    try:
        part = s3.upload_part(Bucket='vmwishes-test',Key='junk4.dtb',
                              PartNumber=n+1,
                              UploadId=uploadId,
                              Body=5400000*(n.to_bytes(1,byteorder='little')))
    except Exception as e:
        return f'caught:{e}  n={n}  uploadId={uploadId}'
    
    return part['ETag']

In [None]:
parts = rdd.map(partial(upload_func,mpu['UploadId'])).collect()
parts

In [None]:
part_info = [ { 'PartNumber':i+1, 'ETag':etag } for i,etag in enumerate(parts) ]
part_info

In [None]:
result = s3.complete_multipart_upload(Bucket='vmwishes-test',
                                      Key='junk4.dtb',
                                      MultipartUpload={'Parts':part_info},
                                      UploadId=mpu['UploadId'])

In [None]:
s3.list_multipart_uploads(Bucket='vmwishes-test')

In [None]:
s3.abort_multipart_upload(Bucket='vmwishes-test',
                                      Key='junk2.dtb',
                                      UploadId=mpu['UploadId'])