## Removing Duplicate Objects

In [None]:
#!/usr/bin/env python3
import boto3
import argparse

parser = argparse.ArgumentParser('Find and remove duplicate objects from an AWS S3 bucket')
parser.add_argument('bucketName', help='S3 bucket to search')

args = parser.parse_args() 
myBucket = args.bucketName

s3 = boto3.client('s3')

print('Starting search for duplicate objects\n')

lastReqLength = 1000
lastKey = ""
existing = {}
duplicateObjects = []
duplicates = []

objectCount = 0
duplicatesCount = 0

while lastReqLength == 1000:
    if (objectCount == 0):
        myObjects = s3.list_objects_v2(Bucket=myBucket)
    else:
        myObjects = s3.list_objects_v2(Bucket=myBucket,StartAfter=lastKey)
    lastReqLength = len(myObjects['Contents'])
    objectCount += lastReqLength
    for obj in myObjects['Contents']:
        lastKey = obj['Key']
        thisKey = obj['Key']
        thisSize = obj['Size']
        thisEtag = obj['ETag']
        if  thisSize > 0:
            if thisEtag in existing:
                #duplicate found:
                print('!!Duplicate: - %s - %s' % (existing[thisEtag], thisKey))
                duplicatesCount += 1
                duplicateObjects.append(existing[thisEtag])
                duplicates.append(thisKey)
            else:
                existing[thisEtag] = thisKey

print(f"Found {duplicatesCount} duplicates")

if (len(duplicateObjects) > 0):
    s3res = boto3.resource('s3')

    for i in range(len(duplicateObjects)):
        parts1 = duplicateObjects[i].rpartition('-')
        parts2 = duplicates[i].rpartition('-')

        objNumberWithExtension = parts1[-1]
        prefix = parts1[0]
        addition = parts2[0].replace('MAIN/', '')

        newName = prefix + '-' + addition + '-' + objNumberWithExtension

        print('Copying ' + duplicateObjects[i] + ' to ' + newName)
        s3res.Object(myBucket, newName).copy_from(CopySource=myBucket+'/'+duplicateObjects[i])

        print('Deleting ' + duplicateObjects[i] + ' and ' + duplicates[i])
        s3res.Object(myBucket, duplicateObjects[i]).delete()
        s3res.Object(myBucket, duplicates[i]).delete()
        
print('Copy & delete complete.')