# Move files around
This is much faster than using the website and when you can't get on the cli

In [9]:
import boto3
import pandas as pd
from io import StringIO
from tqdm import tqdm
from concurrent.futures import ThreadPoolExecutor, as_completed


In [10]:
temp_aws_access_key_id=""
temp_aws_secret_access_key=""
temp_aws_session_token=""

In [11]:
s3_client  = boto3.client('s3', aws_access_key_id=temp_aws_access_key_id,
                      aws_secret_access_key=temp_aws_secret_access_key, 
                 aws_session_token=temp_aws_session_token)

In [17]:
# S3 paths
bucket_name = 'dea-public-data-dev'
source_prefix = 'projects/WIT/ANAEv3_WIT_result_19042025/'
destination_prefix = 'projects/WIT/MDBA_ANAE_WIT_MH_18_04_2025/merged/'

In [18]:
# Paginate through all files
all_objects = []
continuation_token = None

while True:
    list_kwargs = {
        'Bucket': bucket_name,
        'Prefix': source_prefix,
        'MaxKeys': 1000,
    }
    if continuation_token:
        list_kwargs['ContinuationToken'] = continuation_token

    response = s3_client.list_objects_v2(**list_kwargs)
    all_objects.extend(response.get('Contents', []))

    if response.get('IsTruncated'):  # more pages
        continuation_token = response.get('NextContinuationToken')
    else:
        break

print(f"Found {len(all_objects)} files to move.")

# Function to copy and delete a single object
def move_object(obj):
    source_key = obj['Key']
    destination_key = source_key.replace(source_prefix, destination_prefix, 1)

    copy_source = {'Bucket': bucket_name, 'Key': source_key}

    try:
        s3_client.copy_object(Bucket=bucket_name, CopySource=copy_source, Key=destination_key)
        s3_client.delete_object(Bucket=bucket_name, Key=source_key)
        return f"Moved: {source_key} → {destination_key}"
    except Exception as e:
        return f"Failed: {source_key} — {str(e)}"

# Parallel execution with progress bar
max_workers = 40

with ThreadPoolExecutor(max_workers=max_workers) as executor:
    futures = {executor.submit(move_object, obj): obj for obj in all_objects}

    for future in tqdm(as_completed(futures), total=len(futures), desc="Moving files"):
        result = future.result()
        # Optional: print(result)

Found 270653 files to move.


Moving files: 100%|██████████| 270653/270653 [18:46<00:00, 240.19it/s]
