# Zip the files into 10 folders and send to s3

In [1]:
import boto3
import zipfile
import os
import shutil
from tqdm import tqdm


In [2]:
temp_aws_access_key_id=""
temp_aws_secret_access_key=""
temp_aws_session_token=""

In [3]:
s3_client  = boto3.client('s3', aws_access_key_id=temp_aws_access_key_id,
                      aws_secret_access_key=temp_aws_secret_access_key, 
                 aws_session_token=temp_aws_session_token)

In [4]:
source_bucket = 'dea-public-data-dev'
source_prefix = 'projects/WIT/MDBA_ANAE_WIT_MH_18_04_2025/merged/'

# List all files in the source folder
response = s3_client.list_objects_v2(Bucket=source_bucket, Prefix=source_prefix)

# Gather all file keys
file_keys = [obj['Key'] for obj in response.get('Contents', [])]

# Display the first 10 file keys to verify
file_keys[:10]


['projects/WIT/MDBA_ANAE_WIT_MH_18_04_2025/merged/r1dtvqu5m.csv',
 'projects/WIT/MDBA_ANAE_WIT_MH_18_04_2025/merged/r1dtygfug.csv',
 'projects/WIT/MDBA_ANAE_WIT_MH_18_04_2025/merged/r1dtyggsf.csv',
 'projects/WIT/MDBA_ANAE_WIT_MH_18_04_2025/merged/r1dtyjqxg.csv',
 'projects/WIT/MDBA_ANAE_WIT_MH_18_04_2025/merged/r1dtyjxp1.csv',
 'projects/WIT/MDBA_ANAE_WIT_MH_18_04_2025/merged/r1dtym0ms.csv',
 'projects/WIT/MDBA_ANAE_WIT_MH_18_04_2025/merged/r1dtyqx9v.csv',
 'projects/WIT/MDBA_ANAE_WIT_MH_18_04_2025/merged/r1dtyr729.csv',
 'projects/WIT/MDBA_ANAE_WIT_MH_18_04_2025/merged/r1dtytfzc.csv',
 'projects/WIT/MDBA_ANAE_WIT_MH_18_04_2025/merged/r1dtyvemv.csv']

In [10]:
destination_bucket = 'dea-public-data-dev'
destination_prefix = 'projects/WIT/ANAEv3_WIT_result_22042025/'

In [6]:
# Create a temporary directory to store files for zipping
temp_dir = '/tmp/s3_zip_temp'
os.makedirs(temp_dir, exist_ok=True)

# Step 1: List files
print("Listing files in S3...")
paginator = s3_client.get_paginator('list_objects_v2')
page_iterator = paginator.paginate(Bucket=source_bucket, Prefix=source_prefix)

file_keys = []
for page in tqdm(page_iterator, desc="Fetching file list"):
    file_keys.extend(obj['Key'] for obj in page.get('Contents', []))

print(f"Total files: {len(file_keys)}")

Listing files in S3...


Fetching file list: 271it [00:47,  5.75it/s]

Total files: 270653





In [12]:
# Step 2: Define helper functions
# def download_files(keys, temp_dir):
#     """Download files with progress bar"""
#     for key in tqdm(keys, desc="Downloading files", leave=False):
#         local_path = os.path.join(temp_dir, os.path.basename(key))
#         s3_client.download_file(source_bucket, key, local_path)
from concurrent.futures import ThreadPoolExecutor

def download_files(file_keys, download_dir, max_workers=16):
    def download(key):
        filename = os.path.basename(key)
        dest_path = os.path.join(download_dir, filename)
        s3_client.download_file(source_bucket, key, dest_path)

    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        list(tqdm(executor.map(download, file_keys), total=len(file_keys), desc="Downloading"))

def zip_files(files, zip_name):
    """Create zip file with progress bar"""
    zip_path = os.path.join(temp_dir, zip_name)
    with zipfile.ZipFile(zip_path, 'w', zipfile.ZIP_DEFLATED) as zipf:
        for file in tqdm(files, desc=f"Zipping {zip_name}", leave=False):
            zipf.write(file, os.path.basename(file))

def upload_zip(zip_file, destination_key):
    """Upload zip file to S3"""
    tqdm.write(f"Uploading {os.path.basename(zip_file)} to S3...")
    s3_client.upload_file(zip_file, destination_bucket, destination_key)


In [13]:
# # Step 3: Split, zip, and upload
# files_per_zip = 27066
# last_zip_files = 27059
# total_files = len(file_keys)

# for i in range(0, total_files, files_per_zip):
#     # Determine which group of files we're zipping
#     if i + files_per_zip > total_files:
#         group_files = file_keys[i:i + last_zip_files]
#     else:
#         group_files = file_keys[i:i + files_per_zip]

#     part_num = i // files_per_zip
#     zip_name = f"ANAE_WIT_result_2025_04-part-{part_num}.zip"
#     tqdm.write(f"\n📦 Processing {zip_name} ({len(group_files)} files)")

#     # Download files to local temp dir
#     download_files(group_files, temp_dir)

#     # Build list of local file paths (only filenames)
#     local_file_paths = [os.path.join(temp_dir, os.path.basename(f)) for f in group_files]

#     # Create the zip file
#     zip_files(local_file_paths, zip_name)

#     # Upload zip file to S3 destination
#     destination_key = f"{destination_prefix}{zip_name}"
#     upload_zip(os.path.join(temp_dir, zip_name), destination_key)

#     # Cleanup temporary files
#     shutil.rmtree(temp_dir)
#     os.makedirs(temp_dir, exist_ok=True)

# print("\n✅ All zip files created and uploaded to S3 successfully!")


files_per_zip = 27066
last_zip_files = 27059
total_files = len(file_keys)

for i in range(0, total_files, files_per_zip):
    if i + files_per_zip > total_files:
        group_files = file_keys[i:i + last_zip_files]
    else:
        group_files = file_keys[i:i + files_per_zip]

    part_num = i // files_per_zip
    zip_name = f"ANAE_WIT_result_2025_04-part-{part_num}.zip"
    tqdm.write(f"\n📦 Processing {zip_name} ({len(group_files)} files)")

    # Download with threading
    download_files(group_files, temp_dir)

    # Zip
    local_file_paths = [os.path.join(temp_dir, os.path.basename(f)) for f in group_files]
    zip_files(local_file_paths, zip_name)

    # Upload to S3
    destination_key = f"{destination_prefix}{zip_name}"
    upload_zip(os.path.join(temp_dir, zip_name), destination_key)

    # Clean up for next batch
    shutil.rmtree(temp_dir)
    os.makedirs(temp_dir, exist_ok=True)

print("\n✅ All zip files created and uploaded to S3 successfully!")



📦 Processing ANAE_WIT_result_2025_04-part-0.zip (27066 files)


Downloading: 100%|██████████| 27066/27066 [02:52<00:00, 156.94it/s]
                                                                                                  

Uploading ANAE_WIT_result_2025_04-part-0.zip to S3...

📦 Processing ANAE_WIT_result_2025_04-part-1.zip (27066 files)


Downloading: 100%|██████████| 27066/27066 [03:06<00:00, 144.79it/s]
                                                                                                  

Uploading ANAE_WIT_result_2025_04-part-1.zip to S3...

📦 Processing ANAE_WIT_result_2025_04-part-2.zip (27066 files)


Downloading: 100%|██████████| 27066/27066 [03:00<00:00, 150.07it/s]
                                                                                                  

Uploading ANAE_WIT_result_2025_04-part-2.zip to S3...

📦 Processing ANAE_WIT_result_2025_04-part-3.zip (27066 files)


Downloading: 100%|██████████| 27066/27066 [02:57<00:00, 152.48it/s]
                                                                                                  

Uploading ANAE_WIT_result_2025_04-part-3.zip to S3...

📦 Processing ANAE_WIT_result_2025_04-part-4.zip (27066 files)


Downloading: 100%|██████████| 27066/27066 [02:58<00:00, 151.93it/s]
                                                                                                  

Uploading ANAE_WIT_result_2025_04-part-4.zip to S3...

📦 Processing ANAE_WIT_result_2025_04-part-5.zip (27066 files)


Downloading: 100%|██████████| 27066/27066 [02:58<00:00, 151.32it/s]
                                                                                                  

Uploading ANAE_WIT_result_2025_04-part-5.zip to S3...

📦 Processing ANAE_WIT_result_2025_04-part-6.zip (27066 files)


Downloading: 100%|██████████| 27066/27066 [02:54<00:00, 155.05it/s]
                                                                                                  

Uploading ANAE_WIT_result_2025_04-part-6.zip to S3...

📦 Processing ANAE_WIT_result_2025_04-part-7.zip (27066 files)


Downloading: 100%|██████████| 27066/27066 [03:15<00:00, 138.69it/s]
                                                                                                  

Uploading ANAE_WIT_result_2025_04-part-7.zip to S3...

📦 Processing ANAE_WIT_result_2025_04-part-8.zip (27066 files)


Downloading: 100%|██████████| 27066/27066 [03:09<00:00, 142.83it/s]
                                                                                                  

Uploading ANAE_WIT_result_2025_04-part-8.zip to S3...

📦 Processing ANAE_WIT_result_2025_04-part-9.zip (27059 files)


Downloading: 100%|██████████| 27059/27059 [03:09<00:00, 142.73it/s]
                                                                                                  

Uploading ANAE_WIT_result_2025_04-part-9.zip to S3...

✅ All zip files created and uploaded to S3 successfully!


### verification before sending 

In [16]:
import boto3
import zipfile
import os
import tempfile
import shutil
from collections import Counter
from tqdm import tqdm

# === Config ===
destination_bucket = "dea-public-data-dev"
destination_prefix = "projects/WIT/ANAEv3_WIT_result_22042025/"
expected_counts = [27066] * 9 + [27059]
total_expected_files = sum(expected_counts)

zip_names = [f"ANAE_WIT_result_2025_04-part-{i}.zip" for i in range(10)]

# === Setup ===
s3_client = boto3.client('s3')
verify_temp_dir = tempfile.mkdtemp()
all_filenames = []

print("🔍 Starting verification...\n")

# === Verify each zip file ===
for i, zip_name in enumerate(tqdm(zip_names, desc="Verifying zip files")):
    zip_path = os.path.join(verify_temp_dir, zip_name)
    destination_key = f"{destination_prefix}{zip_name}"

    # Download zip
    try:
        s3_client.download_file(destination_bucket, destination_key, zip_path)
    except Exception as e:
        print(f"❌ Failed to download {zip_name}: {e}")
        continue

    try:
        with zipfile.ZipFile(zip_path, 'r') as zipf:
            # Test for corruption
            bad_file = zipf.testzip()
            if bad_file:
                print(f"❌ Corrupt file in {zip_name}: {bad_file}")
                continue

            file_list = zipf.namelist()
            all_filenames.extend(file_list)

            # Check count
            actual_count = len(file_list)
            expected_count = expected_counts[i]
            if actual_count != expected_count:
                print(f"⚠️ {zip_name} has {actual_count} files, expected {expected_count}")
            else:
                print(f"✅ {zip_name}: {actual_count} files")

            # Check for empty files
            empty_files = [f.filename for f in zipf.infolist() if f.file_size == 0]
            if empty_files:
                print(f"⚠️ Empty files in {zip_name}: {empty_files}")

    except zipfile.BadZipFile:
        print(f"❌ {zip_name} is not a valid zip file.")

# === Global checks ===

# Check total number of files
total_files = len(all_filenames)
print(f"\n📦 Total files across all zips: {total_files}")
if total_files != total_expected_files:
    print(f"❌ Expected {total_expected_files} files, but found {total_files}")
else:
    print("✅ Total file count matches expected.")

# Check uniqueness of filenames
file_counts = Counter(all_filenames)
duplicates = [f for f, count in file_counts.items() if count > 1]

if duplicates:
    print(f"\n❌ Found duplicate filenames across zips ({len(duplicates)}):")
    for dup in duplicates:
        print(f" - {dup}")
else:
    print("\n✅ All filenames are unique across all zip files.")

# === Cleanup ===
shutil.rmtree(verify_temp_dir)
print("\n🧹 Temp files cleaned up.")
print("\n🎉 Verification complete!")


🔍 Starting verification...



Verifying zip files:  10%|█         | 1/10 [00:12<01:53, 12.66s/it]

✅ ANAE_WIT_result_2025_04-part-0.zip: 27066 files


Verifying zip files:  20%|██        | 2/10 [00:27<01:49, 13.68s/it]

✅ ANAE_WIT_result_2025_04-part-1.zip: 27066 files


Verifying zip files:  30%|███       | 3/10 [00:40<01:35, 13.70s/it]

✅ ANAE_WIT_result_2025_04-part-2.zip: 27066 files


Verifying zip files:  40%|████      | 4/10 [00:54<01:21, 13.57s/it]

✅ ANAE_WIT_result_2025_04-part-3.zip: 27066 files


Verifying zip files:  50%|█████     | 5/10 [01:08<01:08, 13.72s/it]

✅ ANAE_WIT_result_2025_04-part-4.zip: 27066 files


Verifying zip files:  60%|██████    | 6/10 [01:22<00:55, 13.87s/it]

✅ ANAE_WIT_result_2025_04-part-5.zip: 27066 files


Verifying zip files:  70%|███████   | 7/10 [01:34<00:39, 13.19s/it]

✅ ANAE_WIT_result_2025_04-part-6.zip: 27066 files


Verifying zip files:  80%|████████  | 8/10 [01:46<00:26, 13.00s/it]

✅ ANAE_WIT_result_2025_04-part-7.zip: 27066 files


Verifying zip files:  90%|█████████ | 9/10 [02:00<00:13, 13.11s/it]

✅ ANAE_WIT_result_2025_04-part-8.zip: 27066 files


Verifying zip files: 100%|██████████| 10/10 [02:12<00:00, 13.25s/it]

✅ ANAE_WIT_result_2025_04-part-9.zip: 27059 files

📦 Total files across all zips: 270653
✅ Total file count matches expected.

✅ All filenames are unique across all zip files.






🧹 Temp files cleaned up.

🎉 Verification complete!


### Use for dry run

In [9]:
# DRY RUN: Use only the first 100 files
dry_run_file_count = 100
group_files = file_keys[:dry_run_file_count]

# Set a test zip name
zip_name = "ANAE_WIT_result_2025_04-part-TEST.zip"
tqdm.write(f"\n🚧 DRY RUN: Processing {zip_name} with {len(group_files)} files")

# Download files
download_files(group_files, temp_dir)

# Zip files
local_file_paths = [os.path.join(temp_dir, os.path.basename(f)) for f in group_files]
zip_files(local_file_paths, zip_name)

# Upload zip to S3
destination_key = f"{destination_prefix}{zip_name}"
upload_zip(os.path.join(temp_dir, zip_name), destination_key)

# Clean up
shutil.rmtree(temp_dir)
os.makedirs(temp_dir, exist_ok=True)

print("\n✅ DRY RUN complete — test zip uploaded!")



🚧 DRY RUN: Processing ANAE_WIT_result_2025_04-part-TEST.zip with 100 files


                                                                                                

Uploading ANAE_WIT_result_2025_04-part-TEST.zip to S3...

✅ DRY RUN complete — test zip uploaded!
