# Zip the files into 10 folders and send to s3

In [None]:
import boto3
import zipfile
import os
import shutil
from tqdm import tqdm


In [None]:
temp_aws_access_key_id=""
temp_aws_secret_access_key=""
temp_aws_session_token=""

In [None]:
s3_client  = boto3.client('s3', aws_access_key_id=temp_aws_access_key_id,
                      aws_secret_access_key=temp_aws_secret_access_key, 
                 aws_session_token=temp_aws_session_token)

In [None]:
source_bucket = 'dea-public-data-dev'
source_prefix = 'projects/WIT/MDBA_ANAE_WIT_MH_18_04_2025/merged/'

# list all files in the source folder
response = s3_client.list_objects_v2(Bucket=source_bucket, Prefix=source_prefix)

# gather all file keys
file_keys = [obj['Key'] for obj in response.get('Contents', [])]

# display the first 10 file keys to verify
file_keys[:10]


In [None]:
destination_bucket = 'dea-public-data-dev'
destination_prefix = 'projects/WIT/ANAEv3_WIT_result_22042025/'

In [None]:
# create a temporary directory to store files for zipping
temp_dir = '/tmp/s3_zip_temp'
os.makedirs(temp_dir, exist_ok=True)

# list files
print("Listing files in S3...")
paginator = s3_client.get_paginator('list_objects_v2')
page_iterator = paginator.paginate(Bucket=source_bucket, Prefix=source_prefix)

file_keys = []
for page in tqdm(page_iterator, desc="Fetching file list"):
    file_keys.extend(obj['Key'] for obj in page.get('Contents', []))

print(f"Total files: {len(file_keys)}")

In [None]:
from concurrent.futures import ThreadPoolExecutor

def download_files(file_keys, download_dir, max_workers=16):
    def download(key):
        filename = os.path.basename(key)
        dest_path = os.path.join(download_dir, filename)
        s3_client.download_file(source_bucket, key, dest_path)

    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        list(tqdm(executor.map(download, file_keys), total=len(file_keys), desc="Downloading"))

def zip_files(files, zip_name):
    """Create zip file with progress bar"""
    zip_path = os.path.join(temp_dir, zip_name)
    with zipfile.ZipFile(zip_path, 'w', zipfile.ZIP_DEFLATED) as zipf:
        for file in tqdm(files, desc=f"Zipping {zip_name}", leave=False):
            zipf.write(file, os.path.basename(file))

def upload_zip(zip_file, destination_key):
    """Upload zip file to S3"""
    tqdm.write(f"Uploading {os.path.basename(zip_file)} to S3...")
    s3_client.upload_file(zip_file, destination_bucket, destination_key)


In [None]:
files_per_zip = 27066
last_zip_files = 27059
total_files = len(file_keys)

for i in range(0, total_files, files_per_zip):
    if i + files_per_zip > total_files:
        group_files = file_keys[i:i + last_zip_files]
    else:
        group_files = file_keys[i:i + files_per_zip]

    part_num = i // files_per_zip
    zip_name = f"ANAE_WIT_result_2025_04-part-{part_num}.zip"
    tqdm.write(f"\n📦 Processing {zip_name} ({len(group_files)} files)")

    # download 
    download_files(group_files, temp_dir)

    # zip
    local_file_paths = [os.path.join(temp_dir, os.path.basename(f)) for f in group_files]
    zip_files(local_file_paths, zip_name)

    # upload to S3
    destination_key = f"{destination_prefix}{zip_name}"
    upload_zip(os.path.join(temp_dir, zip_name), destination_key)

    # clean up
    shutil.rmtree(temp_dir)
    os.makedirs(temp_dir, exist_ok=True)

print("\n✅ All zip files created and uploaded to S3 successfully!")


### verification before sending 

In [None]:
import boto3
import zipfile
import os
import tempfile
import shutil
from collections import Counter
from tqdm import tqdm

# === Config ===
destination_bucket = "dea-public-data-dev"
destination_prefix = "projects/WIT/ANAEv3_WIT_result_22042025/"
expected_counts = [27066] * 9 + [27059]
total_expected_files = sum(expected_counts)

zip_names = [f"ANAE_WIT_result_2025_04-part-{i}.zip" for i in range(10)]

# === Setup ===
s3_client = boto3.client('s3')
verify_temp_dir = tempfile.mkdtemp()
all_filenames = []

print("🔍 Starting verification...\n")

# verify each zip file 
for i, zip_name in enumerate(tqdm(zip_names, desc="Verifying zip files")):
    zip_path = os.path.join(verify_temp_dir, zip_name)
    destination_key = f"{destination_prefix}{zip_name}"

    # download zip
    try:
        s3_client.download_file(destination_bucket, destination_key, zip_path)
    except Exception as e:
        print(f"❌ Failed to download {zip_name}: {e}")
        continue

    try:
        with zipfile.ZipFile(zip_path, 'r') as zipf:
            # test for corruption
            bad_file = zipf.testzip()
            if bad_file:
                print(f"❌ Corrupt file in {zip_name}: {bad_file}")
                continue

            file_list = zipf.namelist()
            all_filenames.extend(file_list)

            # Check count
            actual_count = len(file_list)
            expected_count = expected_counts[i]
            if actual_count != expected_count:
                print(f"⚠️ {zip_name} has {actual_count} files, expected {expected_count}")
            else:
                print(f"✅ {zip_name}: {actual_count} files")

            # check for empty files
            empty_files = [f.filename for f in zipf.infolist() if f.file_size == 0]
            if empty_files:
                print(f"⚠️ Empty files in {zip_name}: {empty_files}")

    except zipfile.BadZipFile:
        print(f"❌ {zip_name} is not a valid zip file.")

# global checks

# check total number of files
total_files = len(all_filenames)
print(f"\n📦 Total files across all zips: {total_files}")
if total_files != total_expected_files:
    print(f"❌ Expected {total_expected_files} files, but found {total_files}")
else:
    print("✅ Total file count matches expected.")

# check uniqueness of filenames
file_counts = Counter(all_filenames)
duplicates = [f for f, count in file_counts.items() if count > 1]

if duplicates:
    print(f"\n❌ Found duplicate filenames across zips ({len(duplicates)}):")
    for dup in duplicates:
        print(f" - {dup}")
else:
    print("\n✅ All filenames are unique across all zip files.")

# clean up
shutil.rmtree(verify_temp_dir)
print("\n🧹 Temp files cleaned up.")
print("\n🎉 Verification complete!")


### Use for dry run

In [None]:
# DRY RUN: Use only the first 100 files
dry_run_file_count = 100
group_files = file_keys[:dry_run_file_count]

# Set a test zip name
zip_name = "ANAE_WIT_result_2025_04-part-TEST.zip"
tqdm.write(f"\n🚧 DRY RUN: Processing {zip_name} with {len(group_files)} files")

# Download files
download_files(group_files, temp_dir)

# Zip files
local_file_paths = [os.path.join(temp_dir, os.path.basename(f)) for f in group_files]
zip_files(local_file_paths, zip_name)

# Upload zip to S3
destination_key = f"{destination_prefix}{zip_name}"
upload_zip(os.path.join(temp_dir, zip_name), destination_key)

# Clean up
shutil.rmtree(temp_dir)
os.makedirs(temp_dir, exist_ok=True)

print("\n✅ DRY RUN complete — test zip uploaded!")
