## Unzip the files and send them to an s3 folder

In [None]:
import boto3
import tempfile
import zipfile
import io
import os
import shutil
from tqdm import tqdm

In [None]:
# [451924316694_DEA-Dev]
temp_aws_access_key_id=""
temp_aws_secret_access_key=""
temp_aws_session_token=""

In [None]:
s3 = boto3.client('s3', aws_access_key_id=temp_aws_access_key_id,
                      aws_secret_access_key=temp_aws_secret_access_key, 
                 aws_session_token=temp_aws_session_token)

In [None]:
def upload_file_to_s3(file_path, bucket_name, object_key):
    """
    Upload a file to a specified S3 bucket.

    Parameters:
    - file_path: The local path to the file you want to upload.
    - bucket_name: The name of the S3 bucket.
    - object_key: The key (path) under which to store the file in the S3 bucket.
    """
    # # Set your AWS credentials (you can also use environment variables or IAM roles)
    # aws_access_key_id = temp_aws_access_key_id
    # aws_secret_access_key = temp_aws_secret_access_key
    # aws_session_token = temp_aws_session_token
    
    # # Create an S3 client
    # s3 = boto3.client('s3', aws_access_key_id=aws_access_key_id,
    #                   aws_secret_access_key=aws_secret_access_key)

    # Upload the file
    s3.upload_file(file_path, bucket_name, object_key)

In [None]:
# this was used to test the upload_file_to_s3 function works
# bucket_name = 'dea-public-data-dev'
# object_key = 'projects/WIT/MDBA_ANAE_WIT_04_2025/polygon_base_result/test.txt'
# upload_file_to_s3('test.txt', bucket_name, object_key)

In [None]:
def process_zip_from_s3(zip_filename, source_prefix, destination_prefix, bucket_name):

    s3 = boto3.client('s3', aws_access_key_id=temp_aws_access_key_id,
                      aws_secret_access_key=temp_aws_secret_access_key, 
                 aws_session_token=temp_aws_session_token)
    
    # Temp working dir
    temp_dir = tempfile.mkdtemp()
    print(f"Downloading: {source_prefix + zip_filename}")

    try:
        # Download the zip file into memory
        zip_obj = s3.get_object(Bucket=bucket_name, Key=source_prefix + zip_filename)
        buffer = io.BytesIO(zip_obj['Body'].read())

        # Extract zip file to temp_dir
        with zipfile.ZipFile(buffer) as z:
            z.extractall(temp_dir)
            print(f"Extracted {len(z.namelist())} files to temp dir")

        # List all the files to upload
        files_to_upload = []
        for root, dirs, files in os.walk(temp_dir):
            for file in files:
                local_path = os.path.join(root, file)
                s3_key = destination_prefix + os.path.basename(file)
                files_to_upload.append((local_path, s3_key))

        # Upload each file to S3 with a progress bar
        for local_path, s3_key in tqdm(files_to_upload, desc="Uploading files", unit="file"):
            upload_file_to_s3(local_path, bucket_name, s3_key)

        print(f"Done processing: {zip_filename}")
    finally:
        # Cleanup
        shutil.rmtree(temp_dir)
        print(f"Cleaned up local files for: {zip_filename}")

In [None]:
zip_filename = 'ANAE_WIT_result_2024_02-part-9.zip'
source_prefix = 'projects/WIT/ANAEv3_WIT_result_22022024/'
destination_prefix = 'projects/WIT/MDBA_ANAE_WIT_04_2025/polygon_base_result/'
bucket_name = 'dea-public-data-dev'

process_zip_from_s3(zip_filename, source_prefix, destination_prefix, bucket_name)


In [None]:
# count the number of files in the bucket 

bucket_name = 'dea-public-data-dev'
prefix = 'projects/WIT/MDBA_ANAE_WIT_04_2025/polygon_base_result/'

s3 = boto3.client('s3')

paginator = s3.get_paginator('list_objects_v2')
page_iterator = paginator.paginate(Bucket=bucket_name, Prefix=prefix)

count = 0
for page in page_iterator:
    if 'Contents' in page:
        count += len(page['Contents'])

print(f"Total files: {count}")

In [None]:
27066+27066+27066+27066+27066+27066+27066+27066+27066+27059

In [None]:
# this will look through the folder and find unique file extensions - they should all be csv but sometimes a folder marker might be uploaded or something weird 
# basically it will help you find the impostor file 

import boto3
import os
from collections import Counter

def list_s3_file_extensions(bucket_name, prefix):
    s3 = boto3.client('s3')
    paginator = s3.get_paginator('list_objects_v2')
    pages = paginator.paginate(Bucket=bucket_name, Prefix=prefix)

    extensions = []

    for page in pages:
        if 'Contents' in page:
            for obj in page['Contents']:
                key = obj['Key']
                _, ext = os.path.splitext(key)
                ext = ext.lower()
                extensions.append(ext if ext else '[no extension]')

    ext_counts = Counter(extensions)
    print("🔍 Unique file extensions in S3:")
    for ext, count in ext_counts.items():
        print(f"{ext}: {count}")

# Call the function
list_s3_file_extensions(
    bucket_name='dea-public-data-dev',
    prefix='projects/WIT/MDBA_ANAE_WIT_04_2025/polygon_base_result/'
)


In [None]:
# this will give you the name of the impostor file 

import boto3
import os

def find_files_with_no_extension(bucket_name, prefix):
    s3 = boto3.client('s3')
    paginator = s3.get_paginator('list_objects_v2')
    pages = paginator.paginate(Bucket=bucket_name, Prefix=prefix)

    print("📁 Files in S3 with NO extension:")
    found = False

    for page in pages:
        if 'Contents' in page:
            for obj in page['Contents']:
                key = obj['Key']
                _, ext = os.path.splitext(key)
                if ext == '':
                    print("  🔸", key)
                    found = True

    if not found:
        print("✅ None found!")

# Run it
find_files_with_no_extension(
    bucket_name='dea-public-data-dev',
    prefix='projects/WIT/MDBA_ANAE_WIT_04_2025/polygon_base_result/'
)
