* notebook created by nov05 on 2025-01-12  
* Registry of Open Data on AWS: [**Amazon Bin Image Dataset**](https://registry.opendata.aws/amazon-bin-imagery/)      
  https://us-east-1.console.aws.amazon.com/s3/buckets/aft-vbi-pds  

In [None]:
## windows cmd to launch notepad to edit aws config and credential files
# !notepad C:\Users\guido\.aws\config
!notepad C:\Users\guido\.aws\credentials

In [4]:
## reset the session after updating credentials
import boto3 # type: ignore
boto3.DEFAULT_SESSION = None
import sagemaker # type: ignore
from sagemaker import get_execution_role # type: ignore

# Extract and print the account ID
sts_client = boto3.client('sts')
response = sts_client.get_caller_identity() 
account_id = response['Account']

role_arn = get_execution_role()  ## get role ARN
if 'AmazonSageMaker-ExecutionRole' not in role_arn:
    ## Go to "IAM - Roles", search for "SageMaker", find the execution role.
    voclabs_role_arn = role_arn
    sagemaker_role_arn = "arn:aws:iam::570668189909:role/service-role/AmazonSageMaker-ExecutionRole-20250126T194519"
session = sagemaker.Session()  ## "default"
region = session.boto_region_name
bucket = session.default_bucket()

print(f"Current AWS Account ID: {account_id}")
print("AWS Region: {}".format(region))
print("Default Bucket: {}".format(bucket))
print(f"Role voclabs ARN: {voclabs_role_arn}") ## If local, Role ARN: arn:aws:iam::807711953667:role/voclabs
print("SageMaker Role ARN: {}".format(sagemaker_role_arn)) 

## generate secrets.env. remember to add it to .gitignore  
import wandb
wandb.sagemaker_auth(path="../secrets") 

## get my own AWS account info
def get_secrets(name):
    path = '../secrets/' + name
    with open(path, 'r') as file:
        for line in file:
            return line.strip()
aws_account_number = get_secrets('aws_account_number')
aws_account_profile = get_secrets('aws_account_profile')

Current AWS Account ID: 570668189909
AWS Region: us-east-1
Default Bucket: sagemaker-us-east-1-570668189909
Role voclabs ARN: arn:aws:iam::570668189909:role/voclabs
SageMaker Role ARN: arn:aws:iam::570668189909:role/service-role/AmazonSageMaker-ExecutionRole-20250126T194519


# 👉 **Download metadata from S3** 

Download a portion of the metadata from the public S3 bucket containing the **Amazon Bin Image Dataset** to your local system.  

In [None]:
## example code to download a file from s3 bucket
import boto3
from botocore import UNSIGNED
from botocore.client import Config
# Create an S3 client with unsigned requests (public access)
s3_client = boto3.client('s3', config=Config(signature_version=UNSIGNED))
s3_client.download_file(
    Bucket='aft-vbi-pds',
    Key='bin-images/100313.jpg',
    Filename='../data/bin-images/100313.jpg'
)

<pre style="white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace"><span style="color: #7fbfbf; text-decoration-color: #7fbfbf">[02/05/25 10:26:24] </span><span style="color: #0069ff; text-decoration-color: #0069ff; font-weight: bold">INFO    </span> Skipping checksum validation. Response did not contain one of the  <a href="file://d:\Users\guido\miniconda3\envs\sagemaker_py310\lib\site-packages\botocore\httpchecksum.py" target="_blank"><span style="color: #7f7f7f; text-decoration-color: #7f7f7f">httpchecksum.py</span></a><span style="color: #7f7f7f; text-decoration-color: #7f7f7f">:</span><a href="file://d:\Users\guido\miniconda3\envs\sagemaker_py310\lib\site-packages\botocore\httpchecksum.py#481" target="_blank"><span style="color: #7f7f7f; text-decoration-color: #7f7f7f">481</span></a>
<span style="color: #7fbfbf; text-decoration-color: #7fbfbf">                    </span>         following algorithms: <span style="font-weight: bold">[</span><span style="color: #008700; text-decoration-color: #008700">'crc32'</span>, <span style="color: #008700; text-decoration-color: #008700">'sha1'</span>, <span style="color: #008700; text-decoration-color: #008700">'sha256'</span><span style="font-weight: bold">]</span>.                 <span style="color: #7f7f7f; text-decoration-color: #7f7f7f">                   </span>
</pre>

In [None]:
import os
import json
from tqdm import tqdm
import boto3
from botocore import UNSIGNED
from botocore.client import Config

def download_and_arrange_data(
        prefix='bin-images', 
        file_extension='.jpg',
        download_dir='../data/train',
        partition=True):
    
    s3_client = boto3.client('s3', config=Config(signature_version=UNSIGNED))

    ## There are 140536 image file names in the list. 
    with open('file_list.json', 'r') as f:
        d = json.load(f)

    for k, v in d.items():  ## There are 5 items (for 5 classes) in the JSON file.
        print(f"Downloading images/metadata of images with {k} object...")
        if partition:
            download_dir = os.path.join(download_dir, k)
        if not os.path.exists(download_dir):
            os.makedirs(download_dir)
        for file_path in tqdm(v):
            file_name = os.path.basename(file_path).split('.')[0] + file_extension
            s3_client.download_file(
                'aft-vbi-pds', 
                prefix+'/'+file_name,  ## e.g. metadata/100313.json
                download_dir+'/'+file_name)
            
## download metadata, 17.9 MB, 56m 57.4s
download_and_arrange_data(
    prefix='metadata', 
    file_extension='.json',
    download_dir='../data/metadata',
    partition=False)

```text
Downloading images/metadata of images with 1 object...
100%|██████████| 1228/1228 [06:36<00:00,  3.09it/s]
Downloading images/metadata of images with 2 object...
100%|██████████| 2299/2299 [12:38<00:00,  3.03it/s]
Downloading images/metadata of images with 3 object...
100%|██████████| 2666/2666 [14:35<00:00,  3.04it/s]
Downloading images/metadata of images with 4 object...
100%|██████████| 2373/2373 [12:54<00:00,  3.06it/s]
Downloading images/metadata of images with 5 object...
100%|██████████| 1875/1875 [10:11<00:00,  3.07it/s]  
```

In [1]:
print("total metadata file number:", 1228 + 2299 + 2666 + 2373 + 1875)

total metadata file number: 10441


# 👉 **Upload metadata to S3**

Upload this portion of the metadata to my own S3 bucket for further experimental analysis using AWS Glue, Athena, and other services.  

In [4]:
## example code: upload a file to s3. mind the profile that is used.
import boto3
session = boto3.Session(profile_name=aws_account_profile)  ## use the profile name in the credentials file
s3_client = session.client('s3')
bucket = 'dataset-aft-vbi-pds'
key = 'metadata/100313.json'
filename = '../data/metadata/100313.json'
s3_client.upload_file(
    Filename=filename,
    Key=key,
    Bucket=bucket
)

In [16]:
## example code of directory traversal
import os
local_folder = '../data/metadata'
for root, dir, files in os.walk(local_folder):
    print(root, dir)
    for i,file in enumerate(files):
        local_file = os.path.join(root, file)
        print(local_file)
        relative_path = os.path.relpath(local_file, '../data/')
        print(relative_path)
        break

../data/metadata []
../data/metadata\00004.json
metadata\00004.json


In [None]:
## Upload 10441 metadata json files to s3 (my own ccount)
## 53m 23.5s for uploading 10441 json files
import os
from tqdm import tqdm
import boto3
from botocore.exceptions import NoCredentialsError
def upload_folder_to_s3(local_folder, bucket_name, s3_folder=''):
    session = boto3.Session(profile_name=aws_account_profile)  ## use the profile name in the credentials file
    s3_client = session.client('s3')
    for root, _, files in os.walk(local_folder):
        for file in tqdm(files):
            local_file = os.path.join(root, file)
            relative_path = os.path.relpath(local_file, local_folder)  # Get relative file path
            s3_file = os.path.join(s3_folder, relative_path).replace("\\", "/")  # Handle folder structure in S3
            try:
                s3_client.upload_file(local_file, bucket_name, s3_file)
            except NoCredentialsError:
                print("AWS credentials not available.")
bucket = 'dataset-aft-vbi-pds'
local_folder = '../data/metadata/'  # Local folder path
s3_folder = 'metadata/'  # The folder in S3 to upload to (optional)
upload_folder_to_s3(local_folder, bucket, s3_folder)

# 👉 **Get the 10K dataset file list from S3**

In [None]:
## example code: get jpg-json file name pairs 
import json
import random
def get_file_list(s3_uri):
    s3_client = boto3.client('s3')
    bucket, key = s3_uri.replace("s3://", "").split("/", 1)
    response = s3_client.get_object(Bucket=bucket, Key=key)
    json_content = json.loads(response["Body"].read().decode("utf-8"))
    file_list = []
    for label, file_name_list in json_content.items():
        for file_name in file_name_list:
            file_list.append((file_name.split("/")[-1].split(".")[0], label))
    random.shuffle(file_list)
    return file_list  
FILE_LIST_KEY = "s3://p5-amazon-bin-images/file_list.json"
file_list = get_file_list(FILE_LIST_KEY)
print("total image file number:", len(file_list))
print("Example file list:", file_list[:5])

<pre style="white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace"><span style="color: #7fbfbf; text-decoration-color: #7fbfbf">[01/31/25 05:43:31] </span><span style="color: #0069ff; text-decoration-color: #0069ff; font-weight: bold">INFO    </span> Skipping checksum validation. Response did not contain one of the  <a href="file://d:\Users\guido\miniconda3\envs\sagemaker_py310\lib\site-packages\botocore\httpchecksum.py" target="_blank"><span style="color: #7f7f7f; text-decoration-color: #7f7f7f">httpchecksum.py</span></a><span style="color: #7f7f7f; text-decoration-color: #7f7f7f">:</span><a href="file://d:\Users\guido\miniconda3\envs\sagemaker_py310\lib\site-packages\botocore\httpchecksum.py#481" target="_blank"><span style="color: #7f7f7f; text-decoration-color: #7f7f7f">481</span></a>
<span style="color: #7fbfbf; text-decoration-color: #7fbfbf">                    </span>         following algorithms: <span style="font-weight: bold">[</span><span style="color: #008700; text-decoration-color: #008700">'crc32'</span>, <span style="color: #008700; text-decoration-color: #008700">'sha1'</span>, <span style="color: #008700; text-decoration-color: #008700">'sha256'</span><span style="font-weight: bold">]</span>.                 <span style="color: #7f7f7f; text-decoration-color: #7f7f7f">                   </span>
</pre>

total image file number: 10441  
Example file list: [('03146', '4'), ('102813', '4'), ('100517', '5'), ('08340', '4'), ('06487', '2')]  

# 👉 **Convert data to WebDataset and upload to AWS S3**

* [ScriptProcessor](https://sagemaker.readthedocs.io/en/stable/api/training/processing.html#sagemaker.processing.ScriptProcessor) official documentation    
* [My tutorial](https://docs.google.com/document/d/17KzWVf84xQJVNH1jd6yh_FLgr781QcdKng1JIF6P5X4): Create custom docker image for SageMaker data processing jobs, create AWS ECR private repo, and upload the image to the repo   
* [AWS re:Post](https://repost.aws/en/knowledge-center/secondary-account-access-ecr), pull ECR image from the repo of another account  

In [None]:
## There is no need to run this cell. Just update permissions of the ECR repo 
## to allow pulling from another AWS account, and add the "AmazonEC2ContainerRegistryPowerUser"
## policy permissions to the SageMaker role of this account.

## To pull ECR image from another AWS account 
import boto3
import subprocess
import base64
ecr_client = boto3.client('ecr', region_name='us-east-1')
# Retrieve the authentication token from ECR
response = ecr_client.get_authorization_token()
authorization_data = response['authorizationData'][0]
token = authorization_data['authorizationToken']
registry_uri = authorization_data['proxyEndpoint']
decoded_token = base64.b64decode(token).decode('utf-8')
username, password = decoded_token.split(':')
# Docker login command
login_command = f"docker login --username {username} --password {password} {registry_uri}"
subprocess.run(login_command, shell=True, check=True)
# Now you can use this image in your SageMaker processing job

In [None]:
## example code for webdataset.TarWrite() conversion
## this cell reads the first 20 jpg-json pairs and write them to 2 tar files
from sagemaker.processing import ScriptProcessor
processor = ScriptProcessor(
    command=['python3'],
    ## You can use a custom image or use the default SageMaker image
    ## You can pull from AWS ECR or DockerHub
    image_uri=f'{aws_account_number}.dkr.ecr.us-east-1.amazonaws.com/udacity/p5-amazon-bin-images:latest', 
    role=sagemaker_role_arn,  # Execution role
    instance_count=1,
    instance_type='ml.t3.large',  # Use the appropriate instance type
    volume_size_in_gb=10,  # Minimal disk space since we're streaming
    base_job_name='p5-amazon-bin-images' 
)
processor.run(
    ## ⚠️ I made a terrible mistake here by naming the script as "webdataset.py" 
    ## which is the same as the package name. You know what happened next.
    code='../scripts_process/test_convert_to_webdataset.py',  # Your script to process data
    arguments=[
        '--SM_INPUT_BUCKET', 'aft-vbi-pds',
        '--SM_INPUT_PREFIX_IMAGES', 'bin-images/',
        '--SM_INPUT_PREFIX_METADATA', 'metadata/',
        '--SM_OUTPUT_BUCKET', 'p5-amazon-bin-images',
        '--SM_OUTPUT_PREFIX', 'webdataset/',
    ]
)

<pre style="white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace"><span style="color: #7fbfbf; text-decoration-color: #7fbfbf">[01/30/25 20:17:20] </span><span style="color: #0069ff; text-decoration-color: #0069ff; font-weight: bold">INFO    </span> Creating processing-job with name                                      <a href="file://d:\Users\guido\miniconda3\envs\sagemaker_py310\lib\site-packages\sagemaker\session.py" target="_blank"><span style="color: #7f7f7f; text-decoration-color: #7f7f7f">session.py</span></a><span style="color: #7f7f7f; text-decoration-color: #7f7f7f">:</span><a href="file://d:\Users\guido\miniconda3\envs\sagemaker_py310\lib\site-packages\sagemaker\session.py#1575" target="_blank"><span style="color: #7f7f7f; text-decoration-color: #7f7f7f">1575</span></a>
<span style="color: #7fbfbf; text-decoration-color: #7fbfbf">                    </span>         p5-amazon-bin-images-<span style="color: #008080; text-decoration-color: #008080; font-weight: bold">2025</span>-01-31-02-17-16-724                           <span style="color: #7f7f7f; text-decoration-color: #7f7f7f">               </span>
</pre>

.......👉 image_keys: ['bin-images/', 'bin-images/00001.jpg', 'bin-images/00002.jpg', 'bin-images/00003.jpg', 'bin-images/00004.jpg', 'bin-images/00005.jpg', 'bin-images/00006.jpg', 'bin-images/00007.jpg', 'bin-images/00008.jpg', 'bin-images/00009.jpg']  
⚠️ Skipping non-image file: bin-images/  
🟢 Successfully uploaded tar file to s3://p5-amazon-bin-images/webdataset/data_0.tar  
👉 image_keys: ['bin-images/00010.jpg', 'bin-images/00011.jpg', 'bin-images/00012.jpg', 'bin-images/00013.jpg', 'bin-images/00014.jpg', 'bin-images/00015.jpg', 'bin-images/00016.jpg', 'bin-images/00017.jpg', 'bin-images/00018.jpg', 'bin-images/00019.jpg']  
🟢 Successfully uploaded tar file to s3://p5-amazon-bin-images/webdataset/data_1.tar  

In [None]:
job_name = processor.latest_job.job_name
processing_job_desc = processor.sagemaker_session.describe_processing_job(job_name)
job_status = processing_job_desc['ProcessingJobStatus']
print(f"Processing job status: {job_status}")
## Processing job status: Completed