# weather-classification-aws.ipynb



## Startup cells

In [0]:
# Set environment variables for sagemaker_studio imports

import os
os.environ['DataZoneProjectId'] = '4qujod1i4mic5v'
os.environ['DataZoneDomainId'] = 'dzd-b06qjg2gc96f2r'
os.environ['DataZoneEnvironmentId'] = '6aojsxq51b2bdf'
os.environ['DataZoneDomainRegion'] = 'eu-north-1'

# create both a function and variable for metadata access
_resource_metadata = None

def _get_resource_metadata():
    global _resource_metadata
    if _resource_metadata is None:
        _resource_metadata = {
            "AdditionalMetadata": {
                "DataZoneProjectId": "4qujod1i4mic5v",
                "DataZoneDomainId": "dzd-b06qjg2gc96f2r",
                "DataZoneEnvironmentId": "6aojsxq51b2bdf",
                "DataZoneDomainRegion": "eu-north-1",
            }
        }
    return _resource_metadata
metadata = _get_resource_metadata()

In [0]:
"""
Logging Configuration

Purpose:
--------
This sets up the logging framework for code executed in the user namespace.
"""

from typing import Optional


def _set_logging(log_dir: str, log_file: str, log_name: Optional[str] = None):
    import os
    import logging
    from logging.handlers import RotatingFileHandler

    level = logging.INFO
    max_bytes = 5 * 1024 * 1024
    backup_count = 5

    # fallback to /tmp dir on access, helpful for local dev setup
    try:
        os.makedirs(log_dir, exist_ok=True)
    except Exception:
        log_dir = "/tmp/kernels/"

    os.makedirs(log_dir, exist_ok=True)
    log_path = os.path.join(log_dir, log_file)

    logger = logging.getLogger() if not log_name else logging.getLogger(log_name)
    logger.handlers = []
    logger.setLevel(level)

    formatter = logging.Formatter("%(asctime)s - %(name)s - %(levelname)s - %(message)s")

    # Rotating file handler
    fh = RotatingFileHandler(filename=log_path, maxBytes=max_bytes, backupCount=backup_count, encoding="utf-8")
    fh.setFormatter(formatter)
    logger.addHandler(fh)

    logger.info(f"Logging initialized for {log_name}.")


_set_logging("/var/log/computeEnvironments/kernel/", "kernel.log")
_set_logging("/var/log/studio/data-notebook-kernel-server/", "metrics.log", "metrics")

In [0]:
import logging
from sagemaker_studio import ClientConfig, sqlutils, sparkutils, dataframeutils

logger = logging.getLogger(__name__)
logger.info("Initializing sparkutils")
spark = sparkutils.init()
logger.info("Finished initializing sparkutils")

In [0]:
def _reset_os_path():
    """
    Reset the process's working directory to handle mount timing issues.
    
    This function resolves a race condition where the Python process starts
    before the filesystem mount is complete, causing the process to reference
    old mount paths and inodes. By explicitly changing to the mounted directory
    (/home/sagemaker-user), we ensure the process uses the correct, up-to-date
    mount point.
    
    The function logs stat information (device ID and inode) before and after
    the directory change to verify that the working directory is properly
    updated to reference the new mount.
    
    Note:
        This is executed at module import time to ensure the fix is applied
        as early as possible in the kernel initialization process.
    """
    try:
        import os
        import logging

        logger = logging.getLogger(__name__)
        logger.info("---------Before------")
        logger.info("CWD: %s", os.getcwd())
        logger.info("stat('.'): %s %s", os.stat('.').st_dev, os.stat('.').st_ino)
        logger.info("stat('/home/sagemaker-user'): %s %s", os.stat('/home/sagemaker-user').st_dev, os.stat('/home/sagemaker-user').st_ino)

        os.chdir("/home/sagemaker-user")

        logger.info("---------After------")
        logger.info("CWD: %s", os.getcwd())
        logger.info("stat('.'): %s %s", os.stat('.').st_dev, os.stat('.').st_ino)
        logger.info("stat('/home/sagemaker-user'): %s %s", os.stat('/home/sagemaker-user').st_dev, os.stat('/home/sagemaker-user').st_ino)
    except Exception as e:
        logger.exception(f"Failed to reset working directory: {e}")

_reset_os_path()

## Notebook

In [0]:
import json
import boto3
import numpy as np
import pandas as pd

s3 = boto3.client('s3')
bucket = 'weather-vit-aws'
pred_prefix = 'predictions/'

submission = []
paginator = s3.get_paginator('list_objects_v2')
pages = paginator.paginate(Bucket=bucket, Prefix=pred_prefix)

for page in pages:
    for obj in page.get('Contents', []):
        key = obj['Key']
        if key.endswith('.out'):
            img_id = key.split('/')[-1].replace('.out', '').replace('.jpg', '').replace('.jpeg', '').replace('.png', '')
            response = s3.get_object(Bucket=bucket, Key=key)
            pred_json = json.loads(response['Body'].read().decode('utf-8'))
            pred_probs = pred_json['prediction']
            label = int(np.argmax(pred_probs))
            submission.append({'id': img_id, 'label': label})

df = pd.DataFrame(submission)
df.to_csv('/home/sagemaker-user/submission.csv', index=False)
print(df.head(20))
print("\nDone! Download 'submission.csv' from Files tab → submit to Kaggle.")

          id  label
0    Cloud_1      0
1    Cloud_2      0
2    Cloud_3      3
3    Cloud_4      3
4    foggy_1      2
5   foggy_10      1
6    foggy_2      0
7    foggy_3      1
8    foggy_4      1
9    foggy_5      0
10   foggy_6      1
11   foggy_7      4
12   foggy_8      1
13   foggy_9      2
14    rain_1      2
15    rain_2      2
16    rain_3      2
17    rain_4      2
18    rain_5      2
19    rain_6      2

Done! Download 'submission.csv' from Files tab → submit to Kaggle.


In [0]:
transformer = ic.transformer(
    instance_count=1,
    instance_type='ml.m5.xlarge',
    output_path=f's3://{bucket}/predictions/'
)

transformer.transform(
    data=f's3://{bucket}/dataset/alien_test/',  
    data_type='S3Prefix',
    content_type='application/x-image',
    split_type='None'
)

transformer.wait()
print("Predictions saved to:", transformer.output_path)

.

.

.

.

.

.

.

.

.

.

.

.

.

.

.

.

.

.

.

.

.

.

.

.

.

.

.

.

.

.

.

.

.

.

.

[34mDocker entrypoint called with argument(s): serve[0m
[34mRunning default environment configuration script[0m
  if num_device is 1 and 'dist' not in kvstore:[0m
[34m[02/10/2026 00:18:27 INFO 140538468493120] loaded entry point class algorithm.serve.server_config:config_api[0m
[34m[02/10/2026 00:18:27 INFO 140538468493120] loading entry points[0m
[34m[02/10/2026 00:18:27 INFO 140538468493120] loaded request iterator application/x-image[0m
[34m[02/10/2026 00:18:27 INFO 140538468493120] loaded request iterator image/jpeg[0m
[34m[02/10/2026 00:18:27 INFO 140538468493120] loaded request iterator image/png[0m
[34m[02/10/2026 00:18:27 INFO 140538468493120] loaded response encoder application/json[0m
[34m[02/10/2026 00:18:27 INFO 140538468493120] loaded response encoder application/jsonlines[0m
[34m[02/10/2026 00:18:27 INFO 140538468493120] loaded entry point class algorithm:model[0m
[34m[02/10/2026 00:18:27 INFO 140538468493120] Number of server workers: 3[0m
[34m[20


[32m2026-02-10T00:18:29.460:[sagemaker logs]: MaxConcurrentTransforms=3, MaxPayloadInMB=6, BatchStrategy=SINGLE_RECORD[0m


[34mDocker entrypoint called with argument(s): serve[0m
[34mRunning default environment configuration script[0m
[35mDocker entrypoint called with argument(s): serve[0m
[35mRunning default environment configuration script[0m
  if num_device is 1 and 'dist' not in kvstore:[0m
  if num_device is 1 and 'dist' not in kvstore:[0m
[34m[02/10/2026 00:18:27 INFO 140538468493120] loaded entry point class algorithm.serve.server_config:config_api[0m
[34m[02/10/2026 00:18:27 INFO 140538468493120] loading entry points[0m
[34m[02/10/2026 00:18:27 INFO 140538468493120] loaded request iterator application/x-image[0m
[34m[02/10/2026 00:18:27 INFO 140538468493120] loaded request iterator image/jpeg[0m
[34m[02/10/2026 00:18:27 INFO 140538468493120] loaded request iterator image/png[0m
[34m[02/10/2026 00:18:27 INFO 140538468493120] loaded response encoder application/json[0m
[34m[02/10/2026 00:18:27 INFO 140538468493120] loaded response encoder application/jsonlines[0m
[34m[02/10/2

In [0]:
import boto3
import random
import os
import sagemaker
from sagemaker import get_execution_role
from sagemaker.image_uris import retrieve
from sagemaker.inputs import TrainingInput
import numpy as np
import pandas as pd
from datetime import datetime

bucket = 'weather-vit-aws'          
prefix = 'dataset/'                 
region = sagemaker.Session().boto_region_name

role = get_execution_role()
s3_client = boto3.client('s3')

class_map = {
    'cloudy': 0,
    'foggy': 1,
    'rainy': 2,
    'shine': 3,
    'sunrise': 4
}

print("=== Generating LST files ===")

lst_lines = []
paginator = s3_client.get_paginator('list_objects_v2')
pages = paginator.paginate(Bucket=bucket, Prefix=prefix)

index = 0
for page in pages:
    for obj in page.get('Contents', []):
        key = obj['Key']
        if key.lower().endswith(('.jpg', '.jpeg', '.png')):
            parts = key.split('/')
            if len(parts) >= 3:  
                cls_folder = parts[1]  
                if cls_folder in class_map:
                    rel_path = '/'.join(parts[1:])  
                    lst_lines.append(f"{index}\t{class_map[cls_folder]}\t{rel_path}")
                    index += 1
                    if index % 100 == 0:
                        print(f"Processed {index} images...")

print(f"Total images found: {len(lst_lines)}")

if len(lst_lines) == 0:
    raise ValueError("No images found! Check prefix/class folders in S3.")

random.shuffle(lst_lines)
split = int(0.8 * len(lst_lines))
train_lines = lst_lines[:split]
val_lines = lst_lines[split:]

print(f"Train images: {len(train_lines)}, Val images: {len(val_lines)}")

local_train_lst = '/home/sagemaker-user/train.lst'
local_val_lst = '/home/sagemaker-user/val.lst'

with open(local_train_lst, 'w') as f:
    f.write('\n'.join(train_lines))
with open(local_val_lst, 'w') as f:
    f.write('\n'.join(val_lines))

s3_client.upload_file(local_train_lst, bucket, 'train.lst')
s3_client.upload_file(local_val_lst, bucket, 'val.lst')

print("LST files generated and uploaded!")

print("\n=== Setting up training job ===")

container = retrieve('image-classification', region)

ic = sagemaker.estimator.Estimator(
    image_uri=container,
    role=role,
    instance_count=1,
    instance_type='ml.g4dn.xlarge',         
    volume_size=50,                    
    output_path=f's3://{bucket}/output',
    sagemaker_session=sagemaker.Session(),
    max_run=3600 * 2                        
)

ic.set_hyperparameters(
    num_layers=18,                      
    use_pretrained_model=0,             
    image_shape='3,224,224',
    num_classes=5,
    num_training_samples=len(train_lines),  
    epochs=15,                          
    learning_rate=0.001,                
    mini_batch_size=32,                 
    resize=256                          
)

train_channel = TrainingInput(
    s3_data=f's3://{bucket}/{prefix}',
    content_type='application/x-image'
)
val_channel = TrainingInput(
    s3_data=f's3://{bucket}/{prefix}',  
    content_type='application/x-image'
)
train_lst_channel = TrainingInput(
    s3_data=f's3://{bucket}/train.lst',
    content_type='application/x-image'
)
val_lst_channel = TrainingInput(
    s3_data=f's3://{bucket}/val.lst',
    content_type='application/x-image'
)

print("Channels ready. Starting fit...")

ic.fit(
    {
        'train': train_channel,
        'validation': val_channel,
        'train_lst': train_lst_channel,
        'validation_lst': val_lst_channel
    },
    wait=True, logs=True
)

print("Training complete! Check SageMaker console > Training jobs for logs/metrics.")
print(f"Model artifacts in: s3://{bucket}/output/")

sagemaker.config INFO - Fetched defaults config from location: /etc/xdg/sagemaker/config.yaml


sagemaker.config INFO - Not applying SDK defaults from location: /home/sagemaker-user/.config/sagemaker/config.yaml


sagemaker.config INFO - Applied value from config key = SageMaker.PythonSDK.Modules.Session.DefaultS3Bucket


sagemaker.config INFO - Applied value from config key = SageMaker.PythonSDK.Modules.Session.DefaultS3ObjectKeyPrefix


sagemaker.config INFO - Applied value from config key = SageMaker.PythonSDK.Modules.Session.DefaultS3Bucket


sagemaker.config INFO - Applied value from config key = SageMaker.PythonSDK.Modules.Session.DefaultS3ObjectKeyPrefix


sagemaker.config INFO - Applied value from config key = SageMaker.PythonSDK.Modules.Session.DefaultS3Bucket


sagemaker.config INFO - Applied value from config key = SageMaker.PythonSDK.Modules.Session.DefaultS3ObjectKeyPrefix


=== Generating LST files ===


Processed 100 images...
Processed 200 images...
Processed 300 images...
Processed 400 images...
Processed 500 images...
Processed 600 images...
Processed 700 images...
Processed 800 images...
Processed 900 images...
Processed 1000 images...
Processed 1100 images...
Processed 1200 images...
Processed 1300 images...
Processed 1400 images...
Total images found: 1498
Train images: 1198, Val images: 300
LST files generated and uploaded!

=== Setting up training job ===


sagemaker.config INFO - Applied value from config key = SageMaker.PythonSDK.Modules.Session.DefaultS3Bucket


sagemaker.config INFO - Applied value from config key = SageMaker.PythonSDK.Modules.Session.DefaultS3ObjectKeyPrefix


Channels ready. Starting fit...


2026-02-10 00:04:00 Starting - Starting the training job.

.

.

.

.

.


2026-02-10 00:05:02 Pending - Training job waiting for capacity.

.

.


2026-02-10 00:05:17 Pending - Preparing the instances for training.

.

.


2026-02-10 00:05:42 Downloading - Downloading input data.

.

.


2026-02-10 00:06:18 Downloading - Downloading the training image.

.

.

.

.

.

.

.

.


2026-02-10 00:07:49 Training - Training image download completed. Training in progress..

[34mDocker entrypoint called with argument(s): train[0m
[34mRunning default environment configuration script[0m
[34mNvidia gpu devices, drivers and cuda toolkit versions (only available on hosts with GPU):[0m
[34mTue Feb 10 00:07:56 2026       [0m
[34m+-----------------------------------------------------------------------------------------+[0m
[34m| NVIDIA-SMI 570.195.03             Driver Version: 570.195.03     CUDA Version: 12.8     |[0m
[34m|-----------------------------------------+------------------------+----------------------+[0m
[34m| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |[0m
[34m| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |[0m
[34m|                                         |                        |               MIG M. |[0m
[34m|   0  Tesla T4                       On  |   00000000:00:1E.0 Off |                    0 |[0m
[34m| N/A   21C    P8              9

[34m[00:08:13] /opt/brazil-pkg-cache/packages/AIAlgorithmsMXNet/AIAlgorithmsMXNet-1.4.x_ecl_Cuda_11.1.x.441.0/AL2_x86_64/generic-flavor/src/src/operator/nn/./cudnn/./cudnn_algoreg-inl.h:97: Running performance tests to find the best convolution algorithm, this can take a while... (setting env variable MXNET_CUDNN_AUTOTUNE_DEFAULT to 0 to disable)[0m
[34m[02/10/2026 00:08:15 INFO 140495229204288] Epoch[0] Batch [20]#011Speed: 194.835 samples/sec#011accuracy=0.431548[0m
[34m[02/10/2026 00:08:17 INFO 140495229204288] Epoch[0] Train-accuracy=0.523649[0m
[34m[02/10/2026 00:08:17 INFO 140495229204288] Epoch[0] Time cost=4.785[0m
[34m[02/10/2026 00:08:17 INFO 140495229204288] Epoch[0] Validation-accuracy=0.663194[0m
[34m[02/10/2026 00:08:17 INFO 140495229204288] Storing the best model with validation accuracy: 0.663194[0m
[34m[02/10/2026 00:08:17 INFO 140495229204288] Saved checkpoint to "/opt/ml/model/image-classification-0001.params"[0m
[34m[02/10/2026 00:08:19 INFO 140495229

[34m[02/10/2026 00:08:24 INFO 140495229204288] Epoch[2] Batch [20]#011Speed: 331.281 samples/sec#011accuracy=0.782738[0m
[34m[02/10/2026 00:08:25 INFO 140495229204288] Epoch[2] Train-accuracy=0.789696[0m
[34m[02/10/2026 00:08:25 INFO 140495229204288] Epoch[2] Time cost=3.429[0m
[34m[02/10/2026 00:08:26 INFO 140495229204288] Epoch[2] Validation-accuracy=0.725694[0m
[34m[02/10/2026 00:08:26 INFO 140495229204288] Storing the best model with validation accuracy: 0.725694[0m
[34m[02/10/2026 00:08:26 INFO 140495229204288] Saved checkpoint to "/opt/ml/model/image-classification-0003.params"[0m
[34m[02/10/2026 00:08:28 INFO 140495229204288] Epoch[3] Batch [20]#011Speed: 330.656 samples/sec#011accuracy=0.818452[0m
[34m[02/10/2026 00:08:29 INFO 140495229204288] Epoch[3] Train-accuracy=0.822635[0m
[34m[02/10/2026 00:08:29 INFO 140495229204288] Epoch[3] Time cost=3.437[0m
[34m[02/10/2026 00:08:30 INFO 140495229204288] Epoch[3] Validation-accuracy=0.760417[0m
[34m[02/10/2026 00

[34m[02/10/2026 00:08:34 INFO 140495229204288] Saved checkpoint to "/opt/ml/model/image-classification-0005.params"[0m
[34m[02/10/2026 00:08:36 INFO 140495229204288] Epoch[5] Batch [20]#011Speed: 331.158 samples/sec#011accuracy=0.845238[0m
[34m[02/10/2026 00:08:37 INFO 140495229204288] Epoch[5] Train-accuracy=0.848818[0m
[34m[02/10/2026 00:08:37 INFO 140495229204288] Epoch[5] Time cost=3.439[0m
[34m[02/10/2026 00:08:38 INFO 140495229204288] Epoch[5] Validation-accuracy=0.781250[0m
[34m[02/10/2026 00:08:38 INFO 140495229204288] Storing the best model with validation accuracy: 0.781250[0m
[34m[02/10/2026 00:08:38 INFO 140495229204288] Saved checkpoint to "/opt/ml/model/image-classification-0006.params"[0m
[34m[02/10/2026 00:08:40 INFO 140495229204288] Epoch[6] Batch [20]#011Speed: 331.348 samples/sec#011accuracy=0.840774[0m
[34m[02/10/2026 00:08:42 INFO 140495229204288] Epoch[6] Train-accuracy=0.854730[0m
[34m[02/10/2026 00:08:42 INFO 140495229204288] Epoch[6] Time cos

[34m[02/10/2026 00:08:46 INFO 140495229204288] Epoch[7] Train-accuracy=0.863176[0m
[34m[02/10/2026 00:08:46 INFO 140495229204288] Epoch[7] Time cost=3.438[0m
[34m[02/10/2026 00:08:46 INFO 140495229204288] Epoch[7] Validation-accuracy=0.781250[0m
[34m[02/10/2026 00:08:48 INFO 140495229204288] Epoch[8] Batch [20]#011Speed: 330.642 samples/sec#011accuracy=0.863095[0m
[34m[02/10/2026 00:08:50 INFO 140495229204288] Epoch[8] Train-accuracy=0.875845[0m
[34m[02/10/2026 00:08:50 INFO 140495229204288] Epoch[8] Time cost=3.438[0m
[34m[02/10/2026 00:08:50 INFO 140495229204288] Epoch[8] Validation-accuracy=0.812500[0m
[34m[02/10/2026 00:08:50 INFO 140495229204288] Storing the best model with validation accuracy: 0.812500[0m
[34m[02/10/2026 00:08:51 INFO 140495229204288] Saved checkpoint to "/opt/ml/model/image-classification-0009.params"[0m
[34m[02/10/2026 00:08:53 INFO 140495229204288] Epoch[9] Batch [20]#011Speed: 324.710 samples/sec#011accuracy=0.888393[0m
[34m[02/10/2026 00

[34m[02/10/2026 00:08:57 INFO 140495229204288] Epoch[10] Batch [20]#011Speed: 330.401 samples/sec#011accuracy=0.913690[0m
[34m[02/10/2026 00:08:58 INFO 140495229204288] Epoch[10] Train-accuracy=0.913007[0m
[34m[02/10/2026 00:08:58 INFO 140495229204288] Epoch[10] Time cost=3.452[0m
[34m[02/10/2026 00:08:59 INFO 140495229204288] Epoch[10] Validation-accuracy=0.822917[0m
[34m[02/10/2026 00:08:59 INFO 140495229204288] Storing the best model with validation accuracy: 0.822917[0m
[34m[02/10/2026 00:08:59 INFO 140495229204288] Saved checkpoint to "/opt/ml/model/image-classification-0011.params"[0m
[34m[02/10/2026 00:09:01 INFO 140495229204288] Epoch[11] Batch [20]#011Speed: 330.023 samples/sec#011accuracy=0.906250[0m


[34m[02/10/2026 00:09:02 INFO 140495229204288] Epoch[11] Train-accuracy=0.909628[0m
[34m[02/10/2026 00:09:02 INFO 140495229204288] Epoch[11] Time cost=3.467[0m
[34m[02/10/2026 00:09:03 INFO 140495229204288] Epoch[11] Validation-accuracy=0.847222[0m
[34m[02/10/2026 00:09:03 INFO 140495229204288] Storing the best model with validation accuracy: 0.847222[0m
[34m[02/10/2026 00:09:03 INFO 140495229204288] Saved checkpoint to "/opt/ml/model/image-classification-0012.params"[0m
[34m[02/10/2026 00:09:05 INFO 140495229204288] Epoch[12] Batch [20]#011Speed: 329.045 samples/sec#011accuracy=0.919643[0m
[34m[02/10/2026 00:09:06 INFO 140495229204288] Epoch[12] Train-accuracy=0.921453[0m
[34m[02/10/2026 00:09:06 INFO 140495229204288] Epoch[12] Time cost=3.460[0m
[34m[02/10/2026 00:09:07 INFO 140495229204288] Epoch[12] Validation-accuracy=0.821875[0m
[34m[02/10/2026 00:09:09 INFO 140495229204288] Epoch[13] Batch [20]#011Speed: 330.194 samples/sec#011accuracy=0.925595[0m
[34m[02/10

[34m[02/10/2026 00:09:13 INFO 140495229204288] Epoch[14] Batch [20]#011Speed: 328.884 samples/sec#011accuracy=0.907738[0m
[34m[02/10/2026 00:09:15 INFO 140495229204288] Epoch[14] Train-accuracy=0.915541[0m
[34m[02/10/2026 00:09:15 INFO 140495229204288] Epoch[14] Time cost=3.456[0m
[34m[02/10/2026 00:09:15 INFO 140495229204288] Epoch[14] Validation-accuracy=0.836806[0m



2026-02-10 00:09:32 Uploading - Uploading generated training model
2026-02-10 00:09:32 Completed - Training job completed


Training seconds: 230
Billable seconds: 230
Training complete! Check SageMaker console > Training jobs for logs/metrics.
Model artifacts in: s3://weather-vit-aws/output/


In [0]:
import pandas as pd

df_csv_4c0ju4mvw = pd.read_csv('submission.csv')
df_csv_4c0ju4mvw


Unnamed: 0,id,label
0,Cloud_1,0
1,Cloud_2,0
2,Cloud_3,3
3,Cloud_4,3
4,foggy_1,2
5,foggy_10,1
6,foggy_2,0
7,foggy_3,1
8,foggy_4,1
9,foggy_5,0


## Shutdown cells

In [0]:
"""
Stop spark session and associated Athena Spark session
"""

from IPython import get_ipython as _get_ipython
_get_ipython().user_ns["spark"].stop()