In [1]:
!pip install --upgrade pip -U sagemaker awscli boto3 numpy ipywidgets torch torchvision s3fs scikit-learn json
!pip install Pillow==7.1.2

Collecting sagemaker
  Downloading sagemaker-2.216.0-py3-none-any.whl.metadata (14 kB)
Collecting awscli
  Downloading awscli-1.32.87-py3-none-any.whl.metadata (11 kB)
Collecting boto3
  Downloading boto3-1.34.87-py3-none-any.whl.metadata (6.6 kB)
Collecting s3fs
  Using cached s3fs-2024.3.1-py3-none-any.whl.metadata (1.6 kB)
[31mERROR: Could not find a version that satisfies the requirement json (from versions: none)[0m[31m
[0m[31mERROR: No matching distribution found for json[0m[31m


In [5]:
!mkdir -p data src output fakedata

## Create Session and Environment


In [1]:
import os
from sagemaker.s3 import S3Uploader as s3up
from sagemaker.session import Session
from sagemaker import get_execution_role

sess = Session()

# S3 bucket for saving code and model artifacts.
# Feel free to specify a different bucket here if you wish.
bucket = sess.default_bucket()
prefix = "EnergyData"

# Location to save your custom code in tar.gz format.
s3_custom_code_upload_location = f"s3://{bucket}/{prefix}/customcode"

checkpoint_s3_bucket=f"s3://{bucket}/{prefix}/checkpoints"
checkpoint_local_path="/opt/ml/checkpoints"

# Location where results of model training are saved.
s3_model_artifacts_location = f"s3://{bucket}/{prefix}/artifacts/"
dataroot="/home/ec2-user/SageMaker/WCGAN-Energy-Time-Series/data"
s3_data_location = s3up.upload(
    os.path.join(dataroot, "EnergyData"), f"s3://{bucket}/{prefix}/data"
)

print(s3_data_location)

# IAM execution role that gives SageMaker access to resources in your AWS account.
# We can use the SageMaker Python SDK to get the role from our notebook environment.
role = get_execution_role()

sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/ec2-user/.config/sagemaker/config.yaml
s3://sagemaker-ap-southeast-1-220295937604/EnergyData/data


In [2]:
import json

hps = {
    "batch_size": 256,
    "epochs": 50000,
    "latent_dim":1464, 
    "ts_dim": 1464,  
    "conditional": 720, #month
    "lr_a": 0.00006,
    "lr_b": 0.00010,
}


str_hps = json.dumps(hps, indent=4)
print(str_hps)

{
    "batch_size": 256,
    "epochs": 50000,
    "latent_dim": 1464,
    "ts_dim": 1464,
    "conditional": 720,
    "lr_a": 6e-05,
    "lr_b": 0.0001
}


In [3]:
from sagemaker.pytorch import PyTorch
import s3fs

metric_definitions = [
    {
        "Name": "RMSE",
        "Regex": "RMSE: ([0-9\\.]+)"
    },
    {
        "Name": "Discriminator Loss",
        "Regex": "D_loss: ([0-9\\.]+)"
    },
    {
        "Name": "Generator Loss",
        "Regex": "G_loss: ([0-9\\.]+)"
    },
    {
        "Name": "Gradient Penalty",
        "Regex": "grad_pen: ([0-9\\.]+)"
    }
]


estimator = PyTorch(
    model_data={'S3DataSource':{'S3Uri': s3_data_location,'S3DataType': 'S3Prefix','CompressionType': 'None'}},
    role=role,
    entry_point="main.py",
    source_dir="./src",
    output_path=s3_model_artifacts_location,
    code_location=s3_custom_code_upload_location,
    instance_count=1,
    instance_type="ml.g4dn.16xlarge",
    framework_version="1.5.0",
    py_version="py3",
    use_spot_instances=True,
    max_wait=180000,
    max_run=150000,
    input_mode='File',
    base_job_name='WCGAN-DR',
    disable_output_compression=True,
    hyperparameters=hps,
    image_uri='763104351884.dkr.ecr.ap-southeast-1.amazonaws.com/pytorch-training:1.5.0-gpu-py3',
    metric_definitions=metric_definitions,
    checkpoint_s3_uri=checkpoint_s3_bucket,
    checkpoint_local_path=checkpoint_local_path,
    debugger_hook_config=False
    #early_stopping_type="Auto"
)


## Initiate Training

In [4]:
%%time
import s3fs
# Start training
from sagemaker.inputs import TrainingInput
import sagemaker.utils

estimator.fit(
    inputs=TrainingInput(s3_data=s3_data_location))

INFO:sagemaker:Creating training-job with name: WCGAN-DR-2024-05-08-03-21-08-291


2024-05-08 03:21:08 Starting - Starting the training job...
2024-05-08 03:21:24 Starting - Preparing the instances for training...
2024-05-08 03:22:02 Downloading - Downloading input data...
2024-05-08 03:22:17 Downloading - Downloading the training image........[34mbash: cannot set terminal process group (-1): Inappropriate ioctl for device[0m
[34mbash: no job control in this shell[0m
[34m2024-05-08 03:23:44,602 sagemaker-training-toolkit INFO     Imported framework sagemaker_pytorch_container.training[0m
[34m2024-05-08 03:23:44,634 sagemaker_pytorch_container.training INFO     Block until all host DNS lookups succeed.[0m
[34m2024-05-08 03:23:44,636 sagemaker_pytorch_container.training INFO     Invoking user training script.[0m
[34m2024-05-08 03:23:44,996 sagemaker-training-toolkit INFO     Invoking user script[0m
[34mTraining Env:[0m
[34m{
    "additional_framework_parameters": {},
    "channel_input_dirs": {
        "training": "/opt/ml/input/data/training"
    },
   

KeyboardInterrupt: 

In [12]:
## %%time
from sagemaker.estimator import Estimator
from sagemaker.s3 import S3Downloader as s3down

# Attaching previous training session
training_job_name = estimator.latest_training_job.name

#training_job_name = 'WCGAN-DR-2024-03-28-04-47-29-058'
path = "s3://sagemaker-ap-southeast-1-220295937604/EnergyData/artifacts/"+str(training_job_name)+"/output/"
print(path)

s3down.download(path+"model/","./tmp")
s3down.download(path+"output/","./output")



INFO:sagemaker:Nothing to download from bucket: sagemaker-ap-southeast-1-220295937604, key_prefix: EnergyData/artifacts/WCGAN-DR-2024-05-02-01-33-04-883/output/model/.


s3://sagemaker-ap-southeast-1-220295937604/EnergyData/artifacts/WCGAN-DR-2024-05-02-01-33-04-883/output/


['./output/256-100000-1464-1464-720-6e-05-0.0001/GP.png',
 './output/256-100000-1464-1464-720-6e-05-0.0001/LR_.png',
 './output/256-100000-1464-1464-720-6e-05-0.0001/Loss_0.png',
 './output/256-100000-1464-1464-720-6e-05-0.0001/Loss_1.png',
 './output/256-100000-1464-1464-720-6e-05-0.0001/Loss_2.png',
 './output/256-100000-1464-1464-720-6e-05-0.0001/Loss_3.png',
 './output/256-100000-1464-1464-720-6e-05-0.0001/Loss_4.png',
 './output/256-100000-1464-1464-720-6e-05-0.0001/Loss_5.png',
 './output/256-100000-1464-1464-720-6e-05-0.0001/Loss_6.png',
 './output/256-100000-1464-1464-720-6e-05-0.0001/Loss_7.png',
 './output/256-100000-1464-1464-720-6e-05-0.0001/Loss_8.png',
 './output/256-100000-1464-1464-720-6e-05-0.0001/Loss_9.png',
 './output/256-100000-1464-1464-720-6e-05-0.0001/dis.pth',
 './output/256-100000-1464-1464-720-6e-05-0.0001/gen.pth',
 './output/256-100000-1464-1464-720-6e-05-0.0001/line_generation/Iteration_91999.png',
 './output/256-100000-1464-1464-720-6e-05-0.0001/line_gene

## Hyperparameter Tuning

In [6]:
from sagemaker.tuner import IntegerParameter, ContinuousParameter, HyperparameterTuner

hpt_ranges = {
    'lr_a': ContinuousParameter(0.00001, 0.0001),
    'lr_b': ContinuousParameter(0.00001, 0.0001),
    'latent_dim': IntegerParameter(12, 168),
    'ts_dim': IntegerParameter(168,336),
    'conditional': IntegerParameter(12, 24),
    'batch_size': IntegerParameter(128, 256)
}

In [7]:
objective_metric_name = "Generator Loss"
objective_type = "Minimize"


In [None]:
%%time

import s3fs
# Start training
from sagemaker.inputs import TrainingInput

tuner = HyperparameterTuner(
    estimator=estimator,
    objective_metric_name=objective_metric_name,
    hyperparameter_ranges=hpt_ranges,
    metric_definitions=metric_definitions,
    max_jobs=20,
    max_parallel_jobs=1,
    objective_type=objective_type
)
tuner.fit(inputs=TrainingInput(s3_data=s3_data_location))

No finished training job found associated with this estimator. Please make sure this estimator is only used for building workflow config
No finished training job found associated with this estimator. Please make sure this estimator is only used for building workflow config


........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................

In [63]:
import boto3

# Initialize the SageMaker client
sagemaker_client = boto3.client('sagemaker')

# Replace 'training_job_name' with the name of your training job
training_job_name = 'WGAN-financial-2024-02-15-05-01-10-308'

# Retrieve the training job description
response = sagemaker_client.describe_training_job(TrainingJobName=training_job_name)

# Extract the image URI from the training job description
image_uri = response['AlgorithmSpecification']['TrainingImage']

print("Image URI:", image_uri)


Image URI: 763104351884.dkr.ecr.ap-southeast-1.amazonaws.com/pytorch-training:1.5.0-gpu-py3
