In [15]:
import boto3

import sagemaker as sm

from sagemaker import script_uris
from sagemaker import image_uris
from sagemaker import model_uris
from sagemaker import hyperparameters

from sagemaker.utils import name_from_base
from sagemaker.estimator import Estimator


ROLE = sm.get_execution_role()
REGION = boto3.Session().region_name
session = sm.Session()

In [16]:
DEFAULT_BUCKET = session.default_bucket()
TRAIN_DATA_INPUT_PREFIX = 'js-input/artmoca/'
TRAIN_DATA_INPUT_S3_PATH = f's3://{DEFAULT_BUCKET}/{TRAIN_DATA_INPUT_PREFIX}'
!aws s3 cp ./img {TRAIN_DATA_INPUT_S3_PATH} --recursive

upload: img/.ipynb_checkpoints/dataset_info-Copy1-checkpoint.json to s3://sagemaker-eu-central-1-106877348565/js-input/artmoca/.ipynb_checkpoints/dataset_info-Copy1-checkpoint.json
upload: img/.ipynb_checkpoints/dataset_info-checkpoint.json to s3://sagemaker-eu-central-1-106877348565/js-input/artmoca/.ipynb_checkpoints/dataset_info-checkpoint.json
upload: img/artmoca_1.jpeg to s3://sagemaker-eu-central-1-106877348565/js-input/artmoca/artmoca_1.jpeg
upload: img/artmoca_3.jpeg to s3://sagemaker-eu-central-1-106877348565/js-input/artmoca/artmoca_3.jpeg
upload: img/dataset_info.json to s3://sagemaker-eu-central-1-106877348565/js-input/artmoca/dataset_info.json
upload: img/artmoca_2.jpeg to s3://sagemaker-eu-central-1-106877348565/js-input/artmoca/artmoca_2.jpeg
upload: img/artmoca_4.jpeg to s3://sagemaker-eu-central-1-106877348565/js-input/artmoca/artmoca_4.jpeg
upload: img/artmoca_5.jpeg to s3://sagemaker-eu-central-1-106877348565/js-input/artmoca/artmoca_5.jpeg
upload: img/artmoca_7.jpeg

In [20]:
MODEL_ID = 'model-txt2img-stabilityai-stable-diffusion-v2-1-base'
MODEL_VERSION = '*' 
IMG_SCOPE = 'training'
TRAIN_INSTANCE_TYPE = 'ml.g4dn.2xlarge'


train_image_uri = image_uris.retrieve(
    region=REGION, 
    framework=None,
    model_id=MODEL_ID, 
    model_version=MODEL_VERSION, 
    image_scope=IMG_SCOPE, 
    instance_type=TRAIN_INSTANCE_TYPE
)

train_source_uri = script_uris.retrieve(
    model_id=MODEL_ID, 
    model_version=MODEL_VERSION,
    script_scope=IMG_SCOPE
)


train_model_uri = model_uris.retrieve(
    model_id=MODEL_ID, 
    model_version=MODEL_VERSION, 
    model_scope=IMG_SCOPE
)

TRAIN_DATA_OUTPUT_PREFIX = 'js-output'
TRAIN_DATA_OUTPUT_S3_PATH = f's3://{DEFAULT_BUCKET}/{TRAIN_DATA_OUTPUT_PREFIX}'

print( "Image URI: {} ".format(train_image_uri))
print( "Source URI: {} ".format(train_source_uri))
print( "Model URI: {} ".format(train_model_uri))
print( "Output Storage: {} ".format(TRAIN_DATA_OUTPUT_S3_PATH))

INFO:sagemaker.image_uris:image_uri is not presented, retrieving image_uri based on instance_type, framework etc.


Image URI: 763104351884.dkr.ecr.eu-central-1.amazonaws.com/huggingface-pytorch-training:1.10.2-transformers4.17.0-gpu-py38-cu113-ubuntu20.04 
Source URI: s3://jumpstart-cache-prod-eu-central-1/source-directory-tarballs/stabilityai/transfer_learning/txt2img/v1.0.3/sourcedir.tar.gz 
Model URI: s3://jumpstart-cache-prod-eu-central-1/stabilityai-training/train-model-txt2img-stabilityai-stable-diffusion-v2-1-base.tar.gz 
Output Storage: s3://sagemaker-eu-central-1-106877348565/js-output 


In [21]:
hyperparams = hyperparameters.retrieve_default(
    model_id=MODEL_ID, 
    model_version=MODEL_VERSION
)

hyperparams['max_steps'] = '400'
# hyperparams['seed'] = '123'
# hyperparams['with_prior_preservation'] = 'True'
hyperparams

{'epochs': '20',
 'max_steps': '400',
 'batch_size': '1',
 'with_prior_preservation': 'False',
 'num_class_images': '100',
 'learning_rate': '2e-06',
 'prior_loss_weight': '1.0',
 'center_crop': 'False',
 'lr_scheduler': 'constant',
 'adam_weight_decay': '0.01',
 'adam_beta1': '0.9',
 'adam_beta2': '0.999',
 'adam_epsilon': '1e-08',
 'gradient_accumulation_steps': '1',
 'max_grad_norm': '1.0',
 'compute_fid': 'False',
 'seed': '0'}

In [None]:
MAX_RUN = 360000

model_prefix = name_from_base(f'genai-paris-avatar-{MODEL_ID}-')
training_job_name = f'{model_prefix}-finetuning'

estimator = Estimator(
    role=ROLE, 
    image_uri=train_image_uri, 
    source_dir=train_source_uri, 
    model_uri=train_model_uri, 
    entry_point='transfer_learning.py', 
    instance_count=1, 
    instance_type=TRAIN_INSTANCE_TYPE, 
    max_run=MAX_RUN, 
    hyperparameters=hyperparams, 
    output_path=TRAIN_DATA_OUTPUT_S3_PATH, 
    base_job_name=training_job_name
)

estimator.fit({'training': TRAIN_DATA_INPUT_S3_PATH}, logs=False)

INFO:sagemaker:Creating training-job with name: genai-paris-avatar-model-txt2img-stabil-2023-04-03-23-39-53-809



2023-04-03 23:39:53 Starting - Starting the training job......
2023-04-03 23:40:29 Starting - Preparing the instances for training...