In [None]:
import sagemaker
from sagemaker.pytorch import PyTorch
from sagemaker.inputs import TrainingInput
from sagemaker import get_execution_role
import time

bucket = "say1-5team-bucket"
role = get_execution_role()

est = PyTorch(
    entry_point="train-eff-b4.py",          # 방금 저장한 B4 지원 train.py
    source_dir=".",
    role=role,
    framework_version="2.1",
    py_version="py310",
    instance_count=1,
    instance_type="ml.g4dn.2xlarge",  # 16GB VRAM
    hyperparameters={
        "backbone": "efficientnet_b4",
        "epochs": 20,
        "freeze-epochs": 2,
        "img-size": 380,             # B4 권장 해상도
        "batch-size": 16,            # g4dn.2xlarge 안전 권장
        "lr": 3e-4,
        "weight-decay": 3e-4,
        "dropout": 0.2,            # ← 여기만 바꾸면 됨
        "label-smoothing": 0.05,
        "seed": 42,
    },
    output_path=f"s3://{bucket}/densenet-output/",
    base_job_name="effnetb4-skin",
)

inputs = {
    "train": TrainingInput(f"s3://{bucket}/densenet-training-data/train"),
    "val":   TrainingInput(f"s3://{bucket}/densenet-training-data/val"),
    "test":  TrainingInput(f"s3://{bucket}/densenet-training-data/test"),
}

job_name = f"effnetb4-skin-job-test-{time.strftime('%Y-%m-%d-%H-%M-%S')}"
est.fit(inputs, job_name=job_name, logs=True)
print("model_data:", est.model_data)

In [None]:
# ⛓️ 재접속 + 상태/로그 보기
import boto3, sagemaker
from sagemaker.estimator import Estimator

# job_name = "densenet121-skin-job-2025-08-25-02-02-44"
region   = "ap-northeast-2"  # 서울

boto_sess = boto3.Session(region_name=region)
sm        = boto_sess.client("sagemaker")
sess      = sagemaker.Session(boto_session=boto_sess)

# 1) 상태 확인
desc = sm.describe_training_job(TrainingJobName=job_name)
print("Status:", desc["TrainingJobStatus"])
print("Secondary:", desc["SecondaryStatus"])
print("ModelArtifacts:", desc["ModelArtifacts"]["S3ModelArtifacts"])
print("FailureReason:", desc.get("FailureReason"))

# 2) 로그 재스트리밍 (끝까지)
print("\n--- Streaming logs ---")
sess.logs_for_job(job_name, wait=True)
