In [2]:
# 运行时参数
import os

os.environ["AWS_PROFILE"] = "sandbox"

In [13]:
# 生成数据
import pandas as pd
import numpy as np

# 生成随机数据
np.random.seed(0)
data_size = 1000
X1 = np.random.rand(data_size)
X2 = np.random.rand(data_size)
y = X1 + X2 + np.random.rand(data_size) * 0.1  # 简单的线性关系，加上一些噪声

# 创建DataFrame
df = pd.DataFrame({"X1": X1, "X2": X2, "y": y})

# 保存为CSV文件到本地
df.to_csv("./data/demo_data.csv", index=False)

# 保存为CSV文件到s3
# s3_client.put_object(Bucket=bucket_name, Key='demo_data.csv', Body=df.to_csv(index=False))

In [3]:
import sagemaker
from sagemaker.sklearn import SKLearn

# 配置 SageMaker 角色和 S3 路径
role = "arn:aws-cn:iam::278103880173:role/sandbox-SageMakerExecutorRole"
bucket = 'sandbox-sagemaker'
prefix = 'demo'

# 创建 SageMaker 会话
sagemaker_session = sagemaker.Session()

# 上传训练数据到S3
input_data = sagemaker_session.upload_data('demo_data.csv', bucket=bucket, key_prefix=f'{prefix}/data')

# 创建并配置 SKLearn Estimator
sklearn_estimator = SKLearn(
    entry_point='train.py',
    role=role,
    instance_count=1,
    instance_type='ml.m5.large',
    framework_version='0.23-1',
    py_version='py3',
    source_dir='.',  # 指定源代码目录，以便包括 requirements.txt
    dependencies=['requirements.txt'],
    output_path=f's3://{bucket}/{prefix}/output',
    sagemaker_session=sagemaker_session,
    hyperparameters={
        'output-model-dir': f's3://{bucket}/{prefix}/output/model'  # SageMaker 的输出路径
    }
)

# 启动训练任务
sklearn_estimator.fit({'train': input_data})

# 下载训练结果
sagemaker_session.download_data(
    path='./output',
    bucket=bucket,
    key_prefix=f'{prefix}/output/model'
)

INFO:sagemaker:Creating training-job with name: sagemaker-scikit-learn-2024-08-05-08-48-04-782


2024-08-05 08:51:55 Starting - Starting the training job...
2024-08-05 08:52:10 Starting - Preparing the instances for training...
2024-08-05 08:52:56 Downloading - Downloading the training image......
2024-08-05 08:53:37 Training - Training image download completed. Training in progress.2024-08-05 08:53:41,632 sagemaker-containers INFO     Imported framework sagemaker_sklearn_container.training
2024-08-05 08:53:41,636 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)
2024-08-05 08:53:41,677 sagemaker_sklearn_container.training INFO     Invoking user training script.
2024-08-05 08:54:06,510 sagemaker-training-toolkit INFO     Installing dependencies from requirements.txt:
/miniconda3/bin/python -m pip install -r requirements.txt
[notice] A new release of pip is available: 23.0 -> 24.0
[notice] To update, run: pip install --upgrade pip
2024-08-05 08:54:07,882 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)
2024-08-05 08

['./output/model.pkl']