# Hugging Faceで公開されている大規模言語モデルをSageMakerにデプロイ


* 対象モデル
  
  matsuo-lab/weblab-10b-instruction-sft
  
  https://huggingface.co/matsuo-lab/weblab-10b-instruction-sft

* HuggingFace Text Generation Inference Containers

  https://huggingface.co/blog/sagemaker-huggingface-llm

  https://aws.amazon.com/jp/blogs/machine-learning/announcing-the-launch-of-new-hugging-face-llm-inference-containers-on-amazon-sagemaker/


### SageMakerライブラリーのインストール

In [None]:
%pip install sagemaker --upgrade


---

### パラメーターを指定

In [None]:
model_id = 'matsuo-lab/weblab-10b-instruction-sft'
instance_type = 'ml.g5.12xlarge'
gpus = '2'  # ml.g5.12xlargeのGPUは4つですが、２を指定する必要があります。

### インポート

In [None]:
import sagemaker
import boto3


### IAMロールの取得

In [None]:
try:
	role = sagemaker.get_execution_role()
except ValueError:
	iam = boto3.client('iam')
	role_name = 'AmazonSageMaker-ExecutionRole-20230617T201891' # Role name with `AmazonSageMakerFullAccess` policy attached
	role = iam.get_role(RoleName=role_name)['Role']['Arn']


### SageMakerへデプロイ

In [None]:
from sagemaker.huggingface.model import HuggingFaceModel
from sagemaker.huggingface import get_huggingface_llm_image_uri

image_uri = get_huggingface_llm_image_uri(
  backend='huggingface', # or lmi
  # region=region
)

# Hub model configuration <https://huggingface.co/models>
hub = {
  'HF_MODEL_ID': model_id, # model_id from hf.co/models
  'HF_TASK':'text-generation',          # NLP task you want to use for predictions
  # 'HF_MODEL_QUANTIZE':'bitsandbytes',
  'SM_NUM_GPUS': gpus,
}

# create Hugging Face Model Class
huggingface_model = HuggingFaceModel(
  env=hub,                            # configuration for loading model from Hub
  role=role,                          # IAM role with permissions to create an endpoint
  image_uri=image_uri
)

# deploy model to SageMaker Inference
predictor = huggingface_model.deploy(
  initial_instance_count=1,
  instance_type=instance_type,
  container_startup_health_check_timeout=600,
)


### 推論

In [None]:
text = "大規模言語モデルについて説明してください。"
text = f'以下は、タスクを説明する指示です。要求を適切に満たす応答を書きなさい。\n\n### 指示:\n{text}\n\n### 応答:'

data = {
   'inputs': text,
   'parameters': {
        'max_new_tokens': 100,
        'do_sample': True,
        'temperature': 0.7,
        'top_p': 0.95  
   }
}

# request
result = predictor.predict(data)

result


### エンドポイントの削除

In [None]:
predictor.delete_endpoint(delete_endpoint_config=False)
predictor.delete_model()
