# Training and Deploying Mistral 7B

This notebook was copied from AWS SageMaker to give insight on the workflow on model configuration and deployment

In [2]:
!pip install bitsandbytes


Collecting bitsandbytes
  Using cached bitsandbytes-0.43.1-py3-none-manylinux_2_24_x86_64.whl.metadata (2.2 kB)
Using cached bitsandbytes-0.43.1-py3-none-manylinux_2_24_x86_64.whl (119.8 MB)
Installing collected packages: bitsandbytes
Successfully installed bitsandbytes-0.43.1


In [3]:
!pip install peft

Collecting peft
  Using cached peft-0.10.0-py3-none-any.whl.metadata (13 kB)
Using cached peft-0.10.0-py3-none-any.whl (199 kB)
Installing collected packages: peft
Successfully installed peft-0.10.0


In [1]:
import sagemaker
import boto3
import io
from sagemaker.huggingface import get_huggingface_llm_image_uri
import json
from sagemaker.huggingface import HuggingFaceModel

ModuleNotFoundError: No module named 'sagemaker'

# Modeling

In [29]:
sess = sagemaker.Session()

#preparing the necessary AWS resources and permissions to ensure 
#SageMaker can access the data it needs and has the permissions to perform operations on behalf of the user.

# sagemaker_session_bucket -> used for uploading data, models and logs
# sagemaker_will_automatically create this bucket if it not exists

sagemaker_session_bucket=None

if sagemaker_session_bucket is None and sess is not None:
    # set to default bucket if a bucket name is not given
    sagemaker_session_bucket = sess.default_bucket()

try:
    role = sagemaker.get_execution_role()
except ValueError:
    iam = boto3.client('iam')
    role = iam.get_role(RoleName='sagemaker_execution_role')['Role']['Arn']

sess = sagemaker.Session(default_bucket=sagemaker_session_bucket)

print(f"sagemaker role arn: {role}")
print(f"sagemaker session region: {sess.boto_region_name}")

sagemaker role arn: arn:aws:iam::566086704797:role/service-role/AmazonSageMaker-ExecutionRole-20240222T123141
sagemaker session region: us-west-2


## Hugging Face Deep Learning Container

In [3]:
# retrieve the llm image uri
llm_image = get_huggingface_llm_image_uri(
    "huggingface",
    version="1.1.0"
)

# print ecr image uri
print(f"llm image uri: {llm_image}")

llm image uri: 763104351884.dkr.ecr.us-west-2.amazonaws.com/huggingface-pytorch-tgi-inference:2.0.1-tgi1.1.0-gpu-py39-cu118-ubuntu20.04


## Configuration
Mistral7b sharded requires less VRAM to load the model. Quantisation requires less memory

In [21]:

# sagemaker config
instance_type = "ml.g5.xlarge"
n_gpu = 1
health_check_timeout = 800

# Model configuration for text generation ingerence with huggingface
config = {
    'HF_MODEL_ID': "filipealmeida/Mistral-7B-Instruct-v0.1-sharded", # model_id for Mistral 7B. 
    'SM_NUM_GPUS': json.dumps(n_gpu),
    'MAX_INPUT_LENGTH': json.dumps(2048),  # Max length of input text
    'MAX_TOTAL_TOKENS': json.dumps(4096),  # Max length of the generation (including input text)
    'MAX_BATCH_TOTAL_TOKENS': json.dumps(8192),  # Limits the number of tokens that can be processed in parallel during the generation
    'HUGGING_FACE_HUB_TOKEN': json.dumps("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"),
    'HF_MODEL_QUANTIZE': "bitsandbytes",  # Enable bitsandbytes quantization
    'QUANTIZATION_BITS': json.dumps(8)  # Set quantization to 8-bit
}

# HF Model Class
hf_model = HuggingFaceModel(
  role=role,
  image_uri=llm_image,
  env=config
)

## Deploy Mistral 7B

In [22]:
# deploy the HuggingFaceModel to Amazon SageMaker
# Creates an endpoint that will contain the model
llm = hf_model.deploy(
  initial_instance_count= 1,
  instance_type= instance_type,
  container_startup_health_check_timeout= health_check_timeout,
)

----------------!

## Structure and configure response

In [9]:
message = 'what are the ingredients of a meat pie'
prompt= f'''[INST] {message} [/INST]'''

### Hyper params

In [10]:
#config response
input_data = {
    'inputs':prompt,
    'parameters':{
        'do_sample': True,
        'top_p': 0.6,
        'temperature': 0.3, #low temperature because I want consistency in the answers
        'top_k': 50,
        'max_new_tokens': 512,
        'repetition_penalty': 1.03
    }
}

body_input_data_json = json.dumps(input_data)# needs to be in json format to be passed

### Request the response

In [11]:
#sagemaker runtime client
sagemaker_runtime = boto3.client('sagemaker-runtime')
endpoint = 'huggingface-pytorch-tgi-inference-2024-04-26-05-43-18-580'
content_type = 'application/json'


# Requests inference from AWS SageMaker endpoint
# Model deployed - Mistral 7B
response = sagemaker_runtime.invoke_endpoint(
    EndpointName = endpoint,
    ContentType = content_type,
    Body = body_input_data_json.encode('utf-8')
)

### Parse the response
the response is given in json format.

In [12]:
response_body = response['Body'].read().decode('utf-8')
response_json = json.loads(response_body)

In [13]:
response_json

[{'generated_text': '[INST] what are the ingredients of a meat pie [/INST] The ingredients for a meat pie can vary depending on the recipe, but some common ingredients include:\n\n* Flaky pie crust\n* Ground beef, pork, or a combination of the two\n* Onion\n* Garlic\n* Carrots\n* Peas\n* Thyme\n* Rosemary\n* Salt\n* Pepper\n* Egg wash (optional)\n\nSome recipes may also include other ingredients such as mushrooms, celery, or diced tomatoes.'}]

### Extract and Print Response

In [14]:
generated_text = response_json[0]['generated_text']
print(generated_text[len(prompt):])

 The ingredients for a meat pie can vary depending on the recipe, but some common ingredients include:

* Flaky pie crust
* Ground beef, pork, or a combination of the two
* Onion
* Garlic
* Carrots
* Peas
* Thyme
* Rosemary
* Salt
* Pepper
* Egg wash (optional)

Some recipes may also include other ingredients such as mushrooms, celery, or diced tomatoes.
