# Fine-tune Gemma on Sagemaker with QLoRA

- Notebook running with a different EC2 instance type
- Use Huggingface library
  - SQL generator Dataset from Huggingface
  - Transformer with Gemma model
  - SFT Trainer from TRL library by HugginFace
  - QLoRA based training from PEFT
  - Deploy using saved artifacts on S3
  - evaluate the model

In [1]:
! pip install -r ./requirements_local.txt



# ADD HF_TOKEN

# Setup SageMaker

In [2]:
import dotenv
import sagemaker
import boto3
from botocore.exceptions import ClientError

print(dotenv.load_dotenv('./.env'))

USER_ID = 'u1'
s3_bucket = f"{USER_ID}-av-llmops-sagemaker-workshop"

job_name = f"{USER_ID}-qlora-gemma-2b-sql-generator"
deploy_model_name = f"{USER_ID}-sql-generator-model"

# role_name = "llmops_workshop_sagemaker_exec_role"

train_instance = 'ml.g5.2xlarge'
deploy_instance = 'ml.g5.2xlarge'
model_id = "google/gemma-2b-it"

train_local, train_path = './tmp/train.jsonl', 'dataset/train.jsonl'
test_local, test_path = './tmp/test.jsonl', 'dataset/test.jsonl'



# s3_bucket = sess.default_bucket()
def create_bucket(bucket_name, region="ap-south-1"):
    s3_client = boto3.client('s3', region_name=region)
    try:
        location = {'LocationConstraint': region}
        s3_client.create_bucket(Bucket=bucket_name, CreateBucketConfiguration=location)
    except ClientError as e:
        print(f"Bucket {bucket_name} got response {e.response['Error']['Code']}")

create_bucket(s3_bucket)

try:
    role = sagemaker.get_execution_role()
except ValueError:
    iam = boto3.client('iam')
    # role = iam.get_role(RoleName=role_name)['Role']['Arn']
    role = "arn:aws:iam::009676737623:role/llmops_workshop_sagemaker_exec_role"
 
sess = sagemaker.Session(default_bucket=s3_bucket)

print(f"{role =}")
print(f"{s3_bucket =}")
print(f"{sess.boto_region_name =}")

True


# Dataset Overview

In [3]:
import datasets

DATASET = "gretelai/synthetic_text_to_sql"
ds = datasets.load_dataset(DATASET)

In [4]:
print(ds.shape)
ds

{'train': (100000, 11), 'test': (5851, 11)}


DatasetDict({
    train: Dataset({
        features: ['id', 'domain', 'domain_description', 'sql_complexity', 'sql_complexity_description', 'sql_task_type', 'sql_task_type_description', 'sql_prompt', 'sql_context', 'sql', 'sql_explanation'],
        num_rows: 100000
    })
    test: Dataset({
        features: ['id', 'domain', 'domain_description', 'sql_complexity', 'sql_complexity_description', 'sql_task_type', 'sql_task_type_description', 'sql_prompt', 'sql_context', 'sql', 'sql_explanation'],
        num_rows: 5851
    })
})

In [5]:
ds["train"].features

{'id': Value(dtype='int32', id=None),
 'domain': Value(dtype='string', id=None),
 'domain_description': Value(dtype='string', id=None),
 'sql_complexity': Value(dtype='string', id=None),
 'sql_complexity_description': Value(dtype='string', id=None),
 'sql_task_type': Value(dtype='string', id=None),
 'sql_task_type_description': Value(dtype='string', id=None),
 'sql_prompt': Value(dtype='string', id=None),
 'sql_context': Value(dtype='string', id=None),
 'sql': Value(dtype='string', id=None),
 'sql_explanation': Value(dtype='string', id=None)}

In [6]:
for label, val in ds["train"][0].items():
    print(f"{label}: {val}\n\n")

id: 5097


domain: forestry


domain_description: Comprehensive data on sustainable forest management, timber production, wildlife habitat, and carbon sequestration in forestry.


sql_complexity: single join


sql_complexity_description: only one join (specify inner, outer, cross)


sql_task_type: analytics and reporting


sql_task_type_description: generating reports, dashboards, and analytical insights


sql_prompt: What is the total volume of timber sold by each salesperson, sorted by salesperson?


sql_context: CREATE TABLE salesperson (salesperson_id INT, name TEXT, region TEXT); INSERT INTO salesperson (salesperson_id, name, region) VALUES (1, 'John Doe', 'North'), (2, 'Jane Smith', 'South'); CREATE TABLE timber_sales (sales_id INT, salesperson_id INT, volume REAL, sale_date DATE); INSERT INTO timber_sales (sales_id, salesperson_id, volume, sale_date) VALUES (1, 1, 120, '2021-01-01'), (2, 1, 150, '2021-02-01'), (3, 2, 180, '2021-01-01');


sql: SELECT salesperson_id, name, SUM(vo

# Prepare Dataset

In [7]:
USER_PROMPT_TEMPLATE = """ 
You are a database management system expert, proficient in Structured Query Language (SQL). 
Your job is to write an SQL query that answers the following question, based on the given database schema and any additional information provided. 
Use SQLite syntax and please output only SQL without any kind of explanations. 
### Schema: {sql_context} 
 
### Knowledge: This "{sql_task_type}" type task is commonly used for {sql_task_type_description} in the domain of {domain}, which involves {domain_description}. 
 
### Question: {sql_prompt} 
"""

In [8]:
def get_messages(item_dict):
    return { "messages": [
        {"role": "user", "content": USER_PROMPT_TEMPLATE.format(**item_dict)},
        {"role": "assistant", "content": item_dict["sql"]}
        ]
    }

In [9]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(model_id)

formated_ds = ds.map(get_messages, remove_columns=ds["train"].features, batched=False)
formated_ds["train"][5]



{'messages': [{'content': ' \nYou are a database management system expert, proficient in Structured Query Language (SQL). \nYour job is to write an SQL query that answers the following question, based on the given database schema and any additional information provided. \nUse SQLite syntax and please output only SQL without any kind of explanations. \n### Schema: CREATE SCHEMA if not exists defense; CREATE TABLE if not exists eu_humanitarian_assistance (id INT PRIMARY KEY, year INT, spending INT); INSERT INTO defense.eu_humanitarian_assistance (id, year, spending) VALUES (1, 2019, 1500), (2, 2020, 1800), (3, 2021, 2100); \n \n### Knowledge: This "analytics and reporting" type task is commonly used for generating reports, dashboards, and analytical insights in the domain of defense operations, which involves Defense data on military innovation, peacekeeping operations, defense diplomacy, and humanitarian assistance.. \n \n### Question: What is the total spending on humanitarian assistan

In [10]:
tokenizer.apply_chat_template(formated_ds["train"][5]['messages'], tokenize=False)

'<bos><start_of_turn>user\nYou are a database management system expert, proficient in Structured Query Language (SQL). \nYour job is to write an SQL query that answers the following question, based on the given database schema and any additional information provided. \nUse SQLite syntax and please output only SQL without any kind of explanations. \n### Schema: CREATE SCHEMA if not exists defense; CREATE TABLE if not exists eu_humanitarian_assistance (id INT PRIMARY KEY, year INT, spending INT); INSERT INTO defense.eu_humanitarian_assistance (id, year, spending) VALUES (1, 2019, 1500), (2, 2020, 1800), (3, 2021, 2100); \n \n### Knowledge: This "analytics and reporting" type task is commonly used for generating reports, dashboards, and analytical insights in the domain of defense operations, which involves Defense data on military innovation, peacekeeping operations, defense diplomacy, and humanitarian assistance.. \n \n### Question: What is the total spending on humanitarian assistance 

# Upload Dataset to S3

In [12]:
import boto3
import os

# pandas orient=‘records’ for jsonl. List like [{column -> value}, … , {column -> value}]
formated_ds["train"].shuffle().select(range(1000)).to_json(train_local, orient="records")
formated_ds["test"].shuffle().select(range(100)).to_json(test_local, orient="records")

s3 = boto3.client('s3')
s3.upload_file(train_local, s3_bucket, train_path)
s3.upload_file(test_local, s3_bucket, test_path)

Creating json from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Creating json from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

# Training

In [13]:
hyperparameters = {
  'dataset_path': '/opt/ml/input/data/training/train.jsonl',
  'model_id': model_id,
  'max_seq_len': 3072,
  'use_qlora': True,

  'num_train_epochs': 1,
  'per_device_train_batch_size': 1,
  'gradient_accumulation_steps': 4,
  'gradient_checkpointing': True,
  'optim': "adamw_torch_fused",
  'logging_steps': 5,
  'save_strategy': "epoch",
  'learning_rate': 2e-4,

  'bf16': True,
  'tf32': True,
  'max_grad_norm': 0.3,
  'warmup_ratio': 0.03,
  'lr_scheduler_type': "constant",
  'report_to': "tensorboard",
  'output_dir': '/tmp/tun',
  'merge_adapters': True
}

In [14]:
from sagemaker.huggingface import HuggingFace

# image_uri = "763104351884.dkr.ecr.ap-south-1.amazonaws.com/huggingface-pytorch-training:2.1.0-transformers4.36.0-gpu-py310-cu121-ubuntu20.04"
huggingface_estimator = HuggingFace(
     base_job_name=job_name,
    # image_uri=image_uri,
    # if not image_uri
    transformers_version = '4.36.0',
    pytorch_version      = '2.1.0',
    
    instance_type=train_instance,
    instance_count=1,
    max_run=int(3600 * 0.5),
    role=role,
    environment={
        "HUGGINGFACE_HUB_CACHE": "/tmp/.cache",
        "HF_TOKEN": os.environ["HF_TOKEN"]
    },
    py_version='py310',
    entry_point='qlora.py',
    source_dir=".", # Copy source to S3 and auto installs the requirements.txt file 
    hyperparameters=hyperparameters,
    disable_output_compression = True, # not compress output to save training time and cost
    metric_definitions=[
      {'Name': 'loss', 'Regex': "'loss': (.*?),"},
      {'Name': 'grad_norm', 'Regex': "'grad_norm': (.*?),"},
      {'Name': 'learning_rate', 'Regex': "'learning_rate': (.*?),"},
      {'Name': 'epoch', 'Regex': "'epoch': (.*?)}"}
    ]
)

In [15]:
data = {'training': f's3://{s3_bucket}/dataset'}
huggingface_estimator.fit(data, wait=True)

INFO:sagemaker.image_uris:image_uri is not presented, retrieving image_uri based on instance_type, framework etc.
INFO:sagemaker:Creating training-job with name: qlora-gemma-2b-sql-generator-2024-08-07-01-36-42-465


2024-08-07 01:36:43 Starting - Starting the training job...
2024-08-07 01:37:02 Starting - Preparing the instances for training...
2024-08-07 01:37:31 Downloading - Downloading input data...
2024-08-07 01:37:51 Downloading - Downloading the training image........................
2024-08-07 01:42:03 Training - Training image download completed. Training in progress..[34mbash: cannot set terminal process group (-1): Inappropriate ioctl for device[0m
[34mbash: no job control in this shell[0m
[34m2024-08-07 01:42:15,774 sagemaker-training-toolkit INFO     Imported framework sagemaker_pytorch_container.training[0m
[34m2024-08-07 01:42:15,791 sagemaker-training-toolkit INFO     No Neurons detected (normal if no neurons installed)[0m
[34m2024-08-07 01:42:15,803 sagemaker_pytorch_container.training INFO     Block until all host DNS lookups succeed.[0m
[34m2024-08-07 01:42:15,809 sagemaker_pytorch_container.training INFO     Invoking user training script.[0m
[34m2024-08-07 01:42:17

## Demo S3 Models and Instance metrics in SageMaker training job

# Create Model and Endpoint

In [16]:
from sagemaker.huggingface import get_huggingface_llm_image_uri
import sagemaker
import boto3

# sess = sagemaker.Session()
# s3_bucket = sess.default_bucket()
# role = sagemaker.get_execution_role()

model_s3_path = huggingface_estimator.model_data["S3DataSource"]["S3Uri"]
# Example:
# model_s3_path = "s3://sagemaker-ap-south-1-466407698387/qlora-gemma-2b-sql-generator-2024-08-06-18-15-52-352/output/model/"

# https://github.com/aws/deep-learning-containers/blob/master/available_images.md
llm_image = "763104351884.dkr.ecr.ap-south-1.amazonaws.com/huggingface-pytorch-tgi-inference:2.3.0-tgi2.2.0-gpu-py310-cu121-ubuntu22.04-v2.0"

In [17]:
import json
from sagemaker.huggingface import HuggingFaceModel
 

config = {
  'HF_MODEL_ID': "/opt/ml/model",
  'SM_NUM_GPUS': '1',
  'MAX_INPUT_LENGTH': json.dumps(1024),
  'MAX_TOTAL_TOKENS': json.dumps(2048) # req prompt tokens + req generated tokens in the GPU for this req
}

llm_model = HuggingFaceModel(
  name=deploy_model_name,
  role=role,
  image_uri=llm_image,
  model_data={'S3DataSource':{'S3Uri': model_s3_path,'S3DataType': 'S3Prefix','CompressionType': 'None'}},
  env=config
)

In [18]:
llm = llm_model.deploy(
  initial_instance_count=1,
  instance_type=deploy_instance,
  container_startup_health_check_timeout=300,
)

INFO:sagemaker:Creating model with name: sql-generator-model
INFO:sagemaker:Creating endpoint-config with name sql-generator-model-2024-08-07-01-49-07-055
INFO:sagemaker:Creating endpoint with name sql-generator-model-2024-08-07-01-49-07-055


-----------!

# Evaluation

In [22]:
import os
from transformers import AutoTokenizer
from sagemaker import Predictor
from sagemaker.serializers import JSONSerializer
from sagemaker.deserializers import JSONDeserializer
from datasets import load_dataset

os.makedirs('./tmp', exist_ok=True)
tokenizer = AutoTokenizer.from_pretrained(model_id)

boto3.client('s3').download_file(s3_bucket, test_path, test_local)
test_dataset = load_dataset("json", data_files=test_local, split="train")

# or put endpoint name from UI
deployed_llm = Predictor(
    endpoint_name=llm.endpoint_name,
    serializer=JSONSerializer(),
    deserializer=JSONDeserializer()
)

def request(sample):
    prompt = tokenizer.apply_chat_template(sample, tokenize=False, add_generation_prompt=True)
    print(f"prompt:\n {prompt} \n\n")
    outputs = deployed_llm.predict({
      "inputs": prompt,
      "parameters": {
        "max_new_tokens": 512,
        "temperature": 0.01,
        "return_full_text": False,
      }
    })
    return {"role": "assistant", "content": outputs[0]["generated_text"].strip()}

random_sample = test_dataset[10]
print(f"input message:\n {random_sample['messages'][0]} \n\n")
print(f"expected output:\n {random_sample['messages'][1]} \n\n")
print(f"generated output:\n {request([random_sample['messages'][0]])} \n\n")



Generating train split: 0 examples [00:00, ? examples/s]

input message:
 {'content': ' \nYou are a database management system expert, proficient in Structured Query Language (SQL). \nYour job is to write an SQL query that answers the following question, based on the given database schema and any additional information provided. \nUse SQLite syntax and please output only SQL without any kind of explanations. \n### Schema: CREATE TABLE students (id INT, department VARCHAR(255)); INSERT INTO students (id, department) VALUES (1, \'education\'), (2, \'engineering\'), (3, \'education\'); CREATE TABLE equipment (id INT, student_id INT, cost DECIMAL(10,2)); INSERT INTO equipment (id, student_id, cost) VALUES (1, 1, 500.00), (2, 1, 200.00), (3, 3, 300.00), (4, 3, 100.00), (5, 2, 700.00); \n \n### Knowledge: This "analytics and reporting" type task is commonly used for generating reports, dashboards, and analytical insights in the domain of disability services, which involves Comprehensive data on disability accommodations, support programs, policy ad

In [23]:
deployed_llm.delete_model()
deployed_llm.delete_endpoint()

INFO:sagemaker:Deleting model with name: sql-generator-model
INFO:sagemaker:Deleting endpoint configuration with name: sql-generator-model-2024-08-07-01-49-07-055
INFO:sagemaker:Deleting endpoint with name: sql-generator-model-2024-08-07-01-49-07-055
