In [1]:
#!pip install -Uq transformers datasets evaluate accelerate torch sagemaker boto3 botocore pip

In [2]:
%load_ext autoreload
%autoreload 2

%matplotlib inline

In [3]:
!mkdir -p src

In [4]:
import pandas as pd
import altair as alt
import boto3
sm = boto3.client('sagemaker')

In [5]:
%%writefile src/requirements.txt
torch~=2.2
tiktoken
bitsandbytes~=0.43
peft~=0.10
transformers~=4.39.2 
accelerate~=0.28 
datasets~=2.18
evaluate~=0.4.1

pynvml
#tensorboardX

Overwriting src/requirements.txt


In [27]:
%%writefile src/train.py
import sys
import os
import time
from threading import Thread
import logging
import argparse

import torch
import numpy as np

import datasets
import evaluate
import transformers
from transformers import (
    AutoConfig,
    AutoTokenizer,
    AutoModelForSequenceClassification,
    AutoModelForCausalLM,
    DataCollatorWithPadding,
    TrainingArguments,
    Trainer,
    BitsAndBytesConfig
)

import peft
from peft import (
    get_peft_model,
    replace_lora_weights_loftq,
    prepare_model_for_kbit_training,
    LoraConfig,
    IA3Config,
    TaskType
)

logger = logging.getLogger(__name__)
logging.basicConfig(stream=sys.stdout, level=logging.INFO)
logger.setLevel(logging.DEBUG)

transformers.logging.set_verbosity_info()

logger.info(f'transformers {transformers.__version__}')
logger.info(f'peft {peft.__version__}')
logger.info(f'torch {torch.__version__}')

def count_parameters(m, verbose=True):
    total_count = 0
    learnable_count = 0
    if verbose:
        logger.debug("Parameters (name, tunable, count):")

    output_width = max([len(n) for n, _ in m.named_parameters()])
    for n, p in m.named_parameters():
        count = p.data.numel()
        if verbose:
            logger.debug(f" {n:{output_width}} {p.requires_grad:5b} {count:>11d}")
        total_count += count
        if p.requires_grad:
            learnable_count += count

    logger.info(
        f"Total parameters: {total_count:,}, "
        f"thereof learnable: {learnable_count:,} "
        f"({learnable_count/total_count*100.:5.4f}%)"
    )

    return total_count, learnable_count

def compute_metrics(eval_pred):
    load_accuracy = evaluate.load("accuracy")
 
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)

    accuracy = load_accuracy.compute(
        predictions=predictions, references=labels)["accuracy"]
 
    metrics = {f"accuracy": accuracy}
 
    return metrics

def fit(args):
    ### model / tokenizer

    # 'mistralai/Mistral-7B-Instruct-v0.2'
    conf = AutoConfig.from_pretrained(args.hf_ckp, num_labels=2)
    print('conf', conf)
    
    ## model
    
    bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bits_use_double_quant=True
    )
      
    model = AutoModelForSequenceClassification.from_pretrained(
        args.hf_ckp, 
        config=conf, 
        #torch_dtype=torch.bfloat16, 
        #load_in_8bit=True,
        #load_in_4bit=True,
        quantization_config=bnb_config
    )
 
    model = prepare_model_for_kbit_training(model) 

    if args.use_hf_lora:
        
        
        peft_config = LoraConfig(
            task_type=TaskType.SEQ_CLS,
            r=4,
            lora_alpha=4,
            use_dora=False,
            lora_dropout=0.1,
            target_modules='all-linear' # ['query_key_value','dense_h_to_4h','dense_4h_to_h']
        ) 
#         peft_config = LoraConfig(
#             task_type=TaskType.SEQ_CLS,
#             r=4,
#             lora_alpha=4,
#             use_dora=False,
#             lora_dropout=0.1,
#             target_modules='all-linear' # ['query_key_value','dense_h_to_4h','dense_4h_to_h']
#         )
        
        # peft_config = IA3Config(
        #     task_type=TaskType.SEQ_CLS,
        #     target_modules='all-linear' # 
        # )
        
        model = get_peft_model(model, peft_config)
        replace_lora_weights_loftq(model)
    
    count_parameters(model, verbose=args.use_hf_lora)
    
    #model = torch.compile(model)
    
    ## tokenizer
    tokenizer = AutoTokenizer.from_pretrained(args.hf_ckp)
    
    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token
        model.config.pad_token_id = model.config.eos_token_id

    collator = DataCollatorWithPadding(tokenizer=tokenizer)

 
    ### data 
    datasets.logging.disable_progress_bar()
    dataset = datasets.load_dataset("glue", "sst2")
    train = dataset["train"]
    valid = dataset["validation"]

    def preprocess_function(examples):
        return tokenizer(examples["sentence"], padding=False, truncation=True)

    tokenized_train = train.map(preprocess_function, batched=False, remove_columns=['idx', 'sentence'])
    tokenized_valid = valid.map(preprocess_function, batched=False, remove_columns=['idx', 'sentence'])

    ### Trainer
    
    log_steps = len(tokenized_train) // args.batch_size // 50 # 2 log outputs per epoch
    use_bf16 = True if args.use_bf16 and torch.cuda.is_available() else False
    print(f'Using bf16: {use_bf16}, LoRA: {args.use_hf_lora}')
    
    training_args = TrainingArguments(
        output_dir=args.model_dir if args.model_dir else "out",    
        learning_rate=args.learning_rate,
        #auto_find_batch_size=True,
        per_device_train_batch_size=args.batch_size,
        per_device_eval_batch_size=args.batch_size,
        num_train_epochs=2,
        weight_decay=0.1,
        push_to_hub=False,
        bf16=use_bf16,
        warmup_steps=150,
        max_steps=500,
        save_steps=log_steps,
        logging_steps=log_steps,
        evaluation_strategy="steps",
        save_strategy="no", #"steps",
        #save_total_limit=1,
        
        gradient_accumulation_steps=args.gradient_acc_steps,
        gradient_checkpointing=args.use_gradient_checkpointing,
        
        disable_tqdm=True,
        lr_scheduler_type="cosine",
        optim="paged_adamw_8bit",
        #report_to="tensorboard",
        #logging_dir="/opt/ml/output/tensorboard",
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_train,
        eval_dataset=tokenized_valid,
        tokenizer=tokenizer,
        data_collator=collator,
        compute_metrics=compute_metrics,
        # no early stopping for simplicity
    )

 
    ### train
    trainer.train()
    trainer.evaluate()
    
if __name__ == "__main__":

    def schedule_gpu_memory_logging():
        def log_gpu_usage():
            if not torch.cuda.is_available():
                return

            from pynvml.smi import nvidia_smi

            nvsmi = nvidia_smi.getInstance()
            res = nvsmi.DeviceQuery("memory.free, memory.total, memory.used")["gpu"][0][
                "fb_memory_usage"
            ]
            res["percentage"] = res["used"] / res["total"] * 100
            logger.info(
                f'GPU Usage. Used: {res["used"]:5.3f} Total: {res["total"]:5.3f} ({res["percentage"]:3.1f}% used). Free: {res["free"]:5.3f}'
            )
        
        def log_loop():
            while True:
                log_gpu_usage()
                time.sleep(60)
    
        t = Thread(target=log_loop, daemon=True)
        t.start()

    schedule_gpu_memory_logging()

    parser = argparse.ArgumentParser()
    parser.add_argument("--model-dir", type=str, default=os.environ.get("SM_MODEL_DIR"))
    parser.add_argument("--use-bf16", type=int, default=1, help='1 yes, 0 no') 
    parser.add_argument("--use-gradient-checkpointing", type=int, default=1, help='1 yes, 0 no') 
    parser.add_argument("--use-hf-lora", type=int, default=0, help='1 yes, 0 no')
    parser.add_argument("--learning-rate", type=float, default=4e-5)
    parser.add_argument("--batch-size", type=int, default=224)
    parser.add_argument("--gradient-acc-steps", type=int, default=1)
    parser.add_argument("--hf-ckp", type=str, default='roberta-base')

    args, _ = parser.parse_known_args()

    fit(args)


Overwriting src/train.py


In [28]:
import sagemaker
import datetime
from sagemaker.pytorch import PyTorch
from sagemaker import get_execution_role
from sagemaker.debugger import TensorBoardOutputConfig
from sagemaker.utils import name_from_base

job_name = name_from_base('QLoRA', short=True)
SAGEMAKER_BUCKET = sagemaker.Session().default_bucket()
ymd = datetime.date.today().strftime('%Y/%m/%d')

tensorboard_output_config = TensorBoardOutputConfig(
    s3_output_path              = f's3://{SAGEMAKER_BUCKET}/tensorboard/{ymd}/{job_name}',
    container_local_output_path = "/opt/ml/output/tensorboard"
)

hyperparameters = {
    'use-bf16'  : 1,
    'use-hf-lora': 1,
    'learning-rate': 3e-5,
    'batch-size': 128+32,
    'gradient-acc-steps': 1, 
    'use-gradient-checkpointing': 0,
    'hf-ckp': 'roberta-base'
}

metric_definitions = [
    
    {'Name': 'train_samples_per_second', 'Regex': '\'train_samples_per_second\': (-?[0-9\\.]+)'},
    {'Name': 'valid_acc', 'Regex': '\'eval_accuracy\': (-?[0-9\\.]+)'},
    {'Name': 'train_loss', 'Regex': '\'loss\': (-?[0-9\\.]+)'},
    {'Name': 'valid_loss', 'Regex': '\'eval_loss\': (-?[0-9\\.]+)'},
    {'Name': 'gpu_mem', 'Regex': 'GPU Usage.*?(-?[0-9\\.]+)% used'}
]
print('tb', tensorboard_output_config)
estimator_parameters = dict(
    source_dir         = 'src',
    entry_point        = 'train.py',
    instance_type      = 'ml.g5.xlarge',#'ml.g4dn.xlarge',
    instance_count     = 1,
    framework_version  = '2.2',
    py_version         = 'py310',
    use_spot_instances = True,
    max_run            = 24*60*60, # one day in seconds
    max_wait           = 24*60*60, 
    role               = get_execution_role(),
    metric_definitions = metric_definitions,
    hyperparameters    = hyperparameters,
    tensorboard_output_config = tensorboard_output_config
)

tb <sagemaker.debugger.debugger.TensorBoardOutputConfig object at 0x11e005600>


In [29]:
est = PyTorch(**estimator_parameters)
ckp = 'mistralai/Mistral-7B-Instruct-v0.2'
est.set_hyperparameters(**{'hf-ckp': ckp})
est.hyperparameters()

{'use-bf16': '1',
 'use-hf-lora': '1',
 'learning-rate': '3e-05',
 'batch-size': '160',
 'gradient-acc-steps': '1',
 'use-gradient-checkpointing': '0',
 'hf-ckp': '"\\"mistralai/Mistral-7B-Instruct-v0.2\\""'}

In [30]:
print(est.tensorboard_output_config.s3_output_path)
print(est.tensorboard_output_config.container_local_output_path)

s3://sagemaker-eu-west-1-753739741425/tensorboard/2024/03/31/QLoRA-240331-1331
/opt/ml/output/tensorboard


In [31]:
est.fit(job_name=job_name, wait=True)

INFO:sagemaker.image_uris:image_uri is not presented, retrieving image_uri based on instance_type, framework etc.
INFO:sagemaker:Creating training-job with name: QLoRA-240331-1331


2024-03-31 11:31:10 Starting - Starting the training job...
2024-03-31 11:31:24 Starting - Preparing the instances for training......
2024-03-31 11:32:18 Downloading - Downloading input data...
2024-03-31 11:32:48 Downloading - Downloading the training image..................
2024-03-31 11:35:59 Training - Training image download completed. Training in progress.[34mbash: cannot set terminal process group (-1): Inappropriate ioctl for device[0m
[34mbash: no job control in this shell[0m
[34m2024-03-31 11:36:08,368 sagemaker-training-toolkit INFO     Imported framework sagemaker_pytorch_container.training[0m
[34m2024-03-31 11:36:08,386 sagemaker-training-toolkit INFO     No Neurons detected (normal if no neurons installed)[0m
[34m2024-03-31 11:36:08,396 sagemaker_pytorch_container.training INFO     Block until all host DNS lookups succeed.[0m
[34m2024-03-31 11:36:08,404 sagemaker_pytorch_container.training INFO     Invoking user training script.[0m
[34m2024-03-31 11:36:09,722